[med-svn] [iqtree] 01/02: Imported Upstream version 1.3.8+dfsg

Mon Aug 31 08:57:30 UTC 2015

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository iqtree.

commit 8bc47bab876f13c9b173edbdc6e175d38efdd0b5
Author: Andreas Tille <tille at debian.org>
Date:   Mon Aug 31 10:55:00 2015 +0200

    Imported Upstream version 1.3.8+dfsg
---
 CMakeLists.txt                              |  537 ++
 Documents/iqtree-manual-1.0.pdf             |  Bin 0 -> 158099 bytes
 License.txt                                 |  340 ++
 alignment.cpp                               | 3446 +++++++++++
 alignment.h                                 |  694 +++
 alignmentpairwise.cpp                       |  319 +
 alignmentpairwise.h                         |  100 +
 bionj.h                                     |  790 +++
 candidateset.cpp                            |  298 +
 candidateset.h                              |  287 +
 checkpoint.cpp                              |  169 +
 checkpoint.h                                |   65 +
 circularnetwork.cpp                         |  622 ++
 circularnetwork.h                           |  253 +
 ecopd.cpp                                   | 1400 +++++
 ecopd.h                                     |  276 +
 ecopdmtreeset.cpp                           |   36 +
 ecopdmtreeset.h                             |   35 +
 eigendecomposition.cpp                      | 1121 ++++
 eigendecomposition.h                        |  185 +
 fmemopen.c                                  |  147 +
 fmemopen.h                                  |   60 +
 graph.cpp                                   |   63 +
 graph.h                                     |   29 +
 greedy.cpp                                  |  202 +
 greedy.h                                    |  112 +
 gss.cpp                                     |  345 ++
 gss.h                                       |   96 +
 guidedbootstrap.cpp                         | 1301 ++++
 guidedbootstrap.h                           |   95 +
 gurobiwrapper.cpp                           |  135 +
 gurobiwrapper.h                             |   42 +
 gzstream.cpp                                |  170 +
 gzstream.h                                  |  124 +
 hashsplitset.cpp                            |   72 +
 hashsplitset.h                              |  128 +
 iqtree.cpp                                  | 3207 ++++++++++
 iqtree.h                                    |  822 +++
 iqtree_config.h.in                          |   14 +
 lbfgsb/CMakeLists.txt                       |    3 +
 lbfgsb/lbfgsb_new.cpp                       | 4605 ++++++++++++++
 lbfgsb/lbfgsb_new.h                         |  232 +
 likelihood.c                                |  176 +
 lpwrapper.c                                 |  104 +
 lpwrapper.h                                 |   56 +
 maalignment.cpp                             |  253 +
 maalignment.h                               |  116 +
 matree.cpp                                  |  192 +
 matree.h                                    |   97 +
 mexttree.cpp                                |  498 ++
 mexttree.h                                  |  161 +
 model/CMakeLists.txt                        |   23 +
 model/modelbin.cpp                          |   44 +
 model/modelbin.h                            |   55 +
 model/modelcodon.cpp                        |  851 +++
 model/modelcodon.h                          |  179 +
 model/modelcodonempirical.cpp               |  270 +
 model/modelcodonempirical.h                 |   48 +
 model/modelcodonparametric.cpp              |  113 +
 model/modelcodonparametric.h                |   56 +
 model/modelcodonsemiempirical.cpp           |   31 +
 model/modelcodonsemiempirical.h             |   40 +
 model/modeldna.cpp                          |  409 ++
 model/modeldna.h                            |  121 +
 model/modelfactory.cpp                      | 1025 ++++
 model/modelfactory.h                        |  237 +
 model/modelgtr.cpp                          |  789 +++
 model/modelgtr.h                            |  341 ++
 model/modelmixture.cpp                      | 1481 +++++
 model/modelmixture.h                        |  145 +
 model/modelmorphology.cpp                   |   39 +
 model/modelmorphology.h                     |   45 +
 model/modelnonrev.cpp                       |  258 +
 model/modelnonrev.h                         |   92 +
 model/modelpomo.cpp                         |   18 +
 model/modelpomo.h                           |   19 +
 model/modelprotein.cpp                      | 3242 ++++++++++
 model/modelprotein.h                        |   62 +
 model/models.nex                            |  955 +++
 model/modelset.cpp                          |  127 +
 model/modelset.h                            |  182 +
 model/modelsubst.cpp                        |  183 +
 model/modelsubst.h                          |  299 +
 model/partitionmodel.cpp                    |  113 +
 model/partitionmodel.h                      |   66 +
 model/ratefree.cpp                          |  567 ++
 model/ratefree.h                            |  139 +
 model/ratefreeinvar.cpp                     |   82 +
 model/ratefreeinvar.h                       |  116 +
 model/rategamma.cpp                         |  435 ++
 model/rategamma.h                           |  284 +
 model/rategammainvar.cpp                    |  186 +
 model/rategammainvar.h                      |  145 +
 model/rateheterogeneity.cpp                 |   96 +
 model/rateheterogeneity.h                   |  247 +
 model/rateinvar.cpp                         |   98 +
 model/rateinvar.h                           |  154 +
 model/ratekategory.cpp                      |  181 +
 model/ratekategory.h                        |  132 +
 model/ratemeyerdiscrete.cpp                 |  536 ++
 model/ratemeyerdiscrete.h                   |  166 +
 model/ratemeyerhaeseler.cpp                 |  496 ++
 model/ratemeyerhaeseler.h                   |  162 +
 modelsblock.cpp                             |   90 +
 modelsblock.h                               |   63 +
 mpdablock.cpp                               |  393 ++
 mpdablock.h                                 |  153 +
 msetsblock.cpp                              |  283 +
 msetsblock.h                                |  145 +
 msplitsblock.cpp                            |  318 +
 msplitsblock.h                              |  106 +
 mtree.cpp                                   | 2037 +++++++
 mtree.h                                     |  788 +++
 mtreeset.cpp                                |  760 +++
 mtreeset.h                                  |  181 +
 myreader.h                                  |  144 +
 ncbitree.cpp                                |  255 +
 ncbitree.h                                  |   97 +
 ncl/CMakeLists.txt                          |   17 +
 ncl/ncl.h                                   |  108 +
 ncl/nxsassumptionsblock.cpp                 |  535 ++
 ncl/nxsassumptionsblock.h                   |   89 +
 ncl/nxsblock.cpp                            |  193 +
 ncl/nxsblock.h                              |   74 +
 ncl/nxscharactersblock.cpp                  | 2951 +++++++++
 ncl/nxscharactersblock.h                    |  800 +++
 ncl/nxsdatablock.cpp                        |   54 +
 ncl/nxsdatablock.h                          |   39 +
 ncl/nxsdefs.h                               |   79 +
 ncl/nxsdiscretedatum.cpp                    |   88 +
 ncl/nxsdiscretedatum.h                      |   71 +
 ncl/nxsdiscretematrix.cpp                   |  637 ++
 ncl/nxsdiscretematrix.h                     |   90 +
 ncl/nxsdistancedatum.cpp                    |   36 +
 ncl/nxsdistancedatum.h                      |   44 +
 ncl/nxsdistancesblock.cpp                   |  896 +++
 ncl/nxsdistancesblock.h                     |  128 +
 ncl/nxsemptyblock.cpp                       |  174 +
 ncl/nxsemptyblock.h                         |   77 +
 ncl/nxsexception.cpp                        |   49 +
 ncl/nxsexception.h                          |   42 +
 ncl/nxsindent.h                             |   56 +
 ncl/nxsreader.cpp                           |  492 ++
 ncl/nxsreader.h                             |   77 +
 ncl/nxssetreader.cpp                        |  273 +
 ncl/nxssetreader.h                          |   79 +
 ncl/nxsstring.cpp                           |  877 +++
 ncl/nxsstring.h                             |  610 ++
 ncl/nxstaxablock.cpp                        |  352 ++
 ncl/nxstaxablock.h                          |   69 +
 ncl/nxstoken.cpp                            |  622 ++
 ncl/nxstoken.h                              |  533 ++
 ncl/nxstreesblock.cpp                       |  557 ++
 ncl/nxstreesblock.h                         |   83 +
 ngs.cpp                                     | 1226 ++++
 ngs.h                                       |  430 ++
 node.cpp                                    |  249 +
 node.h                                      |  416 ++
 optimization.cpp                            | 1083 ++++
 optimization.h                              |  195 +
 parsmultistate.cpp                          |   36 +
 parsmultistate.h                            |   28 +
 pattern.cpp                                 |   59 +
 pattern.h                                   |   74 +
 pda.cpp                                     | 2400 ++++++++
 pdnetwork.cpp                               | 1964 ++++++
 pdnetwork.h                                 |  416 ++
 pdtree.cpp                                  |  400 ++
 pdtree.h                                    |  171 +
 pdtreeset.cpp                               |  135 +
 pdtreeset.h                                 |   94 +
 phyloanalysis.cpp                           | 2760 +++++++++
 phyloanalysis.h                             |   92 +
 phylokernel.h                               | 1535 +++++
 phylokernelmixrate.h                        | 1113 ++++
 phylokernelmixture.h                        | 1197 ++++
 phylolib.h                                  |   26 +
 phylonode.cpp                               |   94 +
 phylonode.h                                 |  201 +
 phylosupertree.cpp                          | 1449 +++++
 phylosupertree.h                            |  322 +
 phylosupertreeplen.cpp                      | 2039 +++++++
 phylosupertreeplen.h                        |  322 +
 phylotesting.cpp                            | 2029 +++++++
 phylotesting.h                              |  113 +
 phylotree.cpp                               | 5076 ++++++++++++++++
 phylotree.h                                 | 1774 ++++++
 phylotreeavx.cpp                            |  109 +
 phylotreeeigen.cpp                          |   11 +
 phylotreepars.cpp                           |  463 ++
 phylotreesse.cpp                            | 2743 +++++++++
 pllnni.cpp                                  | 1094 ++++
 pllnni.h                                    |  254 +
 pllrepo/AUTHORS                             |    0
 pllrepo/COPYING                             |  674 +++
 pllrepo/ChangeLog                           |    0
 pllrepo/Doxyfile                            | 2299 +++++++
 pllrepo/INSTALL                             |  370 ++
 pllrepo/Makefile.am                         |    7 +
 pllrepo/NEWS                                |    0
 pllrepo/README                              |    0
 pllrepo/configure.ac                        |  123 +
 pllrepo/sources.am                          |    2 +
 pllrepo/src/CMakeLists.txt                  |   67 +
 pllrepo/src/Makefile.ALL                    |   54 +
 pllrepo/src/Makefile.ARM                    |   51 +
 pllrepo/src/Makefile.AVX                    |   60 +
 pllrepo/src/Makefile.AVX-MPI                |   59 +
 pllrepo/src/Makefile.AVX-PTHREADS           |   61 +
 pllrepo/src/Makefile.AVX.clang              |   57 +
 pllrepo/src/Makefile.AVX.shared             |   68 +
 pllrepo/src/Makefile.MIC-PTHREADS           |   62 +
 pllrepo/src/Makefile.SSE3                   |   52 +
 pllrepo/src/Makefile.SSE3-MPI               |   50 +
 pllrepo/src/Makefile.SSE3-PTHREADS          |   52 +
 pllrepo/src/Makefile.am                     |   53 +
 pllrepo/src/alignment.c                     |  754 +++
 pllrepo/src/avxLikelihood.c                 | 4111 +++++++++++++
 pllrepo/src/bipartitionList.c               |  434 ++
 pllrepo/src/cycle.h                         |  516 ++
 pllrepo/src/errcodes.h                      |   69 +
 pllrepo/src/evaluateGenericSpecial.c        | 3321 ++++++++++
 pllrepo/src/evaluatePartialGenericSpecial.c | 1378 +++++
 pllrepo/src/fastDNAparsimony.c              | 1941 ++++++
 pllrepo/src/genericParallelization.c        | 2283 +++++++
 pllrepo/src/genericParallelization.h        |  127 +
 pllrepo/src/globalVariables.h               |  170 +
 pllrepo/src/hardware.c                      |  165 +
 pllrepo/src/hardware.h                      |   48 +
 pllrepo/src/hash.c                          |  219 +
 pllrepo/src/hash.h                          |   50 +
 pllrepo/src/lexer.c                         |  299 +
 pllrepo/src/lexer.h                         |   88 +
 pllrepo/src/makenewzGenericSpecial.c        | 3145 ++++++++++
 pllrepo/src/mem_alloc.c                     |  228 +
 pllrepo/src/mem_alloc.h                     |   70 +
 pllrepo/src/mic_native.h                    |   56 +
 pllrepo/src/mic_native_aa.c                 | 1254 ++++
 pllrepo/src/mic_native_dna.c                |  676 +++
 pllrepo/src/models.c                        | 4377 ++++++++++++++
 pllrepo/src/newick.c                        |  583 ++
 pllrepo/src/newick.h                        |   61 +
 pllrepo/src/newviewGenericSpecial.c         | 8736 +++++++++++++++++++++++++++
 pllrepo/src/optimizeModel.c                 | 3145 ++++++++++
 pllrepo/src/parsePartition.c                |  388 ++
 pllrepo/src/parsePartition.h                |   51 +
 pllrepo/src/parsimony.c                     |  865 +++
 pllrepo/src/pll.h                           | 1692 ++++++
 pllrepo/src/pllInternal.h                   |  313 +
 pllrepo/src/pthread.h                       | 1368 +++++
 pllrepo/src/queue.c                         |   96 +
 pllrepo/src/queue.h                         |   48 +
 pllrepo/src/randomTree.c                    |  177 +
 pllrepo/src/recom.c                         |  689 +++
 pllrepo/src/restartHashTable.c              |  357 ++
 pllrepo/src/sched.h                         |  183 +
 pllrepo/src/searchAlgo.c                    | 3310 ++++++++++
 pllrepo/src/semaphore.h                     |  169 +
 pllrepo/src/ssort.c                         |  121 +
 pllrepo/src/stack.c                         |   85 +
 pllrepo/src/stack.h                         |   48 +
 pllrepo/src/topologies.c                    |  778 +++
 pllrepo/src/trash.c                         |  129 +
 pllrepo/src/treeIO.c                        |  236 +
 pllrepo/src/treeIO.h                        |   23 +
 pllrepo/src/utils.c                         | 3734 ++++++++++++
 pruning.cpp                                 |  183 +
 pruning.h                                   |  112 +
 split.cpp                                   |  572 ++
 split.h                                     |  298 +
 splitgraph.cpp                              |  703 +++
 splitgraph.h                                |  412 ++
 splitset.cpp                                |   59 +
 splitset.h                                  |   57 +
 stoprule.cpp                                |  501 ++
 stoprule.h                                  |  176 +
 superalignment.cpp                          |  610 ++
 superalignment.h                            |  229 +
 superalignmentpairwise.cpp                  |   74 +
 superalignmentpairwise.h                    |   66 +
 supernode.cpp                               |   56 +
 supernode.h                                 |  114 +
 timeutil.h                                  |  287 +
 tinatree.cpp                                |  141 +
 tinatree.h                                  |   61 +
 tools.cpp                                   | 3654 +++++++++++
 tools.h                                     | 2246 +++++++
 upperbounds.cpp                             |  907 +++
 upperbounds.h                               |   89 +
 vectorclass/CMakeLists.txt                  |    3 +
 vectorclass/changelog.txt                   |  110 +
 vectorclass/dispatch_example.cpp            |   99 +
 vectorclass/instrset.h                      |  203 +
 vectorclass/instrset_detect.cpp             |  153 +
 vectorclass/license.txt                     |  619 ++
 vectorclass/special.zip                     |  Bin 0 -> 34472 bytes
 vectorclass/vectorclass.h                   |   69 +
 vectorclass/vectorclass.pdf                 |  Bin 0 -> 466946 bytes
 vectorclass/vectorf128.h                    | 2619 ++++++++
 vectorclass/vectorf256.h                    | 3166 ++++++++++
 vectorclass/vectorf256e.h                   | 2069 +++++++
 vectorclass/vectorf512.h                    | 2366 ++++++++
 vectorclass/vectorf512e.h                   | 2127 +++++++
 vectorclass/vectori128.h                    | 6146 +++++++++++++++++++
 vectorclass/vectori256.h                    | 5591 +++++++++++++++++
 vectorclass/vectori256e.h                   | 4332 +++++++++++++
 vectorclass/vectori512.h                    | 2733 +++++++++
 vectorclass/vectori512e.h                   | 2545 ++++++++
 vectorclass/vectormath_common.h             |  310 +
 vectorclass/vectormath_exp.h                | 1995 ++++++
 vectorclass/vectormath_hyp.h                |  736 +++
 vectorclass/vectormath_lib.h                | 2107 +++++++
 vectorclass/vectormath_trig.h               | 1041 ++++
 whtest/CMakeLists.txt                       |   14 +
 whtest/eigen.c                              |  926 +++
 whtest/eigen.h                              |   33 +
 whtest/eigen_sym.c                          |  316 +
 whtest/eigen_sym.h                          |   48 +
 whtest/random.c                             |  356 ++
 whtest/random.h                             |   37 +
 whtest/weisslambda.c                        |   67 +
 whtest/weisslambda_sub.c                    |  369 ++
 whtest/weisslambda_sub.h                    |   53 +
 whtest/whtest.c                             |  658 ++
 whtest/whtest.h                             |   39 +
 whtest/whtest_sub.c                         | 1176 ++++
 whtest/whtest_sub.h                         |   85 +
 whtest/whtools.h                            |   40 +
 whtest_wrapper.cpp                          |   73 +
 whtest_wrapper.h                            |   29 +
 330 files changed, 207216 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..ccfa868
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,537 @@
+##################################################################
+# IQ-TREE cmake build definition
+# Copyright (c) 2012-2014 Bui Quang Minh, Lam Tung Nguyen
+##################################################################
+
+# Windows example usages:
+# cmake -G "Visual Studio 12" <source_dir>			(32-bit SSE3 version, compiled with MSVC)
+# cmake -G "Visual Studio 12 Win64" <source_dir>	(64-bit SSE3 version, compiled with MSVC)
+# cmake -G "Visual Studio 12 Win64" -T "Intel C++ Compiler XE 15.0" <source_dir>		(64-bit SSE3 version, compiled with ICC)
+# cmake -G "Visual Studio 12 Win64" -T "Intel C++ Compiler XE 15.0" -DIQTREE_FLAGS="avx" <source_dir>		(64-bit AVX version, compiled with ICC)
+# cmake -G "Visual Studio 12 Win64" -T "Intel C++ Compiler XE 15.0" -DIQTREE_FLAGS="omp" <source_dir>		(64-bit SSE3+OpenMP version, compiled with ICC)
+# cmake -G "Visual Studio 12 Win64" -T "Intel C++ Compiler XE 15.0" -DIQTREE_FLAGS="avx omp" <source_dir>	(64-bit AVX+OpenMP version, compiled with ICC)
+#
+# Linux example usages:
+# cmake <source_dir>						   (SSE3 version)
+# cmake -DIQTREE_FLAGS="avx" <source_dir>      (AVX version)
+# cmake -DIQTREE_FLAGS="omp" <source_dir>      (OpenMP version)
+# cmake -DIQTREE_FLAGS="omp avx" <source_dir>  (AVX+OpenMP version)
+# cmake -DIQTREE_FLAGS="m32" <source_dir>      (32-bit SSE3 version)
+# cmake -DIQTREE_FLAGS="m32 omp" <source_dir>  (32-bit SSE3+OpenMP version)
+#
+# Mac OSX example usages:
+# cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ <source_dir>							(SSE3 version)
+# cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DIQTREE_FLAGS="avx" <source_dir>		(AVX version)
+# To build OpenMP version one needs to download and compile Clang-OpenMP. 
+# Then assuming clang2 and clang2++ are the newly built compilers, then:
+# cmake -DCMAKE_C_COMPILER=clang2 -DCMAKE_CXX_COMPILER=clang2++ -DIQTREE_FLAGS="omp" <source_dir>		(SSE3+OpenMP version)
+# cmake -DCMAKE_C_COMPILER=clang2 -DCMAKE_CXX_COMPILER=clang2++ -DIQTREE_FLAGS="omp avx" <source_dir>	(AVX+OpenMP version)
+#
+
+
+cmake_minimum_required(VERSION 2.8)
+set(CMAKE_LEGACY_CYGWIN_WIN32 0)
+
+project(iqtree)
+add_definitions(-DIQ_TREE)
+# The version number.
+set (iqtree_VERSION_MAJOR 1)
+set (iqtree_VERSION_MINOR 3)
+set (iqtree_VERSION_PATCH 8) 
+
+set(BUILD_SHARED_LIBS OFF)
+
+message("IQ-TREE flags : ${IQTREE_FLAGS}")
+
+if (NOT CMAKE_BUILD_TYPE) 
+	set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "Release") 
+	message("Builde mode   : Release")
+endif()
+
+include_directories("${PROJECT_SOURCE_DIR}")
+
+
+##################################################################
+# Detect target platforms
+##################################################################
+if (WIN32)
+	message("Target OS     : Windows")
+	# build as static binary to run on most machines
+    if (IQTREE_FLAGS MATCHES "static")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+    endif()
+    SET(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+    add_definitions(-DWIN32)
+elseif (APPLE) 
+	message("Target OS     : Mac OS X")
+	# to be compatible back to Mac OS X 10.6
+	if (IQTREE_FLAGS MATCHES "oldmac") 
+		add_definitions("-mmacosx-version-min=10.5") 
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mmacosx-version-min=10.5")
+	else()
+		add_definitions("-mmacosx-version-min=10.6") 
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mmacosx-version-min=10.6")
+	endif()
+    SET(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+elseif (UNIX) 
+	message("Target OS     : Unix")
+	# build as static binary to run on most machines
+    if (NOT IQTREE_FLAGS MATCHES "static")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
+    else()
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+    endif()
+else()
+	# Note that IQ-TREE has NOT been tested on other platforms
+	message("Target OS     : Unknown and untested yet")
+endif()
+
+##################################################################
+# Setup compiler, currently supported GCC, CLANG, MSVC, and ICC
+##################################################################
+
+set(GCC "FALSE")   #  GNU compiler
+set(CLANG "FALSE") # Clang compiler
+set(ICC "FALSE")   # Intel compiler
+set(VCC "FALSE")   # MS Visual C Compiler, note that it is different from MSVC variable
+
+if (CMAKE_COMPILER_IS_GNUCXX) 	
+	message("Compiler      : GNU Compiler (gcc)")
+	set(GCC "TRUE")
+#	set(COMBINED_FLAGS "-Wall -Wno-unused-function -Wno-sign-compare -pedantic -D_GNU_SOURCE -fms-extensions -Wno-deprecated")
+#	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98")
+	set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g")
+	set(CMAKE_C_FLAGS_RELEASE "-O3 -g")
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	message("Compiler      : Clang")
+	set(CLANG "TRUE")
+#	set(COMBINED_FLAGS "-Wall -Wno-unused-function -Wno-sign-compare -pedantic -D_GNU_SOURCE -Wno-nested-anon-types")
+	set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+	set(CMAKE_C_FLAGS_RELEASE "-O3")	
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+	set(VCC "TRUE")
+	message("Compiler      : MS Visual C++ Compiler")
+elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+	message("Compiler      : Intel C++ Compiler (icc)")
+	set(ICC "TRUE")
+	set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qstd=c99")
+else()
+	message("Compiler      : Unknown and untested yet")
+endif()
+
+set(EXE_SUFFIX "")
+
+if (MSVC) 
+	# MS Visual Studio environment
+	message("Exporting MS Visual Studio projects...")
+	add_definitions(/MP) # enable multi-processor compilation
+	if (CMAKE_BUILD_TYPE STREQUAL "Release")
+		add_definitions(/Ot /Oi)
+		if (VCC)
+			add_definitions(/O2)
+		elseif (ICC)
+			add_definitions(/O3)
+		endif()
+	endif()
+endif()
+
+
+
+##################################################################
+# Configure PLL build
+##################################################################
+if (IQTREE_FLAGS MATCHES "pll")
+	add_definitions(-DUSING_PLL)
+	set(EXE_SUFFIX "${EXE_SUFFIX}-pll")
+endif()
+
+##################################################################
+# detect 32 or 64 bit binary
+##################################################################
+set (BINARY32 "FALSE")
+if(CMAKE_SIZEOF_VOID_P EQUAL 4 OR IQTREE_FLAGS MATCHES "m32")
+	set(BINARY32 "TRUE")
+	message("Target binary : 32-bit")
+	if (CMAKE_GENERATOR MATCHES "Win64")
+		error("Both 32-bit and 64-bit mode cannot be specified")
+	endif()
+	SET(EXE_SUFFIX "${EXE_SUFFIX}32")
+	if (GCC OR CLANG) 
+		set(COMBINED_FLAGS "${COMBINED_FLAGS} -m32")
+  	endif()
+    add_definitions(-DBINARY32)
+else()
+	message("Target binary : 64-bit")
+endif()
+
+if(IQTREE_FLAGS MATCHES "novx") 
+    add_definitions(-D__NOAVX__)
+endif()
+
+##################################################################
+# configure OpenMP/PThreads compilation
+# change the executable name if compiled for OpenMP parallel version
+##################################################################
+if (IQTREE_FLAGS MATCHES "omp")
+	message("Parallel      : OpenMP/PThreads")
+	SET(EXE_SUFFIX "${EXE_SUFFIX}-omp")	
+	add_definitions(-D_USE_PTHREADS)
+	if (MSVC) 
+		add_definitions(/MT)
+	endif()
+	
+	if (VCC) 
+  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+  		include_directories("${PROJECT_SOURCE_DIR}/pllrepo/src") # for PThreads headers 
+	elseif (ICC)
+  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qopenmp")
+  		if (WIN32)
+  			include_directories("${PROJECT_SOURCE_DIR}/pllrepo/src") # for PThreads headers
+  		endif() 
+  	elseif (GCC)
+		set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
+  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  	elseif (CLANG) 
+  		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  	endif()
+else()
+	message("Parallel      : None")
+endif()
+
+##################################################################
+# configure SSE/AVX/FMA instructions
+##################################################################
+
+SET(AVX_FLAGS "-D__AVX")
+if (VCC) 
+	set(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
+elseif (CLANG)
+	set(AVX_FLAGS "${AVX_FLAGS} -mavx")
+elseif (GCC)
+	set(AVX_FLAGS "${AVX_FLAGS} -mavx -fabi-version=0")
+elseif (ICC) 
+	if (WIN32)
+		 set(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
+	else()
+		 set(AVX_FLAGS "${AVX_FLAGS} -mavx")
+	endif()
+endif()
+
+SET(SSE_FLAGS "")
+if (VCC)
+	set(SSE_FLAGS "/arch:SSE2 -D__SSE3__")
+elseif (GCC OR CLANG)
+	set(SSE_FLAGS "-msse3")
+elseif (ICC)
+	if (WIN32)
+		set(SSE_FLAGS "/arch:SSE3")
+	else()
+		set(SSE_FLAGS "-msse3")
+	endif()
+endif()
+
+if (IQTREE_FLAGS MATCHES "fma") # AVX+FMA instruction set
+ 	message("Vectorization : AVX+FMA")
+	add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
+	if (VCC)
+		# Visual C++ has no /mfma flag!, FMA is only included in AVX2 
+		set(COMBINED_FLAGS "${COMBINED_FLAGS} /arch:AVX2")
+	elseif (CLANG)
+		set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -mfma")
+	elseif (GCC)
+		set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -fabi-version=0 -mfma")
+	elseif (ICC) 
+		if (WIN32)
+			 set(COMBINED_FLAGS "${COMBINED_FLAGS} /arch:AVX /Qfma")
+		else()
+			 set(COMBINED_FLAGS "${COMBINED_FLAGS} -mavx -mfma")
+		endif()
+	endif()
+
+	SET(EXE_SUFFIX "${EXE_SUFFIX}-fma")
+
+elseif (IQTREE_FLAGS MATCHES "avx") # AVX instruction set
+ 	message("Vectorization : AVX")
+	add_definitions(-D__SSE3 -D__AVX) # define both SSE3 and AVX directive
+	set(COMBINED_FLAGS "${COMBINED_FLAGS} ${AVX_FLAGS}")
+
+	SET(EXE_SUFFIX "${EXE_SUFFIX}-avx")
+else() #SSE intruction set
+	message("Vectorization : SSE3")
+	add_definitions(-D__SSE3)
+
+endif()
+
+
+##################################################################
+# Setup compiler flags
+##################################################################
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMBINED_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMBINED_FLAGS}")
+
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+	message("C flags    : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_RELEASE}") 
+	message("CXX flags  : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}") 
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+	message("C flags    : ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") 
+	message("CXX flags  : ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}") 
+endif()
+
+set(CMAKE_CXX_FLAGS_PROFILE "-fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -fno-default-inline -fno-inline -O0 -fno-omit-frame-pointer -pg")
+set(CMAKE_C_FLAGS_PROFILE "-fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls -O0 -fno-omit-frame-pointer -pg")
+
+if (GCC) 
+	set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline-functions-called-once -fno-default-inline -fno-inline")
+	set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline-functions-called-once -fno-default-inline -fno-inline")
+	set(CMAKE_CXX_FLAGS_MEM "-g -O1")
+	set(CMAKE_C_FLAGS_MEM "-g -O1")
+elseif (CLANG)
+	set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline")
+	set(CMAKE_C_FLAGS_DEBUG "-O0 -g -fno-inline-functions -fno-inline")
+	set(CMAKE_CXX_FLAGS_MEM "-g -O1")
+	set(CMAKE_C_FLAGS_MEM "-g -O1")
+endif()
+
+##################################################################
+# check existence of a few basic functions
+##################################################################
+include (${CMAKE_ROOT}/Modules/CheckFunctionExists.cmake)
+check_function_exists (gettimeofday HAVE_GETTIMEOFDAY)
+check_function_exists (getrusage HAVE_GETRUSAGE)
+check_function_exists (GlobalMemoryStatusEx HAVE_GLOBALMEMORYSTATUSEX)
+check_function_exists (strndup HAVE_STRNDUP)
+
+# configure a header file to pass some of the CMake settings
+# to the source code
+configure_file (
+  "${PROJECT_SOURCE_DIR}/iqtree_config.h.in"
+  "${PROJECT_BINARY_DIR}/iqtree_config.h"
+  )
+
+# add the binary tree to the search path for include files
+# so that we will find iqtree_config.h
+include_directories("${PROJECT_BINARY_DIR}")
+include_directories("${PROJECT_BINARY_DIR}/zlib-1.2.7")
+
+
+##################################################################
+# subdirectories containing necessary libraries for the build 
+##################################################################
+add_subdirectory(pllrepo/src)
+add_subdirectory(ncl)
+add_subdirectory(lbfgsb)
+add_subdirectory(whtest)
+add_subdirectory(sprng)
+add_subdirectory(zlib-1.2.7)
+add_subdirectory(vectorclass)
+add_subdirectory(model)
+
+##################################################################
+# the main executable
+##################################################################
+
+if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
+add_library(avxkernel phylotreeavx.cpp)
+endif()
+
+add_executable(iqtree
+alignment.cpp
+alignmentpairwise.cpp
+circularnetwork.cpp
+eigendecomposition.cpp
+greedy.cpp
+gss.cpp
+guidedbootstrap.cpp
+gurobiwrapper.cpp
+gzstream.cpp
+hashsplitset.cpp
+iqtree.cpp
+maalignment.cpp
+matree.cpp
+mexttree.cpp
+mpdablock.cpp
+msetsblock.cpp
+msplitsblock.cpp
+modelsblock.cpp
+mtree.cpp
+mtreeset.cpp
+ncbitree.cpp
+ngs.cpp
+node.cpp
+optimization.cpp
+parsmultistate.cpp
+pattern.cpp
+pda.cpp
+pdnetwork.cpp
+pdtree.cpp
+pdtreeset.cpp
+phyloanalysis.cpp
+phylonode.cpp
+phylosupertree.cpp
+phylotree.cpp
+phylotreesse.cpp
+phylotreepars.cpp
+#phylotreeavx.cpp
+pruning.cpp
+split.cpp
+splitgraph.cpp
+splitset.cpp
+stoprule.cpp
+superalignment.cpp
+superalignmentpairwise.cpp
+supernode.cpp
+tinatree.cpp
+tools.cpp
+whtest_wrapper.cpp
+lpwrapper.c
+pllnni.cpp
+phylosupertreeplen.cpp
+phylotesting.cpp
+ecopd.cpp
+ecopdmtreeset.cpp
+graph.cpp
+candidateset.cpp
+checkpoint.cpp
+upperbounds.cpp
+)
+
+if (NOT IQTREE_FLAGS MATCHES "avx" AND NOT IQTREE_FLAGS MATCHES "fma")
+	set_target_properties(iqtree pll ncl lbfgsb whtest zlibstatic sprng vectorclass model PROPERTIES COMPILE_FLAGS "${SSE_FLAGS}")
+	if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
+		set_target_properties(avxkernel pllavx PROPERTIES COMPILE_FLAGS "${AVX_FLAGS}")
+	endif()
+endif()  
+
+##################################################################
+# setup linking flags
+##################################################################
+
+# link special lib for WIN32
+if (WIN32) 
+	set(PLATFORM_LIB "ws2_32")
+else()
+	set(PLATFORM_LIB "m")
+endif()
+
+set(THREAD_LIB "")
+if (IQTREE_FLAGS MATCHES "omp") 
+	link_directories(${PROJECT_SOURCE_DIR}/lib)
+	if (MSVC)
+		if (BINARY32) 
+            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LIBPATH:${PROJECT_SOURCE_DIR}/lib32")
+			set(THREAD_LIB "pthreadVC2")
+		else()
+            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LIBPATH:${PROJECT_SOURCE_DIR}/lib")
+			set(THREAD_LIB "pthreadVC2")
+		endif()
+	elseif(CLANG AND APPLE)
+		set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${PROJECT_SOURCE_DIR}/lib")
+	endif()
+endif()
+
+if (BINARY32 OR IQTREE_FLAGS MATCHES "novx")
+    target_link_libraries(iqtree pll ncl lbfgsb whtest zlibstatic sprng vectorclass model ${PLATFORM_LIB} ${THREAD_LIB})	
+else()
+    target_link_libraries(iqtree pll pllavx ncl lbfgsb whtest zlibstatic sprng vectorclass model avxkernel ${PLATFORM_LIB} ${THREAD_LIB})	
+endif()
+
+##################################################################
+# setup the executable name 
+##################################################################
+set_target_properties(iqtree PROPERTIES OUTPUT_NAME "iqtree${EXE_SUFFIX}")
+
+# strip the release build
+if (CMAKE_BUILD_TYPE STREQUAL "Release" AND (GCC OR CLANG)) # strip is not necessary for MSVC
+	if (WIN32)
+		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND strip $<TARGET_FILE:iqtree>)
+	else()
+		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND ${CMAKE_STRIP} $<TARGET_FILE:iqtree>)
+	endif()
+endif()
+
+if (MSVC)
+	set (BINARY_DIR "${PROJECT_BINARY_DIR}/Release")
+else()
+	set (BINARY_DIR "${PROJECT_BINARY_DIR}")
+endif()
+
+if (WIN32)
+	if (MSVC) 
+		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND copy "Release\\iqtree${EXE_SUFFIX}.exe" "Release\\iqtree${EXE_SUFFIX}-click.exe")
+	else()
+		ADD_CUSTOM_COMMAND(TARGET iqtree POST_BUILD COMMAND copy "iqtree${EXE_SUFFIX}.exe" "iqtree${EXE_SUFFIX}-click.exe")
+	endif()	
+endif()
+
+##############################################################
+# add the install targets
+##############################################################
+install (TARGETS iqtree DESTINATION bin)
+install (FILES "${PROJECT_SOURCE_DIR}/model/models.nex" DESTINATION bin)
+install (FILES "${PROJECT_SOURCE_DIR}/examples/example.phy" DESTINATION .)
+install (FILES "${PROJECT_SOURCE_DIR}/Documents/iqtree-manual-1.0.pdf" DESTINATION .)
+
+if (WIN32)
+	install (FILES "${BINARY_DIR}/iqtree${EXE_SUFFIX}-click.exe" DESTINATION bin)
+	if (EXE_SUFFIX MATCHES "omp" AND MSVC)
+if (BINARY32)
+		install(FILES  "${PROJECT_SOURCE_DIR}/lib32/pthreadVC2.dll" DESTINATION bin)
+else()
+		install(FILES  "${PROJECT_SOURCE_DIR}/lib/pthreadVC2.dll" DESTINATION bin)
+endif()
+		install(FILES  "${PROJECT_SOURCE_DIR}/lib/libiomp5md.dll" DESTINATION bin)
+#		install(FILES  "${PROJECT_SOURCE_DIR}/lib/pthreadGC2.dll" DESTINATION bin)
+#		install(FILES  "${PROJECT_SOURCE_DIR}/lib/pthreadGC2_64.dll" DESTINATION bin)
+	endif()
+endif()
+
+##############################################################
+# build a CPack driven installer package
+##############################################################
+include (InstallRequiredSystemLibraries)
+set (CPACK_RESOURCE_FILE_LICENSE  
+     "${CMAKE_CURRENT_SOURCE_DIR}/License.txt")
+set (CPACK_PACKAGE_VERSION_MAJOR "${iqtree_VERSION_MAJOR}")
+set (CPACK_PACKAGE_VERSION_MINOR "${iqtree_VERSION_MINOR}")
+set (CPACK_PACKAGE_VERSION_PATCH "${iqtree_VERSION_PATCH}")
+if(WIN32 OR APPLE)
+  set(CPACK_GENERATOR "ZIP")
+  set(CPACK_SOURCE_GENERATOR "ZIP")
+else()
+  set(CPACK_GENERATOR "TGZ")
+  set(CPACK_SOURCE_GENERATOR "TGZ")
+endif()
+
+#set(CPACK_SOURCE_PACKAGE_FILE_NAME
+#  "${CMAKE_PROJECT_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}")
+set(CPACK_SOURCE_IGNORE_FILES
+  "/build.*/;/debug.*/;/examples/;/test_scripts/;/manual/;/.bzr/;~$;/\\\\.svn/;/\\\\.git/;/pll/;/pllrepo.dox/;/pllrepo.examples/;/pllrepo.figures/;/pllrepo.legacy/;/pllrepo.m4/;/pllrepo.man/;/pllrepo.MPI/;/pllrepo.sMSA/;/pllrepo.testdata/;${CPACK_SOURCE_IGNORE_FILES}")
+
+set (SYSTEM_NAME "${CMAKE_SYSTEM_NAME}")
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
+	if (IQTREE_FLAGS MATCHES "oldmac") 
+		set (SYSTEM_NAME "MacOS10.5")
+	else() 
+		set (SYSTEM_NAME "MacOSX")
+	endif()
+endif()
+
+set (PROJECT_NAME_SUFFIX "${EXE_SUFFIX}")
+#if (NOT IQTREE_FLAGS MATCHES "omp" AND NOT IQTREE_FLAGS MATCHES "avx" AND NOT IQTREE_FLAGS MATCHES "fma")  
+#	set (PROJECT_NAME_SUFFIX "${PROJECT_NAME_SUFFIX}-sse") 
+#endif()
+
+set(CPACK_PACKAGE_FILE_NAME 
+	"${CMAKE_PROJECT_NAME}${PROJECT_NAME_SUFFIX}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}-${SYSTEM_NAME}")
+
+set(CPACK_STRIP_FILES TRUE)
+
+include (CPack)
+
+#add_custom_target(dist COMMAND ${CMAKE_MAKE_PROGRAM} package_source)
diff --git a/Documents/iqtree-manual-1.0.pdf b/Documents/iqtree-manual-1.0.pdf
new file mode 100644
index 0000000..1a6e27a
Binary files /dev/null and b/Documents/iqtree-manual-1.0.pdf differ
diff --git a/License.txt b/License.txt
new file mode 100644
index 0000000..5b6e7c6
--- /dev/null
+++ b/License.txt
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+

+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+

+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+

+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+

+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+

+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/alignment.cpp b/alignment.cpp
new file mode 100644
index 0000000..e90cc8d
--- /dev/null
+++ b/alignment.cpp
@@ -0,0 +1,3446 @@
+//
+// C++ Implementation: alignment
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#include "alignment.h"
+#include "myreader.h"
+#include <numeric>
+#include <sstream>
+#include "model/rategamma.h"
+using namespace std;
+
+char symbols_protein[] = "ARNDCQEGHILKMFPSTWYVX"; // X for unknown AA
+char symbols_dna[]     = "ACGT";
+char symbols_rna[]     = "ACGU";
+//char symbols_binary[]  = "01";
+char symbols_morph[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
+// genetic code from tri-nucleotides (AAA, AAC, AAG, AAT, ..., TTT) to amino-acids
+// Source: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+// Base1:                AAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTT
+// Base2:                AAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTTAAAACCCCGGGGTTTT
+// Base3:                ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
+char genetic_code1[]  = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; // Standard
+char genetic_code2[]  = "KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Vertebrate Mitochondrial
+char genetic_code3[]  = "KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Yeast Mitochondrial
+char genetic_code4[]  = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Mold, Protozoan, etc.
+char genetic_code5[]  = "KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Invertebrate Mitochondrial
+char genetic_code6[]  = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF"; // Ciliate, Dasycladacean and Hexamita Nuclear
+// note: tables 7 and 8 are not available in NCBI
+char genetic_code9[]  = "NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Echinoderm and Flatworm Mitochondrial
+char genetic_code10[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF"; // Euplotid Nuclear
+char genetic_code11[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; // Bacterial, Archaeal and Plant Plastid
+char genetic_code12[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF"; // Alternative Yeast Nuclear
+char genetic_code13[] = "KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Ascidian Mitochondrial
+char genetic_code14[] = "NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF"; // Alternative Flatworm Mitochondrial
+char genetic_code15[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF"; // Blepharisma Nuclear
+char genetic_code16[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF"; // Chlorophycean Mitochondrial
+// note: tables 17-20 are not available in NCBI
+char genetic_code21[] = "NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Trematode Mitochondrial
+char genetic_code22[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF"; // Scenedesmus obliquus mitochondrial
+char genetic_code23[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF"; // Thraustochytrium Mitochondrial
+char genetic_code24[] = "KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF"; // Pterobranchia mitochondrial
+char genetic_code25[] = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF"; // Candidate Division SR1 and Gracilibacteria
+
+Alignment::Alignment()
+        : vector<Pattern>()
+{
+    num_states = 0;
+    frac_const_sites = 0.0;
+//    codon_table = NULL;
+    genetic_code = NULL;
+//    non_stop_codon = NULL;
+    seq_type = SEQ_UNKNOWN;
+    STATE_UNKNOWN = 126;
+    pars_lower_bound = NULL;
+}
+
+string &Alignment::getSeqName(int i) {
+    assert(i >= 0 && i < (int)seq_names.size());
+    return seq_names[i];
+}
+
+vector<string>& Alignment::getSeqNames() {
+	return seq_names;
+}
+
+int Alignment::getSeqID(string &seq_name) {
+    for (int i = 0; i < getNSeq(); i++)
+        if (seq_name == getSeqName(i)) return i;
+    return -1;
+}
+
+int Alignment::getMaxSeqNameLength() {
+    int len = 0;
+    for (int i = 0; i < getNSeq(); i++)
+        if (getSeqName(i).length() > len)
+            len = getSeqName(i).length();
+    return len;
+}
+
+/** 
+   probability that the observed chi-square exceeds chi2 even if model is correct 
+   @param deg degree of freedom
+   @param chi2 chi-square value
+   @return p-value
+   */
+double chi2prob (int deg, double chi2)
+{
+    double a = 0.5*deg;
+    double x = 0.5*chi2;
+    return 1.0-RateGamma::cmpIncompleteGamma (x, a, RateGamma::cmpLnGamma(a));
+//	return IncompleteGammaQ (0.5*deg, 0.5*chi2);
+} /* chi2prob */
+
+
+void Alignment::checkSeqName() {
+    ostringstream warn_str;
+    StrVector::iterator it;
+    for (it = seq_names.begin(); it != seq_names.end(); it++) {
+        string orig_name = (*it);
+        for (string::iterator i = it->begin(); i != it->end(); i++) {
+            if (!isalnum(*i) && (*i) != '_' && (*i) != '-' && (*i) != '.') {
+                (*i) = '_';
+            }
+        }
+        if (orig_name != (*it))
+            warn_str << orig_name << " -> " << (*it) << endl;
+    }
+    if (warn_str.str() != "") {
+        string str = "Some sequence names are changed as follows:\n";
+        outWarning(str + warn_str.str());
+    }
+    // now check that sequence names are different
+    StrVector names;
+    names.insert(names.begin(), seq_names.begin(), seq_names.end());
+    sort(names.begin(), names.end());
+    bool ok = true;
+    for (it = names.begin(); it != names.end(); it++) {
+        if (it+1==names.end()) break;
+        if (*it == *(it+1)) {
+            cout << "ERROR: Duplicated sequence name " << *it << endl;
+            ok = false;
+        }
+    }
+    if (!ok) outError("Please rename sequences listed above!");
+    
+    double *state_freq = new double[num_states];
+//    double *freq_per_sequence = new double[num_states*getNSeq()];
+    double *freq_per_sequence = new double[num_states];
+    unsigned *count_per_seq = new unsigned[num_states*getNSeq()];
+    computeStateFreq(state_freq);
+//    computeStateFreqPerSequence(freq_per_sequence);
+    countStatePerSequence(count_per_seq);
+    
+    /*if (verbose_mode >= VB_MIN)*/ {
+        int max_len = getMaxSeqNameLength()+1;
+//        cout << "  ID  ";
+//        cout <<  "  Sequence";
+        cout.width(max_len+14);
+        cout << right << "Gap/Ambiguity" << "  Composition  p-value"<< endl;
+        int num_problem_seq = 0;
+        int total_gaps = 0;
+        cout.precision(2);
+        int num_failed = 0;
+        for (int i = 0; i < seq_names.size(); i++) {
+            int j;
+            int num_gaps = getNSite() - countProperChar(i);
+            total_gaps += num_gaps;
+            double percent_gaps = ((double)num_gaps / getNSite())*100.0;
+			cout.width(4);
+			cout << right << i+1 << "  ";
+            cout.width(max_len);
+            cout << left << seq_names[i] << " ";
+			cout.width(6);
+//			cout << num_gaps << " (" << percent_gaps << "%)";
+            cout << right << percent_gaps << "%";
+            if (percent_gaps > 50) {
+//				cout << " !!!";
+				num_problem_seq++;
+			}
+//            cout << "\t" << seq_states[i].size();
+
+            double chi2 = 0.0;
+            unsigned sum_count = 0;
+            for (j = 0; j < num_states; j++)
+                sum_count += count_per_seq[i*num_states+j];
+            double sum_inv = 1.0/sum_count;
+            for (j = 0; j < num_states; j++)
+                freq_per_sequence[j] = count_per_seq[i*num_states+j]*sum_inv;
+            for (j = 0; j < num_states; j++)
+                chi2 += (state_freq[j] - freq_per_sequence[j]) * (state_freq[j] - freq_per_sequence[j]) / state_freq[j];
+            
+//            chi2 *= getNSite();
+            chi2 *= sum_count;
+            double pvalue = chi2prob(num_states-1, chi2);
+            if (pvalue < 0.05) {
+                cout << "    failed ";
+                num_failed++;
+            } else
+                cout << "    passed ";
+            cout.width(9);
+            cout << right << pvalue*100 << "%";
+//            cout << "  " << chi2;
+			cout << endl;
+        }
+        if (num_problem_seq) cout << "WARNING: " << num_problem_seq << " sequences contain more than 50% gaps/ambiguity" << endl;
+        cout << "**** ";
+        cout.width(max_len+2);
+        cout << left << " TOTAL  ";
+        cout.width(6);
+        cout << right << ((double)total_gaps/getNSite())/getNSeq()*100 << "% ";
+        cout << " " << num_failed << " sequences failed composition chi2 test (p-value<5%; df=" << num_states-1 << ")" << endl;
+        cout.precision(3);
+    }
+    delete [] count_per_seq;
+    delete [] freq_per_sequence;
+    delete [] state_freq;
+}
+
+int Alignment::checkIdenticalSeq()
+{
+	int num_identical = 0;
+    IntVector checked;
+    checked.resize(getNSeq(), 0);
+	for (int seq1 = 0; seq1 < getNSeq(); seq1++) {
+        if (checked[seq1]) continue;
+		bool first = true;
+		for (int seq2 = seq1+1; seq2 < getNSeq(); seq2++) {
+			bool equal_seq = true;
+			for (iterator it = begin(); it != end(); it++)
+				if  ((*it)[seq1] != (*it)[seq2]) {
+					equal_seq = false;
+					break;
+				}
+			if (equal_seq) {
+				if (first)
+					cerr << "WARNING: Identical sequences " << getSeqName(seq1); 
+				cerr << ", " << getSeqName(seq2);
+				num_identical++;
+				checked[seq2] = 1;
+				first = false;
+			}
+		}
+		checked[seq1] = 1;
+		if (!first) cerr << endl;
+	}
+	if (num_identical)
+		outWarning("Some identical sequences found that should be discarded before the analysis");
+	return num_identical;
+}
+
+Alignment *Alignment::removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs)
+{
+    IntVector checked;
+    vector<bool> removed;
+    checked.resize(getNSeq(), 0);
+    removed.resize(getNSeq(), false);
+    int seq1;
+
+	for (seq1 = 0; seq1 < getNSeq(); seq1++) {
+        if (checked[seq1]) continue;
+        bool first_ident_seq = true;
+		for (int seq2 = seq1+1; seq2 < getNSeq(); seq2++) {
+			if (getSeqName(seq2) == not_remove) continue;
+			bool equal_seq = true;
+			for (iterator it = begin(); it != end(); it++)
+				if  ((*it)[seq1] != (*it)[seq2]) {
+					equal_seq = false;
+					break;
+				}
+			if (equal_seq) {
+				if (removed_seqs.size() < getNSeq()-3 && (!keep_two || !first_ident_seq)) {
+					removed_seqs.push_back(getSeqName(seq2));
+					target_seqs.push_back(getSeqName(seq1));
+					removed[seq2] = true;
+				}
+				checked[seq2] = 1;
+				first_ident_seq = false;
+			}
+		}
+		checked[seq1] = 1;
+	}
+
+	if (removed_seqs.size() > 0) {
+		if (removed_seqs.size() >= getNSeq()-3)
+			outWarning("Your alignment contains too many identical sequences!");
+		IntVector keep_seqs;
+		for (seq1 = 0; seq1 < getNSeq(); seq1++)
+			if (!removed[seq1]) keep_seqs.push_back(seq1);
+		Alignment *aln = new Alignment;
+		aln->extractSubAlignment(this, keep_seqs, 0);
+		return aln;
+	} else return this;
+}
+
+
+bool Alignment::isGapOnlySeq(int seq_id) {
+    assert(seq_id < getNSeq());
+    for (iterator it = begin(); it != end(); it++)
+        if ((*it)[seq_id] != STATE_UNKNOWN) {
+            return false;
+        }
+    return true;
+}
+
+Alignment *Alignment::removeGappySeq() {
+	IntVector keep_seqs;
+	int i, nseq = getNSeq();
+	for (i = 0; i < nseq; i++)
+		if (! isGapOnlySeq(i)) {
+			keep_seqs.push_back(i);
+		}
+	if (keep_seqs.size() == nseq)
+		return this;
+	Alignment *aln = new Alignment;
+	aln->extractSubAlignment(this, keep_seqs, 0);
+	return aln;
+}
+
+void Alignment::checkGappySeq(bool force_error) {
+    int nseq = getNSeq(), i;
+    int wrong_seq = 0;
+    for (i = 0; i < nseq; i++)
+        if (isGapOnlySeq(i)) {
+            outWarning("Sequence " + getSeqName(i) + " contains only gaps or missing data");
+            wrong_seq++;
+        }
+    if (wrong_seq && force_error) {
+        outError("Some sequences (see above) are problematic, please check your alignment again");
+    }
+}
+
+Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : vector<Pattern>() {
+    num_states = 0;
+    frac_const_sites = 0.0;
+//    codon_table = NULL;
+    genetic_code = NULL;
+//    non_stop_codon = NULL;
+    seq_type = SEQ_UNKNOWN;
+    STATE_UNKNOWN = 126;
+    pars_lower_bound = NULL;
+    cout << "Reading alignment file " << filename << " ... ";
+    intype = detectInputFile(filename);
+
+    try {
+
+        if (intype == IN_NEXUS) {
+            cout << "Nexus format detected" << endl;
+            readNexus(filename);
+        } else if (intype == IN_FASTA) {
+            cout << "Fasta format detected" << endl;
+            readFasta(filename, sequence_type);
+        } else if (intype == IN_PHYLIP) {
+            cout << "Phylip format detected" << endl;
+            readPhylip(filename, sequence_type);
+        } else if (intype == IN_CLUSTAL) {
+            cout << "Clustal format detected" << endl;
+            readClustal(filename, sequence_type);
+        } else if (intype == IN_MSF) {
+            cout << "MSF format detected" << endl;
+            readMSF(filename, sequence_type);
+        } else {
+            outError("Unknown sequence format, please use PHYLIP, FASTA, CLUSTAL, MSF, or NEXUS format");
+        }
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    }
+
+    if (getNSeq() < 3)
+        outError("Alignment must have at least 3 sequences");
+
+    countConstSite();
+
+    cout << "Alignment has " << getNSeq() << " sequences with " << getNSite() <<
+         " columns and " << getNPattern() << " patterns (" << num_informative_sites << " informative sites)" << endl;
+    buildSeqStates();
+    checkSeqName();
+    // OBSOLETE: identical sequences are handled later
+//	checkIdenticalSeq();
+    //cout << "Number of character states is " << num_states << endl;
+    //cout << "Number of patterns = " << size() << endl;
+    //cout << "Fraction of constant sites: " << frac_const_sites << endl;
+
+}
+
+bool Alignment::isStopCodon(int state) {
+	if (seq_type != SEQ_CODON || state >= num_states) return false;
+	assert(genetic_code);
+	return (genetic_code[state] == '*');
+}
+
+int Alignment::getNumNonstopCodons() {
+    if (seq_type != SEQ_CODON) return num_states;
+	assert(genetic_code);
+	int c = 0;
+	for (char *ch = genetic_code; *ch != 0; ch++)
+		if (*ch != '*') c++;
+	return c;
+}
+
+bool Alignment::isStandardGeneticCode() {
+    if (seq_type != SEQ_CODON) return false;
+	return (genetic_code == genetic_code1);
+}
+
+void Alignment::buildSeqStates(bool add_unobs_const) {
+	string unobs_const;
+	if (add_unobs_const) unobs_const = getUnobservedConstPatterns();
+	seq_states.clear();
+	seq_states.resize(getNSeq());
+	for (int seq = 0; seq < getNSeq(); seq++) {
+		vector<bool> has_state;
+		has_state.resize(STATE_UNKNOWN+1, false);
+		for (int site = 0; site < getNPattern(); site++)
+			has_state[at(site)[seq]] = true;
+		for (string::iterator it = unobs_const.begin(); it != unobs_const.end(); it++)
+			has_state[*it] = true;
+		for (int state = 0; state < STATE_UNKNOWN; state++)
+			if (has_state[state])
+				seq_states[seq].push_back(state);
+	}
+}
+
+int Alignment::readNexus(char *filename) {
+    NxsTaxaBlock *taxa_block;
+    NxsAssumptionsBlock *assumptions_block;
+    NxsDataBlock *data_block = NULL;
+    NxsTreesBlock *trees_block = NULL;
+    NxsCharactersBlock *char_block = NULL;
+
+    taxa_block = new NxsTaxaBlock();
+    assumptions_block = new NxsAssumptionsBlock(taxa_block);
+    data_block = new NxsDataBlock(taxa_block, assumptions_block);
+    char_block = new NxsCharactersBlock(taxa_block, assumptions_block);
+    trees_block = new TreesBlock(taxa_block);
+
+    MyReader nexus(filename);
+
+    nexus.Add(taxa_block);
+    nexus.Add(assumptions_block);
+    nexus.Add(data_block);
+	nexus.Add(char_block);
+    nexus.Add(trees_block);
+
+    MyToken token(nexus.inf);
+    nexus.Execute(token);
+
+	if (data_block->GetNTax() && char_block->GetNTax()) { 
+		outError("I am confused since both DATA and CHARACTERS blocks were specified");
+		return 0;
+	}
+
+    if (char_block->GetNTax() == 0) { char_block = data_block; }
+
+    if (char_block->GetNTax() == 0) {
+        outError("No data is given in the input file");
+        return 0;
+    }
+    if (verbose_mode >= VB_DEBUG)
+        char_block->Report(cout);
+
+
+    extractDataBlock(char_block);
+
+    return 1;
+}
+
+void Alignment::computeUnknownState() {
+    switch (seq_type) {
+    case SEQ_DNA: STATE_UNKNOWN = 18; break;
+    case SEQ_PROTEIN: STATE_UNKNOWN = 23; break;
+    default: STATE_UNKNOWN = num_states; break;
+    }
+}
+
+int getDataBlockMorphStates(NxsCharactersBlock *data_block) {
+    int nseq = data_block->GetNTax();
+    int nsite = data_block->GetNCharTotal();
+    int seq, site;
+    char ch;
+    int nstates = 0;
+    
+    for (site = 0; site < nsite; site++)
+        for (seq = 0; seq < nseq; seq++) {
+            int nstate = data_block->GetNumStates(seq, site);
+            if (nstate == 0)
+                continue;
+            if (nstate == 1) {
+                ch = data_block->GetState(seq, site, 0);
+                if (!isalnum(ch)) continue;
+                if (ch >= '0' && ch <= '9') 
+                    ch = ch - '0' + 1;
+                else if (ch >= 'A' && ch <= 'Z') 
+                    ch = ch - 'A' + 11;
+                else 
+                    outError(data_block->GetTaxonLabel(seq) + " has invalid state at site " + convertIntToString(site));
+                if (ch > nstates) nstates = ch;
+                continue;
+            }
+            for (int state = 0; state < nstate; state++) {
+                ch = data_block->GetState(seq, site, state);
+                if (!isalnum(ch)) continue;
+                if (ch >= '0' && ch <= '9') ch = ch - '0' + 1;
+                if (ch >= 'A' && ch <= 'Z') ch = ch - 'A' + 11;
+                if (ch >= '0' && ch <= '9') 
+                    ch = ch - '0' + 1;
+                else if (ch >= 'A' && ch <= 'Z') 
+                    ch = ch - 'A' + 11;
+                else 
+                    outError(data_block->GetTaxonLabel(seq) + " has invalid state at site " + convertIntToString(site));
+                if (ch > nstates) nstates = ch;
+            }
+        }
+    return nstates;
+}
+
+void Alignment::extractDataBlock(NxsCharactersBlock *data_block) {
+    int nseq = data_block->GetNTax();
+    int nsite = data_block->GetNCharTotal();
+    char *symbols = NULL;
+    //num_states = strlen(symbols);
+    char char_to_state[NUM_CHAR];
+    char state_to_char[NUM_CHAR];
+
+    NxsCharactersBlock::DataTypesEnum data_type = (NxsCharactersBlock::DataTypesEnum)data_block->GetDataType();
+    if (data_type == NxsCharactersBlock::continuous) {
+        outError("Continuous characters not supported");
+    } else if (data_type == NxsCharactersBlock::dna || data_type == NxsCharactersBlock::rna ||
+               data_type == NxsCharactersBlock::nucleotide)
+    {
+        num_states = 4;
+        if (data_type == NxsCharactersBlock::rna)
+            symbols = symbols_rna;
+        else
+            symbols = symbols_dna;
+        seq_type = SEQ_DNA;
+    } else if (data_type == NxsCharactersBlock::protein) {
+        num_states = 20;
+        symbols = symbols_protein;
+        seq_type = SEQ_PROTEIN;
+    } else {
+    	// standard morphological character
+//        num_states = data_block->GetMaxObsNumStates();
+        num_states = getDataBlockMorphStates(data_block);
+        if (num_states > 32)
+        	outError("Number of states can not exceed 32");
+        if (num_states < 2)
+        	outError("Number of states can not be below 2");
+        if (num_states == 2)
+        	seq_type = SEQ_BINARY;
+        else
+    		seq_type = SEQ_MORPH;
+        symbols = symbols_morph;
+    }
+
+    computeUnknownState();
+    memset(char_to_state, STATE_UNKNOWN, NUM_CHAR);
+    memset(state_to_char, '?', NUM_CHAR);
+    for (int i = 0; i < strlen(symbols); i++) {
+        char_to_state[(int)symbols[i]] = i;
+        state_to_char[i] = symbols[i];
+    }
+    state_to_char[(int)STATE_UNKNOWN] = '-';
+
+
+    int seq, site;
+
+    for (seq = 0; seq < nseq; seq++) {
+        seq_names.push_back(data_block->GetTaxonLabel(seq));
+    }
+
+    site_pattern.resize(nsite, -1);
+
+    int num_gaps_only = 0;
+
+    for (site = 0; site < nsite; site++) {
+        Pattern pat;
+        for (seq = 0; seq < nseq; seq++) {
+            int nstate = data_block->GetNumStates(seq, site);
+            if (nstate == 0)
+                pat += STATE_UNKNOWN;
+            else if (nstate == 1) {
+                pat += char_to_state[(int)data_block->GetState(seq, site, 0)];
+            } else {
+                assert(data_type != NxsCharactersBlock::dna || data_type != NxsCharactersBlock::rna || data_type != NxsCharactersBlock::nucleotide);
+                char pat_ch = 0;
+                for (int state = 0; state < nstate; state++) {
+                    pat_ch |= (1 << char_to_state[(int)data_block->GetState(seq, site, state)]);
+                }
+                pat_ch += 3;
+                pat += pat_ch;
+            }
+        }
+        num_gaps_only += addPattern(pat, site);
+    }
+    if (num_gaps_only)
+        cout << "WARNING: " << num_gaps_only << " sites contain only gaps or ambiguous characters." << endl;
+    if (verbose_mode >= VB_MAX)
+        for (site = 0; site < size(); site++) {
+            for (seq = 0; seq < nseq; seq++)
+                cout << state_to_char[(int)(*this)[site][seq]];
+            cout << "  " << (*this)[site].frequency << endl;
+        }
+}
+
+/**
+	determine if the pattern is constant. update the is_const variable.
+*/
+void Alignment::computeConst(Pattern &pat) {
+    pat.is_const = false;
+    pat.is_informative = false;
+    // critical fix: const_char was set wrongly to num_states in some data type (binary, codon),
+    // causing wrong log-likelihood computation for +I or +I+G model
+    if (STATE_UNKNOWN == num_states)
+    	pat.const_char = STATE_UNKNOWN+1;
+    else
+    	pat.const_char = STATE_UNKNOWN;
+    StateBitset state_app;
+    state_app.reset();
+    int j;
+    for (j = 0; j < num_states; j++)
+    	state_app[j] = 1;
+
+    // number of appearance for each state, to compute is_informative
+    int *num_app = new int[num_states];
+    memset(num_app, 0, num_states*sizeof(int));
+
+    for (Pattern::iterator i = pat.begin(); i != pat.end(); i++) {
+    	StateBitset this_app;
+    	getAppearance(*i, this_app);
+    	state_app &= this_app;
+        if (*i < num_states) { 
+            num_app[(int)(*i)]++;
+            continue;
+        }
+        if (*i == STATE_UNKNOWN) continue;
+        for (j = 0; j < num_states; j++)
+            if (this_app[j])
+                num_app[j]++;
+    }
+    int count = 0;
+    pat.num_chars = 0;
+    for (j = 0; j < num_states; j++) if (num_app[j]) {
+        pat.num_chars++;
+        if (num_app[j] >= 2) {
+            count++;
+        }
+    }
+    // at least 2 states, each appearing at least twice
+    if (count >= 2) pat.is_informative = true;
+    delete [] num_app;
+    
+    count = state_app.count();
+    if (count == 0) {
+    	return;
+    }
+    if (count == num_states) {
+    	// all-gap pattern
+    	pat.is_const = true;
+    	pat.const_char = num_states;
+    	return;
+    }
+    if (count == 1) {
+    	for (j = 0; j < num_states; j++)
+    		if (state_app.test(j)) {
+    			pat.is_const = true;
+    			pat.const_char = j;
+    			return;
+    		}
+    }
+}
+
+
+bool Alignment::addPattern(Pattern &pat, int site, int freq) {
+    // check if pattern contains only gaps
+    bool gaps_only = true;
+    for (Pattern::iterator it = pat.begin(); it != pat.end(); it++)
+        if ((*it) != STATE_UNKNOWN) {
+            gaps_only = false;
+            break;
+        }
+    if (gaps_only) {
+        if (verbose_mode >= VB_DEBUG)
+            cout << "Site " << site << " contains only gaps or ambiguous characters" << endl;
+        //return true;
+    }
+    PatternIntMap::iterator pat_it = pattern_index.find(pat);
+    if (pat_it == pattern_index.end()) { // not found
+        pat.frequency = freq;
+        computeConst(pat);
+        push_back(pat);
+        pattern_index[back()] = size()-1;
+        site_pattern[site] = size()-1;
+    } else {
+        int index = pat_it->second;
+        at(index).frequency += freq;
+        site_pattern[site] = index;
+    }
+    return gaps_only;
+}
+
+void Alignment::addConstPatterns(char *freq_const_patterns) {
+	IntVector vec;
+	convert_int_vec(freq_const_patterns, vec);
+	if (vec.size() != num_states)
+		outError("Const pattern frequency vector has different number of states: ", freq_const_patterns);
+
+	int nsite = getNSite(), orig_nsite = getNSite();
+	int i;
+	for (i = 0; i < vec.size(); i++) {
+		nsite += vec[i];
+		if (vec[i] < 0)
+			outError("Const pattern frequency must be non-negative");
+	}
+    site_pattern.resize(nsite, -1);
+	int nseq = getNSeq();
+	nsite = orig_nsite;
+	for (i = 0; i < vec.size(); i++) if (vec[i] > 0) {
+		Pattern pat;
+		pat.resize(nseq, i);
+//		if (pattern_index.find(pat) != pattern_index.end()) {
+//			outWarning("Constant pattern of all " + convertStateBackStr(i) + " already exists");
+//		}
+		for (int j = 0; j < vec[i]; j++)
+			addPattern(pat, nsite++, 1);
+	}
+    countConstSite();
+    buildSeqStates();
+}
+
+void Alignment::orderPatternByNumChars() {
+    int nptn = getNPattern();
+    int ptn, site, i = 0;
+    int *num_chars = new int[nptn];
+    int *ptn_order = new int[nptn];
+    const int UINT_BITS = sizeof(UINT)*8;
+    int maxi = (num_informative_sites+UINT_BITS-1)/UINT_BITS;
+    pars_lower_bound = new UINT[maxi+1];
+    UINT sum = 0;
+    memset(pars_lower_bound, 0, (maxi+1)*sizeof(UINT));
+    for (ptn = 0; ptn < nptn; ptn++) {
+        num_chars[ptn] =  -at(ptn).num_chars + (!at(ptn).is_informative)*1024;
+        ptn_order[ptn] = ptn;
+    }
+    quicksort(num_chars, 0, nptn-1, ptn_order);
+    ordered_pattern.clear();
+    for (ptn = 0, site = 0, i = 0; ptn < nptn; ptn++) {
+        if (!at(ptn_order[ptn]).is_informative)
+            break;
+        ordered_pattern.push_back(at(ptn_order[ptn]));
+        int freq = ordered_pattern.back().frequency;
+        UINT num = ordered_pattern.back().num_chars - 1;
+        for (int j = 0; j < freq; j++, site++) {
+            if (site == UINT_BITS) {
+                sum += pars_lower_bound[i];
+                i++;
+                site = 0;
+            }
+            pars_lower_bound[i] += num;
+        }
+    }
+    sum += pars_lower_bound[i];
+    // now transform lower_bound
+//    assert(i == maxi-1);
+    
+    for (int j = 0; j <= i; j++) {
+        UINT newsum = sum - pars_lower_bound[j];
+        pars_lower_bound[j] = sum;
+        sum = newsum;
+    }
+    
+    if (verbose_mode >= VB_MAX) {
+//        for (ptn = 0; ptn < nptn; ptn++)
+//            cout << at(ptn_order[ptn]).num_chars << " ";
+        for (int j = 0; j <= i; j++) {
+            cout << pars_lower_bound[j] << " ";
+        }
+        cout << endl << sum << endl;
+    }
+    delete [] ptn_order;
+    delete [] num_chars;
+}
+
+void Alignment::ungroupSitePattern()
+{
+	vector<Pattern> stored_pat = (*this);
+	clear();
+	for (int i = 0; i < getNSite(); i++) {
+		Pattern pat = stored_pat[getPatternID(i)];
+		pat.frequency = 1;
+		push_back(pat);
+		site_pattern[i] = i;
+	}
+	pattern_index.clear();
+}
+
+void Alignment::regroupSitePattern(int groups, IntVector& site_group)
+{
+	vector<Pattern> stored_pat = (*this);
+	IntVector stored_site_pattern = site_pattern;
+	clear();
+	site_pattern.clear();
+	site_pattern.resize(stored_site_pattern.size(), -1);
+	int count = 0;
+	for (int g = 0; g < groups; g++) {
+		pattern_index.clear();
+		for (int i = 0; i < site_group.size(); i++) 
+		if (site_group[i] == g) {
+			count++;
+			Pattern pat = stored_pat[stored_site_pattern[i]];
+			addPattern(pat, i);
+		}
+	}
+	assert(count == stored_site_pattern.size());
+	count = 0;
+	for (iterator it = begin(); it != end(); it++)
+		count += it->frequency;
+	assert(count == getNSite());
+	pattern_index.clear();
+	//printPhylip("/dev/stdout");
+}
+
+
+/**
+	detect the data type of the input sequences
+	@param sequences vector of strings
+	@return the data type of the input sequences
+*/
+SeqType Alignment::detectSequenceType(StrVector &sequences) {
+    int num_nuc = 0;
+    int num_ungap = 0;
+    int num_bin = 0;
+    int num_alpha = 0;
+    int num_digit = 0;
+
+    for (StrVector::iterator it = sequences.begin(); it != sequences.end(); it++)
+        for (string::iterator i = it->begin(); i != it->end(); i++) {
+            if ((*i) != '?' && (*i) != '-' && (*i) != '.' && *i != 'N' && *i != 'X') num_ungap++;
+            if ((*i) == 'A' || (*i) == 'C' || (*i) == 'G' || (*i) == 'T' || (*i) == 'U')
+                num_nuc++;
+            if ((*i) == '0' || (*i) == '1')
+                num_bin++;
+            if (isalpha(*i)) num_alpha++;
+            if (isdigit(*i)) num_digit++;
+        }
+    if (((double)num_nuc) / num_ungap > 0.9)
+        return SEQ_DNA;
+    if (((double)num_bin) / num_ungap > 0.9)
+        return SEQ_BINARY;
+    if (((double)num_alpha) / num_ungap > 0.9)
+        return SEQ_PROTEIN;
+    if (((double)(num_alpha+num_digit)) / num_ungap > 0.9)
+        return SEQ_MORPH;
+    return SEQ_UNKNOWN;
+}
+
+void Alignment::buildStateMap(char *map, SeqType seq_type) {
+    memset(map, STATE_INVALID, NUM_CHAR);
+    assert(STATE_UNKNOWN < 126);
+    map[(unsigned char)'?'] = STATE_UNKNOWN;
+    map[(unsigned char)'-'] = STATE_UNKNOWN;
+    map[(unsigned char)'.'] = STATE_UNKNOWN;
+    int len;
+    switch (seq_type) {
+    case SEQ_BINARY:
+        map[(unsigned char)'0'] = 0;
+        map[(unsigned char)'1'] = 1;
+        return;
+    case SEQ_DNA: // DNA
+	case SEQ_CODON:
+        map[(unsigned char)'A'] = 0;
+        map[(unsigned char)'C'] = 1;
+        map[(unsigned char)'G'] = 2;
+        map[(unsigned char)'T'] = 3;
+        map[(unsigned char)'U'] = 3;
+        map[(unsigned char)'R'] = 1+4+3; // A or G, Purine
+        map[(unsigned char)'Y'] = 2+8+3; // C or T, Pyrimidine
+        map[(unsigned char)'N'] = STATE_UNKNOWN;
+        map[(unsigned char)'X'] = STATE_UNKNOWN;
+        map[(unsigned char)'W'] = 1+8+3; // A or T, Weak
+        map[(unsigned char)'S'] = 2+4+3; // G or C, Strong
+        map[(unsigned char)'M'] = 1+2+3; // A or C, Amino
+        map[(unsigned char)'K'] = 4+8+3; // G or T, Keto
+        map[(unsigned char)'B'] = 2+4+8+3; // C or G or T
+        map[(unsigned char)'H'] = 1+2+8+3; // A or C or T
+        map[(unsigned char)'D'] = 1+4+8+3; // A or G or T
+        map[(unsigned char)'V'] = 1+2+4+3; // A or G or C
+        return;
+    case SEQ_PROTEIN: // Protein
+        for (int i = 0; i < 20; i++)
+            map[(int)symbols_protein[i]] = i;
+        map[(int)symbols_protein[20]] = STATE_UNKNOWN;
+//		map[(unsigned char)'B'] = 4+8+19; // N or D
+//		map[(unsigned char)'Z'] = 32+64+19; // Q or E
+        map[(unsigned char)'B'] = 20; // N or D
+        map[(unsigned char)'Z'] = 21; // Q or E
+        map[(unsigned char)'J'] = 22; // I or L
+        map[(unsigned char)'*'] = STATE_UNKNOWN; // stop codon
+        map[(unsigned char)'U'] = STATE_UNKNOWN; // 21st amino acid
+        
+        return;
+    case SEQ_MULTISTATE:
+        for (int i = 0; i <= STATE_UNKNOWN; i++)
+            map[i] = i;
+        return;
+    case SEQ_MORPH: // Protein
+    	len = strlen(symbols_morph);
+        for (int i = 0; i < len; i++)
+            map[(int)symbols_morph[i]] = i;
+        return;
+    default:
+        return;
+    }
+}
+
+
+/**
+	convert a raw characer state into ID, indexed from 0
+	@param state input raw state
+	@param seq_type data type (SEQ_DNA, etc.)
+	@return state ID
+*/
+char Alignment::convertState(char state, SeqType seq_type) {
+    if (state == '?' || state == '-' || state == '.')
+        return STATE_UNKNOWN;
+
+    char *loc;
+
+    switch (seq_type) {
+    case SEQ_BINARY:
+        switch (state) {
+        case '0':
+            return 0;
+        case '1':
+            return 1;
+        default:
+            return STATE_INVALID;
+        		}
+		break;
+    case SEQ_DNA: // DNA
+        switch (state) {
+        case 'A':
+            return 0;
+        case 'C':
+            return 1;
+        case 'G':
+            return 2;
+        case 'T':
+            return 3;
+        case 'U':
+            return 3;
+        case 'R':
+            return 1+4+3; // A or G, Purine
+        case 'Y':
+            return 2+8+3; // C or T, Pyrimidine
+        case 'O':
+        case 'N':
+        case 'X':
+            return STATE_UNKNOWN;
+        case 'W':
+            return 1+8+3; // A or T, Weak
+        case 'S':
+            return 2+4+3; // G or C, Strong
+        case 'M':
+            return 1+2+3; // A or C, Amino
+        case 'K':
+            return 4+8+3; // G or T, Keto
+        case 'B':
+            return 2+4+8+3; // C or G or T
+        case 'H':
+            return 1+2+8+3; // A or C or T
+        case 'D':
+            return 1+4+8+3; // A or G or T
+        case 'V':
+            return 1+2+4+3; // A or G or C
+        default:
+            return STATE_INVALID; // unrecognize character
+        }
+        return state;
+    case SEQ_PROTEIN: // Protein
+//		if (state == 'B') return 4+8+19;
+//		if (state == 'Z') return 32+64+19;
+		if (state == 'B') return 20;
+		if (state == 'Z') return 21;
+		if (state == 'J') return 22;
+        if (state == '*') return STATE_UNKNOWN; // stop codon
+        if (state == 'U') return STATE_UNKNOWN; // 21st amino-acid
+        loc = strchr(symbols_protein, state);
+
+        if (!loc) return STATE_INVALID; // unrecognize character
+        state = loc - symbols_protein;
+        if (state < 20)
+            return state;
+        else
+            return STATE_UNKNOWN;
+    case SEQ_MORPH: // Standard morphological character
+        loc = strchr(symbols_morph, state);
+
+        if (!loc) return STATE_INVALID; // unrecognize character
+        state = loc - symbols_morph;
+	    return state;
+    default:
+        return STATE_INVALID;
+    }
+}
+
+char Alignment::convertState(char state) {
+	return convertState(state, seq_type);
+}
+
+
+
+char Alignment::convertStateBack(char state) {
+    if (state == STATE_UNKNOWN) return '-';
+    if (state == STATE_INVALID) return '?';
+
+    switch (seq_type) {
+    case SEQ_BINARY:
+        switch (state) {
+        case 0:
+            return '0';
+        case 1:
+            return '1';
+        default:
+            return STATE_INVALID;
+        }
+    case SEQ_DNA: // DNA
+        switch (state) {
+        case 0:
+            return 'A';
+        case 1:
+            return 'C';
+        case 2:
+            return 'G';
+        case 3:
+            return 'T';
+        case 1+4+3:
+            return 'R'; // A or G, Purine
+        case 2+8+3:
+            return 'Y'; // C or T, Pyrimidine
+        case 1+8+3:
+            return 'W'; // A or T, Weak
+        case 2+4+3:
+            return 'S'; // G or C, Strong
+        case 1+2+3:
+            return 'M'; // A or C, Amino
+        case 4+8+3:
+            return 'K'; // G or T, Keto
+        case 2+4+8+3:
+            return 'B'; // C or G or T
+        case 1+2+8+3:
+            return 'H'; // A or C or T
+        case 1+4+8+3:
+            return 'D'; // A or G or T
+        case 1+2+4+3:
+            return 'V'; // A or G or C
+        default:
+            return '?'; // unrecognize character
+        }
+        return state;
+    case SEQ_PROTEIN: // Protein
+        if (state < 20)
+            return symbols_protein[(int)state];
+		else if (state == 20) return 'B';
+		else if (state == 21) return 'Z';
+		else if (state == 22) return 'J';
+//		else if (state == 4+8+19) return 'B';
+//		else if (state == 32+64+19) return 'Z';
+        else
+            return '-';
+    case SEQ_MORPH:
+    	// morphological state
+        if (state < strlen(symbols_morph))
+            return symbols_morph[(int)state];
+        else
+            return '-';
+    default:
+    	// unknown
+    	return '*';
+    }
+}
+
+string Alignment::convertStateBackStr(char state) {
+	string str;
+	if (seq_type != SEQ_CODON) {
+		str = convertStateBack(state);
+	} else {
+		// codon data
+		if (state >= num_states) return "???";
+//		assert(codon_table);
+//		int state_back = codon_table[(int)state];
+		str = symbols_dna[state/16];
+		str += symbols_dna[(state%16)/4];
+		str += symbols_dna[state%4];
+	}
+	return str;
+}
+
+void Alignment::convertStateStr(string &str, SeqType seq_type) {
+    for (string::iterator it = str.begin(); it != str.end(); it++)
+        (*it) = convertState(*it, seq_type);
+}
+
+void Alignment::initCodon(char *gene_code_id) {
+    // build index from 64 codons to non-stop codons
+	int transl_table = 1;
+	if (strlen(gene_code_id) > 0) {
+		try {
+			transl_table = convert_int(gene_code_id);
+		} catch (string &str) {
+			outError("Wrong genetic code ", gene_code_id);
+		}
+		switch (transl_table) {
+		case 1: genetic_code = genetic_code1; break;
+		case 2: genetic_code = genetic_code2; break;
+		case 3: genetic_code = genetic_code3; break;
+		case 4: genetic_code = genetic_code4; break;
+		case 5: genetic_code = genetic_code5; break;
+		case 6: genetic_code = genetic_code6; break;
+		case 9: genetic_code = genetic_code9; break;
+		case 10: genetic_code = genetic_code10; break;
+		case 11: genetic_code = genetic_code11; break;
+		case 12: genetic_code = genetic_code12; break;
+		case 13: genetic_code = genetic_code13; break;
+		case 14: genetic_code = genetic_code14; break;
+		case 15: genetic_code = genetic_code15; break;
+		case 16: genetic_code = genetic_code16; break;
+		case 21: genetic_code = genetic_code21; break;
+		case 22: genetic_code = genetic_code22; break;
+		case 23: genetic_code = genetic_code23; break;
+		case 24: genetic_code = genetic_code24; break;
+		case 25: genetic_code = genetic_code25; break;
+		default:
+			outError("Wrong genetic code ", gene_code_id);
+			break;
+		}
+	} else {
+		genetic_code = genetic_code1;
+	}
+	assert(strlen(genetic_code) == 64);
+
+//	int codon;
+	/*
+	num_states = 0;
+	for (codon = 0; codon < strlen(genetic_code); codon++)
+		if (genetic_code[codon] != '*')
+			num_states++; // only count non-stop codons
+	codon_table = new char[num_states];
+	non_stop_codon = new char[strlen(genetic_code)];
+	int state = 0;
+	for (int codon = 0; codon < strlen(genetic_code); codon++) {
+		if (genetic_code[codon] != '*') {
+			non_stop_codon[codon] = state++;
+			codon_table[(int)non_stop_codon[codon]] = codon;
+		} else {
+			non_stop_codon[codon] = STATE_INVALID;
+		}
+	}
+	*/
+	num_states = strlen(genetic_code);
+//	codon_table = new char[num_states];
+//	non_stop_codon = new char[strlen(genetic_code)];
+//	int state = 0;
+//	for (int codon = 0; codon < strlen(genetic_code); codon++) {
+//		non_stop_codon[codon] = state++;
+//		codon_table[(int)non_stop_codon[codon]] = codon;
+//	}
+//	cout << "num_states = " << num_states << endl;
+}
+
+int getMorphStates(StrVector &sequences) {
+	char maxstate = 0;
+	for (StrVector::iterator it = sequences.begin(); it != sequences.end(); it++)
+		for (string::iterator pos = it->begin(); pos != it->end(); pos++)
+			if ((*pos) > maxstate && isalnum(*pos)) maxstate = *pos;
+	if (maxstate >= '0' && maxstate <= '9') return (maxstate - '0' + 1);
+	if (maxstate >= 'A' && maxstate <= 'V') return (maxstate - 'A' + 11);
+	return 0;
+}
+
+SeqType Alignment::getSeqType(const char *sequence_type) {
+    SeqType user_seq_type = SEQ_UNKNOWN;
+    if (strcmp(sequence_type, "BIN") == 0) {
+        user_seq_type = SEQ_BINARY;
+    } else if (strcmp(sequence_type, "NT") == 0 || strcmp(sequence_type, "DNA") == 0) {
+        user_seq_type = SEQ_DNA;
+    } else if (strcmp(sequence_type, "AA") == 0 || strcmp(sequence_type, "PROT") == 0) {
+        user_seq_type = SEQ_PROTEIN;
+    } else if (strncmp(sequence_type, "NT2AA", 5) == 0) {
+        user_seq_type = SEQ_PROTEIN;
+    } else if (strcmp(sequence_type, "NUM") == 0 || strcmp(sequence_type, "MORPH") == 0 || strcmp(sequence_type, "MULTI") == 0) {
+        user_seq_type = SEQ_MORPH;
+    } else if (strcmp(sequence_type, "TINA") == 0) {
+        user_seq_type = SEQ_MULTISTATE;
+    } else if (strncmp(sequence_type, "CODON", 5) == 0) {
+        user_seq_type = SEQ_CODON;
+    }
+    return user_seq_type;
+}
+
+int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite) {
+    int seq_id;
+    ostringstream err_str;
+//    codon_table = NULL;
+    genetic_code = NULL;
+//    non_stop_codon = NULL;
+
+
+    if (nseq != seq_names.size()) throw "Different number of sequences than specified";
+
+    /* now check that all sequence names are correct */
+    for (seq_id = 0; seq_id < nseq; seq_id ++) {
+        ostringstream err_str;
+        if (seq_names[seq_id] == "")
+            err_str << "Sequence number " << seq_id+1 << " has no names\n";
+        // check that all the names are different
+        for (int i = 0; i < seq_id; i++)
+            if (seq_names[i] == seq_names[seq_id])
+                err_str << "The sequence name " << seq_names[seq_id] << " is dupplicated\n";
+    }
+    if (err_str.str() != "")
+        throw err_str.str();
+
+
+    /* now check that all sequences have the same length */
+    for (seq_id = 0; seq_id < nseq; seq_id ++) {
+        if (sequences[seq_id].length() != nsite) {
+            err_str << "Sequence " << seq_names[seq_id] << " contains ";
+            if (sequences[seq_id].length() < nsite)
+                err_str << "not enough";
+            else
+                err_str << "too many";
+
+            err_str << " characters (" << sequences[seq_id].length() << ")\n";
+        }
+    }
+
+    if (err_str.str() != "")
+        throw err_str.str();
+
+    /* now check data type */
+    seq_type = detectSequenceType(sequences);
+    switch (seq_type) {
+    case SEQ_BINARY:
+        num_states = 2;
+        cout << "Alignment most likely contains binary sequences" << endl;
+        break;
+    case SEQ_DNA:
+        num_states = 4;
+        cout << "Alignment most likely contains DNA/RNA sequences" << endl;
+        break;
+    case SEQ_PROTEIN:
+        num_states = 20;
+        cout << "Alignment most likely contains protein sequences" << endl;
+        break;
+    case SEQ_MORPH:
+        num_states = getMorphStates(sequences);
+        if (num_states < 2 || num_states > 32) throw "Invalid number of states.";
+        cout << "Alignment most likely contains " << num_states << "-state morphological data" << endl;
+        break;
+    default:
+        if (!sequence_type)
+            throw "Unknown sequence type.";
+    }
+    bool nt2aa = false;
+    if (sequence_type && strcmp(sequence_type,"") != 0) {
+        SeqType user_seq_type;
+        if (strcmp(sequence_type, "BIN") == 0) {
+            num_states = 2;
+            user_seq_type = SEQ_BINARY;
+        } else if (strcmp(sequence_type, "NT") == 0 || strcmp(sequence_type, "DNA") == 0) {
+            num_states = 4;
+            user_seq_type = SEQ_DNA;
+        } else if (strcmp(sequence_type, "AA") == 0 || strcmp(sequence_type, "PROT") == 0) {
+            num_states = 20;
+            user_seq_type = SEQ_PROTEIN;
+        } else if (strncmp(sequence_type, "NT2AA", 5) == 0) {
+            if (seq_type != SEQ_DNA)
+                outWarning("Sequence type detected as non DNA!");
+            initCodon(&sequence_type[5]);
+            seq_type = user_seq_type = SEQ_PROTEIN;
+            num_states = 20;
+            nt2aa = true;
+            cout << "Translating to amino-acid sequences with genetic code " << &sequence_type[5] << " ..." << endl;
+        } else if (strcmp(sequence_type, "NUM") == 0 || strcmp(sequence_type, "MORPH") == 0 || strcmp(sequence_type, "MULTI") == 0) {
+            num_states = getMorphStates(sequences);
+            if (num_states < 2 || num_states > 32) throw "Invalid number of states";
+            user_seq_type = SEQ_MORPH;
+        } else if (strcmp(sequence_type, "TINA") == 0) {
+            cout << "Multi-state data with " << num_states << " alphabets" << endl;
+            user_seq_type = SEQ_MULTISTATE;
+        } else if (strncmp(sequence_type, "CODON", 5) == 0) {
+            if (seq_type != SEQ_DNA) 
+				outWarning("You want to use codon models but the sequences were not detected as DNA");
+            seq_type = user_seq_type = SEQ_CODON;
+        	initCodon(&sequence_type[5]);
+            cout << "Converting to codon sequences with genetic code " << &sequence_type[5] << " ..." << endl;
+        } else
+            throw "Invalid sequence type.";
+        if (user_seq_type != seq_type && seq_type != SEQ_UNKNOWN)
+            outWarning("Your specified sequence type is different from the detected one");
+        seq_type = user_seq_type;
+    }
+
+    // now convert to patterns
+    int site, seq, num_gaps_only = 0;
+
+    char char_to_state[NUM_CHAR];
+    char AA_to_state[NUM_CHAR];
+    computeUnknownState();
+    if (nt2aa) {
+        buildStateMap(char_to_state, SEQ_DNA);
+        buildStateMap(AA_to_state, SEQ_PROTEIN);
+    } else
+        buildStateMap(char_to_state, seq_type);
+
+    Pattern pat;
+    pat.resize(nseq);
+    int step = ((seq_type == SEQ_CODON || nt2aa) ? 3 : 1);
+    if (nsite % step != 0)
+    	outError("Number of sites is not multiple of 3");
+    site_pattern.resize(nsite/step, -1);
+    clear();
+    pattern_index.clear();
+    int num_error = 0;
+    for (site = 0; site < nsite; site+=step) {
+        for (seq = 0; seq < nseq; seq++) {
+            //char state = convertState(sequences[seq][site], seq_type);
+            char state = char_to_state[(int)(sequences[seq][site])];
+            if (seq_type == SEQ_CODON || nt2aa) {
+            	// special treatment for codon
+            	char state2 = char_to_state[(int)(sequences[seq][site+1])];
+            	char state3 = char_to_state[(int)(sequences[seq][site+2])];
+            	if (state < 4 && state2 < 4 && state3 < 4) {
+//            		state = non_stop_codon[state*16 + state2*4 + state3];
+            		state = state*16 + state2*4 + state3;
+            		if (genetic_code[(int)state] == '*') {
+                        err_str << "Sequence " << seq_names[seq] << " has stop codon " <<
+                        		sequences[seq][site] << sequences[seq][site+1] << sequences[seq][site+2] <<
+                        		" at site " << site+1 << endl;
+                        num_error++;
+                        state = STATE_UNKNOWN;
+            		} else if (nt2aa) {
+                        state = AA_to_state[(int)genetic_code[(int)state]];
+                    }
+            	} else if (state == STATE_INVALID || state2 == STATE_INVALID || state3 == STATE_INVALID) {
+            		state = STATE_INVALID;
+            	} else {
+            		if (state != STATE_UNKNOWN || state2 != STATE_UNKNOWN || state3 != STATE_UNKNOWN) {
+            			ostringstream warn_str;
+                        warn_str << "Sequence " << seq_names[seq] << " has ambiguous character " <<
+                        		sequences[seq][site] << sequences[seq][site+1] << sequences[seq][site+2] <<
+                        		" at site " << site+1 << endl;
+                        outWarning(warn_str.str());
+            		}
+            		state = STATE_UNKNOWN;
+            	}
+            }
+            if (state == STATE_INVALID) {
+                if (num_error < 100) {
+                    err_str << "Sequence " << seq_names[seq] << " has invalid character " << sequences[seq][site];
+                    if (seq_type == SEQ_CODON) 
+                        err_str << sequences[seq][site+1] << sequences[seq][site+2];
+                    err_str << " at site " << site+1 << endl;
+                } else if (num_error == 100)
+                    err_str << "...many more..." << endl;
+                num_error++;
+            }
+            pat[seq] = state;
+        }
+        if (!num_error)
+            num_gaps_only += addPattern(pat, site/step);
+    }
+    if (num_gaps_only)
+        cout << "WARNING: " << num_gaps_only << " sites contain only gaps or ambiguous characters." << endl;
+    if (err_str.str() != "")
+        throw err_str.str();
+    return 1;
+}
+
+int Alignment::readPhylip(char *filename, char *sequence_type) {
+
+    StrVector sequences;
+    ostringstream err_str;
+    ifstream in;
+    int line_num = 1;
+    // set the failbit and badbit
+    in.exceptions(ios::failbit | ios::badbit);
+    in.open(filename);
+    int nseq = 0, nsite = 0;
+    int seq_id = 0;
+    string line;
+    // remove the failbit
+    in.exceptions(ios::badbit);
+    bool tina_state = (sequence_type && strcmp(sequence_type,"TINA") == 0);
+    num_states = 0;
+
+    for (; !in.eof(); line_num++) {
+        getline(in, line);
+        line = line.substr(0, line.find_first_of("\n\r"));
+        if (line == "") continue;
+
+        //cout << line << endl;
+        if (nseq == 0) { // read number of sequences and sites
+            istringstream line_in(line);
+            if (!(line_in >> nseq >> nsite))
+                throw "Invalid PHYLIP format. First line must contain number of sequences and sites";
+            //cout << "nseq: " << nseq << "  nsite: " << nsite << endl;
+            if (nseq < 3)
+                throw "There must be at least 3 sequences";
+            if (nsite < 1)
+                throw "No alignment columns";
+
+            seq_names.resize(nseq, "");
+            sequences.resize(nseq, "");
+
+        } else { // read sequence contents
+            if (seq_names[seq_id] == "") { // cut out the sequence name
+                string::size_type pos = line.find_first_of(" \t");
+                if (pos == string::npos) pos = 10; //  assume standard phylip
+                seq_names[seq_id] = line.substr(0, pos);
+                line.erase(0, pos);
+            }
+            int old_len = sequences[seq_id].length();
+            if (tina_state) {
+                stringstream linestr(line);
+                int state;
+                while (!linestr.eof() ) {
+                    state = -1;
+                    linestr >> state;
+                    if (state < 0) break;
+                    sequences[seq_id].append(1, state);
+                    if (num_states < state+1) num_states = state+1;
+                }
+            } else
+                for (string::iterator it = line.begin(); it != line.end(); it++) {
+                    if ((*it) <= ' ') continue;
+                    if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.' || (*it) == '*')
+                        sequences[seq_id].append(1, toupper(*it));
+                    else {
+                        err_str << "Line " << line_num <<": Unrecognized character " << *it;
+                        throw err_str.str();
+                    }
+                }
+            if (sequences[seq_id].length() != sequences[0].length()) {
+                err_str << "Line " << line_num << ": Sequence " << seq_names[seq_id] << " has wrong sequence length " << sequences[seq_id].length() << endl;
+                throw err_str.str();
+            }
+            if (sequences[seq_id].length() > old_len)
+                seq_id++;
+            if (seq_id == nseq) {
+                seq_id = 0;
+                // make sure that all sequences have the same length at this moment
+            }
+        }
+        //sequences.
+    }
+    in.clear();
+    // set the failbit again
+    in.exceptions(ios::failbit | ios::badbit);
+    in.close();
+
+    return buildPattern(sequences, sequence_type, nseq, nsite);
+}
+
+int Alignment::readFasta(char *filename, char *sequence_type) {
+
+    StrVector sequences;
+    ostringstream err_str;
+    ifstream in;
+    int line_num = 1;
+    string line;
+
+    // set the failbit and badbit
+    in.exceptions(ios::failbit | ios::badbit);
+    in.open(filename);
+    // remove the failbit
+    in.exceptions(ios::badbit);
+
+    for (; !in.eof(); line_num++) {
+        getline(in, line);
+        if (line == "") continue;
+
+        //cout << line << endl;
+        if (line[0] == '>') { // next sequence
+            string::size_type pos = line.find_first_of("\n\r");
+            seq_names.push_back(line.substr(1, pos-1));
+            trimString(seq_names.back());
+            sequences.push_back("");
+            continue;
+        }
+        // read sequence contents
+        if (sequences.empty()) throw "First line must begin with '>' to define sequence name";
+        for (string::iterator it = line.begin(); it != line.end(); it++) {
+            if ((*it) <= ' ') continue;
+            if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.' || (*it) == '*')
+                sequences.back().append(1, toupper(*it));
+            else {
+                err_str << "Line " << line_num <<": Unrecognized character " << *it;
+                throw err_str.str();
+            }
+        }
+    }
+    in.clear();
+    // set the failbit again
+    in.exceptions(ios::failbit | ios::badbit);
+    in.close();
+
+    // now try to cut down sequence name if possible
+    int i, j, step = 0;
+    StrVector new_seq_names, remain_seq_names;
+    new_seq_names.resize(seq_names.size());
+    remain_seq_names = seq_names;
+    
+    for (step = 0; step < 4; step++) {
+        bool duplicated = false;
+        for (i = 0; i < seq_names.size(); i++) {
+            if (remain_seq_names[i].empty()) continue;
+            size_t pos = remain_seq_names[i].find_first_of(" \t");
+            if (pos == string::npos) {
+                new_seq_names[i] += remain_seq_names[i];
+                remain_seq_names[i] = "";
+            } else {
+                new_seq_names[i] += remain_seq_names[i].substr(0, pos);
+                remain_seq_names[i] = "_" + remain_seq_names[i].substr(pos+1);
+            }
+            // now check for duplication
+            if (!duplicated)
+            for (j = 0; j < i-1; j++)
+                if (new_seq_names[j] == new_seq_names[i]) {
+                    duplicated = true;
+                    break;
+                }
+        }
+        if (!duplicated) break;
+    }
+
+    if (step > 0) {
+        for (i = 0; i < seq_names.size(); i++)
+            if (seq_names[i] != new_seq_names[i]) {
+                cout << "NOTE: Change sequence name '" << seq_names[i] << "' -> " << new_seq_names[i] << endl;
+            }
+    }
+
+    seq_names = new_seq_names;
+
+    return buildPattern(sequences, sequence_type, seq_names.size(), sequences.front().length());
+}
+
+int Alignment::readClustal(char *filename, char *sequence_type) {
+
+
+    StrVector sequences;
+    ifstream in;
+    int line_num = 1;
+    string line;
+    num_states = 0;
+
+
+    // set the failbit and badbit
+    in.exceptions(ios::failbit | ios::badbit);
+    in.open(filename);
+    // remove the failbit
+    in.exceptions(ios::badbit);
+    getline(in, line);
+    if (line.substr(0, 7) != "CLUSTAL") {
+        throw "ClustalW file does not start with 'CLUSTAL'";
+    }
+
+    int seq_count = 0;
+    for (line_num = 2; !in.eof(); line_num++) {
+        getline(in, line);
+        trimString(line);
+        if (line == "") { 
+            seq_count = 0;
+            continue;
+        }
+        if (line[0] == '*' || line[0] == ':' || line[0] == '.') continue; // ignore conservation line
+
+        size_t pos = line.find_first_of(" \t");
+        if (pos == string::npos) {
+            throw "Line " + convertIntToString(line_num) + ": whitespace not found between sequence name and content";
+        }
+        string seq_name = line.substr(0, pos);
+        if (seq_count == seq_names.size()) {
+            seq_names.push_back(seq_name);
+            sequences.push_back("");
+        } else if (seq_count > seq_names.size()){
+            throw "Line " + convertIntToString(line_num) + ": New sequence name is not allowed here";
+        } else if (seq_name != seq_names[seq_count]) {
+            throw "Line " + convertIntToString(line_num) + ": Sequence name " + seq_name + " does not match previously declared " +seq_names[seq_count];
+        }
+        
+        line = line.substr(pos+1);
+        trimString(line);
+        pos = line.find_first_of(" \t");
+        line = line.substr(0, pos);
+        // read sequence contents
+        for (string::iterator it = line.begin(); it != line.end(); it++) {
+            if ((*it) <= ' ') continue;
+            if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.' || (*it) == '*')
+                sequences[seq_count].append(1, toupper(*it));
+            else {
+                throw "Line " +convertIntToString(line_num) + ": Unrecognized character " + *it;
+            }
+        }
+        seq_count++;
+    }
+    in.clear();
+    // set the failbit again
+    in.exceptions(ios::failbit | ios::badbit);
+    in.close();
+    return buildPattern(sequences, sequence_type, seq_names.size(), sequences.front().length());
+
+
+}
+
+
+int Alignment::readMSF(char *filename, char *sequence_type) {
+
+
+    StrVector sequences;
+    ifstream in;
+    int line_num = 1;
+    string line;
+    num_states = 0;
+
+
+    // set the failbit and badbit
+    in.exceptions(ios::failbit | ios::badbit);
+    in.open(filename);
+    // remove the failbit
+    in.exceptions(ios::badbit);
+    getline(in, line);
+    if (line.find("MULTIPLE_ALIGNMENT") == string::npos) {
+        throw "MSF file must start with header line MULTIPLE_ALIGNMENT";
+    }
+
+    int seq_len = 0, seq_count = 0;
+    bool seq_started = false;
+    
+    for (line_num = 2; !in.eof(); line_num++) {
+        getline(in, line);
+        trimString(line);
+        if (line == "") { 
+            continue;
+        }
+        size_t pos;
+        
+        if (line.substr(0, 2) == "//") {
+            seq_started = true;
+            continue;
+        }
+        
+        if (line.substr(0,5) == "Name:") {
+            if (seq_started)
+                throw "Line " + convertIntToString(line_num) + ": Cannot declare sequence name here";
+            line = line.substr(5);
+            trimString(line);
+            pos = line.find_first_of(" \t");
+            if (pos == string::npos)
+                throw "Line " + convertIntToString(line_num) + ": No whitespace found after sequence name";
+            string seq_name = line.substr(0,pos);
+            seq_names.push_back(seq_name);
+            sequences.push_back("");
+            pos = line.find("Len:");
+            if (pos == string::npos)
+                throw "Line " + convertIntToString(line_num) + ": Sequence description does not contain 'Len:'";
+            line = line.substr(pos+4);
+            trimString(line);
+            pos = line.find_first_of(" \t");
+            if (pos == string::npos)
+                throw "Line " + convertIntToString(line_num) + ": No whitespace found after sequence length";
+            
+            int len;
+            line = line.substr(0, pos);
+            try {
+                len = convert_int(line.c_str());
+            } catch (string &str) {
+                throw "Line " + convertIntToString(line_num) + ": " + str;
+            }
+            if (len <= 0)
+                throw "Line " + convertIntToString(line_num) + ": Non-positive sequence length not allowed";
+            if (seq_len == 0)
+                seq_len = len;
+            else if (seq_len != len)
+                throw "Line " + convertIntToString(line_num) + ": Sequence length " + convertIntToString(len) + " is different from previously defined " + convertIntToString(seq_len);
+            continue;
+        }
+        
+        if (!seq_started) continue;
+
+        if (seq_names.empty())
+            throw "No sequence name declared in header";
+        
+        if (isdigit(line[0])) continue;
+        pos = line.find_first_of(" \t");
+        if (pos == string::npos) 
+            throw "Line " + convertIntToString(line_num) + ": whitespace not found between sequence name and content - " + line;
+        
+        string seq_name = line.substr(0, pos);
+        if (seq_name != seq_names[seq_count])
+            throw "Line " + convertIntToString(line_num) + ": Sequence name " + seq_name + " does not match previously declared " +seq_names[seq_count];
+        
+        line = line.substr(pos+1);
+        // read sequence contents
+        for (string::iterator it = line.begin(); it != line.end(); it++) {
+            if ((*it) <= ' ') continue;
+            if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.' || (*it) == '*')
+                sequences[seq_count].append(1, toupper(*it));
+            else  if ((*it) == '~')
+                sequences[seq_count].append(1, '-');
+            else {
+                throw "Line " +convertIntToString(line_num) + ": Unrecognized character " + *it;
+            }
+        }
+        seq_count++;
+        if (seq_count == seq_names.size())
+            seq_count = 0;
+    }
+    in.clear();
+    // set the failbit again
+    in.exceptions(ios::failbit | ios::badbit);
+    in.close();
+    return buildPattern(sequences, sequence_type, seq_names.size(), sequences.front().length());
+
+
+}
+
+
+bool Alignment::getSiteFromResidue(int seq_id, int &residue_left, int &residue_right) {
+    int i, j;
+    int site_left = -1, site_right = -1;
+    for (i = 0, j = -1; i < getNSite(); i++) {
+        if (at(site_pattern[i])[seq_id] != STATE_UNKNOWN) j++;
+        if (j == residue_left) site_left = i;
+        if (j == residue_right-1) site_right = i+1;
+    }
+    if (site_left < 0 || site_right < 0)
+        cout << "Out of range: Maxmimal residue number is " << j+1 << endl;
+    if (site_left == -1) outError("Left residue range is too high");
+    if (site_right == -1) {
+        outWarning("Right residue range is set to alignment length");
+        site_right = getNSite();
+    }
+    residue_left = site_left;
+    residue_right = site_right;
+    return true;
+}
+
+int Alignment::buildRetainingSites(const char *aln_site_list, IntVector &kept_sites,
+		bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name)
+{
+    if (aln_site_list) {
+        int seq_id = -1;
+        if (ref_seq_name) {
+            string ref_seq = ref_seq_name;
+            seq_id = getSeqID(ref_seq);
+            if (seq_id < 0) outError("Reference sequence name not found: ", ref_seq_name);
+        }
+        cout << "Reading site position list " << aln_site_list << " ..." << endl;
+        kept_sites.resize(getNSite(), 0);
+        try {
+            ifstream in;
+            in.exceptions(ios::failbit | ios::badbit);
+            in.open(aln_site_list);
+            in.exceptions(ios::badbit);
+
+            while (!in.eof()) {
+                int left, right;
+                left = right = 0;
+                in >> left;
+                if (in.eof()) break;
+                in >> right;
+                cout << left << "-" << right << endl;
+                if (left <= 0 || right <= 0) throw "Range must be positive";
+                if (left > right) throw "Left range is bigger than right range";
+                left--;
+                if (right > getNSite()) throw "Right range is bigger than alignment size";
+                if (seq_id >= 0) getSiteFromResidue(seq_id, left, right);
+                for (int i = left; i < right; i++)
+                    kept_sites[i] = 1;
+            }
+            in.close();
+        } catch (ios::failure) {
+            outError(ERR_READ_INPUT, aln_site_list);
+        } catch (const char* str) {
+            outError(str);
+        }
+    } else {
+        kept_sites.resize(getNSite(), 1);
+    }
+
+    int j;
+    if (exclude_gaps) {
+        for (j = 0; j < kept_sites.size(); j++)
+            if (kept_sites[j] && at(site_pattern[j]).computeAmbiguousChar(num_states) > 0) {
+                kept_sites[j] = 0;
+            }
+    }
+    if (exclude_const_sites) {
+        for (j = 0; j < kept_sites.size(); j++)
+        	if (at(site_pattern[j]).is_const)
+        		kept_sites[j] = 0;
+
+    }
+
+    int final_length = 0;
+    for (j = 0; j < kept_sites.size(); j++)
+        if (kept_sites[j]) final_length++;
+    return final_length;
+}
+
+void Alignment::printPhylip(ostream &out, bool append, const char *aln_site_list,
+                            bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name) {
+    IntVector kept_sites;
+    int final_length = buildRetainingSites(aln_site_list, kept_sites, exclude_gaps, exclude_const_sites, ref_seq_name);
+    if (seq_type == SEQ_CODON)
+        final_length *= 3;
+
+	out << getNSeq() << " " << final_length << endl;
+	StrVector::iterator it;
+	int max_len = getMaxSeqNameLength();
+	if (max_len < 10) max_len = 10;
+	int seq_id = 0;
+	for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+		out.width(max_len);
+		out << left << (*it) << "  ";
+		int j = 0;
+		for (IntVector::iterator i = site_pattern.begin();  i != site_pattern.end(); i++, j++)
+			if (kept_sites[j])
+				out << convertStateBackStr(at(*i)[seq_id]);
+		out << endl;
+	}
+}
+
+void Alignment::printPhylip(const char *file_name, bool append, const char *aln_site_list,
+                            bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name) {
+    IntVector kept_sites;
+    int final_length = buildRetainingSites(aln_site_list, kept_sites, exclude_gaps, exclude_const_sites, ref_seq_name);
+    if (seq_type == SEQ_CODON)
+        final_length *= 3;
+
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+
+        if (append)
+            out.open(file_name, ios_base::out | ios_base::app);
+        else
+            out.open(file_name);
+        out << getNSeq() << " " << final_length << endl;
+        StrVector::iterator it;
+        int max_len = getMaxSeqNameLength();
+        if (max_len < 10) max_len = 10;
+        int seq_id = 0;
+        for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+            out.width(max_len);
+            out << left << (*it) << "  ";
+            int j = 0;
+            for (IntVector::iterator i = site_pattern.begin();  i != site_pattern.end(); i++, j++)
+                if (kept_sites[j])
+                    out << convertStateBackStr(at(*i)[seq_id]);
+            out << endl;
+        }
+        out.close();
+        if (verbose_mode >= VB_MED)
+        	cout << "Alignment was printed to " << file_name << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, file_name);
+    }
+}
+
+void Alignment::printFasta(const char *file_name, bool append, const char *aln_site_list
+                           , bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name)
+{
+    IntVector kept_sites;
+    buildRetainingSites(aln_site_list, kept_sites, exclude_gaps, exclude_const_sites, ref_seq_name);
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        if (append)
+            out.open(file_name, ios_base::out | ios_base::app);
+        else
+            out.open(file_name);
+        StrVector::iterator it;
+        int seq_id = 0;
+        for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+            out << ">" << (*it) << endl;
+            int j = 0;
+            for (IntVector::iterator i = site_pattern.begin();  i != site_pattern.end(); i++, j++)
+                if (kept_sites[j])
+                    out << convertStateBackStr(at(*i)[seq_id]);
+            out << endl;
+        }
+        out.close();
+        cout << "Alignment was printed to " << file_name << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, file_name);
+    }
+}
+
+
+void Alignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char) {
+    IntVector::iterator it;
+    for (it = seq_id.begin(); it != seq_id.end(); it++) {
+        assert(*it >= 0 && *it < aln->getNSeq());
+        seq_names.push_back(aln->getSeqName(*it));
+    }
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+	genetic_code = aln->genetic_code;
+//    if (seq_type == SEQ_CODON) {
+//    	codon_table = new char[num_states];
+//    	memcpy(codon_table, aln->codon_table, num_states);
+//    	non_stop_codon = new char[strlen(genetic_code)];
+//    	memcpy(non_stop_codon, aln->non_stop_codon, strlen(genetic_code));
+//
+//    }
+    site_pattern.resize(aln->getNSite(), -1);
+    clear();
+    pattern_index.clear();
+    int site = 0;
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (iterator pit = aln->begin(); pit != aln->end(); pit++) {
+        Pattern pat;
+        int true_char = 0;
+        for (it = seq_id.begin(); it != seq_id.end(); it++) {
+            char ch = (*pit)[*it];
+            if (ch != STATE_UNKNOWN) true_char++;
+            pat.push_back(ch);
+        }
+        if (true_char < min_true_char) continue;
+        addPattern(pat, site, (*pit).frequency);
+        for (int i = 0; i < (*pit).frequency; i++)
+            site_pattern[site++] = size()-1;
+    }
+    site_pattern.resize(site);
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+    assert(size() <= aln->size());
+}
+
+
+void Alignment::extractPatterns(Alignment *aln, IntVector &ptn_id) {
+    int i;
+    for (i = 0; i < aln->getNSeq(); i++) {
+        seq_names.push_back(aln->getSeqName(i));
+    }
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    genetic_code = aln->genetic_code;
+    site_pattern.resize(aln->getNSite(), -1);
+    clear();
+    pattern_index.clear();
+    int site = 0;
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (i = 0; i != ptn_id.size(); i++) {
+        assert(ptn_id[i] >= 0 && ptn_id[i] < aln->getNPattern());
+        Pattern pat = aln->at(ptn_id[i]);
+        addPattern(pat, site, aln->at(ptn_id[i]).frequency);
+        for (int j = 0; j < aln->at(ptn_id[i]).frequency; j++)
+            site_pattern[site++] = size()-1;
+    }
+    site_pattern.resize(site);
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+    assert(size() <= aln->size());
+}
+
+void Alignment::extractPatternFreqs(Alignment *aln, IntVector &ptn_freq) {
+    int i;
+    assert(ptn_freq.size() <= aln->getNPattern());
+    for (i = 0; i < aln->getNSeq(); i++) {
+        seq_names.push_back(aln->getSeqName(i));
+    }
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    genetic_code = aln->genetic_code;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    site_pattern.resize(accumulate(ptn_freq.begin(), ptn_freq.end(), 0), -1);
+    clear();
+    pattern_index.clear();
+    int site = 0;
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (i = 0; i != ptn_freq.size(); i++)
+        if (ptn_freq[i]) {
+            assert(ptn_freq[i] > 0);
+            Pattern pat = aln->at(i);
+            addPattern(pat, site, ptn_freq[i]);
+            for (int j = 0; j < ptn_freq[i]; j++)
+                site_pattern[site++] = size()-1;
+        }
+    site_pattern.resize(site);
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+    assert(size() <= aln->size());
+}
+
+void Alignment::extractSites(Alignment *aln, IntVector &site_id) {
+    int i;
+    for (i = 0; i < aln->getNSeq(); i++) {
+        seq_names.push_back(aln->getSeqName(i));
+    }
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    genetic_code = aln->genetic_code;
+    site_pattern.resize(site_id.size(), -1);
+    clear();
+    pattern_index.clear();
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (i = 0; i != site_id.size(); i++) {
+        Pattern pat = aln->getPattern(site_id[i]);
+        addPattern(pat, i);
+    }
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+    // sanity check
+    for (iterator it = begin(); it != end(); it++)
+    	if (it->at(0) == -1)
+    		assert(0);
+
+    //cout << getNSite() << " positions were extracted" << endl;
+    //cout << __func__ << " " << num_states << endl;
+}
+
+
+
+void Alignment::convertToCodonOrAA(Alignment *aln, char *gene_code_id, bool nt2aa) {
+    if (aln->seq_type != SEQ_DNA)
+        outError("Cannot convert non-DNA alignment into codon alignment");
+    if (aln->getNSite() % 3 != 0)
+        outError("Sequence length is not divisible by 3 when converting to codon sequences");
+    int i, site;
+    char AA_to_state[NUM_CHAR];
+    for (i = 0; i < aln->getNSeq(); i++) {
+        seq_names.push_back(aln->getSeqName(i));
+    }
+//    num_states = aln->num_states;
+    seq_type = SEQ_CODON;
+    initCodon(gene_code_id);
+    if (nt2aa) {
+        seq_type = SEQ_PROTEIN;
+        num_states = 20;
+    }
+
+    computeUnknownState();
+
+    if (nt2aa) {
+        buildStateMap(AA_to_state, SEQ_PROTEIN);
+    }
+    
+    site_pattern.resize(aln->getNSite()/3, -1);
+    clear();
+    pattern_index.clear();
+    int step = ((seq_type == SEQ_CODON || nt2aa) ? 3 : 1);
+
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    int nsite = aln->getNSite();
+    int nseq = aln->getNSeq();
+    Pattern pat;
+    pat.resize(nseq);
+    int num_error = 0;
+    ostringstream err_str;
+
+    for (site = 0; site < nsite; site+=step) {
+        for (int seq = 0; seq < nseq; seq++) {
+            //char state = convertState(sequences[seq][site], seq_type);
+            char state = aln->at(aln->getPatternID(site))[seq];
+            // special treatment for codon
+            char state2 = aln->at(aln->getPatternID(site+1))[seq];
+            char state3 = aln->at(aln->getPatternID(site+2))[seq];
+            if (state < 4 && state2 < 4 && state3 < 4) {
+//            		state = non_stop_codon[state*16 + state2*4 + state3];
+                state = state*16 + state2*4 + state3;
+                if (genetic_code[(int)state] == '*') {
+                    err_str << "Sequence " << seq_names[seq] << " has stop codon "
+                            << " at site " << site+1 << endl;
+                    num_error++;
+                    state = STATE_UNKNOWN;
+                } else if (nt2aa) {
+                    state = AA_to_state[(int)genetic_code[(int)state]];
+                }
+            } else if (state == STATE_INVALID || state2 == STATE_INVALID || state3 == STATE_INVALID) {
+                state = STATE_INVALID;
+            } else {
+                if (state != STATE_UNKNOWN || state2 != STATE_UNKNOWN || state3 != STATE_UNKNOWN) {
+                    ostringstream warn_str;
+                    warn_str << "Sequence " << seq_names[seq] << " has ambiguous character " <<
+                        " at site " << site+1 << endl;
+                    outWarning(warn_str.str());
+                }
+                state = STATE_UNKNOWN;
+            }
+            if (state == STATE_INVALID) {
+                if (num_error < 100) {
+                    err_str << "Sequence " << seq_names[seq] << " has invalid character ";
+                    err_str << " at site " << site+1 << endl;
+                } else if (num_error == 100)
+                    err_str << "...many more..." << endl;
+                num_error++;
+            }
+            pat[seq] = state;
+        }
+        if (!num_error)
+            addPattern(pat, site/step);
+    }
+    if (num_error)
+        outError(err_str.str());
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+    // sanity check
+    for (iterator it = begin(); it != end(); it++)
+    	if (it->at(0) == -1)
+    		assert(0);
+    
+}
+
+void convert_range(const char *str, int &lower, int &upper, int &step_size, char* &endptr) throw (string) {
+    //char *endptr;
+    char *beginptr = (char*) str;
+
+    // parse the lower bound of the range
+    int d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+    lower = d;
+    //int d_save = d;
+    upper = d;
+    step_size = 1;
+    if (*endptr != '-') return;
+
+    // parse the upper bound of the range
+    str = endptr+1;
+    d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    //lower = d_save;
+    upper = d;
+    if (*endptr != '\\') return;
+
+    // parse the step size of the range
+    str = endptr+1;
+    d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    step_size = d;
+    str = beginptr;
+
+}
+
+void extractSiteID(Alignment *aln, const char* spec, IntVector &site_id) {
+    int i;
+    char *str = (char*)spec;
+    int nchars = 0;
+    try {
+        for (; *str != 0; ) {
+            int lower, upper, step;
+            convert_range(str, lower, upper, step, str);
+            lower--;
+            upper--;
+            nchars += (upper-lower+1)/step;
+            if (aln->seq_type == SEQ_CODON) {
+                lower /= 3;
+                upper /= 3;
+            }
+            if (upper >= aln->getNSite()) throw "Too large site ID";
+            if (lower < 0) throw "Negative site ID";
+            if (lower > upper) throw "Wrong range";
+            if (step < 1) throw "Wrong step size";
+            for (i = lower; i <= upper; i+=step)
+                site_id.push_back(i);
+            if (*str == ',' || *str == ' ') str++;
+            else break;
+        }
+        if (aln->seq_type == SEQ_CODON && nchars % 3 != 0)
+            throw (string)"Range " + spec + " length is not multiple of 3 (necessary for codon data)";
+    } catch (const char* err) {
+        outError(err);
+    } catch (string err) {
+        outError(err);
+    }
+}
+
+void Alignment::extractSites(Alignment *aln, const char* spec) {
+    IntVector site_id;
+    extractSiteID(aln, spec, site_id);
+    extractSites(aln, site_id);
+}
+
+void Alignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
+    if (aln->isSuperAlignment()) outError("Internal error: ", __func__);
+    int site, nsite = aln->getNSite();
+    seq_names.insert(seq_names.begin(), aln->seq_names.begin(), aln->seq_names.end());
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    genetic_code = aln->genetic_code;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    site_pattern.resize(nsite, -1);
+    clear();
+    pattern_index.clear();
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    if (pattern_freq) {
+        pattern_freq->resize(0);
+        pattern_freq->resize(aln->getNPattern(), 0);
+    }
+	IntVector site_vec;
+    if (!spec) {
+		// standard bootstrap
+		for (site = 0; site < nsite; site++) {
+			int site_id = random_int(nsite);
+			int ptn_id = aln->getPatternID(site_id);
+			Pattern pat = aln->at(ptn_id);
+			addPattern(pat, site);
+			if (pattern_freq) ((*pattern_freq)[ptn_id])++;
+		}
+    } else if (strncmp(spec, "GENESITE,", 9) == 0) {
+		// resampling genes, then resampling sites within resampled genes
+		convert_int_vec(spec+9, site_vec);
+		int i;
+		IntVector begin_site;
+		for (i = 0, site = 0; i < site_vec.size(); i++) {
+			begin_site.push_back(site);
+			site += site_vec[i];
+			//cout << "site = " << site_vec[i] << endl;
+		}
+		if (site > getNSite())
+			outError("Sum of lengths exceeded alignment length");
+
+		for (i = 0; i < site_vec.size(); i++) {
+			int part = random_int(site_vec.size());
+			for (int j = 0; j < site_vec[part]; j++) {
+				site = random_int(site_vec[part]) + begin_site[part];
+				int ptn = aln->getPatternID(site);
+				Pattern pat = aln->at(ptn);
+				addPattern(pat, site);
+				if (pattern_freq) ((*pattern_freq)[ptn])++;
+			}
+		}
+    } else if (strncmp(spec, "GENE,", 5) == 0) {
+		// resampling genes instead of sites
+		convert_int_vec(spec+5, site_vec);
+		int i;
+		IntVector begin_site;
+		for (i = 0, site = 0; i < site_vec.size(); i++) {
+			begin_site.push_back(site);
+			site += site_vec[i];
+			//cout << "site = " << site_vec[i] << endl;
+		}
+		if (site > getNSite())
+			outError("Sum of lengths exceeded alignment length");
+
+		for (i = 0; i < site_vec.size(); i++) {
+			int part = random_int(site_vec.size());
+			for (site = begin_site[part]; site < begin_site[part] + site_vec[part]; site++) {
+				int ptn = aln->getPatternID(site);
+				Pattern pat = aln->at(ptn);
+				addPattern(pat, site);
+				if (pattern_freq) ((*pattern_freq)[ptn])++;
+			}
+		}
+    } else {
+    	// special bootstrap
+    	convert_int_vec(spec, site_vec);
+    	if (site_vec.size() % 2 != 0)
+    		outError("Bootstrap specification length is not divisible by 2");
+    	nsite = 0;
+    	int part, begin_site = 0, out_site = 0;
+    	for (part = 0; part < site_vec.size(); part+=2)
+    		nsite += site_vec[part+1];
+    	site_pattern.resize(nsite, -1);
+    	for (part = 0; part < site_vec.size(); part += 2) {
+    		if (begin_site + site_vec[part] > aln->getNSite())
+    			outError("Sum of lengths exceeded alignment length");
+    		for (site = 0; site < site_vec[part+1]; site++) {
+    			int site_id = random_int(site_vec[part]) + begin_site;
+    			int ptn_id = aln->getPatternID(site_id);
+    			Pattern pat = aln->at(ptn_id);
+    			addPattern(pat, site + out_site);
+    			if (pattern_freq) ((*pattern_freq)[ptn_id])++;
+    		}
+    		begin_site += site_vec[part];
+    		out_site += site_vec[part+1];
+    	}
+    }
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+}
+
+void Alignment::createBootstrapAlignment(IntVector &pattern_freq, const char *spec) {
+	int nptn = getNPattern();
+    pattern_freq.resize(nptn, 0);
+    int *internal_freq = new int [nptn];
+    createBootstrapAlignment(internal_freq, spec);
+    for (int i = 0; i < nptn; i++)
+    	pattern_freq[i] = internal_freq[i];
+    delete [] internal_freq;
+}
+
+void Alignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
+    int site, nsite = getNSite();
+    memset(pattern_freq, 0, getNPattern()*sizeof(int));
+	IntVector site_vec;
+    if (!spec) {
+   		for (site = 0; site < nsite; site++) {
+   			int site_id = random_int(nsite);
+   			int ptn_id = getPatternID(site_id);
+   			pattern_freq[ptn_id]++;
+   		}
+    } else if (strncmp(spec, "GENESITE,", 9) == 0) {
+		// resampling genes, then resampling sites within resampled genes
+		convert_int_vec(spec+9, site_vec);
+		int i;
+		IntVector begin_site;
+		for (i = 0, site = 0; i < site_vec.size(); i++) {
+			begin_site.push_back(site);
+			site += site_vec[i];
+			//cout << "site = " << site_vec[i] << endl;
+		}
+		if (site > getNSite())
+			outError("Sum of lengths exceeded alignment length");
+
+		for (i = 0; i < site_vec.size(); i++) {
+			int part = random_int(site_vec.size());
+			for (int j = 0; j < site_vec[part]; j++) {
+				site = random_int(site_vec[part]) + begin_site[part];
+				int ptn = getPatternID(site);
+				pattern_freq[ptn]++;
+			}
+		}
+	} else if (strncmp(spec, "GENE,", 5) == 0) {
+		// resampling genes instead of sites
+		convert_int_vec(spec+5, site_vec);
+		int i;
+		IntVector begin_site;
+		for (i = 0, site = 0; i < site_vec.size(); i++) {
+			begin_site.push_back(site);
+			site += site_vec[i];
+			//cout << "site = " << site_vec[i] << endl;
+		}
+		if (site > getNSite())
+			outError("Sum of lengths exceeded alignment length");
+
+		for (i = 0; i < site_vec.size(); i++) {
+			int part = random_int(site_vec.size());
+			for (site = begin_site[part]; site < begin_site[part] + site_vec[part]; site++) {
+				int ptn = getPatternID(site);
+				pattern_freq[ptn]++;
+			}
+		}
+	} else {
+		// resampling sites within genes
+		convert_int_vec(spec, site_vec);
+		if (site_vec.size() % 2 != 0)
+			outError("Bootstrap specification length is not divisible by 2");
+		int part, begin_site = 0, out_site = 0;
+		for (part = 0; part < site_vec.size(); part += 2) {
+			if (begin_site + site_vec[part] > getNSite())
+				outError("Sum of lengths exceeded alignment length");
+			for (site = 0; site < site_vec[part+1]; site++) {
+				int site_id = random_int(site_vec[part]) + begin_site;
+				int ptn_id = getPatternID(site_id);
+				pattern_freq[ptn_id]++;
+			}
+			begin_site += site_vec[part];
+			out_site += site_vec[part+1];
+		}
+	}
+}
+
+void Alignment::createGapMaskedAlignment(Alignment *masked_aln, Alignment *aln) {
+    if (masked_aln->getNSeq() != aln->getNSeq()) outError("Different number of sequences in masked alignment");
+    if (masked_aln->getNSite() != aln->getNSite()) outError("Different number of sites in masked alignment");
+
+    int site, nsite = aln->getNSite(), nseq = aln->getNSeq();
+    seq_names.insert(seq_names.begin(), aln->seq_names.begin(), aln->seq_names.end());
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    genetic_code = aln->genetic_code;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    site_pattern.resize(nsite, -1);
+    clear();
+    pattern_index.clear();
+    IntVector name_map;
+    for (StrVector::iterator it = seq_names.begin(); it != seq_names.end(); it++) {
+        int seq_id = masked_aln->getSeqID(*it);
+        if (seq_id < 0) outError("Masked alignment does not contain taxon ", *it);
+        name_map.push_back(seq_id);
+    }
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (site = 0; site < nsite; site++) {
+        int ptn_id = aln->getPatternID(site);
+        Pattern pat = aln->at(ptn_id);
+        Pattern masked_pat = masked_aln->at(masked_aln->getPatternID(site));
+        for (int seq = 0; seq < nseq; seq++)
+            if (masked_pat[name_map[seq]] == STATE_UNKNOWN) pat[seq] = STATE_UNKNOWN;
+        addPattern(pat, site);
+    }
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+}
+
+void Alignment::shuffleAlignment() {
+    if (isSuperAlignment()) outError("Internal error: ", __func__);
+    my_random_shuffle(site_pattern.begin(), site_pattern.end());
+}
+
+
+void Alignment::concatenateAlignment(Alignment *aln) {
+    if (getNSeq() != aln->getNSeq()) outError("Different number of sequences in two alignments");
+    if (num_states != aln->num_states) outError("Different number of states in two alignments");
+    if (seq_type != aln->seq_type) outError("Different data type in two alignments");
+    int site, nsite = aln->getNSite();
+    int cur_sites = getNSite();
+    site_pattern.resize(cur_sites + nsite , -1);
+    IntVector name_map;
+    for (StrVector::iterator it = seq_names.begin(); it != seq_names.end(); it++) {
+        int seq_id = aln->getSeqID(*it);
+        if (seq_id < 0) outError("The other alignment does not contain taxon ", *it);
+        name_map.push_back(seq_id);
+    }
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (site = 0; site < nsite; site++) {
+        Pattern pat = aln->at(aln->getPatternID(site));
+        Pattern new_pat = pat;
+        for (int i = 0; i < name_map.size(); i++) new_pat[i] = pat[name_map[i]];
+        addPattern(new_pat, site + cur_sites);
+    }
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+}
+
+void Alignment::copyAlignment(Alignment *aln) {
+    int site, nsite = aln->getNSite();
+    seq_names.insert(seq_names.begin(), aln->seq_names.begin(), aln->seq_names.end());
+    num_states = aln->num_states;
+    seq_type = aln->seq_type;
+    genetic_code = aln->genetic_code;
+    STATE_UNKNOWN = aln->STATE_UNKNOWN;
+    site_pattern.resize(nsite, -1);
+    clear();
+    pattern_index.clear();
+    VerboseMode save_mode = verbose_mode;
+    verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+    for (site = 0; site < nsite; site++) {
+        int site_id = site;
+        int ptn_id = aln->getPatternID(site_id);
+        Pattern pat = aln->at(ptn_id);
+        addPattern(pat, site);
+    }
+    verbose_mode = save_mode;
+    countConstSite();
+    buildSeqStates();
+}
+
+void Alignment::countConstSite() {
+    int num_const_sites = 0;
+    num_informative_sites = 0;
+    for (iterator it = begin(); it != end(); it++) {
+        if ((*it).is_const) 
+            num_const_sites += (*it).frequency;
+        if (it->is_informative)
+            num_informative_sites += it->frequency;
+    }
+    frac_const_sites = ((double)num_const_sites) / getNSite();
+}
+
+string Alignment::getUnobservedConstPatterns() {
+	string ret = "";
+	for (char state = 0; state < num_states; state++) 
+    if (!isStopCodon(state))
+    {
+		string pat;
+		pat.resize(getNSeq(), state);
+		if (pattern_index.find(pat) == pattern_index.end()) {
+			// constant pattern is unobserved
+			ret.push_back(state);
+		}
+	}
+	return ret;
+}
+
+int Alignment::countProperChar(int seq_id) {
+    int num_proper_chars = 0;
+    for (iterator it = begin(); it != end(); it++) {
+        if ((*it)[seq_id] >= 0 && (*it)[seq_id] < num_states) num_proper_chars+=(*it).frequency;
+    }
+    return num_proper_chars;
+}
+
+Alignment::~Alignment()
+{
+//	if (codon_table) {
+//		delete [] codon_table;
+//		codon_table = NULL;
+//	}
+//	if (non_stop_codon) {
+//		delete [] non_stop_codon;
+//		non_stop_codon = NULL;
+//	}
+    if (pars_lower_bound) {
+        delete [] pars_lower_bound;
+        pars_lower_bound = NULL;
+    }
+}
+
+double Alignment::computeObsDist(int seq1, int seq2) {
+    int diff_pos = 0, total_pos = 0;
+    for (iterator it = begin(); it != end(); it++)
+        if  ((*it)[seq1] < num_states && (*it)[seq2] < num_states) {
+            //if ((*it)[seq1] != STATE_UNKNOWN && (*it)[seq2] != STATE_UNKNOWN) {
+            total_pos += (*it).frequency;
+            if ((*it)[seq1] != (*it)[seq2] )
+                diff_pos += (*it).frequency;
+        }
+    if (!total_pos)
+        return MAX_GENETIC_DIST; // return +INF if no overlap between two sequences
+    return ((double)diff_pos) / total_pos;
+}
+
+double Alignment::computeJCDist(int seq1, int seq2) {
+    double obs_dist = computeObsDist(seq1, seq2);
+    double z = (double)num_states / (num_states-1);
+    double x = 1.0 - (z * obs_dist);
+
+    if (x <= 0) {
+        /*		string str = "Too long distance between two sequences ";
+        		str += getSeqName(seq1);
+        		str += " and ";
+        		str += getSeqName(seq2);
+        		outWarning(str);*/
+        return MAX_GENETIC_DIST;
+    }
+
+    return -log(x) / z;
+}
+
+void Alignment::printDist(ostream &out, double *dist_mat) {
+    int nseqs = getNSeq();
+    int max_len = getMaxSeqNameLength();
+    if (max_len < 10) max_len = 10;
+    out << nseqs << endl;
+    int pos = 0;
+    out.precision(6);
+    out << fixed;
+    for (int seq1 = 0; seq1 < nseqs; seq1 ++)  {
+        out.width(max_len);
+        out << left << getSeqName(seq1) << " ";
+        for (int seq2 = 0; seq2 < nseqs; seq2 ++) {
+            out << dist_mat[pos++];
+            /*if (seq2 % 7 == 6) {
+            	out << endl;
+            	out.width(max_len+1);
+            } */
+            out << " ";
+        }
+        out << endl;
+    }
+}
+
+void Alignment::printDist(const char *file_name, double *dist_mat) {
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(file_name);
+        printDist(out, dist_mat);
+        out.close();
+        //cout << "Distance matrix was printed to " << file_name << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, file_name);
+    }
+}
+
+double Alignment::readDist(istream &in, double *dist_mat) {
+    double longest_dist = 0.0;    
+    int nseqs;
+    in >> nseqs;
+    if (nseqs != getNSeq())
+        throw "Distance file has different number of taxa";
+    double *tmp_dist_mat = new double[nseqs * nseqs];
+    std::map< string, int > map_seqName_ID;
+    int pos = 0, seq1, seq2, id = 0;
+    // read in distances to a temporary array
+    for (seq1 = 0; seq1 < nseqs; seq1++)  {
+        string seq_name;
+        in >> seq_name;
+        // assign taxa name to integer id
+        map_seqName_ID[seq_name] = id++;
+        /*
+        if (seq_name != getSeqName(seq1))
+            throw "Sequence name " + seq_name + " is different from " + getSeqName(seq1);
+        for (seq2 = 0; seq2 < nseqs; seq2++) {
+            in >> dist_mat[pos++];
+            if (dist_mat[pos-1] > longest_dist)
+                longest_dist = dist_mat[pos-1];
+        }
+         */
+        for (seq2 = 0; seq2 < nseqs; seq2++) {
+            in >> tmp_dist_mat[pos++];
+            //cout << tmp_dist_mat[pos - 1] << "  ";
+            if (tmp_dist_mat[pos - 1] > longest_dist)
+                longest_dist = tmp_dist_mat[pos - 1];
+        }
+        //cout << endl;        
+    }
+    //cout << "Internal distance matrix: " << endl;
+    // Now initialize the internal distance matrix, in which the sequence order is the same
+    // as in the alignment
+    for (seq1 = 0; seq1 < nseqs; seq1++) {
+        for (seq2 = 0; seq2 < nseqs; seq2++) {
+            string seq1Name = getSeqName(seq1);
+            string seq2Name = getSeqName(seq2);
+            if (map_seqName_ID.count(seq1Name) == 0) {
+                throw "Could not find taxa name " + seq1Name;
+            }
+            if (map_seqName_ID.count(seq2Name) == 0) {
+                throw "Could not find taxa name " + seq2Name;
+            }
+            int seq1_tmp_id = map_seqName_ID[seq1Name];
+            int seq2_tmp_id = map_seqName_ID[seq2Name];
+            dist_mat[seq1 * nseqs + seq2] = tmp_dist_mat[seq1_tmp_id * nseqs + seq2_tmp_id];
+            //cout << dist_mat[seq1 * nseqs + seq2] << "  ";
+        }
+        //cout << endl;
+    }
+            
+    // check for symmetric matrix
+    for (seq1 = 0; seq1 < nseqs-1; seq1++) {
+        if (dist_mat[seq1*nseqs+seq1] != 0.0)
+            throw "Diagonal elements of distance matrix is not ZERO";
+        for (seq2 = seq1+1; seq2 < nseqs; seq2++)
+            if (dist_mat[seq1*nseqs+seq2] != dist_mat[seq2*nseqs+seq1])
+                throw "Distance between " + getSeqName(seq1) + " and " + getSeqName(seq2) + " is not symmetric";
+    }
+    
+    /*
+    string dist_file = params.out_prefix;
+    dist_file += ".userdist";
+    printDist(dist_file.c_str(), dist_mat);*/
+    return longest_dist;
+}
+
+double Alignment::readDist(const char *file_name, double *dist_mat) {
+    double longest_dist = 0.0;
+
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(file_name);
+        longest_dist = readDist(in, dist_mat);
+        in.close();
+        cout << "Distance matrix was read from " << file_name << endl;
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, file_name);
+    }
+    return longest_dist;
+}
+
+void Alignment::computeStateFreq (double *state_freq, size_t num_unknown_states) {
+    int i, j;
+    double *states_app = new double[num_states*(STATE_UNKNOWN+1)];
+    double *new_freq = new double[num_states];
+    unsigned *state_count = new unsigned[STATE_UNKNOWN+1];
+    double *new_state_freq = new double[num_states];
+    
+    
+    memset(state_count, 0, sizeof(unsigned)*(STATE_UNKNOWN+1));
+    state_count[(int)STATE_UNKNOWN] = num_unknown_states;
+    
+    for (i = 0; i <= STATE_UNKNOWN; i++)
+        getAppearance(i, &states_app[i*num_states]);
+        
+    for (iterator it = begin(); it != end(); it++)
+        for (Pattern::iterator it2 = it->begin(); it2 != it->end(); it2++)
+            state_count[(int)*it2] += it->frequency;
+            
+    for (i = 0; i < num_states; i++)
+        state_freq[i] = 1.0/num_states;
+        
+    const int NUM_TIME = 8;
+    for (int k = 0; k < NUM_TIME; k++) {
+        memset(new_state_freq, 0, sizeof(double)*num_states);
+        
+        for (i = 0; i <= STATE_UNKNOWN; i++) {
+            if (state_count[i] == 0) continue;
+            double sum_freq = 0.0;
+            for (j = 0; j < num_states; j++) {
+                new_freq[j] = state_freq[j] * states_app[i*num_states+j];
+                sum_freq += new_freq[j];
+            }
+            sum_freq = 1.0/sum_freq;
+            for (j = 0; j < num_states; j++) {
+                new_state_freq[j] += new_freq[j]*sum_freq*state_count[i];
+            }
+        }
+        
+        double sum_freq = 0.0;
+        for (j = 0; j < num_states; j++)
+            sum_freq += new_state_freq[j];
+        sum_freq = 1.0/sum_freq;
+        for (j = 0; j < num_states; j++)
+            state_freq[j] = new_state_freq[j]*sum_freq;
+    }
+    
+	convfreq(state_freq);
+
+    if (verbose_mode >= VB_MED) {
+        cout << "Empirical state frequencies: ";
+        for (i = 0; i < num_states; i++)
+            cout << state_freq[i] << " ";
+        cout << endl;
+    }
+    
+    delete [] new_state_freq;
+    delete [] state_count;
+    delete [] new_freq;
+    delete [] states_app;
+}
+
+void Alignment::countStatePerSequence (unsigned *count_per_sequence) {
+    int i;
+    int nseqs = getNSeq();
+    memset(count_per_sequence, 0, sizeof(unsigned)*num_states*nseqs);
+    for (iterator it = begin(); it != end(); it++)
+        for (i = 0; i != nseqs; i++) {
+            if (it->at(i) < num_states) {
+                count_per_sequence[i*num_states + it->at(i)] += it->frequency;
+            }
+        }
+}
+
+void Alignment::computeStateFreqPerSequence (double *freq_per_sequence) {
+    int i, j;
+    int nseqs = getNSeq();
+    double *states_app = new double[num_states*(STATE_UNKNOWN+1)];
+    double *new_freq = new double[num_states];
+    unsigned *state_count = new unsigned[(STATE_UNKNOWN+1)*nseqs];
+    double *new_state_freq = new double[num_states];
+    
+    
+    memset(state_count, 0, sizeof(unsigned)*(STATE_UNKNOWN+1)*nseqs);
+    
+    for (i = 0; i <= STATE_UNKNOWN; i++)
+        getAppearance(i, &states_app[i*num_states]);
+        
+    for (iterator it = begin(); it != end(); it++)
+        for (i = 0; i != nseqs; i++) {
+            state_count[i*(STATE_UNKNOWN+1) + it->at(i)] += it->frequency;
+        }
+    double equal_freq = 1.0/num_states;
+    for (i = 0; i < num_states*nseqs; i++)
+        freq_per_sequence[i] = equal_freq;
+        
+    const int NUM_TIME = 8;
+    for (int k = 0; k < NUM_TIME; k++) {
+        for (int seq = 0; seq < nseqs; seq++) {
+            double *state_freq = &freq_per_sequence[seq*num_states];
+            memset(new_state_freq, 0, sizeof(double)*num_states);
+            for (i = 0; i <= STATE_UNKNOWN; i++) {
+                if (state_count[seq*(STATE_UNKNOWN+1)+i] == 0) continue;
+                double sum_freq = 0.0;
+                for (j = 0; j < num_states; j++) {
+                    new_freq[j] = state_freq[j] * states_app[i*num_states+j];
+                    sum_freq += new_freq[j];
+                }
+                sum_freq = 1.0/sum_freq;
+                for (j = 0; j < num_states; j++) {
+                    new_state_freq[j] += new_freq[j]*sum_freq*state_count[seq*(STATE_UNKNOWN+1)+i];
+                }
+            }
+            
+            double sum_freq = 0.0;
+            for (j = 0; j < num_states; j++)
+                sum_freq += new_state_freq[j];
+            sum_freq = 1.0/sum_freq;
+            for (j = 0; j < num_states; j++)
+                state_freq[j] = new_state_freq[j]*sum_freq;
+         }   
+    }
+    
+//	convfreq(state_freq);
+//
+//    if (verbose_mode >= VB_MED) {
+//        cout << "Empirical state frequencies: ";
+//        for (i = 0; i < num_states; i++)
+//            cout << state_freq[i] << " ";
+//        cout << endl;
+//    }
+    
+    delete [] new_state_freq;
+    delete [] state_count;
+    delete [] new_freq;
+    delete [] states_app;
+}
+
+//void Alignment::computeStateFreq (double *stateFrqArr) {
+//    int stateNo_;
+//    int nState_ = num_states;
+//    int nseqs = getNSeq();
+//    double *timeAppArr_ = new double[num_states];
+//    double *siteAppArr_ = new double[num_states]; //App = appearance
+//    double *newSiteAppArr_ = new double[num_states];
+//
+//    for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//        stateFrqArr [ stateNo_ ] = 1.0 / nState_;
+//
+//    int NUM_TIME = 8;
+//    //app = appeareance
+//    if (verbose_mode >= VB_MED)
+//        cout << "Computing state frequencies..." << endl;
+//    for (int time_ = 0; time_ < NUM_TIME; time_ ++)
+//    {
+//        for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//            timeAppArr_[stateNo_] = 0.0;
+//
+//        for (iterator it = begin(); it != end(); it++)
+//            for (int i = 0; i < (*it).frequency; i++)
+//            {
+//                for (int seq = 0; seq < nseqs; seq++) {
+//                    int stateNo_ = (*it)[seq];
+//
+//                    getAppearance (stateNo_, siteAppArr_);
+//
+//                    double totalSiteApp_ = 0.0;
+//                    for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++) {
+//                        newSiteAppArr_[stateNo_] = stateFrqArr[stateNo_] * siteAppArr_[stateNo_];
+//                        totalSiteApp_ += newSiteAppArr_[stateNo_];
+//                    }
+//                    totalSiteApp_ = 1.0 / totalSiteApp_;
+//
+//                    for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//                        timeAppArr_[stateNo_] += newSiteAppArr_[stateNo_] * totalSiteApp_;
+//                }
+//            }
+//
+//        double totalTimeApp_ = 0.0;
+//        int stateNo_;
+//        for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//            totalTimeApp_ += timeAppArr_[stateNo_];
+//
+//
+//        for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//            stateFrqArr[stateNo_] = timeAppArr_[stateNo_] / totalTimeApp_;
+//
+//    } //end of for time_
+//
+//    //  std::cout << "state frequency ..." << endl;
+//    // for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//    // std::cout << stateFrqArr[stateNo_] << endl;
+//
+//	convfreq(stateFrqArr);
+//
+//    if (verbose_mode >= VB_MED) {
+//        cout << "Empirical state frequencies: ";
+//        for (stateNo_ = 0; stateNo_ < nState_; stateNo_ ++)
+//            cout << stateFrqArr[stateNo_] << " ";
+//        cout << endl;
+//    }
+//	delete [] newSiteAppArr_;
+//	delete [] siteAppArr_;
+//	delete [] timeAppArr_;
+//	
+//}
+
+void Alignment::getAppearance(char state, double *state_app) {
+    int i;
+    if (state == STATE_UNKNOWN) {
+        for (i = 0; i < num_states; i++)
+            state_app[i] = 1.0;
+        return;
+    }
+
+    memset(state_app, 0, num_states * sizeof(double));
+    if (state < num_states) {
+        state_app[(int)state] = 1.0;
+        return;
+    }
+	// ambiguous characters
+	int ambi_aa[] = {4+8, 32+64, 512+1024};
+	switch (seq_type) {
+	case SEQ_DNA:
+	    state -= (num_states-1);
+		for (i = 0; i < num_states; i++)
+			if (state & (1 << i)) {
+				state_app[i] = 1.0;
+			}
+		break;
+	case SEQ_PROTEIN:
+		assert(state<23);
+		state -= 20;
+		for (i = 0; i < 11; i++)
+			if (ambi_aa[(int)state] & (1<<i)) {
+				state_app[i] = 1.0;
+			}
+		break;
+	default: assert(0); break;
+	}
+}
+
+void Alignment::getAppearance(char state, StateBitset &state_app) {
+
+	int i;
+    if (state == STATE_UNKNOWN) {
+    	state_app.set();
+        return;
+    }
+
+    state_app.reset();
+    if (state < num_states) {
+        state_app[(int)state] = 1;
+        return;
+    }
+	// ambiguous characters
+	int ambi_aa[] = {4+8, 32+64, 512+1024};
+	switch (seq_type) {
+	case SEQ_DNA:
+	    state -= (num_states-1);
+		for (i = 0; i < num_states; i++)
+			if (state & (1 << i)) {
+				state_app[i] = 1;
+			}
+		break;
+	case SEQ_PROTEIN:
+		if (state >= 23) return;
+		state -= 20;
+		for (i = 0; i < 11; i++)
+			if (ambi_aa[(int)state] & (1<<i)) {
+				state_app[i] = 1;
+			}
+		break;
+	default: assert(0); break;
+	}
+}
+
+void Alignment::computeCodonFreq(StateFreqType freq, double *state_freq, double *ntfreq) {
+	int nseqs = getNSeq();
+	int i, j;
+
+	if (freq == FREQ_CODON_1x4) {
+		memset(ntfreq, 0, sizeof(double)*4);
+		for (iterator it = begin(); it != end(); it++) {
+			for (int seq = 0; seq < nseqs; seq++) if ((*it)[seq] != STATE_UNKNOWN) {
+//				int codon = codon_table[(int)(*it)[seq]];
+				int codon = (int)(*it)[seq];
+				int nt1 = codon / 16;
+				int nt2 = (codon % 16) / 4;
+				int nt3 = codon % 4;
+				ntfreq[nt1] += (*it).frequency;
+				ntfreq[nt2] += (*it).frequency;
+				ntfreq[nt3] += (*it).frequency;
+			}
+		}
+		double sum = 0;
+		for (i = 0; i < 4; i++)
+			sum += ntfreq[i];
+		for (i = 0; i < 4; i++)
+			ntfreq[i] /= sum;
+		if (verbose_mode >= VB_MED) {
+			for (i = 0; i < 4; i++)
+				cout << "  " << symbols_dna[i] << ": " << ntfreq[i];
+			cout << endl;
+		}
+		memcpy(ntfreq+4, ntfreq, sizeof(double)*4);
+		memcpy(ntfreq+8, ntfreq, sizeof(double)*4);
+        double sum_stop=0.0;
+        sum = 0.0;
+		for (i = 0; i < num_states; i++) {
+            state_freq[i] = ntfreq[i/16] * ntfreq[(i%16)/4] * ntfreq[i%4];
+			if (isStopCodon(i)) {
+                sum_stop += state_freq[i];
+				state_freq[i] = MIN_FREQUENCY;
+                sum += MIN_FREQUENCY;
+			}
+        }
+        sum = (1.0-sum)/(1.0-sum_stop);
+		for (i = 0; i < num_states; i++)
+            if (!isStopCodon(i))
+                state_freq[i] *= sum;
+        sum = 0.0;
+		for (i = 0; i < num_states; i++)
+                sum += state_freq[i];
+        assert(fabs(sum-1.0)<1e-5);
+	} else if (freq == FREQ_CODON_3x4) {
+		// F3x4 frequency model
+		memset(ntfreq, 0, sizeof(double)*12);
+		for (iterator it = begin(); it != end(); it++) {
+			for (int seq = 0; seq < nseqs; seq++) if ((*it)[seq] != STATE_UNKNOWN) {
+//				int codon = codon_table[(int)(*it)[seq]];
+				int codon = (int)(*it)[seq];
+				int nt1 = codon / 16;
+				int nt2 = (codon % 16) / 4;
+				int nt3 = codon % 4;
+				ntfreq[nt1] += (*it).frequency;
+				ntfreq[4+nt2] += (*it).frequency;
+				ntfreq[8+nt3] += (*it).frequency;
+			}
+		}
+		for (j = 0; j < 12; j+=4) {
+			double sum = 0;
+			for (i = 0; i < 4; i++)
+				sum += ntfreq[i+j];
+			for (i = 0; i < 4; i++)
+				ntfreq[i+j] /= sum;
+			if (verbose_mode >= VB_MED) {
+				for (i = 0; i < 4; i++)
+					cout << "  " << symbols_dna[i] << ": " << ntfreq[i+j];
+				cout << endl;
+			}
+		}
+        
+        double sum_stop=0.0;
+        double sum = 0.0;
+		for (i = 0; i < num_states; i++) {
+            state_freq[i] = ntfreq[i/16] * ntfreq[4+(i%16)/4] * ntfreq[8+i%4];
+			if (isStopCodon(i)) {
+                sum_stop += state_freq[i];
+				state_freq[i] = MIN_FREQUENCY;
+                sum += MIN_FREQUENCY;
+			}
+        }
+        sum = (1.0-sum)/(1.0-sum_stop);
+		for (i = 0; i < num_states; i++)
+            if (!isStopCodon(i))
+                state_freq[i] *= sum;
+        sum = 0.0;
+		for (i = 0; i < num_states; i++)
+                sum += state_freq[i];
+        assert(fabs(sum-1.0)<1e-5);
+        
+//		double sum = 0;
+//		for (i = 0; i < num_states; i++)
+//			if (isStopCodon(i)) {
+//				state_freq[i] = 0.0;
+//			} else {
+//				//int codon = codon_table[i];
+//				int codon = i;
+//				state_freq[i] = ntfreq[codon/16] * ntfreq[4+(codon%16)/4] * ntfreq[8+codon%4];
+//				sum += state_freq[i];
+//			}
+//		for (i = 0; i < num_states; i++)
+//			state_freq[i] /= sum;
+            
+        // now recompute ntfreq based on state_freq
+//        memset(ntfreq, 0, 12*sizeof(double));
+//        for (i = 0; i < num_states; i++)
+//            if (!isStopCodon(i)) {
+//				int nt1 = i / 16;
+//				int nt2 = (i % 16) / 4;
+//				int nt3 = i % 4;
+//                ntfreq[nt1] += state_freq[i];
+//                ntfreq[nt2+4] += state_freq[i];
+//                ntfreq[nt3+8] += state_freq[i];
+//            }
+//		for (j = 0; j < 12; j+=4) {
+//			double sum = 0;
+//			for (i = 0; i < 4; i++)
+//				sum += ntfreq[i+j];
+//			for (i = 0; i < 4; i++)
+//				ntfreq[i+j] /= sum;
+//			if (verbose_mode >= VB_MED) {
+//				for (i = 0; i < 4; i++)
+//					cout << "  " << symbols_dna[i] << ": " << ntfreq[i+j];
+//				cout << endl;
+//			}
+//		}
+	} else if (freq == FREQ_CODON_3x4C) {
+        outError("F3X4C not yet implemented. Contact authors if you really need it.");
+	} else if (freq == FREQ_EMPIRICAL || freq == FREQ_ESTIMATE) {
+		memset(state_freq, 0, num_states*sizeof(double));
+        i = 0;
+        for (iterator it = begin(); it != end(); it++, i++)
+			for (int seq = 0; seq < nseqs; seq++) {
+				int state = it->at(seq);
+				if (state >= num_states) continue;
+				state_freq[state] += it->frequency;
+			}
+        double sum = 0.0;
+        for (i = 0; i < num_states; i++)
+        	sum += state_freq[i];
+        for (i = 0; i < num_states; i++)
+        	state_freq[i] /= sum;
+	} else {
+        outError("Unsupported codon frequency");
+    }
+	convfreq(state_freq);
+}
+
+void Alignment::computeEmpiricalRate (double *rates) {
+    int i, j, k;
+    assert(rates);
+    int nseqs = getNSeq();
+    unsigned *pair_rates = new unsigned[num_states*num_states];
+    memset(pair_rates, 0, sizeof(unsigned)*num_states*num_states);
+//    for (i = 0; i < num_states; i++) {
+//        pair_rates[i] = new double[num_states];
+//        memset(pair_rates[i], 0, sizeof(double)*num_states);
+//    }
+
+    unsigned *state_freq = new unsigned[STATE_UNKNOWN+1];
+
+    for (iterator it = begin(); it != end(); it++) {
+        memset(state_freq, 0, sizeof(unsigned)*(STATE_UNKNOWN+1));
+        for (i = 0; i < nseqs; i++) {
+            state_freq[(int)it->at(i)]++;
+        }
+        for (i = 0; i < num_states; i++) {
+            if (state_freq[i] == 0) continue;
+            pair_rates[i*num_states+i] += (state_freq[i]*(state_freq[i]-1)/2)*it->frequency;
+            for (j = i+1; j < num_states; j++)
+                pair_rates[i*num_states+j] += state_freq[i]*state_freq[j]*it->frequency;
+        }
+//            int state1 = it->at(i);
+//            if (state1 >= num_states) continue;
+//            int *this_pair = pair_rates + state1*num_states;
+//            for (j = i+1; j < nseqs; j++) {
+//                int state2 = it->at(j);
+//                if (state2 < num_states) this_pair[state2] += it->frequency;
+//            }
+//        }
+    }
+
+    k = 0;
+    double last_rate = pair_rates[(num_states-2)*num_states+num_states-1] + pair_rates[(num_states-1)*num_states+num_states-2];
+    if (last_rate == 0) last_rate = 1;
+    for (i = 0; i < num_states-1; i++)
+        for (j = i+1; j < num_states; j++) {
+            rates[k++] = (pair_rates[i*num_states+j] + pair_rates[j*num_states+i]) / last_rate;
+            // BIG WARNING: zero rates might cause numerical instability!
+            if (rates[k-1] <= 0.0001) rates[k-1] = 0.01;
+            if (rates[k-1] > 100.0) rates[k-1] = 50.0;
+        }
+    rates[k-1] = 1;
+    if (verbose_mode >= VB_MAX) {
+        cout << "Empirical rates: ";
+        for (k = 0; k < num_states*(num_states-1)/2; k++)
+            cout << rates[k] << " ";
+        cout << endl;
+    }
+
+//    for (i = num_states-1; i >= 0; i--) {
+//        delete [] pair_rates[i];
+//    }
+    delete [] state_freq;
+    delete [] pair_rates;
+}
+
+void Alignment::computeEmpiricalRateNonRev (double *rates) {
+    double *rates_mat = new double[num_states*num_states];
+    int i, j, k;
+
+    computeEmpiricalRate(rates);
+
+    for (i = 0, k = 0; i < num_states-1; i++)
+        for (j = i+1; j < num_states; j++)
+            rates_mat[i*num_states+j] = rates_mat[j*num_states+i] = rates[k++];
+
+    for (i = 0, k = 0; i < num_states; i++)
+        for (j = 0; j < num_states; j++)
+            if (j != i) rates[k++] = rates_mat[i*num_states+j];
+	delete [] rates_mat;
+
+}
+
+void Alignment::convfreq(double *stateFrqArr) {
+	int i, maxi=0;
+	double freq, maxfreq, sum;
+	int zero_states = 0;
+
+	sum = 0.0;
+	maxfreq = 0.0;
+	for (i = 0; i < num_states; i++)
+	{
+		freq = stateFrqArr[i];
+		if (freq < MIN_FREQUENCY) {
+			stateFrqArr[i] = MIN_FREQUENCY;
+			if (!isStopCodon(i))
+				cout << "WARNING: " << convertStateBackStr(i) << " is not present in alignment that may cause numerical problems" << endl;
+		}
+		if (freq > maxfreq) {
+			maxfreq = freq;
+			maxi = i;
+		}
+
+		sum += stateFrqArr[i];
+	}
+	stateFrqArr[maxi] += 1.0 - sum;
+
+	// make state frequencies a bit different from each other
+//	for (i = 0; i < num_states - 1; i++)
+//		if (!isStopCodon(i))
+//			for (j = i + 1; j < num_states; j++)
+//				if (!isStopCodon(j))
+//					if (stateFrqArr[i] == stateFrqArr[j]) {
+//						stateFrqArr[i] += MIN_FREQUENCY_DIFF;
+//						stateFrqArr[j] -= MIN_FREQUENCY_DIFF;
+//					}
+	if (zero_states) {
+		cout << "WARNING: " << zero_states << " states not present in alignment that might cause numerical instability" << endl;
+	}
+} /* convfreq */
+
+double Alignment::computeUnconstrainedLogL() {
+    int nptn = size();
+    double logl = 0.0;
+    int nsite = getNSite(), i;
+    double lognsite = log(nsite);
+    for (i = 0; i < nptn; i++)
+        logl += (log(at(i).frequency) - lognsite) * at(i).frequency;
+    return logl;
+}
+
+void Alignment::printSiteGaps(const char *filename) {
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+
+        out.open(filename);
+        int nsite = getNSite();
+        out << nsite << endl << "Site_Gap  ";
+        for (int site = 0; site < getNSite(); site++) {
+            out << " " << at(getPatternID(site)).computeGapChar(num_states, STATE_UNKNOWN);
+        }
+        out << endl << "Site_Ambi ";
+        for (int site = 0; site < getNSite(); site++) {
+            out << " " << at(getPatternID(site)).computeAmbiguousChar(num_states);
+        }
+        out << endl;
+        out.close();
+        cout << "Site gap-counts printed to " << filename << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, filename);
+    }
+}
+
+void Alignment::getPatternFreq(IntVector &freq) {
+	freq.resize(getNPattern());
+	int cnt = 0;
+	for (iterator it = begin(); it < end(); it++, cnt++)
+		freq[cnt] = (*it).frequency;
+}
+
+//added by MA
+void Alignment::multinomialProb(Alignment refAlign, double &prob)
+{
+// 	cout << "Computing the probability of this alignment given the multinomial distribution determined by a reference alignment ..." << endl;
+    //should we check for compatibility of sequence's names and sequence's order in THIS alignment and in the objectAlign??
+    //check alignment length
+    int nsite = getNSite();
+    assert(nsite == refAlign.getNSite());
+    double sumFac = 0;
+    double sumProb = 0;
+    double fac = logFac(nsite);
+    int index;
+    for ( iterator it = begin(); it != end() ; it++)
+    {
+        PatternIntMap::iterator pat_it = refAlign.pattern_index.find((*it));
+        if ( pat_it == refAlign.pattern_index.end() ) //not found ==> error
+            outError("Pattern in the current alignment is not found in the reference alignment!");
+        sumFac += logFac((*it).frequency);
+        index = pat_it->second;
+        sumProb += (double)(*it).frequency*log((double)refAlign.at(index).frequency/(double)nsite);
+    }
+    prob = fac - sumFac + sumProb;
+}
+
+void Alignment::multinomialProb (DoubleVector logLL, double &prob)
+{
+    //cout << "Function in Alignment: Compute probability of the expected alignment (determined by patterns log-likelihood under some tree and model) given THIS alignment." << endl;
+
+    //The expected normalized requencies
+    IntVector expectedNorFre;
+
+    if ( logLL.empty())
+        outError("Error: log likelihood of patterns are not given!");
+
+    int patNum = getNPattern();
+
+    assert(logLL.size() == patNum);
+
+    int alignLen = getNSite();
+    //resize the expectedNorFre vector
+    expectedNorFre.resize(patNum,-1);
+
+    //Vector containing the 'relative' likelihood of the pattern p_i
+    DoubleVector LL(patNum,-1.0);
+    double sumLL = 0; //sum of the likelihood of the patterns in the alignment
+    double max_logl = *max_element(logLL.begin(), logLL.end()); // to rescale the log-likelihood
+    //Compute the `relative' (to the first pattern) likelihood from the logLL
+    for ( int i = 0; i < patNum; i++ )
+    {
+        LL[i] = exp(logLL[i]-max_logl);
+        //LL[i] = exp(logLL[i]);
+        sumLL += LL[i];
+    }
+
+    //Vector containing l_i = p_i*ell/sum_i(p_i)
+    DoubleVector ell(patNum, -1.0);
+    //Compute l_i
+    for ( int i = 0; i < patNum; i++ )
+    {
+        ell[i] = (double)alignLen * LL[i] / sumLL;
+    }
+
+
+    //Vector containing r_i where r_0 = ell_0; r_{i+1} = ell_{i+1} + r_i - ordinaryRounding(r_i)
+    DoubleVector r(patNum, -1.0);
+    //Compute r_i and the expected normalized frequencies
+    r[0] = ell[0];
+    expectedNorFre[0] = (int)floor(ell[0]+0.5); //note that floor(_number+0.5) returns the ordinary rounding of _number
+    //int sum = expectedNorFre[0];
+    for (int j = 1; j < patNum; j++ )
+    {
+        r[j] = ell[j] + r[j-1] - floor(r[j-1]+0.5);
+        expectedNorFre[j] = (int)floor(r[j]+0.5);
+        //sum += expectedNorFre[j];
+    }
+
+    //cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
+    //return expectedNorFre;
+    //compute the probability of having expectedNorFre given the observed pattern frequencies of THIS alignment
+    double sumFac = 0;
+    double sumProb = 0;
+    double fac = logFac(alignLen);
+    for (int patID = 0; patID < patNum; patID++) {
+        int patFre = expectedNorFre[patID];
+        sumFac += logFac(patFre);
+        sumProb += (double)patFre*log((double)at(patID).frequency/(double)alignLen);
+    }
+    prob = fac - sumFac + sumProb;
+}
+
+void Alignment::multinomialProb (double *logLL, double &prob)
+{
+    //cout << "Function in Alignment: Compute probability of the expected alignment (determined by patterns log-likelihood under some tree and model) given THIS alignment." << endl;
+
+    //The expected normalized requencies
+    IntVector expectedNorFre;
+
+    /*	if ( logLL.empty())
+    		outError("Error: log likelihood of patterns are not given!");*/
+
+    int patNum = getNPattern();
+
+    //assert(logLL.size() == patNum);
+
+    int alignLen = getNSite();
+    //resize the expectedNorFre vector
+    expectedNorFre.resize(patNum,-1);
+
+    //Vector containing the 'relative' likelihood of the pattern p_i
+    DoubleVector LL(patNum,-1.0);
+    double sumLL = 0; //sum of the likelihood of the patterns in the alignment
+    double max_logl = *max_element(logLL, logLL + patNum); // to rescale the log-likelihood
+    //Compute the `relative' (to the first pattern) likelihood from the logLL
+    for ( int i = 0; i < patNum; i++ )
+    {
+        LL[i] = exp(logLL[i]-max_logl);
+        //LL[i] = exp(logLL[i]);
+        sumLL += LL[i];
+    }
+
+    //Vector containing l_i = p_i*ell/sum_i(p_i)
+    DoubleVector ell(patNum, -1.0);
+    //Compute l_i
+    for ( int i = 0; i < patNum; i++ )
+    {
+        ell[i] = (double)alignLen * LL[i] / sumLL;
+    }
+
+
+    //Vector containing r_i where r_0 = ell_0; r_{i+1} = ell_{i+1} + r_i - ordinaryRounding(r_i)
+    DoubleVector r(patNum, -1.0);
+    //Compute r_i and the expected normalized frequencies
+    r[0] = ell[0];
+    expectedNorFre[0] = (int)floor(ell[0]+0.5); //note that floor(_number+0.5) returns the ordinary rounding of _number
+    //int sum = expectedNorFre[0];
+    for (int j = 1; j < patNum; j++ )
+    {
+        r[j] = ell[j] + r[j-1] - floor(r[j-1]+0.5);
+        expectedNorFre[j] = (int)floor(r[j]+0.5);
+        //sum += expectedNorFre[j];
+    }
+
+    //cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
+    //return expectedNorFre;
+    //compute the probability of having expectedNorFre given the observed pattern frequencies of THIS alignment
+    double sumFac = 0;
+    double sumProb = 0;
+    double fac = logFac(alignLen);
+    for (int patID = 0; patID < patNum; patID++) {
+        int patFre = expectedNorFre[patID];
+        sumFac += logFac(patFre);
+        sumProb += (double)patFre*log((double)at(patID).frequency/(double)alignLen);
+    }
+    prob = fac - sumFac + sumProb;
+}
+
+double Alignment::multinomialProb (IntVector &pattern_freq)
+{
+    //cout << "Function in Alignment: Compute probability of the expected alignment (determined by patterns log-likelihood under some tree and model) given THIS alignment." << endl;
+
+    //The expected normalized requencies
+
+    //cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
+    //return expectedNorFre;
+    //compute the probability of having expectedNorFre given the observed pattern frequencies of THIS alignment
+    assert(size() == pattern_freq.size());
+    int patNum = getNPattern();
+    int alignLen = getNSite();
+    double sumFac = 0;
+    double sumProb = 0;
+    double fac = logFac(alignLen);
+    for (int patID = 0; patID < patNum; patID++) {
+        int patFre = pattern_freq[patID];
+        sumFac += logFac(patFre);
+        sumProb += (double)patFre*log((double)at(patID).frequency/(double)alignLen);
+    }
+    return (fac - sumFac + sumProb);
+}
diff --git a/alignment.h b/alignment.h
new file mode 100644
index 0000000..b75b073
--- /dev/null
+++ b/alignment.h
@@ -0,0 +1,694 @@
+//
+// C++ Interface: alignment
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#ifndef ALIGNMENT_H
+#define ALIGNMENT_H
+
+#include <vector>
+#include <bitset>
+#include "pattern.h"
+#include "ncl/ncl.h"
+#include "tools.h"
+
+// IMPORTANT: refactor STATE_UNKNOWN
+//const char STATE_UNKNOWN = 126;
+const char STATE_INVALID = 127;
+const int NUM_CHAR = 256;
+const double MIN_FREQUENCY          = 0.0001;
+const double MIN_FREQUENCY_DIFF     = 0.00001;
+
+typedef bitset<NUM_CHAR> StateBitset;
+
+enum SeqType {
+    SEQ_DNA, SEQ_PROTEIN, SEQ_BINARY, SEQ_MORPH, SEQ_MULTISTATE, SEQ_CODON, SEQ_UNKNOWN
+};
+
+
+#ifdef USE_HASH_MAP
+typedef unordered_map<string, int> StringIntMap;
+typedef unordered_map<string, double> StringDoubleHashMap;
+typedef unordered_map<string, int> PatternIntMap;
+#else
+typedef map<string, int> StringIntMap;
+typedef map<string, double> StringDoubleHashMap;
+typedef map<string, int> PatternIntMap;
+#endif
+
+/**
+Multiple Sequence Alignment. Stored by a vector of site-patterns
+
+        @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+ */
+class Alignment : public vector<Pattern> {
+    friend class SuperAlignment;
+
+public:
+
+    /**
+            constructor
+     */
+    Alignment();
+
+    /**
+            constructor
+            @param filename file name
+            @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+            @param intype (OUT) input format of the file
+     */
+    Alignment(char *filename, char *sequence_type, InputType &intype);
+
+    /**
+            destructor
+     */
+    virtual ~Alignment();
+
+
+    /****************************************************************************
+            input alignment reader
+     ****************************************************************************/
+
+    /** get the SeqType for a given string */
+    static SeqType getSeqType(const char *sequence_type);
+
+
+    /**
+            add a pattern into the alignment
+            @param pat the pattern
+            @param site the site index of the pattern from the alignment
+            @param freq frequency of pattern
+            @return TRUE if pattern contains only gaps or unknown char. 
+                            In that case, the pattern won't be added.
+     */
+    bool addPattern(Pattern &pat, int site, int freq = 1);
+
+	/**
+		determine if the pattern is constant. update the is_const variable.
+	*/
+	void computeConst(Pattern &pat);
+
+
+    /**
+     * add const patterns into the alignment
+     * @param freq_const_pattern comma-separated list of const pattern frequencies
+     */
+    void addConstPatterns(char *freq_const_patterns);
+
+    /**
+            read the alignment in NEXUS format
+            @param filename file name
+            @return 1 on success, 0 on failure
+     */
+    int readNexus(char *filename);
+
+    int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite);
+
+    /**
+            read the alignment in PHYLIP format
+            @param filename file name
+            @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+            @return 1 on success, 0 on failure
+     */
+    int readPhylip(char *filename, char *sequence_type);
+
+    /**
+            read the alignment in FASTA format
+            @param filename file name
+            @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+            @return 1 on success, 0 on failure
+     */
+    int readFasta(char *filename, char *sequence_type);
+
+    /**
+            read the alignment in CLUSTAL format
+            @param filename file name
+            @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+            @return 1 on success, 0 on failure
+     */
+    int readClustal(char *filename, char *sequence_type);
+
+    /**
+            read the alignment in MSF format
+            @param filename file name
+            @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
+            @return 1 on success, 0 on failure
+     */
+    int readMSF(char *filename, char *sequence_type);
+
+    /**
+            extract the alignment from a nexus data block, called by readNexus()
+            @param data_block data block of nexus file
+     */
+    void extractDataBlock(NxsCharactersBlock *data_block);
+
+    vector<Pattern> ordered_pattern;
+    
+    /** lower bound of sum parsimony scores for remaining pattern in ordered_pattern */
+    UINT *pars_lower_bound;
+
+    /** order pattern by number of character states and return in ptn_order
+    */
+    void orderPatternByNumChars();
+
+    /**
+     * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns
+     */
+    void ungroupSitePattern();
+
+
+    /**
+     * re-group site-patterns
+     * @param groups number of groups
+     * @param site_group group ID (0, 1, ...ngroups-1; must be continuous) of all sites
+     */
+    void regroupSitePattern(int groups, IntVector &site_group);
+
+
+    /****************************************************************************
+            output alignment 
+     ****************************************************************************/
+    SeqType detectSequenceType(StrVector &sequences);
+
+    void computeUnknownState();
+
+    void buildStateMap(char *map, SeqType seq_type);
+
+    virtual char convertState(char state, SeqType seq_type);
+
+    /** 
+     * convert state if the number of states (num_states is known)
+     * @param state input char to convert
+     * @return output char from 0 to 0-num_states or STATE_INVALID or STATE_UNKNOWN
+     */
+    char convertState(char state);
+
+    virtual void convertStateStr(string &str, SeqType seq_type);
+
+	/**
+	 * convert from internal state to user-readable state (e.g., to ACGT for DNA)
+	 * Note: does not work for codon data
+	 * @param state internal state code
+	 * @return user-readable state
+	 */
+    char convertStateBack(char state);
+
+    /**
+	 * convert from internal state to user-readable state (e.g., to ACGT for DNA)
+	 * Note: work for all data
+	 * @param state internal state code
+	 * @return user-readable state string
+	 */
+	string convertStateBackStr(char state);
+
+	/**
+            get alignment site range from the residue range relative to a sequence
+            @param seq_id reference sequence
+            @param residue_left (IN/OUT) left of range
+            @param residue_right (IN/OUT) right of range [left,right)
+            @return TRUE if success, FALSE if out of range
+     */
+    bool getSiteFromResidue(int seq_id, int &residue_left, int &residue_right);
+
+    int buildRetainingSites(const char *aln_site_list, IntVector &kept_sites,
+            bool exclude_gaps, bool exclude_const_sites, const char *ref_seq_name);
+
+    void printPhylip(const char *filename, bool append = false, const char *aln_site_list = NULL,
+    		bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
+
+    void printPhylip(ostream &out, bool append = false, const char *aln_site_list = NULL,
+    		bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
+
+    void printFasta(const char *filename, bool append = false, const char *aln_site_list = NULL,
+    		bool exclude_gaps = false, bool exclude_const_sites = false, const char *ref_seq_name = NULL);
+
+    /**
+            Print the number of gaps per site
+            @param filename output file name
+     */
+    void printSiteGaps(const char *filename);
+
+    /****************************************************************************
+            get general information from alignment
+     ****************************************************************************/
+
+    /**
+            @return number of sequences
+     */
+    inline int getNSeq() {
+        return seq_names.size();
+    }
+
+    /**
+            @return number of sites (alignment columns)
+     */
+    inline int getNSite() {
+        return site_pattern.size();
+    }
+
+    /**
+             @return number of patterns
+     */
+    inline int getNPattern() {
+        return size();
+    }
+
+    inline int getPatternID(int site) {
+        return site_pattern[site];
+    }
+
+    inline Pattern getPattern(int site) {
+        return at(site_pattern[site]);
+    }
+
+    /**
+     * @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites
+     */
+    virtual void getSitePatternIndex(IntVector &pattern_index) {
+        pattern_index = site_pattern;
+    }
+
+    /**
+     * @param freq (OUT) vector of site-pattern frequencies
+     */
+    virtual void getPatternFreq(IntVector &freq);
+
+    /**
+            @param i sequence index
+            @return sequence name
+     */
+    string &getSeqName(int i);
+
+    /**
+     *  Get a list of all sequence names
+     *  @return vector containing the sequence names
+     */
+    vector<string>& getSeqNames();
+
+    /**
+            @param seq_name sequence name
+            @return corresponding ID, -1 if not found
+     */
+    int getSeqID(string &seq_name);
+
+    /**
+            @return length of the longest sequence name
+     */
+    int getMaxSeqNameLength();
+
+    /**
+            check proper and undupplicated sequence names
+     */
+    void checkSeqName();
+
+    /**
+     * check identical sequences
+     * @return the number of sequences that are identical to one of the sequences
+     */
+    int checkIdenticalSeq();
+
+    /**
+     * remove identical sequences from alignment
+     * @param not_remove name of sequence where removal is avoided
+     * @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1
+     * @param removed_seqs (OUT) name of removed sequences
+     * @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences
+     * @return this if no sequences were removed, or new alignment if at least 1 sequence was removed
+     */
+    virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs);
+
+    /**
+            Quit if some sequences contain only gaps or missing data
+     */
+	virtual void checkGappySeq(bool force_error = true);
+
+	/**
+	 * return a new alignment if some sequence is totally gappy, or this if all sequence are okey
+	 */
+	Alignment *removeGappySeq();
+
+    /**
+            @return TRUE if seq_id contains only gaps or missing characters
+            @param seq_id sequence ID
+     */
+    bool isGapOnlySeq(int seq_id);
+
+    virtual bool isSuperAlignment() {
+        return false;
+    }
+
+    /****************************************************************************
+            alignment general processing
+     ****************************************************************************/
+
+    /**
+            extract sub-alignment of a sub-set of sequences
+            @param aln original input alignment
+            @param seq_id ID of sequences to extract from
+            @param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
+     */
+    virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char);
+
+    /**
+            extract a sub-set of patterns
+            @param aln original input alignment
+            @param ptn_id ID of patterns to extract from
+     */
+    void extractPatterns(Alignment *aln, IntVector &ptn_id);
+
+    /**
+            extract a sub-set of patterns
+            @param aln original input alignment
+            @param ptn_freq pattern frequency to extract from
+     */
+    void extractPatternFreqs(Alignment *aln, IntVector &ptn_freq);
+
+    /**
+            create a non-parametric bootstrap alignment from an input alignment
+            @param aln input alignment
+            @param pattern_freq (OUT) resampled pattern frequencies if not NULL
+            @param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
+            	to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
+            	must equal m, where m is the alignment length. Otherwise, an error will occur.
+            	If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
+     */
+    virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL);
+
+    /**
+            resampling pattern frequency by a non-parametric bootstrap 
+            @param pattern_freq (OUT) resampled pattern frequencies
+            @param spec bootstrap specification, see above
+     */
+    virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL);
+
+    /**
+            resampling pattern frequency by a non-parametric bootstrap
+            @param pattern_freq (OUT) resampled pattern frequencies
+            @param spec bootstrap specification, see above
+     */
+    virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL);
+
+    /**
+            create a gap masked alignment from an input alignment. Gap patterns of masked_aln 
+                    will be superimposed into aln to create the current alignment object.
+            @param aln input alignment
+            @param masked_aln gappy alignment of the same size with aln
+     */
+    void createGapMaskedAlignment(Alignment *masked_aln, Alignment *aln);
+
+    /**
+	 * shuffle alignment by randomizing the order of sites
+	 */
+	virtual void shuffleAlignment();
+
+	/**
+            concatenate an alignment into the current alignment object
+            @param aln an alignment of the same number of sequences and sequence names    
+     */
+    void concatenateAlignment(Alignment *aln);
+
+    /**
+            copy the input alignment into the current alignment object
+            @param aln input alignment
+     */
+    void copyAlignment(Alignment *aln);
+
+    /**
+            extract a sub-set of sites
+            @param aln original input alignment
+            @param ptn_id ID of sites to extract from (starting from 0)
+     */
+    void extractSites(Alignment *aln, IntVector &site_id);
+
+    /**
+            extract a sub-set of sites
+            @param aln original input alignment
+            @param spec specification of positions, e.g. "1-100,101-200\2"
+     */
+    void extractSites(Alignment *aln, const char* spec);
+
+    /**
+        convert a DNA alignment into codon or AA alignment
+    */
+    void convertToCodonOrAA(Alignment *aln, char *gene_code_id, bool nt2aa = false);
+
+    /****************************************************************************
+            Distance functions
+     ****************************************************************************/
+
+
+    /**
+            compute the observed distance (number of different pairs of positions per site) 
+                    between two sequences
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2
+            @return the observed distance between seq1 and seq2 (between 0.0 and 1.0)
+     */
+    virtual double computeObsDist(int seq1, int seq2);
+
+    /**
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2
+            @return Juke-Cantor correction distance between seq1 and seq2
+     */
+    double computeJCDist(int seq1, int seq2);
+
+    /**
+            abstract function to compute the distance between 2 sequences. The default return
+            Juke-Cantor corrected distance.
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2		
+            @return any distance between seq1 and seq2
+     */
+    virtual double computeDist(int seq1, int seq2) {
+        return computeJCDist(seq1, seq2);
+    }
+
+
+    /**
+            write distance matrix into a file in PHYLIP distance format
+            @param file_name distance file name
+            @param dist_mat distance matrix
+     */
+    void printDist(const char *file_name, double *dist_mat);
+
+    /**
+            write distance matrix into a stream in PHYLIP distance format
+            @param out output stream
+            @param dist_mat distance matrix
+     */
+    void printDist(ostream &out, double *dist_mat);
+
+    /**
+            read distance matrix from a file in PHYLIP distance format
+            @param file_name distance file name
+            @param dist_mat distance matrix
+            @return the longest distance
+     */
+    double readDist(const char *file_name, double *dist_mat);
+
+    /**
+            read distance matrix from a stream in PHYLIP distance format
+            @param in input stream
+            @param dist_mat distance matrix
+     */
+    double readDist(istream &in, double *dist_mat);
+
+
+    /****************************************************************************
+            some statistics
+     ****************************************************************************/
+
+    /**
+            compute empirical state frequencies from the alignment
+            @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with 
+                    at least num_states entries.
+     */
+    virtual void computeStateFreq(double *state_freq, size_t num_unknown_states = 0);
+
+    /**
+            compute empirical state frequencies for each sequence 
+            @param freq_per_sequence (OUT) state frequencies for each sequence, of size num_states*num_freq
+     */
+    void computeStateFreqPerSequence (double *freq_per_sequence);
+
+    void countStatePerSequence (unsigned *count_per_sequence);
+
+    /**
+     * Make all frequencies a little different and non-zero
+     * @param stateFrqArr (IN/OUT) state frequencies
+     */
+    void convfreq(double *stateFrqArr);
+
+    /**
+	 * compute special empirical frequencies for codon alignment: 1x4, 3x4, 3x4C
+	 * @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with
+	 * at least num_states entries.
+	 * @param freq either FREQ_CODON_1x4, FREQ_CODON_3x4, or FREQ_CODON_3x4C
+	 * @param ntfreq (OUT) nucleotide frequencies, assuming of size 4 for F1x4 and of size 12 for F3x4.
+	 */
+	void computeCodonFreq(StateFreqType freq, double *state_freq, double *ntfreq);
+
+	/**
+            compute empirical rates between state pairs
+            @param rates (OUT) vector of size num_states*(num_states-1)/2 for the rates
+     */
+    virtual void computeEmpiricalRate(double *rates);
+
+    /**
+            compute non-reversible empirical rates between state pairs
+            @param rates (OUT) vector of size num_states*(num_states-1) for the rates
+     */
+    virtual void computeEmpiricalRateNonRev(double *rates);
+
+    /**
+            count the fraction of constant sites in the alignment, update the variable frac_const_sites
+     */
+    virtual void countConstSite();
+
+    /**
+     * @return unobserved constant patterns, each entry encoding for one constant character
+     */
+    string getUnobservedConstPatterns();
+
+    /**
+            @return the number of ungappy and unambiguous characters from a sequence
+            @param seq_id sequence ID
+     */
+    int countProperChar(int seq_id);
+
+    /**
+            @return unconstrained log-likelihood (without a tree)
+     */
+    virtual double computeUnconstrainedLogL();
+
+    /** either SEQ_BINARY, SEQ_DNA, SEQ_PROTEIN, SEQ_MORPH, or SEQ_CODON */
+    SeqType seq_type;
+
+    char STATE_UNKNOWN;
+
+    /**
+            number of states
+     */
+    int num_states;
+
+    /**
+            fraction of constant sites
+     */
+    double frac_const_sites;
+
+    /** number of informative sites */
+    int num_informative_sites;
+    
+	/**
+	 *  map from 64 codon to non-stop codon index
+	 */
+//    char *non_stop_codon;
+
+	/**
+	 * For codon sequences: index of 61 non-stop codons to 64 codons
+	 * For other sequences: NULL
+	 */
+//	char *codon_table;
+
+	/**
+	 * For codon_sequences: 64 amino-acid letters for genetic code of AAA,AAC,AAG,AAT,...,TTT
+	 * For other sequences: NULL
+	 */
+	char *genetic_code;
+
+    vector<vector<int> > seq_states; // state set for each sequence in the alignment
+
+    /**
+     * @return true if data type is SEQ_CODON and state is a stop codon
+     */
+    bool isStopCodon(int state);
+
+    bool isStandardGeneticCode();
+
+	/**
+	 * @return number of non-stop codons in the genetic code
+	 */
+	int getNumNonstopCodons();
+
+    /* build seq_states containing set of states per sequence
+     * @param add_unobs_const TRUE to add all unobserved constant states (for +ASC model)
+     */
+    void buildSeqStates(bool add_unobs_const = false);
+
+
+    /** Added by MA
+            Compute the probability of this alignment according to the multinomial distribution with parameters determined by the reference alignment
+            @param refAlign the reference alignment
+            @param prob (OUT) the returned probabilty
+		
+            The probability is computed as follows:
+            - From the reference alignment, we count the relative pattern frequencies p_1 ... p_k (sum = 1)
+            - From THIS alignment, we have frequencies d_1 ... d_k (sum = len = nsite)
+            - Prob(THIS | refAlign) = nsite!/(d_1! * ... * d_k!) product(p_i^d_i)
+     */
+    void multinomialProb(Alignment refAlign, double &prob);
+
+    /** Added by MA
+            Compute the probability of the `expected alignment' according to the multinomial distribution with parameters determined by the pattern's observed frequencies in THIS alignment.
+            The `expected alignment' consists of patterns with log-likelihoods (under some model+tree) given in the input file (logLL).
+            Note that order of the log-likelihoods in inputLL must corresponds to patterns in THIS alignment.
+
+            @param inputLL the input patterns log-likelihood vector
+            @param prob (OUT) the returned probability
+     */
+    void multinomialProb(DoubleVector logLL, double &prob);
+    void multinomialProb(double *logLL, double &prob);
+
+    /** Adapted from MA
+            compute the probability of the alignment defined by pattern_freq given this alignment	
+     */
+    double multinomialProb(IntVector &pattern_freq);
+
+
+    /**
+            get the appearance for a state, helpful for ambigious states
+            @param state the state index
+            @param state_app (OUT) state appearance
+     */
+    void getAppearance(char state, double *state_app);
+
+    void getAppearance(char state, StateBitset &state_app);
+
+protected:
+
+
+    /**
+            sequence names
+     */
+    vector<string> seq_names;
+
+    /**
+            Site to pattern index
+     */
+    IntVector site_pattern;
+
+    /**
+            hash map from pattern to index in the vector of patterns (the alignment)
+     */
+    PatternIntMap pattern_index;
+
+
+    /**
+	 * special initialization for codon sequences, e.g., setting #states, genetic_code
+	 * @param sequence_type user-defined sequence type
+	 */
+	void initCodon(char *gene_code_id);
+
+};
+
+
+void extractSiteID(Alignment *aln, const char* spec, IntVector &site_id);
+
+#endif
diff --git a/alignmentpairwise.cpp b/alignmentpairwise.cpp
new file mode 100644
index 0000000..efc3c80
--- /dev/null
+++ b/alignmentpairwise.cpp
@@ -0,0 +1,319 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "alignmentpairwise.h"
+#include "phylosupertree.h"
+
+AlignmentPairwise::AlignmentPairwise()
+        : Alignment(), Optimization()
+{
+    pair_freq = NULL;
+}
+
+AlignmentPairwise::AlignmentPairwise(PhyloTree *atree, int seq1, int seq2) : Alignment(), Optimization() {
+    tree = atree;
+    seq_id1 = seq1;
+    seq_id2 = seq2;
+    num_states = tree->aln->num_states;
+    STATE_UNKNOWN = tree->aln->STATE_UNKNOWN;
+    pair_freq = NULL;
+
+    if (tree->getRate()->isSiteSpecificRate() || tree->getModel()->isSiteSpecificModel()) return;
+
+    // categorized rates
+    if (tree->getRate()->getPtnCat(0) >= 0) {
+        int size_sqr = num_states * num_states;
+        int total_size = size_sqr * tree->getRate()->getNDiscreteRate();
+        pair_freq = new double[total_size];
+        memset(pair_freq, 0, sizeof(double)*total_size);
+        int i = 0;
+        for (Alignment::iterator it = tree->aln->begin(); it != tree->aln->end(); it++, i++) {
+            int state1 = (*it)[seq_id1];
+            int state2 = (*it)[seq_id2];
+            addPattern(state1, state2, it->frequency, tree->getRate()->getPtnCat(i));
+            /*
+            	if (state1 < num_states && state2 < num_states)
+            		pair_freq[tree->getRate()->getPtnCat(i)*size_sqr + state1*num_states + state2] += it->frequency;*/
+        }
+        return;
+    }
+
+    pair_freq = new double[num_states * num_states];
+    memset(pair_freq, 0, sizeof(double) * num_states * num_states);
+    for (Alignment::iterator it = tree->aln->begin(); it != tree->aln->end(); it++) {
+        int state1 = (*it)[seq_id1];
+        int state2 = (*it)[seq_id2];
+        addPattern(state1, state2, it->frequency);
+        /*		if (state1 < num_states && state2 < num_states)
+        			pair_freq[state1 * num_states + state2] += it->frequency;*/
+    }
+}
+
+bool AlignmentPairwise::addPattern(int state1, int state2, int freq, int cat) {
+    int i;
+    if (state1 == STATE_UNKNOWN || state2 == STATE_UNKNOWN) return true;
+
+    double *pair_pos = pair_freq + (cat*num_states*num_states);
+    // unambiguous case
+    if (state1 < num_states && state2 < num_states) {
+        pair_pos[state1*num_states + state2] += freq;
+        return false;
+    }
+
+    return true;
+
+    if (state1 < num_states) {
+        // ambiguous character, for DNA, RNA
+        state2 = state2 - (num_states - 1);
+        for (i = 0; i < num_states; i++)
+            if (state2 & (1 << i))
+                pair_pos[state1*num_states + i] += freq;
+        return false;
+    }
+
+    if (state2 < num_states) {
+        // ambiguous character, for DNA, RNA
+        state1 = state1 - (num_states - 1);
+        for (i = 0; i < num_states; i++)
+            if (state1 & (1 << i))
+                pair_pos[i*num_states + state2] += freq;
+        return false;
+    }
+
+    return true;
+}
+
+double AlignmentPairwise::computeFunction(double value) {
+
+    RateHeterogeneity *site_rate = tree->getRate();
+    int ncat = site_rate->getNDiscreteRate();
+    ModelSubst *model = tree->getModel();
+    int trans_size = tree->getModel()->getTransMatrixSize();
+    int cat, i;
+    int nptn = tree->aln->getNPattern();
+    double lh = 0.0;
+
+    // site-specific rates
+    if (site_rate->isSiteSpecificRate()) {
+        for (i = 0; i < nptn; i++) {
+            int state1 = tree->aln->at(i)[seq_id1];
+            int state2 = tree->aln->at(i)[seq_id2];
+            if (state1 >= num_states || state2 >= num_states) continue;
+            double trans = tree->getModelFactory()->computeTrans(value * site_rate->getPtnRate(i), state1, state2);
+            lh -= log(trans) * tree->aln->at(i).frequency;
+
+        }
+        return lh;
+    }
+
+    if (tree->getModel()->isSiteSpecificModel()) {
+        for (i = 0; i < nptn; i++) {
+            int state1 = tree->aln->at(i)[seq_id1];
+            int state2 = tree->aln->at(i)[seq_id2];
+            if (state1 >= num_states || state2 >= num_states) continue;
+            double trans = tree->getModel()->computeTrans(value, model->getPtnModelID(i), state1, state2);
+            lh -= log(trans) * tree->aln->at(i).frequency;
+
+        }
+		return lh;
+	}
+    
+    double *trans_mat = new double[trans_size];
+
+    // categorized rates
+    if (site_rate->getPtnCat(0) >= 0) {
+        for (cat = 0; cat < ncat; cat++) {
+            tree->getModelFactory()->computeTransMatrix(value*site_rate->getRate(cat), trans_mat);
+            double *pair_pos = pair_freq + cat*trans_size;
+            for (i = 0; i < trans_size; i++) if (pair_pos[i] > 1e-6) {
+                    if (trans_mat[i] <= 0) throw "Negative transition probability";
+                    lh -= pair_pos[i] * log(trans_mat[i]);
+                }
+        }
+        delete [] trans_mat;
+        return lh;
+    }
+
+    double *sum_trans_mat = new double[trans_size];
+
+    if (tree->getModelFactory()->site_rate->getGammaShape() == 0.0)
+        tree->getModelFactory()->computeTransMatrix(value, sum_trans_mat);
+    else {
+        tree->getModelFactory()->computeTransMatrix(value * site_rate->getRate(0), sum_trans_mat);
+        for (cat = 1; cat < ncat; cat++) {
+            tree->getModelFactory()->computeTransMatrix(value * site_rate->getRate(cat), trans_mat);
+            for (i = 0; i < trans_size; i++)
+                sum_trans_mat[i] += trans_mat[i];
+        }
+    }
+    for (i = 0; i < trans_size; i++) {
+        lh -= pair_freq[i] * log(sum_trans_mat[i]);
+    }
+    delete [] sum_trans_mat;
+    delete [] trans_mat;
+    // negative log-likelihood (for minimization)
+    return lh;
+}
+
+void AlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) {
+    RateHeterogeneity *site_rate = tree->getRate();
+    int ncat = site_rate->getNDiscreteRate();
+    ModelSubst *model = tree->getModel();
+    int trans_size = tree->getModel()->getTransMatrixSize();
+    int cat, i;
+    int nptn = tree->aln->getNPattern();
+//    double lh = 0.0;
+    df = 0.0;
+    ddf = 0.0;
+
+    if (site_rate->isSiteSpecificRate()) {
+        for (i = 0; i < nptn; i++) {
+            int state1 = tree->aln->at(i)[seq_id1];
+            int state2 = tree->aln->at(i)[seq_id2];
+            if (state1 >= num_states || state2 >= num_states) continue;
+            double rate_val = site_rate->getPtnRate(i);
+            double rate_sqr = rate_val * rate_val;
+            double derv1, derv2;
+            double trans = tree->getModelFactory()->computeTrans(value * rate_val, state1, state2, derv1, derv2);
+//            lh -= log(trans) * tree->aln->at(i).frequency;
+            double d1 = derv1 / trans;
+            df -= rate_val * d1 * tree->aln->at(i).frequency;
+            ddf -= rate_sqr * (derv2/trans - d1*d1) * tree->aln->at(i).frequency;
+
+        }
+//        return lh;
+        return;
+    }
+
+    
+    if (tree->getModel()->isSiteSpecificModel()) {
+        for (i = 0; i < nptn; i++) {
+            int state1 = tree->aln->at(i)[seq_id1];
+            int state2 = tree->aln->at(i)[seq_id2];
+            if (state1 >= num_states || state2 >= num_states) continue;
+            double rate_val = site_rate->getPtnRate(i);
+            double rate_sqr = rate_val * rate_val;
+            double derv1, derv2;
+            double trans = tree->getModel()->computeTrans(value * rate_val,model->getPtnModelID(i), state1, state2, derv1, derv2);
+//            lh -= log(trans) * tree->aln->at(i).frequency;
+            double d1 = derv1 / trans;
+            df -= rate_val * d1 * tree->aln->at(i).frequency;
+            ddf -= rate_sqr * (derv2/trans - d1*d1) * tree->aln->at(i).frequency;
+
+        }
+//        return lh;
+        return;
+    }
+
+    double *trans_mat = new double[trans_size];
+	double *trans_derv1 = new double[trans_size];
+	double *trans_derv2 = new double[trans_size];
+
+    // categorized rates
+    if (site_rate->getPtnCat(0) >= 0) {
+        for (cat = 0; cat < ncat; cat++) {
+            double rate_val = site_rate->getRate(cat);
+            double derv1 = 0.0, derv2 = 0.0;
+            tree->getModelFactory()->computeTransDerv(value*rate_val, trans_mat, trans_derv1, trans_derv2);
+            double *pair_pos = pair_freq + cat*trans_size;
+            for (i = 0; i < trans_size; i++) if (pair_pos[i] > 0) {
+                    if (trans_mat[i] <= 0) throw "Negative transition probability";
+                    double d1 = trans_derv1[i] / trans_mat[i];
+                    derv1 += pair_pos[i] * d1;
+                    derv2 += pair_pos[i] * (trans_derv2[i]/trans_mat[i] - d1 * d1);
+//                    lh -= pair_pos[i] * log(trans_mat[i]);
+                }
+            df -= derv1 * rate_val;
+            ddf -= derv2 * rate_val * rate_val;
+        }
+        delete [] trans_derv2;
+		delete [] trans_derv1;
+		delete [] trans_mat;
+//        return lh;
+        return;
+    }
+
+
+    double *sum_trans = new double[trans_size];
+	double *sum_derv1 = new double[trans_size];
+	double *sum_derv2 = new double[trans_size];
+    memset(sum_trans, 0, sizeof(double) * trans_size);
+    memset(sum_derv1, 0, sizeof(double) * trans_size);
+    memset(sum_derv2, 0, sizeof(double) * trans_size);
+
+    for (cat = 0; cat < ncat; cat++) {
+        double rate_val = site_rate->getRate(cat);
+        if (tree->getModelFactory()->site_rate->getGammaShape() == 0.0)
+            rate_val = 1.0;
+
+        double rate_sqr = rate_val * rate_val;
+        tree->getModelFactory()->computeTransDerv(value * rate_val, trans_mat, trans_derv1, trans_derv2);
+        for (i = 0; i < trans_size; i++) {
+            sum_trans[i] += trans_mat[i];
+            sum_derv1[i] += trans_derv1[i] * rate_val;
+            sum_derv2[i] += trans_derv2[i] * rate_sqr;
+        }
+    }
+    for (i = 0; i < trans_size; i++) if (pair_freq[i] > 1e-6) {
+//            lh -= pair_freq[i] * log(sum_trans[i]);
+            double d1 = sum_derv1[i] / sum_trans[i];
+            df -= pair_freq[i] * d1;
+            ddf -= pair_freq[i] * (sum_derv2[i]/sum_trans[i] - d1 * d1);
+        }
+    delete [] sum_derv2;
+	delete [] sum_derv1;
+	delete [] sum_trans;
+	delete [] trans_derv2;
+	delete [] trans_derv1;
+	delete [] trans_mat;
+    // negative log-likelihood (for minimization)
+//    return lh;
+    return;
+}
+
+double AlignmentPairwise::optimizeDist(double initial_dist, double &d2l) {
+    // initial guess of the distance using Juke-Cantor correction
+    double dist = initial_dist;
+
+    d2l = -1.0;
+
+    // if no model or rate is specified, return the JC distance and set variance to const
+    if (!tree->getModelFactory() || !tree->getRate()) return dist;
+
+    double negative_lh, ferror;
+    if (tree->optimize_by_newton) // Newton-Raphson method
+        dist = minimizeNewton(1e-6, dist, MAX_GENETIC_DIST, 1e-6, d2l);
+    else // Brent method
+        dist = minimizeOneDimen(1e-6, dist, MAX_GENETIC_DIST, 1e-6, &negative_lh, &ferror);
+
+    return dist;
+}
+
+double AlignmentPairwise::optimizeDist(double initial_dist) {
+	double d2l;
+	return optimizeDist(initial_dist, d2l);
+}
+
+
+AlignmentPairwise::~AlignmentPairwise()
+{
+    if (pair_freq) delete [] pair_freq;
+}
+
+
diff --git a/alignmentpairwise.h b/alignmentpairwise.h
new file mode 100644
index 0000000..22faf22
--- /dev/null
+++ b/alignmentpairwise.h
@@ -0,0 +1,100 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef ALIGNMENTPAIRWISE_H
+#define ALIGNMENTPAIRWISE_H
+
+#include "optimization.h"
+#include "phylotree.h"
+
+/**
+Pairwise alignment
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class AlignmentPairwise : public Alignment, public Optimization
+{
+public:
+    AlignmentPairwise();
+
+	/**
+		construct the pairwise alignment from two sequences of a multiple alignment
+		@param aln input multiple alignment
+		@param seq_id1 ID of the first sequence
+		@param seq_id2 ID of the second sequence
+	*/
+    AlignmentPairwise(PhyloTree *atree, int seq1, int seq2);
+
+	/**
+		compute the likelihood for a distance between two sequences. Used for the ML optimization of the distance.
+		@param value x-value of the function
+		@return log-likelihood 
+	*/
+	virtual double computeFunction(double value);
+
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		used by Newton raphson method to minimize the function.
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+	/**
+		compute the ML distance and variance between two sequences
+		@param initial_dist initial guess
+		@param (OUT) second derivative of likelihood function evaluated at ML distance
+		@return the ML distance
+	*/
+	double optimizeDist(double initial_dist);
+
+	double optimizeDist(double initial_dist, double &d2l);
+
+
+	/**
+		add a pattern into the alignment
+		@param state1
+		@param state2 states of the pattern
+		@param freq frequency of pattern
+		@param cat category for the pattern (for the discrete model)
+		@return TRUE if pattern contains only gaps or unknown char. 
+				In that case, the pattern won't be added.
+	*/
+	bool addPattern(int state1, int state2, int freq, int cat = 0);
+
+
+	/**
+		destructor
+	*/
+    virtual ~AlignmentPairwise();
+
+	/**
+		pairwise state frequencies
+	*/
+	double *pair_freq;
+
+	PhyloTree *tree;
+
+	int seq_id1, seq_id2;
+};
+
+#endif
diff --git a/bionj.h b/bionj.h
new file mode 100644
index 0000000..1552550
--- /dev/null
+++ b/bionj.h
@@ -0,0 +1,790 @@
+#ifndef BIONJ_H
+#define BIONJ_H
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;                                                                           ;
+;                         BIONJ program                                     ;
+;       was obtained from http://www.lirmm.fr/~w3ifa/MAAS/BIONJ/BIONJ.html  ;                                    ;
+;                                                                           ;
+;                         Olivier Gascuel                                   ;
+;                                                                           ;
+;                         GERAD - Montreal- Canada                          ;
+;                         olivierg at crt.umontreal.ca                         ;
+;                                                                           ;
+;                         LIRMM - Montpellier- France                       ;
+;                         gascuel at lirmm.fr                                  ;
+;                                                                           ;
+;                         UNIX version, written in C                        ;
+;                         by Hoa Sien Cuong (Univ. Montreal)                ; 
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+
+#include <stdio.h>                  
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define PREC 8                             /* precision of branch-lengths  */
+#define PRC  100
+#define LEN  1000                            /* length of taxon names        */
+
+class BioNj {
+typedef struct word
+{
+  char name[LEN];
+  struct word *suiv;
+}WORD;
+
+typedef struct pointers
+{
+  WORD *head;
+  WORD *tail;
+}POINTERS;
+
+/*
+void   Initialize(float **delta, FILE *input, int n, POINTERS *trees);
+
+void   Compute_sums_Sx(float **delta, int n);
+
+void   Best_pair(float **delta, int r, int *a, int *b, int n);
+
+void   Finish(float **delta, int n, POINTERS *trees, FILE *output);
+
+void   Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post);
+
+void   Print_output(int i, POINTERS *trees, FILE *output);
+
+float Distance(int i, int j, float **delta);
+
+float Variance(int i, int j, float **delta);
+
+float Sum_S(int i, float **delta);
+
+float Agglomerative_criterion(int i, int j, float **delta, int r);
+
+float Branch_length(int a, int b, float **delta, int r);
+
+float Reduction4(int a, float la, int b, float lb, int i, float lamda,
+		 float **delta);
+
+float Reduction10(int a, int b, int i, float lamda, float vab, float
+		  **delta);
+float Lamda(int a, int b, float vab, float **delta, int n, int r);
+
+float Finish_branch_length(int i, int j, int k, float **delta);
+
+int    Emptied(int i, float **delta);
+
+int    Symmetrize(float **delta, int n);
+
+*/
+/*;;;;;;;;;;;  INPUT, OUTPUT, INITIALIZATION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;                                                                           ;
+;                                                                           ;
+;              The delta matrix is read from the input-file.                ;
+;              It is recommended to put it and the executable in            ;
+;              a special directory. The input-file and output-file          ;
+;              can be given as arguments to the executable by               ;
+;              typing them after the executable (Bionj input-file           ;
+;              output-file) or by typing them when asked by the             ;
+;              program. The input-file has to be formated according         ;
+;              the PHYLIP standard. The output file is formated             ;
+;              according to the NEWSWICK standard.                          ;
+;                                                                           ;
+;              The lower-half of the delta matrix is occupied by            ;
+;              dissimilarities. The upper-half of the matrix is             ;
+;              occupied by variances. The first column                      ;
+;              is initialized as 0; during the algorithm some               ;
+;              indices are no more used, and the corresponding              ;
+;              positions in the first column are set to 1.                  ;
+;                                                                           ;
+;              This delta matix is made symmetrical using the rule:         ;
+;              Dij = Dji <- (Dij + Dji)/2. The diagonal is set to 0;        ;
+;              during the further steps of the algorithm, it is used        ;
+;              to store the sums Sx.                                        ;
+;                                                                           ;
+;              A second array, trees, is used to store taxon names.         ;
+;              During the further steps of the algoritm, some               ;
+;              positions in this array are emptied while the others         ;
+;              are used to store subtrees.                                  ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;; Initialize        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function reads an input file and return the            ;
+;               delta matrix and trees: the list of taxa.                   ;
+;                                                                           ;
+; input       :                                                             ;
+;              float **delta : delta matrix                                 ;
+;              FILE *input    : pointer to input file                       ;
+;              int n          : number of taxa                              ;
+;              char **trees   : list of taxa                                ;
+;                                                                           ;
+; return value:                                                             ;
+;              float **delta : delta matrix                                 ;
+;              char *trees    : list of taxa                                ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+void Initialize(float **delta, FILE *input, int n, POINTERS *trees)
+{
+  int lig;                                          /* matrix line       */
+  int col;                                          /* matrix column     */
+  float distance;
+  char name_taxon[LEN];                             /* taxon�s name      */
+  WORD *name;
+
+  for(lig=1; lig <= n; lig++)
+    {
+      fscanf(input,"%s",name_taxon);                  /* read taxon�s name */
+      name=(WORD *)calloc(1,sizeof(WORD));            /* taxon�s name is   */
+      if(name == NULL)                                /* put in trees      */
+	{
+	  printf("Out of memories !!");
+	  exit(0);
+	}
+      else
+	{
+	  strcpy(name->name,name_taxon);
+	  name->suiv=NULL;
+	  trees[lig].head=name;
+	  trees[lig].tail=name;
+	  for(col= 1; col <= n; col++)
+	    {
+	      fscanf(input,"%f",&distance);             /* read the distance  */
+	      delta[lig][col]=distance;
+	    }
+	}
+    }
+}
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Print_output;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;                                                                           ;
+; Description : This function prints out the tree in the output file.       ;
+;                                                                           ;
+; input       :                                                             ;
+;              POINTERS *trees : pointer to the subtrees.                   ;
+;              int i          : indicate the subtree i to be printed.       ;
+:              FILE *output   : pointer to the output file.                 ;
+;                                                                           ;
+; return value: The phylogenetic tree in the output file.                   ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+
+void Print_output(int i, POINTERS *trees, FILE *output)
+{
+  WORD *parcour;
+  parcour=trees[i].head;
+  while(parcour != NULL)
+    {
+      fprintf(output,"%s",parcour->name);
+      parcour=parcour->suiv;
+    }
+
+}
+
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;                             Utilities                                     ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Symmetrize  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function verifies if the delta matrix is symmetric;    ;
+;               if not the matrix is made symmetric.                        ;
+;                                                                           ;
+; input       :                                                             ;
+;              float **delta : delta matrix                                 ;
+;              int n          : number of taxa                              ;
+;                                                                           ;
+; return value:                                                             ;
+;              int symmetric  : indicate if the matrix has been made        ;
+;                               symmetric or not                            ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+int Symmetrize(float **delta, int n)
+{
+  int lig;                                         /* matrix line        */
+  int col;                                         /* matrix column      */
+  float value;                                     /* symmetrized value  */
+  int symmetric;
+
+  symmetric=1;
+  for(lig=1; lig  <=  n; lig++)
+    {
+      for(col=1; col< lig; col++)
+	{
+	  if(delta[lig][col] != delta[col][lig])
+	    {
+	      value= (delta[lig][col]+delta[col][lig])/2;
+	      delta[lig][col]=value;
+	      delta[col][lig]=value;
+	      symmetric=0;
+	    }
+        }
+    }
+  if(!symmetric)
+    printf("The matrix is not symmetric");
+  return(symmetric);
+}
+
+
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Concatenate ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;                                                                           ;
+; Description : This function concatenates a string to another.             ;
+;                                                                           ;
+; input       :                                                             ;
+;      char *chain1    : the string to be concatenated.                     ;
+;      int ind         : indicate the subtree to which concatenate the      ;
+;                        string                                             ;
+;      POINTERS *trees  : pointer to subtrees.                              ;
+;      int post        : position to which concatenate (front (0) or        ;
+;                        end (1))                                           ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post)
+{
+  WORD *bran;
+
+  bran=(WORD *)calloc(1,sizeof(WORD));
+  if(bran == NULL)
+    {
+      printf("Out of memories");
+      exit(0);
+    }
+  else
+    {
+      strcpy(bran->name,chain1);
+      bran->suiv=NULL;
+    }
+  if(post == 0)
+    {
+      bran->suiv=trees[ind].head;
+      trees[ind].head=bran;
+    }
+  else
+    {
+      trees[ind].tail->suiv=bran;
+      trees[ind].tail=trees[ind].tail->suiv;
+    }
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Distance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function retrieve ant return de distance between taxa  ;
+;               i and j from the delta matrix.                              ;
+;                                                                           ;
+; input       :                                                             ;
+;               int i          : taxon i                                    ;
+;               int j          : taxon j                                    ;
+;               float **delta : the delta matrix                            ;
+;                                                                           ;
+; return value:                                                             ;
+;               float distance : dissimilarity between the two taxa         ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+float Distance(int i, int j, float **delta)
+{
+  if(i > j)
+    return(delta[i][j]);
+  else
+    return(delta[j][i]);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Variance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function retrieve and return the variance of the       ;
+;               distance between i and j, from the delta matrix.            ;
+;                                                                           ;
+; input       :                                                             ;
+;               int i           : taxon i                                   ;
+;               int j           : taxon j                                   ;
+;               float **delta  : the delta matrix                           ;
+;                                                                           ;
+; return value:                                                             ;
+;               float distance : the variance of  Dij                       ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+float Variance(int i, int j, float **delta)
+{
+  if(i > j)
+    return(delta[j][i]);
+  else
+    return(delta[i][j]);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Emptied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function verifie if a line is emptied or not.          ;
+;                                                                           ;
+; input       :                                                             ;
+;               int i          : subtree (or line) i                        ;
+;               float **delta : the delta matrix                            ;
+;                                                                           ;
+; return value:                                                             ;
+;               0              : if not emptied.                            ;
+;               1              : if emptied.                                ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+int Emptied(int i, float **delta)      /* test if the ith line is emptied */
+{
+  return((int)delta[i][0]);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Sum_S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;  Description : This function retrieves the sum Sx from the diagonal       ;
+;                of the delta matrix.                                       ;
+;                                                                           ;
+;  input       :                                                            ;
+;               int i          : subtree i                                  ;
+;               float **delta : the delta matrix                            ;
+;                                                                           ;
+;  return value:                                                            ;
+;                float delta[i][i] : sum Si                                 ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+float Sum_S(int i, float **delta)          /* get sum Si form the diagonal */
+{
+  return(delta[i][i]);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;Compute_sums_Sx;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+; Description : This function computes the sums Sx and store them in the    ;
+;               diagonal the delta matrix.                                  ;
+;                                                                           ;
+; input       :                                                             ;
+;     	         float **delta : the delta matrix.                      ;
+;     	         int n          : the number of taxa                    ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+void Compute_sums_Sx(float **delta, int n)
+{
+  float sum;
+  sum = 0.0;
+  int i;
+  int j;
+
+  for(i= 1; i <= n ; i++)
+    {
+      if(!Emptied(i,delta))
+	{
+	  sum=0;
+	  for(j=1; j <=n; j++)
+	    {
+	      if(i != j && !Emptied(j,delta))           /* compute the sum Si */
+		sum=sum + Distance(i,j,delta);
+	    }
+	}
+      delta[i][i]=sum;                           /* store the sum Si in */
+    }                                               /* delta�s diagonal    */
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Best_pair;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;  Description : This function finds the best pair to be agglomerated by    ;
+;                minimizing the agglomerative criterion (1).                ;
+;                                                                           ;
+;  input       :                                                            ;
+;                float **delta : the delta matrix                           ;
+;                int r          : number of subtrees                        ;
+;                int *a         : contain the first taxon of the pair       ;
+;                int *b         : contain the second taxon of the pair      ;
+;                int n          : number of taxa                            ;
+;                                                                           ;
+;  return value:                                                            ;
+;                int *a         : the first taxon of the pair               ;
+;                int *b         : the second taxon of the pair              ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+void Best_pair(float **delta, int r, int *a, int *b, int n)
+{
+  float Qxy;                         /* value of the criterion calculated*/
+  int x,y;                           /* the pair which is tested         */
+  float Qmin;                        /* current minimun of the criterion */
+
+  Qmin=1.0e300;
+  for(x=1; x <= n; x++)
+    {
+      if(!Emptied(x,delta))
+        {
+	  for(y=1; y < x; y++)
+	    {
+	      if(!Emptied(y,delta))
+		{
+		  Qxy=Agglomerative_criterion(x,y,delta,r);
+		  if(Qxy < Qmin-0.000001)
+		    {
+		      Qmin=Qxy;
+		      *a=x;
+		      *b=y;
+		    }
+		}
+	    }
+        }
+    }
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;Finish_branch_length;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;  Description :  Compute the length of the branch attached                 ;
+;                 to the subtree i, during the final step                   ;
+;                                                                           ;
+;  input       :                                                            ;
+;                int i          : position of subtree i                     ;
+;                int j          : position of subtree j                     ;
+;                int k          : position of subtree k                     ;
+;                float **delta :                                            ;
+;                                                                           ;
+;  return value:                                                            ;
+;                float length  : The length of the branch                   ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+float Finish_branch_length(int i, int j, int k, float **delta)
+{
+  float length;
+  length=0.5*(Distance(i,j,delta) + Distance(i,k,delta)
+	      -Distance(j,k,delta));
+  return(length);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Finish;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;  Description : This function compute the length of the lasts three        ;
+;                subtrees and write the tree in the output file.            ;
+;                                                                           ;
+;  input       :                                                            ;
+;                float **delta  : the delta matrix                          ;
+;                int n           : the number of taxa                       ;
+;                WORD *trees   : list of subtrees                           ;
+;                                                                           ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+void Finish(float **delta, int n, POINTERS *trees, FILE *output)
+{
+  int l=1;
+  int i=0;
+  float length;
+  char *str;
+  WORD *bidon;
+  WORD *ele;
+  int last[3];                            /* the last three subtrees     */
+
+  str=(char *)calloc(LEN,sizeof(char));
+
+  if(str == NULL)
+    {
+      printf("Out of memories !!");
+      exit(0);
+    }
+  while(l <= n)
+    {                                       /* find the last tree subtree  */
+      if(!Emptied(l, delta))
+	{
+	  last[i]=l;
+	  i++;
+	}
+      l++;
+    }
+
+  length=Finish_branch_length(last[0],last[1],last[2],delta);
+  fprintf(output,"(");
+  Print_output(last[0],trees,output);
+  fprintf(output,":");
+/*   gcvt(length,PREC, str); */
+/*   fprintf(output,"%s,",str); */
+  fprintf(output,"%f,",length);
+
+  length=Finish_branch_length(last[1],last[0],last[2],delta);
+  Print_output(last[1],trees,output);
+  fprintf(output,":");
+/*   gcvt(length,PREC, str); */
+/*   fprintf(output,"%s,",str); */
+  fprintf(output,"%f,",length);
+
+  length=Finish_branch_length(last[2],last[1],last[0],delta);
+  Print_output(last[2],trees,output);
+  fprintf(output,":");
+/*   gcvt(length,PREC,str); */
+/*   fprintf(output,"%s",str); */
+  fprintf(output,"%f",length);
+  fprintf(output,");");
+  fprintf(output,"\n");
+
+  for(i=0; i < 3; i++)
+    {
+      bidon=trees[last[i]].head;
+      ele=bidon;
+      while(bidon!=NULL)
+	{
+	  ele=ele->suiv;
+	  free(bidon);
+	  bidon=ele;
+	}
+    }
+  free(str);
+}
+
+
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
+;                                                                           ;
+;                          Formulae                                         ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+
+float Agglomerative_criterion(int i, int j, float **delta, int r)
+{
+  float Qij;
+  Qij=(r-2)*Distance(i,j,delta)                           /* Formula (1) */
+    -Sum_S(i,delta)
+    -Sum_S(j,delta);
+
+  return(Qij);
+}
+
+
+float Branch_length(int a, int b, float **delta, int r)
+{
+  float length;
+  length=0.5*(Distance(a,b,delta)                         /* Formula (2) */
+	      +(Sum_S(a,delta)
+		-Sum_S(b,delta))/(r-2));
+  return(length);
+}
+
+
+float Reduction4(int a, float la, int b, float lb, int i, float lamda,
+		 float **delta)
+{
+  float Dui;
+  Dui=lamda*(Distance(a,i,delta)-la)
+    +(1-lamda)*(Distance(b,i,delta)-lb);                /* Formula (4) */
+  return(Dui);
+}
+
+
+float Reduction10(int a, int b, int i, float lamda, float vab,
+		  float **delta)
+{
+  float Vci;
+  Vci=lamda*Variance(a,i,delta)+(1-lamda)*Variance(b,i,delta)
+    -lamda*(1-lamda)*vab;                              /*Formula (10)  */
+  return(Vci);
+}
+
+float Lamda(int a, int b, float vab, float **delta, int n, int r)
+{
+  float lamda=0.0;
+  int i;
+
+  if(vab==0.0)
+    lamda=0.5;
+  else
+    {
+      for(i=1; i <= n ; i++)
+	{
+          if(a != i && b != i && !Emptied(i,delta))
+            lamda=lamda + (Variance(b,i,delta) - Variance(a,i,delta));
+	}
+      lamda=0.5 + lamda/(2*(r-2)*vab);
+    }                                              /* Formula (9) and the  */
+  if(lamda > 1.0)                                /* constraint that lamda*/
+    lamda = 1.0;                             /* belongs to [0,1]     */
+  if(lamda < 0.0)
+    lamda=0.0;
+  return(lamda);
+}
+/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;                                                                           ;
+;                         Main program                                      ;
+;                                                                           ;
+;                         argc is the number of arguments                   ;
+;                         **argv contains the arguments:                    ;
+;                         the first argument has to be BIONJ;               ;
+;                         the second is the inptu-file;                     ;
+;                         the third is the output-file.                     ;
+;                         When the input and output files are               ;
+;                         not given, the user is asked for them.            ;
+;                                                                           ;
+\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
+
+public :
+int create(const char *inputFile, const char *outputFile) {
+
+  FILE *input;                            /* pointer to input file       */
+  FILE *output;                           /* pointer to output file      */
+  POINTERS *trees;                        /* list of subtrees            */
+  char *Name_fich1;                       /* name of the input file      */
+  char *Name_fich2;                       /* name of the output file     */
+  char *chain1;                           /* stringized branch-length    */
+  char *chain2;                           /* idem                        */
+  int *a, *b;                             /* pair to be agglomerated     */
+  float **delta;                          /* delta matrix                */
+  float la;                               /* first taxon�s branch-length */
+  float lb;                               /* second taxon�s branch-length*/
+  float vab;                              /* variance of Dab             */
+  float lamda;
+  int i;
+  int ok;
+  int r;                                  /* number of subtrees          */
+  int n;                                  /* number of taxa              */
+  int x, y;
+  //float t;
+
+
+  /*   Allocation of memories    */
+
+  Name_fich1=(char*)calloc(LEN,sizeof(char));
+  Name_fich2=(char*)calloc(LEN,sizeof(char));
+  a=(int*)calloc(1,sizeof(int));
+  b=(int*)calloc(1,sizeof(int));
+  chain1=(char *)calloc(LEN,sizeof(char));
+  chain2=(char *)calloc(LEN,sizeof(char));
+
+  input= fopen(inputFile,"r");
+  fscanf(input,"%d",&n);
+
+  output= fopen(outputFile,"w");
+  /*      Create the delta matrix     */
+
+  delta=(float **)calloc(n+1,sizeof(float*));
+  for(i=1; i<= n; i++)
+    {
+      delta[i]=(float *)calloc(n+1, sizeof(float));
+      if(delta[i] == NULL)
+	{
+	  printf("Out of memories!!");
+	  exit(0);
+	}
+    }
+  trees=(POINTERS *)calloc(n+1,sizeof(POINTERS));
+  if(trees == NULL)
+    {
+      printf("Out of memories!!");
+      exit(0);
+    }
+  /*   initialise and symmetrize the running delta matrix    */
+
+  rewind(input);
+  while(fscanf(input,"%d",&n) != EOF )
+    {
+      r=n;
+      *a=0;
+      *b=0;
+      Initialize(delta, input, n, trees);
+      ok=Symmetrize(delta, n);
+      if(!ok)
+	printf("\n The matrix  is not symmetric.\n ");
+      while (r > 3)                             /* until r=3                 */
+	{
+	  Compute_sums_Sx(delta, n);             /* compute the sum Sx       */
+	  Best_pair(delta, r, a, b, n);          /* find the best pair by    */
+	  vab=Variance(*a, *b, delta);           /* minimizing (1)           */
+	  la=Branch_length(*a, *b, delta, r);    /* compute branch-lengths   */
+	  lb=Branch_length(*b, *a, delta, r);    /* using formula (2)        */
+	  lamda=Lamda(*a, *b, vab, delta, n, r); /* compute lambda* using (9)*/
+	  for(i=1; i <= n; i++)
+	    {
+	      if(!Emptied(i,delta) && (i != *a) && (i != *b))
+		{
+		  if(*a > i)
+		    {
+		      x=*a;
+		      y=i;
+		    }
+		  else
+		    {
+		      x=i;
+		      y=*a;                           /* apply reduction formulae */
+		    }                                  /* 4 and 10 to delta        */
+		  delta[x][y]=Reduction4(*a, la, *b, lb, i, lamda, delta);
+		  delta[y][x]=Reduction10(*a, *b, i, lamda, vab, delta);
+		}
+	    }
+	  strcpy(chain1,"");                     /* agglomerate the subtrees */
+	  strcat(chain1,"(");                    /* a and b together with the*/
+	  Concatenate(chain1, *a, trees, 0);     /* branch-lengths according */
+	  strcpy(chain1,"");                     /* to the NEWSWICK format   */
+	  strcat(chain1,":");
+
+	  sprintf(chain1+strlen(chain1),"%f",la);
+/* 	  gcvt(la,PREC, chain2); */
+/* 	  strcat(chain1, chain2); */
+
+	  strcat(chain1,",");
+	  Concatenate(chain1,*a, trees, 1);
+	  trees[*a].tail->suiv=trees[*b].head;
+	  trees[*a].tail=trees[*b].tail;
+	  strcpy(chain1,"");
+	  strcat(chain1,":");
+
+	  sprintf(chain1+strlen(chain1),"%f",lb);
+/* 	  gcvt(lb, PREC, chain2); */
+/* 	  strcat(chain1, chain2); */
+	  strcat(chain1,")");
+	  Concatenate(chain1, *a, trees, 1);
+	  delta[*b][0]=1.0;                     /* make the b line empty     */
+	  trees[*b].head=NULL;
+	  trees[*b].tail=NULL;
+	  r=r-1;                                /* decrease r                */
+	}
+      Finish(delta, n, trees, output);       /* compute the branch-lengths*/
+      for(i=1; i<=n; i++)       	          /* of the last three subtrees*/
+	{				                /* and print the tree in the */
+	  delta[i][0]=0.0;		          /* output-file               */
+	  trees[i].head=NULL;
+	  trees[i].tail=NULL;
+	}
+    }
+  free(trees);
+  for(i=n; i>=1; i--)
+    {
+      free(delta[i]);
+    }
+  free(delta);
+  /* Minh free memory-leak */
+  free(chain2);
+  free(chain1);
+  free(b);
+  free(a);
+  free(Name_fich2);
+  free(Name_fich1);
+  /* Minh done */
+  fclose(input);
+  fclose(output);
+
+  return 0;
+}
+};
+#endif
diff --git a/candidateset.cpp b/candidateset.cpp
new file mode 100644
index 0000000..0962161
--- /dev/null
+++ b/candidateset.cpp
@@ -0,0 +1,298 @@
+/*
+ * candidateset.cpp
+ *
+ *  Created on: Jun 1, 2014
+ *      Author: Tung Nguyen
+ */
+
+#include "phylotree.h"
+#include "candidateset.h"
+
+void CandidateSet::init(Alignment* aln, Params *params) {
+    this->aln = aln;
+    this->params = params;
+}
+
+CandidateSet::~CandidateSet() {
+}
+
+CandidateSet::CandidateSet() {
+	aln = NULL;
+	params = NULL;
+}
+
+vector<string> CandidateSet::getBestTrees() {
+	vector<string> res;
+	double bestScore = rbegin()->first;
+	for (reverse_iterator rit = rbegin(); rit != rend() && rit->second.score == bestScore; rit++) {
+		res.push_back(rit->second.tree);
+	}
+	return res;
+}
+
+string CandidateSet::getRandCandTree() {
+	assert(!empty());
+	if (empty())
+		return "";
+	int id = random_int(min(params->popSize, (int)size()) );
+	for (reverse_iterator i = rbegin(); i != rend(); i++, id--)
+		if (id == 0)
+			return i->second.tree;
+	assert(0);
+	return "";
+}
+
+vector<string> CandidateSet::getTopTrees(int numTree) {
+	assert(numTree <= params->maxCandidates);
+	if (numTree == 0) {
+		numTree = params->maxCandidates;
+	}
+	vector<string> res;
+	int cnt = numTree;
+	for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++, cnt--) {
+		res.push_back(rit->second.tree);
+	}
+	return res;
+}
+
+vector<string> CandidateSet::getBestLocalOptimalTrees(int numTree) {
+	assert(numTree <= params->maxCandidates);
+	if (numTree == 0) {
+		numTree = params->maxCandidates;
+	}
+	vector<string> res;
+	int cnt = numTree;
+	for (reverse_iterator rit = rbegin(); rit != rend() && cnt > 0; rit++) {
+		if (rit->second.localOpt) {
+			res.push_back(rit->second.tree);
+			cnt--;
+		}
+	}
+	return res;
+}
+/*
+bool CandidateSet::replaceTree(string tree, double score) {
+    CandidateTree candidate;
+    candidate.tree = tree;
+    candidate.score = score;
+    candidate.topology = getTopology(tree);
+    if (treeTopologyExist(candidate.topology)) {
+        topologies[candidate.topology] = score;
+        for (reverse_iterator i = rbegin(); i != rend(); i++) {
+            if (i->second.topology == candidate.topology) {
+                erase( --(i.base()) );
+                break;
+            }
+            insert(CandidateSet::value_type(score, candidate));
+        }
+    } else {
+        return false;
+    }
+    return true;
+}
+
+string CandidateSet::getNextCandTree() {
+    string tree;
+    assert(!empty());
+    if (parentTrees.empty()) {
+        initParentTrees();
+    }
+    tree = parentTrees.top();
+    parentTrees.pop();
+    return tree;
+}
+
+void CandidateSet::initParentTrees() {
+    if (parentTrees.empty()) {
+        int count = params->popSize;
+        for (reverse_iterator i = rbegin(); i != rend() && count >0 ; i++, count--) {
+            parentTrees.push(i->second.tree);
+            //cout << i->first << endl;
+        }
+    }
+}
+*/
+bool CandidateSet::update(string tree, double score, bool localOpt) {
+	bool newTree = true;
+	CandidateTree candidate;
+	candidate.score = score;
+	candidate.topology = getTopology(tree);
+	candidate.localOpt = localOpt;
+//	cout << "Updating candidate tree " << tree << endl;
+	candidate.tree = tree;
+
+	if (treeTopologyExist(candidate.topology)) {
+		newTree = false;
+	    /* If tree topology already exist but the score is better, we replace the old one
+	    by the new one (with new branch lengths) and update the score */
+		if (topologies[candidate.topology] < score) {
+			removeCandidateTree(candidate.topology);
+			topologies[candidate.topology] = score;
+			// insert tree into candidate set
+			insert(CandidateSet::value_type(score, candidate));
+		} else if (candidate.localOpt) {
+			CandidateSet::iterator treePtr = getCandidateTree(candidate.topology);
+			treePtr->second.localOpt = candidate.localOpt;
+		}
+	} else {
+		if (getWorstScore() < score && size() >= params->maxCandidates) {
+			// remove the worst-scoring tree
+			topologies.erase(begin()->second.topology);
+			erase(begin());
+		}
+		CandidateSet::iterator it = insert(CandidateSet::value_type(score, candidate));
+		topologies[candidate.topology] = score;
+		if (params->fix_stable_splits && getNumLocalOptTrees() >= params->numSupportTrees) {
+			int it_pos = distance(it, end());
+			// The new tree is one of the numSupportTrees best trees.
+			// Thus recompute supported splits
+			if (it_pos <= params->numSupportTrees) {
+				int nSupportedSplits = computeSplitSupport(params->numSupportTrees);
+				cout << ((double) nSupportedSplits / (aln->getNSeq() - 3)) * 100
+						<< " % of the splits have 100% support and can be fixed." << endl;
+			}
+		}
+	}
+	assert(topologies.size() == size());
+	return newTree;
+}
+
+vector<double> CandidateSet::getBestScores(int numBestScore) {
+	if (numBestScore == 0)
+		numBestScore = size();
+	vector<double> res;
+	for (reverse_iterator rit = rbegin(); rit != rend() && numBestScore > 0; rit++, numBestScore--) {
+		res.push_back(rit->first);
+	}
+	return res;
+}
+
+double CandidateSet::getBestScore() {
+	if (size() == 0)
+		return -DBL_MAX;
+	else
+		return rbegin()->first;
+}
+
+double CandidateSet::getWorstScore() {
+	return begin()->first;
+}
+
+string CandidateSet::getTopology(string tree) {
+	PhyloTree mtree;
+//	mtree.rooted = params->is_rooted;
+	mtree.aln = this->aln;
+	mtree.setParams(params);
+
+	stringstream str;
+	str << tree;
+	str.seekg(0, ios::beg);
+//	freeNode();
+	mtree.readTree(str, params->is_rooted);
+	mtree.setAlignment(aln);
+	mtree.setRootNode(params->root);
+
+//	mtree.readTreeString(tree);
+//	mtree.setRootNode(params->root);
+
+	ostringstream ostr;
+	mtree.printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+	return ostr.str();
+}
+
+double CandidateSet::getTopologyScore(string topology) {
+	assert(topologies.find(topology) != topologies.end());
+	return topologies[topology];
+}
+
+void CandidateSet::clear() {
+	multimap<double, CandidateTree>::clear();
+	clearTopologies();
+}
+
+void CandidateSet::clearTopologies() {
+	topologies.clear();
+}
+
+
+CandidateSet CandidateSet::getBestCandidateTrees(int numTrees) {
+	CandidateSet res;
+	if (numTrees >= size())
+		numTrees = size();
+	for (reverse_iterator rit = rbegin(); rit != rend() && numTrees > 0; rit++, numTrees--) {
+		res.insert(*rit);
+	}
+	return res;
+}
+
+bool CandidateSet::treeTopologyExist(string topo) {
+	return (topologies.find(topo) != topologies.end());
+}
+
+bool CandidateSet::treeExist(string tree) {
+	return treeTopologyExist(getTopology(tree));
+}
+
+CandidateSet::iterator CandidateSet::getCandidateTree(string topology) {
+	for (CandidateSet::reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+		if (rit->second.topology == topology)
+			return --(rit.base());
+	}
+	return end();
+}
+
+void CandidateSet::removeCandidateTree(string topology) {
+	bool removed = false;
+	for (CandidateSet::reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+			if (rit->second.topology == topology) {
+				erase( --(rit.base()) );
+				topologies.erase(topology);
+				removed = true;
+				break;
+			}
+	}
+	assert(removed);
+}
+
+bool CandidateSet::isStableSplit(Split& sp) {
+	return stableSplit.containSplit(sp);
+}
+
+int CandidateSet::computeSplitSupport(int numTree) {
+	stableSplit.clear();
+	if (numTree == 0)
+		numTree = getNumLocalOptTrees();
+	SplitIntMap hash_ss;
+	SplitGraph sg;
+	MTreeSet boot_trees;
+	int numMaxSupport = 0;
+	vector<string> trees = getBestLocalOptimalTrees(numTree);
+	assert(trees.size() > 1);
+	int maxSupport = trees.size();
+	boot_trees.init(trees, aln->getSeqNames(), params->is_rooted);
+	boot_trees.convertSplits(aln->getSeqNames(), sg, hash_ss, SW_COUNT, -1, NULL, false);
+
+	for (SplitIntMap::iterator it = hash_ss.begin(); it != hash_ss.end(); it++) {
+		if (it->second == maxSupport && it->first->countTaxa() > 1) {
+			numMaxSupport++;
+			Split* supportedSplit = new Split(*(it->first));
+			stableSplit.push_back(supportedSplit);
+		}
+	}
+	//cout << "Number of supported splits = " << numMaxSupport << endl;
+	return numMaxSupport;
+}
+
+void CandidateSet::setAln(Alignment* aln) {
+	this->aln = aln;
+}
+
+int CandidateSet::getNumLocalOptTrees() {
+	int numLocalOptima = 0;
+	for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+		if (rit->second.localOpt) {
+			numLocalOptima++;
+		}
+	}
+	return numLocalOptima;
+}
diff --git a/candidateset.h b/candidateset.h
new file mode 100644
index 0000000..aaa3392
--- /dev/null
+++ b/candidateset.h
@@ -0,0 +1,287 @@
+/*
+ * candidateset.h
+ *
+ *  Created on: Jun 1, 2014
+ *      Author: Tung Nguyen
+ */
+
+#ifndef CANDIDATESET_H_
+#define CANDIDATESET_H_
+#include "tools.h"
+#include "alignment.h"
+#include "mtreeset.h"
+#include <stack>
+
+struct CandidateTree {
+
+	/**
+	 * with branch lengths.
+	 * empty for intermediate NNI tree
+	 */
+	string tree;
+
+
+	/**
+	 * tree topology WITHOUT branch lengths
+	 * and WITH TAXON ID (instead of taxon names)
+	 * for sorting purpose
+	 */
+	string topology;
+
+	/**
+	 * log-likelihood or parsimony score
+	 */
+	double score;
+
+	/**
+	 *  Indicate whether the tree is NNI locally optimal.
+	 *  The reason to have this variable is that if the -reduction is
+	 *  enabled, we will also store non-locally optimal trees in the set.
+	 *  This is done to identify trees that belong to the same basin of attraction
+	 */
+	bool localOpt;
+
+};
+
+
+/**
+ * Candidate tree set, sorted in ascending order of scores, i.e. the last element is the highest scoring tree
+ */
+class CandidateSet : public multimap<double, CandidateTree> {
+
+public:
+    /**
+     * Initialization
+     */
+	void init(Alignment* aln, Params *params);
+
+	CandidateSet();
+
+    /**
+     * return randomly one candidate tree from max_candidate
+     */
+    string getRandCandTree();
+
+    /**
+     * return the next parent tree for reproduction.
+     * Here we always maintain a list of candidate trees which have not
+     * been used for reproduction. If all candidate trees have been used, we select the
+     * current best trees as the new parent trees
+     */
+//    string getNextCandTree();
+
+    /**
+     *  Replace an existing tree in the candidate set
+     *  @param tree the new tree string that will replace the existing tree
+     *  @param score the score of the new tree
+     *  @return true if the topology of \a tree exist in the candidate set
+     */
+//    bool replaceTree(string tree, double score);
+
+    /**
+     *  create the parent tree set containing top trees
+     */
+//    void initParentTrees();
+
+    /**
+     * update/insert \a tree into the candidate set if its score is higher than the worst tree
+     *
+     * @param tree
+     * 	The new tree string (with branch lengths)
+     * @param score
+     * 	The score (ML or parsimony) of \a tree
+     * @param localOpt
+     * 	Tells whether \a tree is a locally optimal (DEFAULT: true)
+     * @return false if tree topology already exists
+     *
+     */
+    bool update(string tree, double score, bool localOpt = true);
+
+    /**
+     *  Get the \a numBestScores best scores in the candidate set
+     *
+     *  @param numBestScores
+     *  	Number of best scores
+     *  @return
+     *  	Vector containing \a numBestScore best scores
+     */
+    vector<double> getBestScores(int numBestScores = 0);
+
+    /**
+     * Get the worst score
+     *
+     * @return the worst score
+     */
+    double getWorstScore();
+
+    /**
+     * Get best score
+     *
+     * @return the best score
+     */
+    double getBestScore();
+
+    /**
+     *  Get \a numTree top scoring trees
+     *
+     *  @param numTree
+     *  	Number of top scoring trees
+     *  @return
+     *  	Vector of current best trees
+     */
+    vector<string> getTopTrees(int numTree = 0);
+
+    /**
+     * 	Get \a numTree best locally optimal trees
+     * 	@param numTree
+     * 		Number of locally optimal trees
+     * 	@return
+     * 		Vector of current best locally optimal trees
+     */
+    vector<string> getBestLocalOptimalTrees(int numTree = 0);
+
+    /**
+     * 	Get tree(s) with the best score. There could be more than one
+     * 	tree that share the best score (this happens frequently with parsimony)
+     * 	@return
+     * 		A vector containing trees with the best score
+     */
+    vector<string> getBestTrees();
+
+    /**
+     * destructor
+     */
+    virtual ~CandidateSet();
+
+    /**
+     * 	Check if tree topology \a topo already exists
+     *
+     * 	@param topo
+     * 		Newick string of the tree topology
+     */
+    bool treeTopologyExist(string topo);
+
+    /**
+     * 	Check if tree \a tree already exists
+     *
+     * 	@param tree
+     * 		Newick string of the tree topology
+     */
+    bool treeExist(string tree);
+
+    /**
+     * 	Return a unique topology (sorted by taxon names, rooted at taxon with alphabetically smallest name)
+     * 	without branch lengths
+     *
+     * 	@param tree
+     * 		The newick tree string, from which the topology string will be generated
+     * 	@return
+     * 		Newick string of the tree topology
+     */
+    string getTopology(string tree);
+
+    /**
+     * return the score of \a topology
+     *
+     * @param topology
+     * 		Newick topology
+     * @return
+     * 		Score of the topology
+     */
+    double getTopologyScore(string topology);
+
+    /**
+     *  Empty the candidate set
+     */
+    void clear();
+
+    /**
+     *  Empty the \a topologies data structure;
+     */
+    void clearTopologies();
+
+    /**
+     * Compute the split support from the \a numTree best local optimal trees in the candidate sets
+     * @param numTree the number of best trees used to calculate support values
+     * @return number of splits with 100% support value
+     */
+    int computeSplitSupport(int numTree = 0);
+
+    /**
+     * Check whether the
+     * @param sp the split to check, must have the same taxon set as the trees in CandidateSet.
+     * @return true if the \a supportedSplits contain \a sp, false otherwise.
+     */
+    bool isStableSplit(Split& sp);
+
+    /**
+     * Return a pointer to the \a CandidateTree that has topology equal to \a topology
+     * @param topology
+     * @return
+     */
+    iterator getCandidateTree(string topology);
+
+    /**
+     * Remove the \a CandidateTree with topology equal to \a topology
+     * @param topology
+     */
+    void removeCandidateTree(string topology);
+
+    /* Getter and Setter function */
+	void setAln(Alignment* aln);
+	int getMaxCandidates() const;
+	void setMaxCandidates(int maxCandidates);
+	int getPopSize() const;
+	void setPopSize(int popSize);
+	void setIsRooted(bool isRooted);
+	const StringDoubleHashMap& getTopologies() const {
+		return topologies;
+	}
+
+	/**
+	 * get number of locally optimal trees in the set
+	 * @return
+	 */
+	int getNumLocalOptTrees();
+
+    /**
+     * Return a CandidateSet containing \a numTrees of current best candidate trees
+     * @param numTrees
+     * @return
+     */
+    CandidateSet getBestCandidateTrees(int numTrees);
+
+	SplitGraph& getStableSplits() {
+		return stableSplit;
+	}
+
+private:
+
+    /**
+     *  Set of supported splits by the best trees
+     */
+    SplitGraph stableSplit;
+
+    /**
+     *  Shared params pointing to the global params
+     */
+    Params* params;
+
+    /**
+     *  Map data structure storing <topology_string, score>
+     */
+    StringDoubleHashMap topologies;
+
+    /**
+     *  Trees used for reproduction
+     */
+    stack<string> parentTrees;
+
+    /**
+     * pointer to alignment, just to assign correct IDs for taxa
+     */
+    Alignment *aln;
+
+};
+
+#endif /* CANDIDATESET_H_ */
diff --git a/checkpoint.cpp b/checkpoint.cpp
new file mode 100644
index 0000000..a85a479
--- /dev/null
+++ b/checkpoint.cpp
@@ -0,0 +1,169 @@
+/*
+ * checkpoint.cpp
+ *
+ *  Created on: Jun 12, 2014
+ *      Author: minh
+ */
+
+#include "checkpoint.h"
+
+/*
+ * The following parameters have been saved for checkpoint in IQPNNI
+ *
+Number iterations: 200
+Maximum number iterations: 2000
+Current number iterations: 139
+Probability of deleting a sequence: 0.5
+Number representatives: 4
+Stopping rule (0: YES, 1: YES_MIN_ITER, 2: YES_MAX_ITER, 3: NO): 3
+Type of data (0:NUCLEOTIDE, 1:AMINO_ACID): 0
+Substitution model (0:HKY85, 1: TN93, 2:GTR, 3:WAG, 4:JTT, 5:VT, 6:MtREV24, 7:Blosum62, 8:Dayhoff, 9:rtREV, 10: User-defined): 0
+Frequency of Base A: 0.248672
+Frequency of Base C: 0.261687
+Frequency of Base G: 0.250996
+Frequency of Base T: 0.238645
+Type of parameters (0:ESTIMATE,  1:USER_DEFINED, 2: EQUAL): 0
+Transition/transversion ratito: 0.766912
+Type of parameters (0:ESTIMATE,  1:USER_DEFINED): 0
+Pyridimine/purine ratito: 1
+Type of parameters (0:ESTIMATE,  1:USER_DEFINED): 0
+Transition rate from A to G: -1
+Transition rate from C to T: -1
+Transversion rate from A to C: -1
+Transversion rate from A to T: -1
+Transversion rate from C to G: -1
+Transversion rate from G to T: -1
+Type of parameters (0:ESTIMATE,  1:USER_DEFINED): 0
+Type of rate heterogeneity (0:UNIFORM, 1:SITE_SPECIFIC, 2:GAMMA): 0
+Number rates: 1
+Gamma distribution parameter alpha: 1
+Type of parameters (0:ESTIMATE,  1:USER_DEFINED): 0
+Invariant type (0: NONE, 1:ESTIMATE, 2: USER_DEFINED): 0
+Proportion of invariable sites: 0
+Out group sequence: 0
+Bootstrap sample: 0
+Current bootstrap sample: 0
+Build consensus: 0
+Current best log-likelihood: -11833.35062
+Elapsed time: 23
+Finished: 0
+ */
+
+Checkpoint::Checkpoint() {
+	filename = "";
+}
+
+void Checkpoint::setFileName(string filename) {
+	this->filename = filename;
+}
+void Checkpoint::load() {
+	assert(filename != "");
+    try {
+        ifstream in;
+        // set the failbit and badbit
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(filename.c_str());
+        string line;
+        getline(in, line);
+        if (line != "Checkpoint file for IQ-TREE")
+        	throw ("Invalid checkpoint file");
+        // remove the failbit
+        in.exceptions(ios::badbit);
+        while (!in.eof()) {
+        	getline(in, line);
+        	size_t pos = line.find(" := ");
+        	if (pos == string::npos)
+        		throw "':=' is expected between key and value";
+        	(*this)[line.substr(0, pos)] = line.substr(pos+3);
+        }
+        in.clear();
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+    } catch (ios::failure &) {
+        outError(ERR_READ_INPUT);
+    } catch (const char *str) {
+        outError(str);
+    } catch (string &str) {
+        outError(str);
+    }
+}
+
+void Checkpoint::commit() {
+	assert(filename != "");
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(filename.c_str());
+        out << "Checkpoint file for IQ-TREE" << endl;
+        for (iterator i = begin(); i != end(); i++)
+        	out << i->first << " := " << i->second << endl;
+        out.close();
+    } catch (ios::failure &) {
+        outError(ERR_WRITE_OUTPUT, filename.c_str());
+    }
+}
+
+bool Checkpoint::containsKey(string key) {
+	return (find(key) != end());
+}
+
+/**
+ * series of get functions
+ */
+
+template<class T>
+void Checkpoint::get(string key, T& value) {
+	assert(containsKey(key));
+	stringstream ss((*this)[key]);
+	ss >> value;
+}
+
+bool Checkpoint::getBool(string key) {
+	assert(containsKey(key));
+	if ((*this)[key] == "1") return true;
+	return false;
+}
+
+char Checkpoint::getChar(string key) {
+	assert(containsKey(key));
+	return (*this)[key][0];
+}
+
+double Checkpoint::getDouble(string key) {
+	assert(containsKey(key));
+	return convert_double((*this)[key].c_str());
+
+}
+
+int Checkpoint::getInt(string key) {
+	assert(containsKey(key));
+	return convert_int((*this)[key].c_str());
+
+}
+
+/**
+ * series of put functions
+ */
+
+template<class T>
+void Checkpoint::put(string key, T value) {
+	stringstream ss;
+	ss << value;
+	(*this)[key] = ss.str();
+}
+
+template<class T>
+void Checkpoint::putArray(string key, int num, T* value) {
+	stringstream ss;
+	for (int i = 0; i < num; i++) {
+		if (i > 0) ss << ',';
+		ss << value[i];
+	}
+	(*this)[key] = ss.str();
+}
+
+
+Checkpoint::~Checkpoint() {
+}
+
diff --git a/checkpoint.h b/checkpoint.h
new file mode 100644
index 0000000..f92c3a4
--- /dev/null
+++ b/checkpoint.h
@@ -0,0 +1,65 @@
+/*
+ * checkpoint.h
+ *
+ *  Created on: Jun 12, 2014
+ *      Author: minh
+ */
+
+#ifndef CHECKPOINT_H_
+#define CHECKPOINT_H_
+
+#include "tools.h"
+
+/**
+ * Checkpoint as map from key strings to value strings
+ */
+class Checkpoint : public map<string, string> {
+public:
+	Checkpoint();
+	/**
+	 * @param filename file name
+	 */
+	void setFileName(string filename);
+	/**
+	 * load checkpoint information from file
+	 */
+	void load();
+
+	/**
+	 * commit checkpoint information into file
+	 */
+	void commit();
+
+	/**
+	 * @return true if checkpoint contains the key
+	 * @param key key to search for
+	 */
+	bool containsKey(string key);
+
+	/**
+	 * series of get functions
+	 */
+	template<class T>
+	void get(string key, T& value);
+
+	bool getBool(string key);
+	char getChar(string key);
+	double getDouble(string key);
+	int getInt(string key);
+
+
+	/**
+	 * series of put functions
+	 */
+	template<class T>
+	void put(string key, T value);
+
+	template<class T>
+	void putArray(string key, int num, T* value);
+
+	virtual ~Checkpoint();
+
+	string filename;
+};
+
+#endif /* CHECKPOINT_H_ */
diff --git a/circularnetwork.cpp b/circularnetwork.cpp
new file mode 100644
index 0000000..735b957
--- /dev/null
+++ b/circularnetwork.cpp
@@ -0,0 +1,622 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "circularnetwork.h"
+
+CircularNetwork::CircularNetwork()
+ : PDNetwork()
+{
+}
+
+CircularNetwork::CircularNetwork(Params &params) : PDNetwork(params) {
+}
+
+/********************************************************
+	MAIN FUNCTION
+********************************************************/
+
+void CircularNetwork::findPD(Params &params, vector<SplitSet> &taxa_set, 
+	vector<int> &taxa_order) {
+	
+	if (!isCircular() || params.run_mode == EXHAUSTIVE || params.run_mode == GREEDY 
+		|| params.run_mode == LINEAR_PROGRAMMING || isPDArea()) {
+		// call inherited findPD if condition not met
+		PDNetwork::findPD(params, taxa_set, taxa_order);
+		return;
+	}
+	// call the entering function
+	enterFindPD(params);
+	params.detected_mode = DYNAMIC_PROGRAMMING;
+
+	int root_id = (initialset.size() > 0) ? initialset[0] : -1;
+	
+	if (isBudgetConstraint()) {
+		// resize the taxa_set
+		taxa_set.resize(params.budget - params.min_budget + 1);
+
+		cout << endl << "Dynamic programming on circular split network..." << endl;
+		if (root_id < 0)
+			findCircularPDBudget(params, taxa_set, taxa_order);
+		else
+			findCircularRootedPDBudget(params, taxa_set, taxa_order, root_id);
+	} else	{
+		// resize the taxa_set
+		taxa_set.resize(params.sub_size - params.min_size + 1);
+		
+		cout << endl << "Dynamic programming on circular split network..." << endl;
+		if (root_id < 0)
+			findCircularPD(params, taxa_set, taxa_order);
+		else {
+			findCircularRootedPD(params, taxa_set, taxa_order, root_id);
+		}
+	}
+	// call the leaving function
+	leaveFindPD(taxa_set);
+
+}
+
+
+
+
+/**
+	display the matrix into out (another version)
+*/
+template <class T>
+void reportMyMat(ostream &out, matrix(T) &mat) {
+	unsigned int i, j;
+	for (i = 0; i < mat.size(); i++) {
+		for (j = 0; j < mat[i].size(); j++) {
+			if (mat[i][j] == 0) 
+				out << " - &  "; 
+			else if (j < mat[i].size()-1) 
+				out << mat[i][j] << " & ";
+			else
+				out << mat[i][j];
+		}
+		if (i < mat.size()-1)
+			out << " \\\\";
+		out << endl;
+	}
+} 
+
+
+
+void CircularNetwork::computePDInfo(Params &params, DoubleMatrix &table, 
+		 DoubleMatrix &dist, int root) {
+	int ntaxa = getNTaxa();
+	int v, k, w;
+	// allocate memory to solution table, set everything to ZERO
+	table.resize(params.sub_size-1);
+	for (k = 0; k < params.sub_size-1; k++) {
+		table[k].resize(ntaxa);
+		for (v = root+1; v < ntaxa; v++)
+			table[k][v] = INT_MIN;
+	}
+	//table.setZero();
+
+	
+	for (v = root+1; v < ntaxa; v++) {
+		// initialize cube[0] to distance matrix
+		table[0][v] = dist[root][v];
+		// now iteratively calculate cube[k]
+		for (k = 1; k < params.sub_size-1 && k < v-root; k++) {
+			for (w = k+root; w < v; w++) {
+				double sum = table[k-1][w] + dist[v][w];
+				if (table[k][v] < sum) {
+					table[k][v] = sum;
+				}
+			}
+		}
+	}
+	//cout << table;
+}
+
+double CircularNetwork::computePDScore(int sub_size, DoubleMatrix &table, int root) {
+	int ntaxa = getNTaxa();
+
+	int v;
+	double max_pd = INT_MIN;
+	for (v = root+1; v < ntaxa; v++) {
+		if (max_pd < table[0][v] + table[sub_size-2][v]) {
+			max_pd = table[0][v] + table[sub_size-2][v];
+		}
+	}
+	return max_pd / 2.0;
+}
+
+void rotateTaxaOrder(vector<int> &origin_order, vector<int> &new_order, int root) {
+	int ntaxa = origin_order.size();
+	int i, id = ntaxa;
+	for (i = 0; i < ntaxa; i++) 
+		if (origin_order[i] == root) { id = i; break; }
+	assert(id < ntaxa);
+	new_order.resize(ntaxa);
+	for (i = 0; i < ntaxa; i++)
+		new_order[i] = origin_order[(i+id) % ntaxa];
+}
+
+
+void CircularNetwork::findCircularPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order) {
+
+	int ntaxa = getNTaxa();
+	DoubleMatrix dist;
+
+	int k;
+
+	// calculate the distance matrix
+	DoubleMatrix table;
+	calcDistance(dist, taxa_order);
+
+	for (int root = 0; root <= ntaxa-params.min_size; root++) {
+		// dynamic programming main procedure
+		// compute table information
+		computePDInfo(params, table, dist, root);
+
+		// now construction the optimal PD sets
+		for (k = params.min_size; k <= params.sub_size; k++) {
+			int index = k - params.min_size;
+			double pd_score = computePDScore(k, table, root);
+
+			if (taxa_set[index].getWeight() < pd_score)
+				taxa_set[index].removeAll();
+			else if (taxa_set[index].getWeight() > pd_score || !params.find_all) 
+				// if old pd score is better or equal but not find all, continue
+				continue;
+
+			constructPD(k, params.find_all, params.pd_limit, table, dist, taxa_set[index], taxa_order, root);
+		}
+	}
+}
+
+
+void CircularNetwork::findCircularRootedPD(Params &params, vector<SplitSet> &taxa_set, 
+	vector<int> &origin_order, int root) {
+
+	DoubleMatrix dist;
+
+	int k;
+	DoubleMatrix table;
+
+	vector<int> taxa_order;
+	// rotate the position of root to 0 in the taxa_order
+	rotateTaxaOrder(origin_order, taxa_order, root);
+
+	// calculate the distance matrix
+	calcDistance(dist, taxa_order);
+
+	// dynamic programming main procedure
+	computePDInfo(params, table, dist, 0);
+
+	for (k = params.min_size; k <= params.sub_size; k++) {
+		constructPD(k, params.find_all, params.pd_limit, table, dist, taxa_set[k-params.min_size], taxa_order, 0);
+	}
+}
+
+
+void CircularNetwork::constructPD(int sub_size, bool find_all, int pd_limit, DoubleMatrix &table, 
+	DoubleMatrix  &dist, SplitSet &taxa_set, vector<int> &taxa_order, int root) {
+	int ntaxa = getNTaxa();
+	double max_pd = INT_MIN;
+	vector<int> vec_v;
+	int max_v, k, v, w, sp;
+
+	// first calculate the PD score
+	for (v = root+1; v < ntaxa; v++) {
+		if (max_pd < table[0][v] + table[sub_size-2][v]) {
+			max_pd = table[0][v] + table[sub_size-2][v];
+			max_v = v;
+		}
+	}
+
+	if (find_all) {
+		vec_v.push_back(max_v);
+		// find all v with the same max_pd
+		for (v = max_v+1; v < ntaxa; v++) {
+			if (max_pd == table[0][v] + table[sub_size-2][v]) {
+				vec_v.push_back(v);
+			}
+		}
+	} else {
+		// otherwise, only use max_v
+		vec_v.push_back(max_v);
+	}
+
+	// now try all v
+	for (sp = 0; sp < vec_v.size(); sp ++) {
+
+		max_v = vec_v[sp];
+
+		Split *pd_set = new Split(ntaxa, max_pd / 2.0);
+	
+		pd_set->addTaxon(taxa_order[root]);
+		pd_set->addTaxon(taxa_order[max_v]);
+
+		if (!find_all) {
+			for (k = sub_size-2; k >= 1; k--) {
+				double max = INT_MIN;
+				int max_w = 0;
+				for (w = root+1; w < max_v; w++) 
+					if (max < table[k-1][w] + dist[max_v][w]) {
+						max = table[k-1][w] + dist[max_v][w];
+						max_w = w;
+					}
+				pd_set->addTaxon(taxa_order[max_w]);
+				max_v = max_w;
+			}
+			taxa_set.push_back(pd_set);
+		} else 
+			constructPD(sub_size-2, max_v, pd_limit, pd_set, table, dist, taxa_set, taxa_order, root);
+	}
+}
+
+void CircularNetwork::constructPD(int sub_size, int max_v, int pd_limit, Split *pd_set, DoubleMatrix &table,
+	DoubleMatrix &dist, SplitSet &taxa_set, vector<int> &taxa_order, int root) {
+
+	if (sub_size == 0) {
+		taxa_set.push_back(pd_set);
+		return;
+	}
+
+	int k;
+
+	for (k = sub_size; k >= 1; k--) {
+		int w, max_w = 0;
+		double max = INT_MIN;
+	
+		for (w = root+1; w < max_v; w++) 
+			if (max < table[k-1][w] + dist[max_v][w]) {
+				max = table[k-1][w] + dist[max_v][w];
+				max_w = w;
+			}
+
+		// recursive if find another PD set	
+		for (w = max_w+1; w < max_v && taxa_set.size() < pd_limit; w++) 
+			if (max == table[k-1][w] + dist[max_v][w]) {
+				Split *new_pd = new Split(*pd_set);
+				new_pd->addTaxon(taxa_order[w]);
+				constructPD(k-1, w, pd_limit, new_pd, table, dist, taxa_set, taxa_order, root);
+			}
+	
+		pd_set->addTaxon(taxa_order[max_w]);
+		max_v = max_w;
+		//constructPD(k-1, max_u, max_w, pd_set, table, taxa_set, taxa_order);
+	}	
+	taxa_set.push_back(pd_set);
+
+}
+
+/********************************************************
+	CIRCULAR NETWORKS WITH BUDGET CONSTRAINT
+********************************************************/
+
+void CircularNetwork::calcMaxBudget(int budget, matrix(int) &max_b, vector<int> &taxa_order) {
+	int ntaxa = getNTaxa();
+	int u, v;
+	max_b.resize(ntaxa-1);
+	for (u = 0; u < ntaxa-1; u++) {
+		max_b[u].resize(ntaxa);
+		max_b[u][u] = pda->costs[taxa_order[u]];
+		if (max_b[u][u] > budget) 
+			max_b[u][u] = budget;
+		for (v = u+1; v < ntaxa; v++) {
+			max_b[u][v] = max_b[u][v-1] + pda->costs[taxa_order[v]];
+			if (max_b[u][v] > budget) 
+				max_b[u][v] = budget;
+		}
+	}
+	for (u = 0; u < ntaxa-1; u++)
+		for (v = u+1; v < ntaxa; v++)
+			max_b[u][v] -= (pda->costs[taxa_order[u]] + pda->costs[taxa_order[v]]);
+}
+
+
+
+void CircularNetwork::constructPDBudget(int budget, bool find_all, matrix(double) &table,
+	matrix(double) &dist, SplitSet &taxa_set, vector<int> &taxa_order, 
+	matrix(int) &max_b, int root) {
+
+	int ntaxa = getNTaxa();
+	// now trace back to get the maximum pd_k
+	double max_pd = INT_MIN;
+	int v, s, b, sp;
+	int max_v = -1, total_b;
+	vector<int> vec_v;
+	// reduce the budget
+	budget -= pda->costs[taxa_order[root]];
+
+	for (v = root+1; v < ntaxa; v++) {
+		total_b = budget - pda->costs[taxa_order[v]];
+		if (total_b > max_b[root][v]) total_b = max_b[root][v];
+		if (total_b < 0) continue;
+		if (max_pd < dist[root][v] + table[v][total_b]) {
+			max_pd = dist[root][v] + table[v][total_b];
+			max_v = v;
+		}
+	}
+
+	// check if find something...
+	if (max_v < 0)
+		return;
+
+	if (find_all) {
+		// find all u,v with the same max_pd
+		vec_v.push_back(max_v);
+		for (v = max_v+1; v < ntaxa; v++) {
+			total_b = budget - pda->costs[taxa_order[v]];
+			if (total_b > max_b[root][v]) total_b = max_b[root][v];
+			if (total_b < 0) continue;
+			if (max_pd == dist[root][v] + table[v][total_b]) {
+				vec_v.push_back(v);
+			}
+		}
+	} else {
+		// otherwise, only use max_v
+		vec_v.push_back(max_v);
+	}
+
+	// now try all u,v
+	for (sp = 0; sp < vec_v.size(); sp ++) {
+
+		max_v = vec_v[sp];
+
+		Split* pd_set = new Split(ntaxa, max_pd / 2.0);
+		pd_set->addTaxon(taxa_order[root]);
+		pd_set->addTaxon(taxa_order[max_v]);
+		if (!find_all) {
+			b = budget - pda->costs[taxa_order[max_v]];
+			if (b > max_b[root][max_v]) b = max_b[root][max_v];
+			// now trace to the minimum budget required
+			while (b > 0 && table[max_v][b] == table[max_v][b-1]) b--;
+
+			// iteratively find taxa inbetween
+			while (b >= 0) {
+				double max = INT_MIN;
+				int max_s = -1;
+				for (s = root+1; s < max_v; s++) 
+					if (b >= pda->costs[taxa_order[s]]) {
+						int sub_b = b - pda->costs[taxa_order[s]];
+						if (sub_b > max_b[root][s]) sub_b = max_b[root][s];
+						if (sub_b < 0) continue;
+						if (max < dist[s][max_v] + table[s][sub_b]) {
+							max = dist[s][max_v] + table[s][sub_b];
+							max_s = s;
+						}
+					}
+				if (max_s == -1) break;
+				pd_set->addTaxon(taxa_order[max_s]);
+				b -= pda->costs[taxa_order[max_s]];
+				if (b > max_b[root][max_s])
+					b = max_b[root][max_s];
+				max_v = max_s;
+			}
+			taxa_set.push_back(pd_set);
+		} else {
+			b = budget - pda->costs[taxa_order[max_v]];
+			if (b > max_b[root][max_v]) b = max_b[root][max_v];
+			constructPDBudget(b, max_v, pd_set, table, dist, taxa_set, taxa_order, max_b, root);
+		}
+	}
+}
+
+
+void CircularNetwork::constructPDBudget(int budget, int max_v, Split *pd_set, 
+	matrix(double) &table, matrix(double) &dist, SplitSet &taxa_set, 
+	vector<int> &taxa_order, matrix(int) &max_b, int root) {
+
+	int b = budget;
+
+	while (b >= 0) {
+		int s, max_s = -1;
+		double max = INT_MIN;
+	
+		for (s = root+1; s < max_v; s++) 
+			if (b >= pda->costs[taxa_order[s]]) {
+				int sub_b = b - pda->costs[taxa_order[s]];
+				if (sub_b > max_b[root][s]) sub_b = max_b[root][s];
+				if (sub_b < 0) continue;
+				if (max < dist[s][max_v] + table[s][sub_b]) {
+					max = dist[s][max_v] + table[s][sub_b];
+					max_s = s;
+				}
+			}
+
+		if (max_s < 0) break;
+
+		// recursive if find another PD set	
+		for (s = max_s+1; s < max_v; s++) 
+			if (b >= pda->costs[taxa_order[s]]) {
+				int sub_b = b - pda->costs[taxa_order[s]];
+				if (sub_b > max_b[root][s]) sub_b = max_b[root][s];
+				if (sub_b < 0) continue;
+				if (max == dist[s][max_v] + table[s][sub_b]) {
+					Split *new_pd = new Split(*pd_set);
+					new_pd->addTaxon(taxa_order[s]);
+					constructPDBudget(sub_b, s, new_pd, table, dist, 
+						taxa_set, taxa_order, max_b, root);
+				}
+			}
+
+		pd_set->addTaxon(taxa_order[max_s]);
+		b -= pda->costs[taxa_order[max_s]];
+		if (b > max_b[root][max_s]) b = max_b[root][max_s];
+		max_v = max_s;
+	}
+
+	taxa_set.push_back(pd_set);
+}
+
+void CircularNetwork::computePDBudgetInfo(Params &params, matrix(double) &table, matrix(int) &id, 
+	matrix(double) &dist, vector<int> &taxa_order, matrix(int) &max_b, int root)
+{
+	int ntaxa = getNTaxa();
+
+	int v, s, b, total_b;
+
+	// allocate memory and initialize table
+	table.resize(ntaxa);
+	if (verbose_mode >= VB_DEBUG) {
+		id.resize(ntaxa);
+		for (v = 0; v <= root; v++) {
+			for (b = 0; b < table[v].size(); b++)
+				table[v][b] = 0;			
+			for (b = 0; b < id[v].size(); b++)
+				id[v][b] = 0;
+		}
+	}
+	for (v = root+1; v < ntaxa; v++) {
+		total_b = max_b[root][v];
+		if (total_b < 0) continue;
+		table[v].resize(total_b + 1, 0);
+		// init table[v][b]
+		for (b = 0; b <= total_b; b++)
+			table[v][b] = 0;
+		if (verbose_mode >= VB_DEBUG) {
+			id[v].resize(total_b + 1, 0);
+			for (b = 0; b <= total_b; b++) 
+				id[v][b] = 0;
+		}
+		for (b = 0; b <= total_b; b++)
+			table[v][b] = dist[root][v];
+	}
+
+	// dynamic programming
+	for (v = root+2; v < ntaxa; v++) {
+		total_b = max_b[root][v];
+		if (total_b < 0) continue;
+
+		for (s = root+1; s < v; s++)
+			for (b = pda->costs[taxa_order[s]]; b <= total_b; b++) {
+				int sub_b = b - pda->costs[taxa_order[s]];
+				if (sub_b > max_b[root][s]) sub_b = max_b[root][s];
+				double sum = dist[v][s] + table[s][sub_b];
+				if (table[v][b] < sum) {
+					table[v][b] = sum;
+					if (verbose_mode >= VB_DEBUG)
+						id[v][b] = s+1;
+				}
+			}
+	}
+
+	if (verbose_mode >= VB_DEBUG)	{
+		reportMyMat(cout, table);
+		reportMyMat(cout, id);
+	}
+
+}
+
+
+double CircularNetwork::computePDBudgetScore(int budget, matrix(double) &table,
+	matrix(double) &dist, vector<int> &taxa_order, matrix(int) &max_b, int root) {
+
+	int ntaxa = getNTaxa();
+	double max_pd = INT_MIN;
+	int v;
+	int total_b;
+
+	budget -= pda->costs[taxa_order[root]];
+	for (v = root+1; v < ntaxa; v++) {
+		total_b = budget - pda->costs[taxa_order[v]];
+		if (total_b > max_b[root][v]) total_b = max_b[root][v];
+		if (total_b < 0) continue;
+		if (max_pd < dist[root][v] + table[v][total_b]) {
+			max_pd = dist[root][v] + table[v][total_b];
+		}
+	}
+	return max_pd / 2.0;
+}
+
+
+void CircularNetwork::findCircularRootedPDBudget(Params &params, vector<SplitSet> &taxa_set, 
+	vector<int> &origin_order, int root)
+{
+	//int ntaxa = getNTaxa();
+	int b;
+
+	vector<int> taxa_order;
+	// rotate the position of root to 0 in the taxa_order
+	rotateTaxaOrder(origin_order, taxa_order, root);
+
+	matrix(double) dist;
+	// calculate the distance matrix
+	calcDistance(dist, taxa_order);
+
+	matrix(int) max_b;
+	// calculate maximum required budget from u to v
+	calcMaxBudget(params.budget, max_b, taxa_order);
+
+	matrix(double) table;
+	matrix(int) id;
+
+	// compute table and id information
+	computePDBudgetInfo(params, table, id, dist, taxa_order, max_b, 0);
+
+	// now construction the optimal PD sets
+	int set_count = 0;
+	for (b = params.min_budget; b <= params.budget; b++) {
+		constructPDBudget(b, params.find_all, table, dist, taxa_set[b-params.min_budget], taxa_order, max_b, 0);
+		if (verbose_mode >= VB_MAX) {
+			cout << "budget " << b << ": " << taxa_set.size()-set_count << " set(s)" << endl;
+			set_count = taxa_set.size();
+		}
+	}
+}
+
+
+void CircularNetwork::findCircularPDBudget(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order) {
+
+	int ntaxa = getNTaxa();
+	int b;
+
+	matrix(double) dist;
+	// calculate the distance matrix
+	calcDistance(dist, taxa_order);
+
+	if (verbose_mode >= VB_DEBUG)	{
+		reportMyMat(cout, dist);
+	}
+
+
+	matrix(int) max_b;
+	// calculate maximum required budget from u to v
+	calcMaxBudget(params.budget, max_b, taxa_order);
+
+	matrix(double) table;
+	matrix(int) id;
+
+	int root;
+
+	for (root = 0; root < ntaxa-1; root++) {
+		// compute table and id information
+		computePDBudgetInfo(params, table, id, dist, taxa_order, max_b, root);
+
+		// now construction the optimal PD sets
+		for (b = params.min_budget; b <= params.budget; b++) {
+			int index = b - params.min_budget;
+			double pd_score = computePDBudgetScore(b, table, dist, taxa_order, max_b, root);
+			// if the current set is already better, continue
+			if (taxa_set[index].getWeight() < pd_score)
+				taxa_set[index].removeAll();
+			else if (taxa_set[index].getWeight() > pd_score || !params.find_all) 
+				// if old pd score is better or equal but not find all, continue
+				continue;
+			constructPDBudget(b, params.find_all, table, dist, taxa_set[index], taxa_order, max_b, root);
+		}
+	}
+}
+
diff --git a/circularnetwork.h b/circularnetwork.h
new file mode 100644
index 0000000..d0c90af
--- /dev/null
+++ b/circularnetwork.h
@@ -0,0 +1,253 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef CIRCULARNETWORK_H
+#define CIRCULARNETWORK_H
+
+#include "pdnetwork.h"
+
+/**
+Circular Network for PDA algorithm
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class CircularNetwork : public PDNetwork
+{
+public:
+
+	/**
+		empty constructor
+	*/
+    CircularNetwork();
+
+	/**
+		construct network from a NEXUS file, e.g. produced by SplitsTree
+		@param params program parameters
+	*/
+    CircularNetwork(Params &params);
+
+
+	/**
+		MAIN FUNCTION which will call other other findPD depending on the input splits.
+		Search for maximal phylogenetic diversity of a given size 
+		@param params program parameters
+		@param taxa_set (OUT) the set of taxa in the maximal PD set
+		@param taxa_order (OUT) order of inserted taxa
+	*/
+	virtual void findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
+
+/********************************************************
+	Dynamic programming strategy
+********************************************************/
+
+	/**
+		dynamic programming algorithm in UNROOTED circular splits graph 
+			for phylogenetic diversity of a given size 
+		@param params parameters
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (IN) order of inserted taxa
+	*/
+	void findCircularPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
+
+	/**
+		dynamic programming algorithm in ROOTED circular splits graph 
+			for phylogenetic diversity of a given size 
+		@param params parameters
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (IN) order of inserted taxa
+		@param root index of the root taxon
+	*/
+	void findCircularRootedPD(Params &params, vector<SplitSet> &taxa_set, 
+		vector<int> &taxa_order, int root);
+
+	/**
+		dynamic programming algorithm with cost-constrained in UNROOTED circular splits graph 
+			for phylogenetic diversity under budget constraint
+		@param params program parameters
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (IN) order of inserted taxa
+		@return the PD score of the maximal set, also returned in taxa_set.weight
+	*/
+	void findCircularPDBudget(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
+	
+	/**
+		dynamic programming algorithm with cost-constrained in ROOTED circular splits graph 
+			for phylogenetic diversity under budget constraint
+		@param params program parameters
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (IN) order of inserted taxa
+		@return the PD score of the maximal set, also returned in taxa_set.weight
+		@param root index of the root taxon
+	*/
+	void findCircularRootedPDBudget(Params &params, vector<SplitSet> &taxa_set, 
+		vector<int> &taxa_order, int root);
+
+
+protected:
+
+/********************************************************
+	CIRCULAR NETWORKS
+********************************************************/
+
+	/**
+		compute the PD information table
+		@param params program parameters
+		@param table (OUT) computed information
+		@param dist distance matrix
+		@param root index of the root taxon
+	*/
+	void computePDInfo(Params &params, DoubleMatrix &table, DoubleMatrix  &dist, int root);
+
+	/**
+		compute the PD score
+		@param sub_size the subset size
+		@param table computed information
+		@param root index of the root taxon
+	*/
+	double computePDScore(int sub_size, DoubleMatrix &table, int root);
+
+
+	/**
+		construct optimal PD set from computed information for ROOTED case circular network
+		@param sub_size the subset size
+		@param find_all TRUE of want to find all PD sets
+		@param pd_limit maximum number of returned PD sets
+		@param table computed information
+		@param dist distance matrix
+		@param taxa_set (OUT) sets of taxa with optimal PD
+		@param taxa_order circular order
+		@param root the root
+	*/
+	void constructPD(int sub_size, bool find_all, int pd_limit, DoubleMatrix &table, DoubleMatrix  &dist, 
+		SplitSet &taxa_set, vector<int> &taxa_order, int root);
+
+	/**
+		construct optimal PD set from computed information for ROOTED case circular network
+		@param sub_size the subset size
+		@param max_v end taxon
+		@param pd_limit maximum number of returned PD sets
+		@param pd_set the current constructed PD set
+		@param table computed information
+		@param dist distance matrix
+		@param taxa_set (OUT) sets of taxa with optimal PD
+		@param taxa_order circular order
+		@param root the root
+	*/
+	void constructPD(int sub_size, int max_v, int pd_limit, Split *pd_set, DoubleMatrix &table, 
+		DoubleMatrix  &dist, SplitSet &taxa_set, vector<int> &taxa_order, int root);
+
+
+/********************************************************
+	CIRCULAR NETWORKS WITH BUDGET CONSTRAINT
+********************************************************/
+
+
+	/**
+		calculate the maximum budget required from u to v (excluding u and v)
+		@param budget total budget
+		@param max_b (OUT) max budget matrix between taxa
+		@param taxa_order circular order		
+	*/
+	void calcMaxBudget(int budget, matrix(int) &max_b, vector<int> &taxa_order);
+
+	/**
+		construct optimal PD set from computed information for budget constraint (ROOTED case)
+		@param budget total budget
+		@param find_all TRUE of want to find all PD sets
+		@param table computed information
+		@param dist distance matrix
+		@param taxa_set (OUT) sets of taxa with optimal PD
+		@param taxa_order circular order
+		@param max_b max budget matrix between taxa
+		@param root the root
+	*/
+	void constructPDBudget(int budget, bool find_all, matrix(double) &table, 
+		matrix(double) &dist,SplitSet &taxa_set, 
+		vector<int> &taxa_order, matrix(int) &max_b, int root);
+
+
+	/**
+		construct optimal PD set from computed information for budget constraint (ROOTED case)
+		@param budget total budget
+		@param max_v end taxon
+		@param pd_set the current constructed PD set
+		@param table computed information
+		@param dist distance matrix
+		@param taxa_set (OUT) sets of taxa with optimal PD
+		@param taxa_order circular order
+		@param max_b max budget matrix between taxa
+		@param root the root
+	*/
+	void constructPDBudget(int budget, int max_v, Split *pd_set, 
+		matrix(double) &table, matrix(double) &dist, SplitSet &taxa_set, 
+		vector<int> &taxa_order, matrix(int) &max_b, int root);
+
+	/**
+		compute the PD information table with budget
+		@param params program parameters
+		@param table (OUT) computed information
+		@param id (OUT) computed information
+		@param dist distance matrix
+		@param taxa_order circular order
+		@param max_b max budget matrix between taxa
+		@param root index of the root taxon
+	*/
+	void computePDBudgetInfo(Params &params, matrix(double) &table, matrix(int) &id, 
+		matrix(double) &dist, vector<int> &taxa_order, matrix(int) &max_b, int root);
+
+	/**
+		compute the PD score with budget
+		@param budget total budget
+		@param table (OUT) computed information
+		@param dist distance matrix
+		@param taxa_order circular order
+		@param max_b max budget matrix between taxa
+		@param root index of the root taxon
+	*/
+	double computePDBudgetScore(int budget, matrix(double) &table,
+		matrix(double) &dist, vector<int> &taxa_order, matrix(int) &max_b, int root);
+
+
+};
+
+/**
+	display the matrix into out
+*/
+template <class T>
+ostream &operator<<(ostream &out, matrix(T) &mat) {
+	unsigned int i, j;
+	for (i = 0; i < mat.size(); i++) {
+		for (j = 0; j < mat[i].size(); j++) {
+			if (j < i) 
+				out << " &  "; 
+			else if (j < mat[i].size()-1) 
+				out << mat[i][j] << " & ";
+			else
+				out << mat[i][j];
+		}
+		if (i < mat.size()-1)
+			out << " \\\\";
+		out << endl;
+	}
+	return out;
+} 
+
+
+
+#endif
diff --git a/ecopd.cpp b/ecopd.cpp
new file mode 100644
index 0000000..bca5f14
--- /dev/null
+++ b/ecopd.cpp
@@ -0,0 +1,1400 @@
+/*
+ * ecopd.cpp
+ *
+ *  Created on: Oct 30, 2013
+ *      Author: Olga
+ */
+
+#include <sstream>
+#include "ecopd.h"
+#include "split.h"
+#include "pdnetwork.h"
+#include "mtreeset.h"
+#include "ecopdmtreeset.h"
+#include "graph.h"
+
+
+
+ECOpd::ECOpd(const char *userTreeFile, bool &is_rooted) : MTree(userTreeFile,is_rooted) {}
+
+ECOpd::ECOpd() :MTree(){}
+
+ECOpd::~ECOpd(){}
+
+void ECOpd::initializeEcoPD(Params &params){
+}
+
+void ECOpd::readInitialTaxa(const char *infile){
+	ifstream in;
+	//cout<<"Reading taxa to be included in the final optimal subset from file: "<<infile<< endl;
+	try {
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(infile);
+		in.exceptions(ios::badbit);
+		readInitialTaxa(in);
+		in.close();
+	} catch (const char* str) {
+		outError(str);
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, infile);
+	}
+}
+void ECOpd::readInitialTaxa(istream &in){
+	string name;
+	while(!in.eof()){
+		in>>name;
+		initialTaxa.push_back(name);
+	}
+	if(initialTaxa.size() != 0){
+		initialTaxa.erase(initialTaxa.end());
+	}
+}
+
+bool ECOpd::OUT_tree(int i){
+	bool check=false;
+	for(int j=0;j<OUTtreeTaxa.size();j++)
+		if(OUTtreeTaxa[j]==i){
+			check=true;
+			//cout<<"Taxon "<<i<<" is not present in the tree."<<endl;
+		}
+	return(check);
+}
+
+void ECOpd::readDAG(const char* infile) {
+	ifstream in;
+	//cout<<endl<<"-----------------------------------------------------"<<endl;
+	if(weighted)
+		cout<<"Reading Diet Composition matrix from file: "<<infile<<endl;
+	else
+		cout<<"Reading Food Web matrix from file: "<<infile<<endl;
+	try {
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(infile);
+		in.exceptions(ios::badbit);
+		readDAG(in);
+		in.close();
+	} catch (const char* str) {
+		outError(str);
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, infile);
+	}
+}
+
+void ECOpd::readDAG(istream &in) {
+	int i=0,j=0;
+
+/* ---------------------------------------------------------------------------------------------------------
+ * Reading Diet Composition matrix from the input file
+ * ---------------------------------------------------------------------------------------------------------*/
+	if(!(in>>SpeciesNUM)) throw "The first line must contain the number of species in this Food Web!!";
+	string str_rest, speciesName;
+	getline(in, str_rest);
+
+	if(rooted)
+		SpeciesNUM++;
+
+	vector<double*> MM;
+	for(i=0;i<SpeciesNUM;i++){
+		MM.push_back(new double [SpeciesNUM]);
+	}
+
+	nvar = (TaxaNUM > SpeciesNUM) ? TaxaNUM : SpeciesNUM;
+	for(i=0;i<nvar;i++){
+		DAG.push_back(new double [nvar]);
+		for(j=0; j<nvar; j++){
+			DAG[i][j] = 0.0;
+		}
+	}
+	i = 0;
+	j = 0;
+	if(rooted){
+		while(i != SpeciesNUM-1){
+			if(!(in >> speciesName)) throw "Each line should start with a species name!";
+			dagNames.push_back(speciesName);
+			j = 0;
+			while(j != SpeciesNUM-1){
+				if(!(in >> MM[i][j])) throw "Could not read matrix entry! For each species make sure there are as many entries as the number of species specified in the file. Only square matrices are accepted.";
+				if(MM[i][j] < 0) throw "The Food Web matrix should not contain negative values.Use either 0, 1 or a positive number to indicate the portion of diet.";
+				j++;
+			}
+			MM[i][SpeciesNUM-1] = 0;
+			i++;
+		}
+		for(j=0; j<SpeciesNUM; j++)
+			MM[SpeciesNUM-1][j] = 0;
+		dagNames.push_back("_root");
+	} else {
+		while(i != SpeciesNUM){
+				if(!(in >> speciesName)) throw "Each line should start with a species name!";
+				dagNames.push_back(speciesName);
+				j = 0;
+				while(j != SpeciesNUM){
+					if(!(in >> MM[i][j])) throw "Could not read matrix entry! For each species make sure there are as many entries as the number of species specified in the file. Only square matrices are accepted.";
+					if(MM[i][j] < 0) throw "The Food Web matrix should not contain negative values.Use either 0, 1 or a positive number to indicate the portion of diet.";
+					j++;
+				}
+				i++;
+			}
+	}
+
+/* ---------------------------------------------------------------------------------------------------------
+ * Input data
+ * ---------------------------------------------------------------------------------------------------------*/
+	if(verbose_mode == VB_MAX){
+		cout<<endl<<"Food web is defined by the following matrix"<<endl;
+		for(i=0;i<SpeciesNUM;i++) {
+			cout<<dagNames[i]<<"\t";
+			for(j=0;j<SpeciesNUM;j++)
+				cout<<MM[i][j]<<"\t";
+			cout<<endl;
+		}
+		// Species in the food web and their ids
+		for(i=0; i<SpeciesNUM;i++)
+			cout<<"["<<i<<"] "<<dagNames[i]<<endl;
+	}
+/* ---------------------------------------------------------------------------------------------------------
+ * Processing the input data
+ * ---------------------------------------------------------------------------------------------------------*/
+//Ignoring cannibalism -------------------------------------------------------------------------
+	int cannibals=0;
+	for(i=0;i<SpeciesNUM;i++)
+		if(MM[i][i]!=0){
+			cannibals++;
+			if(weighted){
+				if(cannibals == 1){
+					cout<<"------------------------------------------"<<endl;
+					cout<<"    Cannibal species         link weight  "<<endl;
+					cout<<"------------------------------------------"<<endl;
+				}
+				cout.width(30);
+				cout<<left<<dagNames[i];
+				cout<<" | "<<MM[i][i]<<endl;
+			}else{
+				if(cannibals == 1){
+					cout<<"-----------------------------"<<endl;
+					cout<<"       Cannibal species      "<<endl;
+					cout<<"-----------------------------"<<endl;
+				}
+				cout<<dagNames[i]<<endl;
+			}
+			MM[i][i]=0;
+		}
+	if(cannibals!=0){
+		cout<<endl<<"Deleted "<<cannibals;
+		if(cannibals == 1)
+			cout<<" cannibalistic link."<<endl;
+		else
+			cout<<" cannibalistic links."<<endl;
+	}
+
+//Check whether the graph is acyclic or not
+	Graph g(SpeciesNUM);
+	for(i=0; i<SpeciesNUM; i++)
+		for(j=0; j<SpeciesNUM; j++)
+			if(MM[i][j]>0)
+				g.addEdge(i,j);
+	if(g.isCyclic()){
+		if(cannibals != 0)
+			cout<<endl<<"ERROR: Even after deleting cannibalistic links, there are still some cycles present."<<endl;
+		else
+			cout<<endl<<"ERROR: ";
+		cout<<"Cyclic food webs are not supported. Delete the links which cause cycles and run again."<<endl;
+		cout<<"SOLUTION:"<<endl;
+		cout<<"Detect species in the cycle and choose one link to be deleted in order to break the cycle."<<endl;
+		cout<<"One possibility is to delete the link with least weight. This can be done by setting the corresponding value in the matrix to 0."<<endl;
+		exit(0);
+	}
+
+// The number of links -------------------------------------------------------------------------
+	linksNUM = 0;
+	for(i = 0; i<SpeciesNUM; i++)
+		for(j = 0; j<SpeciesNUM; j++)
+			if(MM[i][j]>0)
+				linksNUM++;
+
+//Rescaling the diet if necessary --------------------------------------------------------------
+	if(weighted){
+		int dietReScaled = 0;
+		vector<double> colsum;
+		//cout<<"Food web is weighted."<<endl;
+		for(j=0;j<SpeciesNUM;j++){
+
+			colsum.push_back(0);
+			for(i=0;i<SpeciesNUM;i++)
+				colsum[j]=colsum[j]+MM[i][j];
+			if(colsum[j]!=1 && colsum[j]!=0){
+				dietReScaled++;
+				//cout<<"    WARNING: rescaled diet composition of species "<<j<<". Column sum = "<<colsum[j]<<endl;
+				for(i=0;i<SpeciesNUM;i++)
+					MM[i][j]=MM[i][j]/colsum[j];
+			}
+			colsum[j]=0;
+			//for(i=0;i<SpeciesNUM;i++)
+			//	colsum[j]=colsum[j]+MM[i][j];
+			//cout<<j<<"  Column sum = "<<colsum[j]<<endl;
+		}
+		cout<<"Rescaled diet composition of "<<dietReScaled<<" species."<<endl;
+	}else{
+		for(i=0; i<SpeciesNUM; i++)
+			for(j=0; j<SpeciesNUM; j++)
+				if( MM[i][j] > 0)
+					MM[i][j] = 1;
+		//cout<<"Since the -eco option was chosen, the entries of Food Web matrix will be converted to 0/1 [not prey / prey]. You can use -ecoW option to account for the Diet Composition."<<endl;
+	}
+
+// Technical: in case of rooted trees, we check which species are basal ones, i.e. for which check = 0, and set them to "feed on" root M[i,j] = 1
+	if(rooted){
+		vector<double> check;
+		for(j=0;j<SpeciesNUM-1;j++){
+			check.push_back(0);
+			for(i=0;i<SpeciesNUM-1;i++)
+				check[j]=check[j]+MM[i][j];
+			if(check[j]==0)
+				MM[SpeciesNUM-1][j]=1;
+		}
+	}
+
+//Detecting which species are not present in either FoodWeb or Tree/SplitNetwork-----------------
+	detectMissingSpecies();
+
+//Check whether all the species from initialTaxa set are actually present on Tree/SplitSys or in Food Web
+	checkInitialTaxa();
+
+// Synchronization of species in Tree/SplitSys and species in FoodWeb ---------------------------
+	synchronizeSpecies();
+
+	for(i=0; i<SpeciesNUM; i++){
+		for(j=0; j<SpeciesNUM; j++){
+			DAG[phylo_order[i]][phylo_order[j]]=MM[i][j];
+		}
+	}
+
+	for(i=SpeciesNUM-1;i>=0;i--)
+		delete[] MM[i];
+
+	if(verbose_mode == VB_MAX){
+	// Print info about synchronization
+		cout<<endl<<"Synchronization:"<<endl;
+		cout<<"PhyloInfo id | FoodWeb id, name"<<endl;
+		for(i=0; i<SpeciesNUM; i++){
+			cout<<"["<<phylo_order[i]<<"] | ["<<i<<"] "<<dagNames[i]<<endl;
+		}
+		cout<<"PhyloInfo id | name"<<endl;
+		for(i=0; i<TaxaNUM;i++){
+			cout<<"["<<i<<"] "<<findNodeID(i)->name<<endl;
+		}
+
+	 // Input data after processing: cannibalism, rescaling, reordering
+		cout<<endl<<"Food web is defined by the following matrix"<<endl;
+		for(i=0;i<nvar;i++) {
+			if(findFoodWebID(i) != -1)
+				cout<<dagNames[findFoodWebID(i)]<<"\t";
+			else
+				cout<<"\t\t";
+			for(j=0;j<nvar;j++)
+				cout<<DAG[i][j]<<"\t";
+			cout<<endl;
+		}
+	}
+/* ---------------------------------------------------------------------------------------------------------
+ * Filling out taxaDAG vector: node corresponds to taxa, neighbors to preys, length (node-neighbor) to weight
+ * ---------------------------------------------------------------------------------------------------------*/
+ 	vector<int> vec2;//the value of vec[j] is the height of the species in the DAG
+	taxaDAG.resize(nvar,NULL);
+	for(j=0;j<nvar;j++){
+		taxaDAG[j] = newNode(j,j);
+		//cout<<"taxonDAG[j="<<j+1<<"]->id="<<taxaDAG[j]->id<<endl;
+	}
+
+	for(j=0;j<nvar;j++){
+		for(i=0;i<nvar;i++)
+			if(DAG[i][j]>0){
+				//cout<<"cheking matrix"<<i<<j<<endl;
+				taxaDAG[j]->addNeighbor(taxaDAG[i], DAG[i][j], taxaDAG[i]->id);
+				//cout<<"neighbors[i="<<taxaDAG[j]->degree()-1<<"]->id="<<taxaDAG[j]->neighbors[taxaDAG[j]->degree()-1]->node->id<<endl;
+			}
+		//cout<<endl;
+	}
+
+/* ---------------------------------------------------------------------------------------------------------
+ * Defining levels in the Food Web based on the longest food chain of predators
+ * ---------------------------------------------------------------------------------------------------------*/
+	for(j=0;j<nvar;j++){
+		levelDAG.push_back(0);
+		if(taxaDAG[j]->degree()>0)
+			vec2.push_back(1);
+		else
+			vec2.push_back(0);
+//  		if(taxaDAG[j]->degree()>0){
+//  			cout<<"Children of taxonDAG[j="<<j<<"]->id="<<taxaDAG[j]->id<<":"<<endl;
+// 			for(i=0;i<taxaDAG[j]->degree();i++)
+// 				cout<<"taxaDAG["<<j<<"]->neighbors["<<i<<"]->node->id "<<taxaDAG[j]->neighbors[i]->node->id<<endl;
+// 				//cout<<"id of the child "<<i<<" node id "<<taxaDAG[j]->neighbors[i]->node->id+1<<" "<<endl;
+// 				//cout<<"           neighbors[i="<<i<<"]->id="<<taxaDAG[j]->neighbors[i]->node->id<<endl;
+// 			cout<<endl;
+//
+//  		}
+	}
+//	for(j=0;j<nvar;j++)
+//		cout<<j<<" "<<levelDAG[j]<<" "<<vec2[j]<<endl;
+
+	int eq=0,step=0;
+	//cout<<"Starting while..."<<endl;
+	while(eq!=1){
+		eq=1;
+		step++;
+// 		if(step==1 or step==2 or step==3)
+//		cout<<"-------STEP "<<step<<"-------"<<endl<<"j v1 v2"<<endl;
+		for(j=0;j<nvar;j++){
+			if(levelDAG[j]!=vec2[j])
+				eq=0;
+// 			if(step==1 or step==2 or step==3)
+//			cout<<j<<" "<<levelDAG[j]<<" "<<vec2[j]<<endl;
+			levelDAG[j]=vec2[j];
+		}
+		for(j=0;j<nvar;j++){
+			if(taxaDAG[j]->degree()>0){
+			//cout<<"taxaDAG["<<j<<"]->neighbors[0]->node->id "<<taxaDAG[j]->neighbors[0]->node->id<<endl;
+			vec2[j]=vec2[taxaDAG[j]->neighbors[0]->node->id]+1;
+			for(i=1;i<taxaDAG[j]->degree();i++)
+				if(vec2[taxaDAG[j]->neighbors[i]->node->id]>=vec2[j])
+					vec2[j]=vec2[taxaDAG[j]->neighbors[i]->node->id]+1;
+			}
+		}
+	}
+
+// For each predator the level corresponds to its longest food chain----------------------------
+	if(verbose_mode == VB_MAX){
+		cout<<"For each species its longest chain according to a food web"<<endl;
+		for(j=0;j<nvar;j++)
+			//if(findFoodWebID(j) != -1)
+			//	cout<<dagNames[findFoodWebID(j)]<<"\t| "<<levelDAG[j]<<endl;
+			//else
+				cout<<*names[j]<<"\t| "<<levelDAG[j]<<endl;
+	}
+ 	//cout<<"Species - level"<<endl;
+	//ofstream levelF;
+	//levelF.open("Level",ios::app);
+ 	//for(j=0;j<SpeciesNUM;j++)
+ 	//	levelF<<j+1<<" "<<levelDAG[j]<<endl;
+	// 	for(i=0;i<tree.leafNum;i++)
+	// 	myfile<<"taxon id: "<<taxaTree[i]->id<<" | taxon name: "<<taxaTree[i]->name<<endl;
+	// 	myfile<<"root  id: "<<root->id<<" | root  name: "<<root->name<<endl;
+	// // myfile.close();
+
+// The maximum level is the longest food chain of the food web ---------------------------------
+//	 int maxlevel;
+//	 maxlevel=0;
+//	 for(i=0;i<SpeciesNUM;i++)
+//		if(maxlevel<levelDAG[i])
+//			maxlevel=levelDAG[i];
+
+// Decrease SpeciesNUM since you do not need to include the root to the Species anymore---------
+	if(rooted)
+		SpeciesNUM--;
+}
+
+/* =========================================================================================================
+ *	ROOTED TREES
+ * =========================================================================================================*/
+void ECOpd::printECOlpRooted(const char* fileOUT,ECOpd &tree){
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open(fileOUT);
+	int m,i,j;
+	//int step=0,step_all=0;
+	//cout<<"# of species to conserve:"<<nspecies<<endl;
+	int nspecies=k;
+	nspecies++; //you have to include also one place for the root
+//----------------------------------------------------------------------------------------------------------------
+// Dealing with d levels
+//----------------------------------------------------------------------------------------------------------------
+// max d level-------------------------------------------------
+	int maxlevel;
+	maxlevel=levelDAG[0]; //i=0;
+// 	cout<<"DAG levels:"<<endl;
+// 	cout<<"LevelDAG[0]:"<<levelDAG[0]<<endl;
+ 	for(i=1;i<nvar;i++){
+//  		//cout<<"LevelDAG["<<i<<"]:"<<levelDAG[i]<<endl;
+ 		if(maxlevel<levelDAG[i])
+ 			maxlevel=levelDAG[i];
+	}
+	//cout<<"max DAG level:"<<maxlevel+1<<endl;
+//# of species at each d level---------------------------------
+// 	int hit=0;
+// 	hvec.resize(maxlevel+1,0);
+// 	while(hit<=maxlevel){
+// 		for(i=0;i<nvar;i++)
+// 			if(levelDAG[i]==hit) hvec[hit]++;
+// 		hit++;
+// 	}
+	/*cout<<"# of species at each level"<<endl;
+	for(i=0;i<hvec.size();i++)
+		cout<<hvec[i]<<" ";
+	cout<<endl;
+	*/
+
+ /****************************************************************************************************************
+  * Integer Programming formulation
+  ****************************************************************************************************************/
+//----------------------------------------------------------------------------------------------------------------
+// Printing objective function
+//----------------------------------------------------------------------------------------------------------------
+	out<<"Maximize"<<endl;
+	tree.getBranchOrdered(nodes1,nodes2);
+	for(i=0;i<tree.branchNum;i++){
+		nodes1[i]->findNeighbor(nodes2[i])->id=i;
+		nodes2[i]->findNeighbor(nodes1[i])->id=i;
+		if(i<tree.branchNum-1)
+			out<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i<<" + ";
+		else
+			out<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i<<endl;
+		}
+//----------------------------------------------------------------------------------------------------------------
+// Printing constraints
+//----------------------------------------------------------------------------------------------------------------
+	out<<"Subject To"<<endl;
+//----------------------------------------------------------------------------------------------------------------
+// 1. constraint: species present in the set
+	if(initialTaxa.size()!=0)
+		for(m=0;m<initialTaxa.size();m++)
+			out<<"x"<<findSpeciesIDname(&initialTaxa[m])<<" = 1"<<endl;
+//----------------------------------------------------------------------------------------------------------------
+// 2. constraint: the sum of all species is <= k
+	for(i=0;i<nvar-1;i++)
+		out<<"x"<<i<<" + ";
+	out<<"x"<<nvar-1<<" <= "<<nspecies<<endl;
+
+//----------------------------------------------------------------------------------------------------------------
+// 3. constraint: the sum of leaves in the DAG is >= to 1
+// 	SpeciesNUM++;
+// 	int nleafDAG=0,nleaf=0;
+// 	for(i=0;i<SpeciesNUM;i++)
+// 		if(levelDAG[i]==0)
+// 			nleafDAG++;
+// 		for(j=0;j<SpeciesNUM;j++){
+// 			if(taxaDAG[j]->degree()==0){
+// 				nleaf++;
+// 			if(nleaf<nleafDAG)
+// 				out<<"x"<<taxaDAG[j]->id<<" + ";
+// 			else
+// 				out<<"x"<<taxaDAG[j]->id<<" >= 1"<<endl;
+// 			}
+// 		}
+
+//----------------------------------------------------------------------------------------------------------------
+// 4. constraints: SURVIVAL CONSTRAINT
+	if(weighted){
+		//weighted food web: sum of weights is greater than a given threshold--------------------------------
+		for(j=0;j<nvar;j++)
+			if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+				for(i=0;i<taxaDAG[j]->degree();i++){
+					if(i<taxaDAG[j]->degree()-1){
+						out<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+					} else {
+						out<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" - "<<T<<" x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+					}
+				}
+			}
+	}else{
+		//for each predator the sum of children in the DAG is >= to its value-------------------------------
+		for(j=0;j<nvar;j++)
+			if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+				for(i=0;i<taxaDAG[j]->degree();i++){
+					if(i<taxaDAG[j]->degree()-1){
+						out<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+					} else {
+						out<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" - x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+					}
+				}
+			}
+	}
+//----------------------------------------------------------------------------------------------------------------
+// 5. constraints for edges in the PhyloTree
+	//cout<<"root "<<tree.root->id<<endl;
+	vector<int> taxaBelow;
+	for(i=0;i<tree.branchNum;i++)
+		//constraints: SUM{Xv in T(e)}(Xv)>=Ye -----------------------------------------------
+		if((nodes1[i]->isLeaf()) && (nodes1[i]!=root))
+			out<<"x"<<nodes1[i]->id<<" - y"<<nodes1[i]->findNeighbor(nodes2[i])->id<<" >= 0"<<endl;
+		else {
+			tree.getTaxaID(taxaBelow,nodes2[i],nodes1[i]);
+			for(j=0;j<taxaBelow.size();j++)
+				if(j<taxaBelow.size()-1)
+					out<<"x"<<taxaBelow[j]<<" + ";
+				else
+					out<<"x"<<taxaBelow[j];
+			taxaBelow.clear();
+			out<<" - y"<<nodes1[i]->findNeighbor(nodes2[i])->id<<" >= 0"<<endl;
+		}
+//----------------------------------------------------------------------------------------------------------------
+// Printing bounds for variables
+//----------------------------------------------------------------------------------------------------------------
+	out<<"Bounds"<<endl;
+	for(j=0;j<nvar;j++)
+			out<<"0 <= x"<<taxaDAG[j]->id<<" <= 1"<<endl;
+	for(i=0;i<tree.branchNum;i++)
+		out<<"0 <= y"<<i<<" <= 1"<<endl;
+//----------------------------------------------------------------------------------------------------------------
+// Printing variables (For IP model)
+//----------------------------------------------------------------------------------------------------------------
+	out<<"Generals"<<endl;
+	for(j=0;j<nvar;j++)
+		out<<"x"<<taxaDAG[j]->id<<" ";
+	for(i=0;i<tree.branchNum;i++)
+		out<<"y"<<i<<" ";
+//----------------------------------------------------------------------------------------------------------------
+	out<<endl<<"End"<<endl;
+	out.close();
+}
+
+/* =========================================================================================================
+ *	UNROOTED TREES and d-levels
+ * =========================================================================================================*/
+void ECOpd::printECOlpUnrooted(const char* fileOUT,ECOpd &tree){
+	ofstream myfile;
+	string str_out = fileOUT;
+	string str_out_1,str_out_2;
+	//myfile.open(fileOUT);
+
+	int i,m,j,step=0,step_all=0;
+	int nspecies=k;
+//---------------------------------------------Dealing with d levels---------------------------------------------
+	{
+//--------------------------------------------------max d level--------------------------------------------------
+	int maxlevel;
+	maxlevel=levelDAG[0];
+// 	cout<<"DAG levels:"<<endl;
+// 	cout<<"LevelDAG[0]:"<<levelDAG[0]<<endl;
+	for(i=1;i<TaxaNUM;i++){
+// 		cout<<"LevelDAG["<<i<<"]:"<<levelDAG[i]<<endl;
+		if(maxlevel<levelDAG[i])
+			maxlevel=levelDAG[i];
+	}
+// 	cout<<"max DAG level:"<<maxlevel+1<<endl;
+//-----------------------------------------------------end-------------------------------------------------------
+
+
+//---------------------------------------generating first vector of d levels-------------------------------------
+	generateFirstMultinorm(dvec, nspecies-1, maxlevel+1); //nspecies-1 - we are saving at least 1 place at d[0] level
+							      //maxlevel+1 - as we start counting levels from 0 one
+// 	cout<<"vector of d levels:"<<endl;
+// 	dvec[0]++;
+// 	for(i=0;i<dvec.size();i++)
+// 		cout<<dvec[i]<<" ";
+// 	cout<<endl;
+// 	dvec[0]--;
+//-----------------------------------------------------end-------------------------------------------------------
+
+
+//------------------------------------------# of species at each d level-----------------------------------------
+	int hit=0;
+	hvec.resize(maxlevel+1,0);
+	while(hit<=maxlevel){
+		for(i=0;i<TaxaNUM;i++)
+			if(levelDAG[i]==hit) hvec[hit]++;
+		hit++;
+	}
+	/*cout<<"# of species at each level"<<endl;
+	for(i=0;i<hvec.size();i++)
+		cout<<hvec[i]<<" ";
+	cout<<endl;
+	*/
+//-----------------------------------------------------end-------------------------------------------------------
+	}
+//-----------------------------------------END:Dealing with d levels---------------------------------------------
+
+	int DAGlevels=1,check_print=1;
+
+//--------------------------------------Printing all cases for different d levels--------------------------------
+	while(DAGlevels==1) {
+		step_all++;
+		//cout<<endl<<"STEP ALL:"<<step_all<<endl;
+// 		cout<<"vector of d levels:"<<endl;
+ 		dvec[0]++;
+// 		for(i=0;i<dvec.size();i++)
+// 			cout<<dvec[i]<<" ";
+// 		cout<<endl;
+
+//--------------------------------------CHECKPOINT:is vector d good or we should ignore it?----------------------
+{
+//Print only if d[i] is <= than the # of species on this level, otherwise it's a waste of places for conservation
+		check_print=1;
+		for(i=0;i<hvec.size();i++){
+			//cout<<"dvec["<<i<<"]="<<dvec[i]<<"   hvec["<<i<<"]="<<hvec[i]<<endl;
+			if(dvec[i]>hvec[i])
+				check_print=0;
+		}
+		//cout<<"CHECKPOINT="<<check_print<<endl<<endl;
+
+		check_print=1;//IGNORE checkpoint: when only one model for each run is needed, used together with DAGlevels=0; (below)
+
+		if(check_print==1){
+			step++;
+			//cout<<"Vector d is SUITABLE, step="<<step<<endl;
+			str_out_1 = convertIntToString(step);
+			str_out_2 = str_out  + str_out_1 + ".lp";
+			//myfile.open(str_out_2.c_str());
+
+			//str_out_2 = str_out + "lp";		//IGNORED d levels: only one model for each run
+			str_out_2 = str_out;
+			myfile.open(str_out_2.c_str());
+
+/**----------------------------------------------Printing objective function---------------------------------------*/
+	{
+
+			myfile<<"Maximize"<<endl;
+			tree.getBranchOrdered(nodes1,nodes2);
+	{
+			for(i=0;i<tree.branchNum;i++){
+				nodes1[i]->findNeighbor(nodes2[i])->id=i;
+				nodes2[i]->findNeighbor(nodes1[i])->id=i;
+				if(i<tree.branchNum-1)
+					myfile<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i<<" + ";
+				else
+					myfile<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i<<endl;
+			}
+	}
+			//IDEA: objective**********************************************************************
+	{
+// 			double lambda_sum=0;
+// 			for(i=0;i<tree.branchNum;i++){
+// 				nodes1[i]->findNeighbor(nodes2[i])->id=i;
+// 				nodes2[i]->findNeighbor(nodes1[i])->id=i;
+// 				lambda_sum = lambda_sum + nodes1[i]->findNeighbor(nodes2[i])->length;
+// 				if(i<tree.branchNum-1)
+// 					myfile<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i<<" + ";
+// 				else
+// 					myfile<<nodes1[i]->findNeighbor(nodes2[i])->length<<" "<<"y"<<i;
+// 			}
+//
+// 			for(j=0;j<tree.leafNum;j++)
+// 				if(taxaDAG[j]->degree()>0)
+// 					myfile<<" - "<<lambda_sum<<" z"<<j;
+// 			myfile<<endl;
+	}
+			//*************************************************************************************
+}
+
+/**--------------------------------------------------Printing constraints------------------------------------------*/
+	{
+			myfile<<"Subject To"<<endl;
+			int c=0;
+/**species present in the set-----------------------------------------------*/
+	if(initialTaxa.size()!=0)
+		for(m=0;m<initialTaxa.size();m++)
+			myfile<<"x"<<findSpeciesIDname(&initialTaxa[m])<<" = 1"<<endl;
+/**the sum of all species is <= k-------------------------------------------------------------------*/
+	{
+			NodeVector taxaTree;
+			tree.getTaxa(taxaTree);
+			myfile<<"c"<<c<<": ";
+			c++;
+
+			for(i=0;i<nvar-1;i++)
+				myfile<<"x"<<i<<" + ";
+			myfile<<"x"<<nvar-1<<" <= "<<nspecies<<endl;
+	}
+/**the sum of leaves in the DAG is >= to 1----------------------------------------------------------*/
+	{ //it might be incorrect, so check it out before using...
+// 			myfile<<"c"<<c<<": ";
+// 			c++;
+// 			int nleafDAG=0,nleaf=0;
+// 			for(i=0;i<TaxaNUM;i++)
+// 				if(levelDAG[i]==0)
+// 					nleafDAG++;
+// 			for(j=0;j<TaxaNUM;j++){
+// 				if(taxaDAG[j]->degree()==0){
+// 				nleaf++;
+// 				if(nleaf<nleafDAG)
+// 					myfile<<"x"<<taxaDAG[j]->id<<" + ";
+// 				else
+// 					myfile<<"x"<<taxaDAG[j]->id<<" >= 1"<<endl;
+// 				}
+// 			}
+	}
+
+//constraints: SURVIVAL CONSTRAINT
+
+if(weighted){//weighted food web: sum of weights is greater than a given threshold--------------------------------
+
+	for(j=0;j<nvar;j++){
+		if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+			for(i=0;i<taxaDAG[j]->degree();i++)
+				if(i<taxaDAG[j]->degree()-1)
+					myfile<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+				else
+					myfile<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" - "<<T<<" x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+			}
+
+	}
+}
+else {//for each predator the sum of children in the DAG is >= to its value-----------------------------
+			for(j=0;j<TaxaNUM;j++){
+				if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+					myfile<<"c"<<c<<": ";
+					c++;
+					for(i=0;i<taxaDAG[j]->degree();i++)
+						if(i<taxaDAG[j]->degree()-1)
+							myfile<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+						else
+							myfile<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" - x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+				}
+			}
+
+
+}
+			//IDEA:new variables Z'tas*************************************************************
+	{
+
+// 			for(j=0;j<tree.leafNum;j++){
+// 				if(taxaDAG[j]->degree()>0){
+// 					myfile<<"c"<<c<<": ";
+// 					c++;
+// 					myfile<<"z"<<j<<" - x"<<j<<" >= 0"<<endl;
+// 					for(i=0;i<taxaDAG[j]->degree();i++){
+// 						myfile<<"c"<<c<<": ";
+// 						c++;
+// 						myfile<<"z"<<j<<" - x"<<taxaDAG[j]->neighbors[i]->node->id<<" >= 0"<<endl;
+// 					}
+// 				}
+// 			}
+
+}
+			//*************************************************************************************
+
+//d levels----------------------------------------------------------------------------------------
+	{
+
+
+// 			int hit=0;
+// 			int h=0;
+// 			int maxlevel=dvec.size()-1;
+// 			while(hit<=maxlevel){
+// 				myfile<<"c"<<c<<": ";
+// 				c++;
+// 				h=0;
+// 				for(i=0;i<SpeciesNUM;i++)
+// 					if(levelDAG[i]==hit){
+// 						h++;
+// 						if(h<hvec[hit])
+// 							myfile<<"x"<<i<<" + ";
+// 						else {
+// 							//myfile<<"x"<<i<<" = "<<dvec[hit];
+// 							myfile<<"x"<<i<<" - d"<<hit<<" = 0";
+//  							if(step==1)
+//  								cout<<"dvec["<<hit<<"]="<<dvec[hit]<<endl;
+// 						}
+// 					}
+// 				myfile<<endl;
+// 				hit++;
+// 			}
+// 			myfile<<"c"<<c<<": ";
+// 			c++;
+// 			for(i=0;i<=maxlevel;i++){
+// 				if(i<maxlevel)
+// 					myfile<<"d"<<i<<" + ";
+// 				else
+// 					myfile<<"d"<<i<<" = "<<nspecies<<endl;
+// 			}
+
+
+}
+
+//for edges in the PhyloTree--------------------------------------------------------------------------
+	{
+
+			vector<int> taxaBelow;
+			for(i=0;i<tree.branchNum;i++){
+
+		//constraints: SUM{Xv in T(e)}(Xv)>=Ye -----------------------------------------------
+				myfile<<"c"<<c<<": ";
+				c++;
+				tree.getTaxaID(taxaBelow,nodes2[i],nodes1[i]);
+				for(j=0;j<taxaBelow.size();j++)
+					if(j<taxaBelow.size()-1)
+						myfile<<"x"<<taxaBelow[j]<<" + ";
+					else
+						myfile<<"x"<<taxaBelow[j];
+				taxaBelow.clear();
+				myfile<<" - y"<<nodes1[i]->findNeighbor(nodes2[i])->id<<" >= 0"<<endl;
+		//constraints: SUM{Xv not in T(e)}(Xv)>=Ye -------------------------------------------
+				myfile<<"c"<<c<<": ";
+				c++;
+				tree.getTaxaID(taxaBelow,nodes1[i],nodes2[i]);
+				for(j=0;j<taxaBelow.size();j++)
+					if(j<taxaBelow.size()-1)
+						myfile<<"x"<<taxaBelow[j]<<" + ";
+					else
+						myfile<<"x"<<taxaBelow[j];
+				taxaBelow.clear();
+				myfile<<" - y"<<nodes1[i]->findNeighbor(nodes2[i])->id<<" >= 0"<<endl;
+			}
+
+	}//end printing constraints for edges (PhyloTree)
+
+
+
+}//-------------------------------------END printing constraints ALL--------------------------------------
+	//int maxlevel=dvec.size()-1;
+
+
+//---------------------------------------------Printing bounds for variables--------------------------------------
+	{
+			myfile<<"Bounds"<<endl;
+			for(j=0;j<nvar;j++)
+				myfile<<"0 <= x"<<taxaDAG[j]->id<<" <= 1"<<endl;
+			for(i=0;i<tree.branchNum;i++)
+				myfile<<"0 <= y"<<i<<" <= 1"<<endl;
+	}
+//------------------------------------------Printing variables (For IP model)---------------------------------
+	{
+ 			myfile<<"Generals"<<endl;
+			for(j=0;j<nvar;j++)
+				myfile<<"x"<<taxaDAG[j]->id<<" ";
+			for(i=0;i<tree.branchNum;i++)
+				myfile<<"y"<<i<<" ";
+
+//			myfile<<"Generals"<<endl;
+// 			int maxlevel=dvec.size()-1;
+// 			for(j=0;j<=maxlevel;j++)
+// 				myfile<<"d"<<j<<" ";
+// 			if(fractVAR.size()!=0)
+// 				for(i=0;i<fractVAR.size();i++)
+// 					myfile<<fractVAR[i]<<" ";
+			myfile<<endl;
+	}
+ 			myfile<<"End"<<endl;
+			myfile.close();
+		}//checkpoint "if" ends here
+}
+//----------------------------------------------CHECKPOINT ENDs here----------------------------------------------
+
+
+//---------------------------------------generating next vector of d levels--------------------------------------
+		dvec[0]--;
+		if(generateNextMultinorm(dvec))
+			DAGlevels=1;
+		else
+			DAGlevels=0;
+//-----------------------------------------------------end-------------------------------------------------------
+
+		DAGlevels=0;//IGNORE d levels: only one model to solve for each run
+	}
+//----------------------------------END:Printing all cases for different d levels--------------------------------
+
+//cout<<"ALL STEPS:"<<step_all<<endl;
+//cout<<"STEPs:"<<step<<endl;
+
+}
+
+
+
+
+/* =========================================================================================================
+ * SPLIT systems
+ * =========================================================================================================*/
+void ECOpd::printInfDAG (const char* fileOUT,PDNetwork &splitsys, Params &params) {
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open(fileOUT,ios::app);
+	int i,j,nspecies=k;
+	int maxlevel;
+	maxlevel=levelDAG[0];
+	for(i=1;i<TaxaNUM;i++)
+		if(maxlevel<levelDAG[i])
+			maxlevel=levelDAG[i];
+	int hit=0;
+	hvec.resize(maxlevel+1,0);
+	while(hit<=maxlevel){
+		for(i=0;i<TaxaNUM;i++)
+			if(levelDAG[i]==hit) hvec[hit]++;
+		hit++;
+	}
+//Constraints----------------------------------------------------------------------
+	//species present in the set-----------------------------------------------
+	if(initialTaxa.size()!=0)
+		for(i=0;i<initialTaxa.size();i++)
+			out<<"x"<<findSpeciesIDname(&initialTaxa[i])<<" = 1"<<endl;
+	//the sum of all species is <= k-------------------------------------------
+			for(i=0;i<nvar-1;i++)
+				out<<"x"<<i<<" + ";
+			out<<"x"<<nvar-1<<" <= "<<nspecies<<endl;
+	//the sum of leaves in the DAG is >= to 1----------------------------------
+			int nleafDAG=0,nleaf=0;
+			for(i=0;i<nvar;i++)
+				if(levelDAG[i]==0)
+					nleafDAG++;
+			for(j=0;j<nvar;j++){
+				if(taxaDAG[j]->degree()==0){
+				nleaf++;
+				if(nleaf<nleafDAG)
+					out<<"x"<<taxaDAG[j]->id<<" + ";
+				else
+					out<<"x"<<taxaDAG[j]->id<<" >= 1"<<endl;
+				}
+			}
+	//SURVIVAL CONSTRAINT
+	if(weighted){
+		//constraint: Weighted food web. Sum of weights is greater than a given threshold--------------------------------
+		for(j=0;j<nvar;j++){
+			if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+				for(i=0;i<taxaDAG[j]->degree();i++)
+					if(i<taxaDAG[j]->degree()-1)
+						out<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+					else
+						out<<taxaDAG[j]->neighbors[i]->length<<" x"<<taxaDAG[j]->neighbors[i]->node->id<<" - "<<T<<" x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+				}
+		}
+	} else {
+		//for each predator the sum of children in the DAG is >= to its value-------
+		for(j=0;j<nvar;j++)
+			if(taxaDAG[j]->degree()>0){//the ones that have children in the DAG
+				for(i=0;i<taxaDAG[j]->degree();i++){
+					if(i<taxaDAG[j]->degree()-1){
+						out<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" + ";
+					} else {
+						out<<"x"<<taxaDAG[j]->neighbors[i]->node->id<<" - x"<<taxaDAG[j]->id<<" >= 0"<<endl;
+					}
+				}
+			}
+	}
+
+//Bounds-----------------------------------------------------------------------------
+	Split included_tax(TaxaNUM);
+	IntVector::iterator it2;
+// 	for (it2 = splitsys.initialset.begin(); it2 != splitsys.initialset.end(); it2++)
+// 		included_tax.addTaxon(*it2);
+	vector<int> y_value;
+	y_value.resize(splitsys.getNSplits(), -1);
+	//splitsys.checkYValue(nspecies, y_value);
+	splitsys.lpVariableBound(out, params, included_tax, y_value);
+
+//Generals for IP or MIP--------------------------------------------------------------
+			out<<"Generals"<<endl;
+ 			for(i=0;i<nvar;i++)
+ 				out<<"x"<<i<<" ";
+ 			for(i=0;i<splitsys.getNSplits();i++)
+ 				out<<"y"<<i<<" ";
+// 			for(j=0;j<=maxlevel;j++)
+// 				out<<"d"<<j<<" ";
+/*			if(fractVAR.size()!=0)
+				for(i=0;i<fractVAR.size();i++)
+					out<<fractVAR[i]<<" ";*/
+			out<<endl;
+			out<<"End"<<endl;
+
+	out.close();
+
+//	ofstream out1;
+//	out1.exceptions(ios::failbit | ios::badbit);
+//	out1.open("variablesNUM.data",ios::app);
+//	out1<<this->k<<" "<<nvar<<" "<<splitsys.getNSplits()<<endl;//" "<<maxlevel+1<<endl;
+//	out1.close();
+}
+
+//Fractional stuff-----------------------------------------------------------------
+void ECOpd::readREC(const char* infile) {
+	ifstream in;
+	cout<<endl<<"-----------------------------------------------------"<<endl;
+	cout<<"Reading file with fractional variables from "<<infile<<endl;
+	try {
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(infile);
+		in.exceptions(ios::badbit);
+		readREC(in);
+		in.close();
+	} catch (const char* str) {
+		outError(str);
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, infile);
+	}
+}
+
+void ECOpd::readREC(istream &in) {
+	int i;
+	string str,name;
+	while (getline(in, str)) {
+		stringstream ss(str);
+		getline(ss,name,':');
+		fractVAR.push_back(name);
+	}
+	for(i=0; i<fractVAR.size();i++)
+	cout<<fractVAR[i]<<endl;
+}
+
+//Generating all d vectors ---------------------------------------------------------
+void ECOpd::generateFirstMultinorm(vector<int> &x, int n, int k) {
+     x.resize(k, 0);
+     x.back() = n;
+}
+
+bool ECOpd::generateNextMultinorm(vector<int> &x) {
+     if (x.size() < 2) return false;
+     int id = x.size()-1;
+     while (id >= 0 && x[id] == 0) id--;
+     if (id <= 0) return false;
+     x[id-1]++;
+     x.back() = x[id]-1;
+     if (id < x.size()-1) x[id] = 0;
+     return true;
+}
+
+void ECOpd::getBranchOrdered(NodeVector &nodes, NodeVector &nodes2, Node *node, Node *dad){
+	if(!node) node = root;
+	FOR_NEIGHBOR_IT(node, dad, it){
+		nodes.push_back(node);
+		nodes2.push_back((*it)->node);
+		getBranchOrdered(nodes,nodes2,(*it)->node,node);
+	}
+}
+
+void ECOpd::synchTreeDAG(ECOpd &tree){
+	if(rooted)
+		tree.root->id=SpeciesNUM;
+
+	for(int i=0; i<SpeciesNUM;i++){
+		if(tree.findLeafName(dagNames[i]))
+			tree.findLeafName(dagNames[i])->id = i;
+	}
+}
+
+int ECOpd::findPhyloID(string name){
+	for(int i=0; i<TaxaNUM; i++)
+		if((phyloNames[i]).compare(name) == 0) return(i);
+	return(-1);
+}
+
+
+int ECOpd::findFoodWebID(int id){
+	for(int i=0; i<phylo_order.size();i++){
+		if(phylo_order[i] == id) return i;
+	}
+	return(-1);
+}
+
+void ECOpd::randomBranLenTrees(Params &params){
+	ECOpd tree = *this;
+	//Trees with random branch length---------------------------------------------------------------------
+	NodeVector nodes_1,nodes_2;
+	tree.getBranchOrdered(nodes_1,nodes_2);
+	for(int i=0;i<tree.branchNum;i++){
+		if(nodes_1[i]!=tree.root && nodes_2[i]!=tree.root){
+			nodes_1[i]->findNeighbor(nodes_2[i])->id=i;
+			nodes_2[i]->findNeighbor(nodes_1[i])->id=i;
+			nodes_1[i]->findNeighbor(nodes_2[i])->length=randomLen(params);
+			nodes_2[i]->findNeighbor(nodes_1[i])->length=nodes_1[i]->findNeighbor(nodes_2[i])->length;
+			//cout<<"Branch: y"<<i<<"-> length = "<<nodes_1[i]->findNeighbor(nodes_2[i])->length<<endl;
+		} else {
+			nodes_1[i]->findNeighbor(nodes_2[i])->id=i;
+			nodes_2[i]->findNeighbor(nodes_1[i])->id=i;
+			nodes_1[i]->findNeighbor(nodes_2[i])->length=1;
+			nodes_2[i]->findNeighbor(nodes_1[i])->length=nodes_1[i]->findNeighbor(nodes_2[i])->length;
+			//cout<<"Branch: y"<<i<<"-> length = "<<nodes_1[i]->findNeighbor(nodes_2[i])->length<<"              y"<<i<<" - ROOT edge"<<endl;
+		}
+	}
+	string str_out = params.user_file;
+	string str_out_1,str_out_2, str1, str2;
+
+	str_out_1 = convertIntToString(params.eco_run);
+	if(params.k_percent)
+		str1 = convertIntToString(params.k_percent);
+	else
+		str1 = convertIntToString(params.sub_size);
+	str2 = convertIntToString(params.diet_max);
+	str_out_2 = str_out  + "." + str1 + "." + str2 + "." + str_out_1;
+	const char *outfile=str_out_2.c_str();
+	tree.printTree(outfile);
+}
+
+void ECOpd::detectMissingSpecies(){
+	int i;
+	for(i=0; i<TaxaNUM; i++)
+		if(!findTaxaDAG(i)){
+			missInDAG.push_back(phyloNames[i]);
+			OUTdagTaxa.push_back(i+1);
+		}
+	if(missInDAG.size() != 0){
+		cout<<endl<<"-------------------------------------------------------------------"<<endl;
+		cout<<" There are "<<missInDAG.size()<<" species missing in the Food Web: "<<endl;
+		cout<<"-------------------------------------------------------------------"<<endl;
+		for(i=0; i<missInDAG.size(); i++)
+			cout<<missInDAG[i]<<endl;
+		cout<<endl;
+	}
+
+
+	for(i=0; i<SpeciesNUM; i++)
+		if(!findSpeciesPhylo(i)){
+			missInPhylo.push_back(dagNames[i]);
+			OUTtreeTaxa.push_back(i+1);
+		}
+	if(missInPhylo.size() != 0){
+		cout<<endl<<"-----------------------------------------------------------------------------"<<endl;
+		cout<<" There are "<<missInPhylo.size()<<" species missing on the Tree/SplitSystem: "<<endl;
+		cout<<"-----------------------------------------------------------------------------"<<endl;
+		for(i=0; i<missInPhylo.size(); i++){
+			cout<<missInPhylo[i]<<endl;
+			names.push_back(&missInPhylo[i]);
+		}
+		cout<<endl;
+	}
+}
+
+bool ECOpd::findTaxaDAG(int i){
+	for(int j=0; j<SpeciesNUM; j++)
+		if(phyloNames[i].compare(dagNames[j]) == 0)
+			return true;
+	return false;
+}
+bool ECOpd::findSpeciesPhylo(int i){
+	for(int j=0; j<TaxaNUM; j++)
+		if(dagNames[i].compare(phyloNames[j]) == 0)
+			return true;
+	return false;
+}
+
+void ECOpd::synchronizeSpecies(){
+	int i;
+	//cout<<"Synchronization starts..."<<endl;
+	if(rooted)
+		SpeciesNUM--;
+	int num = 0;
+	for(i=0; i<SpeciesNUM; i++){
+		if(!OUT_tree(i+1)){
+			if(phyloType == "t"){
+				if(findLeafName(dagNames[i])){
+					phylo_order.push_back(findLeafName(dagNames[i])->id);
+				}
+			}else{
+				if(findPhyloID(dagNames[i]) != -1){
+					phylo_order.push_back(findPhyloID(dagNames[i]));
+				}
+			}
+		}else{
+			phylo_order.push_back(TaxaNUM + num);
+			num++;
+		}
+	}
+	if(rooted){
+		phylo_order.push_back(TaxaNUM-1);
+		SpeciesNUM++;
+	}
+	// Filling out OUTtreeTaxa vector with new ids, based on phylo_order
+	//cout<<"OUTtreeTaxa after reordering:"<<endl;
+	for(i=0; i<OUTtreeTaxa.size(); i++){
+		OUTtreeTaxa[i] = phylo_order[OUTtreeTaxa[i]-1];
+		//cout<<OUTtreeTaxa[i]<<" ";
+	}
+	//cout<<endl;
+}
+
+int ECOpd::findSpeciesIDname(string *name){
+	for(int i=0; i<nvar; i++){
+		if(name->compare(*names[i]) == 0)
+			return i;
+	}
+	return -1;
+}
+
+void ECOpd::defineK(Params &params){
+	cout<<"Defining the subset size, k..."<<endl;
+	if(rooted)
+		nvar--;
+
+	if(params.k_percent)
+		k = params.k_percent*0.01*nvar;
+	else if(params.sub_size)
+		k = params.sub_size;
+
+	if(k<2){
+		cout<<"k = "<<k<<endl;
+		cout<<"ERROR: Wrong value of parameter k. The subset size must be larger than 1."<<endl;
+		exit(0);
+	}else if(k>nvar){
+		cout<<"k = "<<k<<endl;
+		cout<<"Total number of species in the analysis | "<<nvar<<endl;
+		cout<<"ERROR: Wrong value of parameter k. The subset size must be less or equal to the number of all species in the analysis."<<endl;
+		exit(0);
+	}
+	cout<<"k = "<<k<<endl;
+	if(initialTaxa.size() > k){
+		cout<<endl<<"Initial set "<<initialTaxa.size()<<" taxa | Subset size k = "<<k<<endl;
+		cout<<"ERROR: the initial set is already larger than the specified subset size! Increase k or reduce the initial set."<<endl;
+		exit(0);
+	}
+
+	if(rooted)
+		nvar++;
+	if(T != 0){
+		cout<<"Defining the minimum diet, d..."<<endl;
+		cout<<"d = "<<(int) (T*100)<<endl;
+	}
+
+}
+
+void ECOpd::checkInitialTaxa(){
+	int i = 0, j = 0;
+	vector<int> eraseSET;
+	if(initialTaxa.size() != 0){
+		cout<<"Reading taxa to be included in the final optimal subset.."<<endl;
+		//for(i=0; i<initialTaxa.size(); i++)
+		//	cout<<initialTaxa[i]<<endl;
+		for(i=0; i<initialTaxa.size(); i++){
+			if(findSpeciesIDname(&initialTaxa[i]) == -1){
+				j++;
+				if(j == 1){
+					cout<<"---------------------------------------------------------------------------------------------------------"<<endl;
+					cout<<"The following species are not present on Tree/SplitSystem nor in the Food Web, therefore will be ignored:"<<endl;
+					cout<<"---------------------------------------------------------------------------------------------------------"<<endl;
+				}
+				cout<<initialTaxa[i]<<endl;
+				eraseSET.push_back(i);
+			}
+		}
+		cout<<endl;
+		if(eraseSET.size()!=0)
+			for(i = eraseSET.size()-1; i>=0; i--){
+				initialTaxa.erase(initialTaxa.begin() + eraseSET[i]);
+			}
+		cout<<"------------------------------------------"<<endl;
+		cout<<"Taxa to be included in the optimal subset:"<<endl;
+		cout<<"------------------------------------------"<<endl;
+		for(i=0; i<initialTaxa.size(); i++)
+			cout<<initialTaxa[i]<<endl;
+		//cout<<"The initial subset size is"<<initialTaxa.size()<<endl;
+		cout<<endl;
+	}
+}
+
+void ECOpd::printSubFoodWeb(char* fileOUT, double* variables){
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open(fileOUT);
+	int i,j;
+	out<<k<<endl;
+	for(i=0; i<nvar; i++){
+		if(variables[i] == 1){
+			out<<*(names[i])<<" ";
+			for(j=0; j<nvar; j++)
+				if(variables[j] == 1)
+					out<<DAG[i][j]<<" ";
+			out<<endl;
+		}
+	}
+	out.close();
+}
+
+void ECOpd::dietConserved(double *variables){
+	int i,j;
+	double c;
+	for(i=0; i<nvar; i++){
+		c = 0;
+		if(variables[i] == 1){
+			for(j=0; j<nvar; j++){
+				if(variables[j] == 1)
+					c += DAG[j][i];
+			}
+		}
+		c *= 100;
+		dietVAL.push_back(c);
+	}
+}
+
+void ECOpd::printResults(char* fileOUT,double* variables, double score,Params &params){
+	cout<<endl<<"Results of the analysis are printed to "<<fileOUT<<endl<<endl;
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open(fileOUT);
+	int i;
+
+	if(phyloType == "t")
+		summarizeHeader(out, params, false, IN_NEWICK);
+	else
+		summarizeHeader(out, params, false, IN_NEXUS);
+
+	// Analyze the results of IP and print the information
+	out<<endl<<"------Results of biodiversity analysis--------------------------------------------------"<<endl;
+	out<<endl;
+	out<<"Food Web  | # of species "<<SpeciesNUM<<"\t| # of links    "<<linksNUM;
+	//printf("Food Web  | # of species  %3i | # of links    %7i ", SpeciesNUM,linksNUM);
+	if(weighted)
+		out<<"\t| weighted \t|"<<endl;
+	else
+		out<<"\t| non weighted \t|"<<endl;
+
+	if(phyloType == "t"){
+		if(rooted){
+			TaxaNUM--;
+		}
+		//printf("PhyloTree | # of species  %3i | # of branches %7i ", TaxaNUM,branchNum);
+		out<<"PhyloTree | # of species "<<TaxaNUM<<"\t| # of branches "<<branchNum;
+		if(rooted)
+			out<<"\t| rooted \t|";
+		else
+			out<<"\t| unrooted \t|";
+		out<<" total PD "<<treeLength()<<endl;
+	}else{
+		//printf("SplitSys  | # of species  %3i | # of splits %7i | total SD %f \n", ecoInfDAG.TaxaNUM,splitSYS.getNSplits(),splitSYS.calcWeight());
+		out<<"SplitSys  | # of species "<<TaxaNUM<<"\t| # of splits   "<<splitsNUM<<"\t|\t\t| total SD "<<totalSD<<endl;
+	}
+
+	out<<endl;
+	out<<"SubsetSize| k = "<<k<<endl;
+	if(T!= 0)
+		out<<"Constraint| "<<(int) (T*100)<<"%-viability"<<endl;
+	else
+		out<<"Constraint| naive viability"<<endl;
+	out<<endl;
+
+	if(phyloType == "t"){
+		out<<"PD of the optimal subset: "<<score<<" (constitutes "<< score / treeLength() * 100<<"% of the total PD)"<<endl;
+	} else {
+		out<<"SD of the optimal subset: "<<score<<" (constitutes "<< score / totalSD * 100<<"% of the total SD)"<<endl;
+	}
+
+	if(weighted){
+		out<<"--------------------------------------------------"<<endl;
+		out<<" Optimal subset of species  (% of diet conserved) "<<endl;
+		out<<"--------------------------------------------------"<<endl;
+		if(rooted){
+			for(i=0; i<nvar; i++)
+				if(variables[i] == 1 && i!=root->id){
+					if(dietVAL[i]!=0){
+						out<<" ";
+						out.width(30);
+						out<<left<<*(names[i]);
+						out<<"\t("<<dietVAL[i]<<"%)"<<endl;
+					}else
+						out<<" "<<*(names[i])<<endl;
+				}
+		}else{
+			for(i=0; i<nvar; i++)
+				if(variables[i] == 1){
+					if(dietVAL[i]!=0){
+						out<<" ";
+						out.width(30);
+						out<<left<<*(names[i]);
+						out<<"\t("<<dietVAL[i]<<"%)"<<endl;
+					}else
+						out<<" "<<*(names[i])<<endl;
+				}
+		}
+	}else{
+		out<<"-----------------------------"<<endl;
+		out<<" Optimal subset of species  "<<endl;
+		out<<"-----------------------------"<<endl;
+		if(rooted){
+			for(i=0; i<nvar; i++)
+				if(variables[i] == 1 && i!=root->id)
+					out<<" "<<*(names[i])<<endl;
+		}else{
+			for(i=0; i<nvar; i++)
+				if(variables[i] == 1)
+					out<<" "<<*(names[i])<<endl;
+		}
+	}
+	//out<<"----------------------------------------------------------------------------------------"<<endl;
+	summarizeFooter(out, params);
+	out.close();
+}
diff --git a/ecopd.h b/ecopd.h
new file mode 100644
index 0000000..60f5662
--- /dev/null
+++ b/ecopd.h
@@ -0,0 +1,276 @@
+/*
+ * ecopd.h
+ *
+ *  Created on: Oct 30, 2013
+ *      Author: Olga
+ */
+
+#ifndef ECOPD_H
+#define ECOPD_H
+
+#include "mtree.h"
+#include "mtreeset.h"
+#include "ecopdmtreeset.h"
+#include "pdnetwork.h"
+
+/* ===============================================================================
+ *	Class for processing IP problem - PD/SD with ecological constraints
+ * ===============================================================================*/
+
+class ECOpd : public MTree
+{
+public:
+
+	/**
+		CONSTRUCTORs, INITIALIZATION and DESTRUCTOR
+	*/
+    ECOpd(const char *userTreeFile, bool &is_rooted);
+    ECOpd();
+   ~ECOpd();
+
+   void initializeEcoPD();
+   void initializeEcoPD(Params &params);
+
+   /*
+    * Checks whether taxon with id i ("real" id, that is when calling call with i+1 ) is present on the tree/split network
+    */
+   bool OUT_tree(int i);
+
+   /*
+    * Reading and processing Diet Composition matrix
+    */
+   void readDAG(const char *infile);
+   void readDAG(istream &in);
+
+   /*
+    * Transform problem into IP problem and print it to .lp file, for rooted trees
+    */
+   void printECOlpRooted(const char* fileOUT,ECOpd &tree);
+
+   /*
+    * Transform problem into IP problem and print it to .lp file, for UNrooted trees
+    */
+   void printECOlpUnrooted(const char* fileOUT,ECOpd &tree);
+
+   /*
+    * Transform problem into IP problem and print it to .lp file, for split system
+    */
+	void printInfDAG (const char* fileOUT,PDNetwork &splitsys,Params &params);
+
+	/*
+	 * Synchronization of species in the food web with species on the tree
+	 */
+	void synchTreeDAG(ECOpd &tree);
+
+	/*
+	 * Synchronization of species in the food web with species in the split network
+	 */
+	void synchSplitDAG(PDNetwork &system);
+
+	/*
+	 * some left_overs from mtree class, function which is not there anymore..
+	 */
+	void getBranchOrdered(NodeVector &nodes, NodeVector &nodes2,Node *node = NULL, Node *dad = NULL);
+
+	/*
+	 * Find the id of the species on tree by name
+	 */
+	int findPhyloID(string name);
+
+	/*
+	 * Find the id of the species in the food web by their phylo id (id of this species in the tree)
+	 */
+	int findFoodWebID(int id);
+
+	/*
+	 * checks whether there are some species present on the tree/splitSys, but not in the foodWeb, and wise versa.
+	 */
+	void detectMissingSpecies();
+
+	/*
+	 * List of species missing either on the tree/splitSys (missInPhylo), or in the food web (missInDAG)
+	 */
+	vector<string> missInPhylo,missInDAG;
+
+	/*
+	 * Finding Taxon from tree/splitSys among DAG Species
+	 */
+	bool findTaxaDAG(int i);
+
+	/*
+	 * Finding Species from DAG among Taxa on tree/splitSys
+	 */
+	bool findSpeciesPhylo(int i);
+
+	/*
+	 * the number of links in the food web
+	 */
+	int linksNUM;
+
+	/*
+	 * synchronization of species on Tree/SplitSys and in Food Web
+	 */
+	void synchronizeSpecies();
+
+	/*
+	 * Reading taxa to be included in the final optimal subset
+	 */
+	void readInitialTaxa(const char *infile);
+	void readInitialTaxa(istream &in);
+
+	/*
+	 * list of taxa (names) to be included in the final optimal set
+	 */
+	vector<string> initialTaxa;
+
+	/*
+	 * Check if the species in InitialTaxa are actually present either on tree/network or in the food web
+	 */
+	void checkInitialTaxa();
+
+	/*
+	 * find an id (among nvar) of a given species by name
+	 */
+	int findSpeciesIDname(string *name);
+
+	/*
+	 * Define the subset size and check if it's >1 and <nvar (#of all species in the analysis = (TaxaNUM > SpeciesNUM) ? TaxaNUM : SpeciesNUM)
+	 */
+	void defineK(Params &params);
+
+	/*
+	 * Check whether the food web is acyclic or not
+	 */
+	void checkGraph();
+
+	/*
+	 * Diet Composition Matrix (entries either 0/1 or [0,100] )
+	 */
+	vector<double*> DAG;
+
+	/*
+	 * Prints the sub food web corresponding to the optimal subset
+	 */
+	void printSubFoodWeb(char* fileOUT, double* variables);
+
+	/*
+	 * t for tree or n for networks
+	 */
+	string phyloType;
+
+	/*
+	 * Structure of the DAG: taxa with neighbors being their preys
+	 */
+	NodeVector taxaDAG;
+
+	/*
+	 * two vectors of nodes, corresponding to ends of branches
+	 */
+	NodeVector nodes1,nodes2;
+
+	/*
+	 * Ids of species not present on tree/split network
+	 */
+	vector<int> OUTtreeTaxa;
+
+	/*
+	 * Ids of species not present in the food web
+	 */
+	vector<int> OUTdagTaxa;
+
+	/*
+	 * contains the ids of species based on tree/split network (phylo_oder[i] = j species with id=j in the food web has id=i on tree/split network)
+	 */
+	vector<int> phylo_order;
+
+	/*
+	 * for each species contains information about its longest food chain (excluding species itself)
+	 */
+	vector<int> levelDAG;
+
+	/*
+	 * the names of species present in the food web and on the tree/split network respectively
+	 */
+	vector<string> dagNames,phyloNames;
+
+	/*
+	 * names of all species: union of species on the tree/network and in the food web
+	 */
+	vector<string*> names;
+
+	/*
+	 * flag for whether to treat the food web as weighted or not weighted
+	 */
+	bool weighted;
+
+	/*
+	 * the size of an optimal subset to be chosen
+	 */
+	int k;
+
+	/*
+	 * the diet portion to be conserved for each predator, when equals to 0 corresponds to a naive viability
+	 */
+	double T;
+
+	/*
+	 * the number of species in the food web (if rooted tree counts also the root, technical)
+	 */
+	int SpeciesNUM;
+
+	/*
+	 * the number of species on the tree/split network (if rooted tree counts also the root)
+	 */
+	int TaxaNUM;
+
+	/*
+	 * the number of all species: union of species on the tree/network and in the food web
+	 */
+	int nvar;
+
+	/*
+	 * calculates for each predator the diet proportional conserved
+	 */
+	void dietConserved(double *variables);
+
+	/*
+	 * for each predator the diet proportional conserved
+	 */
+	vector<double> dietVAL;
+
+	/*
+	 * print the results
+	 */
+
+	void printResults(char* fileOUT,double* variables, double score, Params &params);
+
+	/*
+	 * Splits number and total SD
+	 */
+	int splitsNUM;
+	double totalSD;
+
+	/**************************
+	 * Miscellaneous
+	 **************************/
+
+	/*
+	 * These function were used when we analyzed results from LP problems, to set the fractional values to be integers
+	 * now, it is not used
+	 */
+	void readREC(const char *infile);
+	void readREC(istream &in);
+	void generateFirstMultinorm(vector<int> &x, int n, int k);
+	bool generateNextMultinorm(vector<int> &x);
+
+	vector<string> fractVAR;
+	vector<int> dvec,hvec;
+
+	/*
+	 * Assigns to a given tree topology random branch lengths
+	 */
+	void randomBranLenTrees(Params &params);
+
+};
+
+#endif
diff --git a/ecopdmtreeset.cpp b/ecopdmtreeset.cpp
new file mode 100644
index 0000000..b3ac748
--- /dev/null
+++ b/ecopdmtreeset.cpp
@@ -0,0 +1,36 @@
+/*
+ * EcoPDmtreeset.cpp
+ *
+ *  Created on: Nov 4, 2013
+ *      Author: olga
+ */
+
+#include "ecopdmtreeset.h"
+#include "mtreeset.h"
+
+EcoPDmtreeset::EcoPDmtreeset() {
+}
+
+EcoPDmtreeset::~EcoPDmtreeset() {
+}
+
+EcoPDmtreeset::EcoPDmtreeset(const char *userTreeFile, bool &is_rooted,
+	int burnin, int max_count, const char *tree_weight_file) {
+	initEcoSD(userTreeFile, is_rooted, burnin, max_count, tree_weight_file);
+}
+
+void EcoPDmtreeset::initEcoSD(const char *userTreeFile, bool &is_rooted, int burnin, int max_count,
+	const char *tree_weight_file, IntVector *weights, bool compressed)
+{
+	readTrees(userTreeFile, is_rooted, burnin, max_count, weights, compressed);
+	//checkConsistency();
+
+	if (tree_weight_file)
+		readIntVector(tree_weight_file, burnin, max_count, tree_weights);
+/*	else if (!weights)
+		tree_weights.resize(size(), 1);*/
+
+	if (size() != tree_weights.size())
+		outError("Tree file and tree weight file have different number of entries");
+
+}
diff --git a/ecopdmtreeset.h b/ecopdmtreeset.h
new file mode 100644
index 0000000..21439eb
--- /dev/null
+++ b/ecopdmtreeset.h
@@ -0,0 +1,35 @@
+/*
+ * EcoPDmtreeset.h
+ *
+ *  Created on: Nov 4, 2013
+ *      Author: olga
+ */
+
+#ifndef ECOPDMTREESET_H_
+#define ECOPDMTREESET_H_
+
+#include "mtreeset.h"
+
+class EcoPDmtreeset : public MTreeSet
+{
+public:
+	EcoPDmtreeset();
+
+	/**
+		constructor, read trees from user file
+		@param userTreeFile the name of the user trees
+		@param is_rooted (IN/OUT) true if tree is rooted
+		@param burnin the number of beginning trees to be discarded
+		@param max_count max number of trees to load
+	*/
+	EcoPDmtreeset(const char *userTreeFile, bool &is_rooted, int burnin, int max_count,
+		const char *tree_weight_file = NULL);
+
+	void initEcoSD(const char *userTreeFile, bool &is_rooted, int burnin, int max_count,
+		const char *tree_weight_file = NULL, IntVector *weights = NULL, bool compressed = false);
+
+
+	virtual ~EcoPDmtreeset();
+};
+
+#endif /* ECOPDMTREESET_H_ */
diff --git a/eigendecomposition.cpp b/eigendecomposition.cpp
new file mode 100644
index 0000000..4b61549
--- /dev/null
+++ b/eigendecomposition.cpp
@@ -0,0 +1,1121 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "eigendecomposition.h"
+#include "optimization.h"
+#include <math.h>
+#include <string.h>
+#include <iostream>
+#include <stdlib.h>
+#include "tools.h"
+
+const double ZERO = 0.000001;
+using namespace std;
+
+EigenDecomposition::EigenDecomposition()
+{
+	total_num_subst = 1.0;
+	normalize_matrix = true;
+    ignore_state_freq = false;
+}
+
+void EigenDecomposition::eigensystem(
+	double **rate_params, double *state_freq, 
+	double *eval, double **evec, double **inv_evec, int num_state) 
+{
+	double *forg = new double[num_state];
+	double *evali = new double[num_state];
+	double *new_forg = new double[num_state];
+	double *eval_new = new double[num_state];
+	double **a = (double**)new double[num_state];
+	double **b = (double**)new double[num_state];
+	double **evec_new = (double**)new double[num_state];
+	int *ordr = new int[num_state + 1];
+	int i, j, k, error, new_num, inew, jnew;
+	double zero;
+
+
+	for (i=0; i < num_state; i++)
+		a[i] = new double[num_state];
+	for (i=0; i < num_state; i++)
+		b[i] = new double[num_state];
+	for (i=0; i < num_state; i++)
+		evec_new[i] = new double[num_state];
+
+	/* get relative transition matrix and frequencies */
+	memcpy(forg, state_freq, num_state * sizeof(double));
+	for (i = 0; i < num_state; i++)
+		memcpy(a[i], rate_params[i], num_state * sizeof(double)); 
+
+	//rtfdata(a, forg, num_state); 
+	//    write (a, forg);
+
+	computeRateMatrix(a, forg, num_state); /* make 1 PAM rate matrix */
+
+	/* copy a to b */
+	for (i = 0; i < num_state; i++)
+		for (j = 0; j < num_state; j++)
+			b[i][j] = a[i][j];
+
+	eliminateZero(b, forg, num_state, a, new_forg, new_num);
+
+	elmhes(a, ordr, new_num); /* compute eigenvalues and eigenvectors */
+	//    writeInt (ordr);
+
+	eltran(a, evec_new, ordr, new_num);
+
+	//  writeMat (evec);
+
+	hqr2(new_num, 1, new_num, a, evec_new, eval_new, evali);
+
+
+	// now get back eigen
+	//for (i = 0,inew = 0; i < num_state; i++)
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		eval[i] = (forg[i] > ZERO) ? eval_new[inew--] : 0;
+		//eval[i] = (forg[i] > ZERO) ? eval_new[inew++] : 0;
+
+	// calculate the actual eigenvectors of Q and its inverse matrix
+	//for (i = 0, inew = 0; i < num_state; i++)
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		if (forg[i] > ZERO) {
+// 			for (j = 0, jnew = 0; j < num_state; j++) 
+			for (j = num_state-1, jnew = new_num-1; j >= 0; j--) 
+				if (forg[j] > ZERO) {
+					evec[i][j] = evec_new[inew][jnew];
+					//jnew++;
+					jnew--;
+				} else {
+					evec[i][j] = (i == j);
+				}
+// 			inew++;
+ 			inew--;
+		} else 
+		for (j=0; j < num_state; j++) {
+			evec[i][j] = (i==j);
+		}
+
+	/* check eigenvalue equation */
+	error = 0;
+	for (j = 0; j < num_state; j++) {
+		for (i = 0, zero = 0.0; i < num_state; i++) {
+			for (k = 0; k < num_state; k++) zero += b[i][k] * evec[k][j];
+			zero -= eval[j] * evec[i][j];
+			if (fabs(zero) > 1.0e-5) {
+				error = 1;
+				break;
+			}
+		}
+	}
+	if (error) {
+		cout << "\nWARNING: Eigensystem doesn't satisfy eigenvalue equation!\n";
+		cout << "Rate matrix R: " << endl;
+		for (i = 0; i < num_state; i++) {
+			for (j = 0; j < num_state; j++) cout << rate_params[i][j] << " ";
+			cout << endl;
+		}
+		cout << "State frequencies: " << endl;
+		for (i = 0; i < num_state; i++) cout << state_freq[i] << " ";
+		cout << endl;
+	}
+
+	for (i=num_state-1; i>= 0; i--)
+		delete [] evec_new[i];
+	for (i=num_state-1; i>= 0; i--)
+		delete [] b[i];
+	for (i=num_state-1; i>= 0; i--)
+		delete [] a[i];
+	delete [] ordr;
+	delete [] evec_new;
+	delete [] b;
+	delete [] a;
+	delete [] eval_new;
+	delete [] new_forg;
+	delete [] evali;
+	delete [] forg;
+	
+	luinverse(evec, inv_evec, num_state); /* inverse eigenvectors are in Ievc */
+	//checkevector(evec, inv_evec, num_state); /* check whether inversion was OK */
+
+} /* eigensystem */
+
+
+void EigenDecomposition::eigensystem_sym(double **rate_params, double *state_freq, 
+	double *eval, double *evec, double *inv_evec, int num_state)
+{
+	double *forg = new double[num_state];
+	double *new_forg = new double[num_state];
+	double *forg_sqrt = new double[num_state];
+	double *off_diag = new double[num_state];
+	double *eval_new = new double[num_state];
+	double **a = (double**)new double[num_state];
+	double **b = (double**)new double[num_state];
+	int i, j, k, new_num, inew, jnew;
+	double error = 0.0;
+	double zero;
+
+	for (i=0; i < num_state; i++)
+		a[i] = new double[num_state];
+	for (i=0; i < num_state; i++)
+		b[i] = new double[num_state];
+
+	/* get relative transition matrix and frequencies */
+	memcpy(forg, state_freq, num_state * sizeof(double));
+    
+	for (i = 0; i < num_state; i++)
+		memcpy(a[i], rate_params[i], num_state * sizeof(double)); 
+
+	//rtfdata(a, forg, num_state); 
+	//    write (a, forg);
+
+	computeRateMatrix(a, forg, num_state); /* make 1 PAM rate matrix */
+
+	/* copy a to b */
+	for (i = 0; i < num_state; i++)
+		for (j = 0; j < num_state; j++)
+			b[i][j] = a[i][j];
+
+	eliminateZero(b, forg, num_state, a, new_forg, new_num);
+
+	symmetrizeRateMatrix(a, new_forg, forg_sqrt, new_num); 
+
+	// make this matrix tridiagonal
+	tred2(a, new_num, eval_new, off_diag);
+	// compute eigenvalues and eigenvectors
+	tqli(eval_new, off_diag, new_num, a);
+
+	// now get back eigen
+	//for (i = 0,inew = 0; i < num_state; i++)
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		eval[i] = (forg[i] > ZERO) ? eval_new[inew--] : 0;
+		//eval[i] = (forg[i] > ZERO) ? eval_new[inew++] : 0;
+
+	assert(inew == -1);
+	// calculate the actual eigenvectors of Q and its inverse matrix
+	//for (i = 0, inew = 0; i < num_state; i++)
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		if (forg[i] > ZERO) {
+// 			for (j = 0, jnew = 0; j < num_state; j++) 
+			for (j = num_state-1, jnew = new_num-1; j >= 0; j--) 
+				if (forg[j] > ZERO) {
+					evec[i*num_state+j] = a[inew][jnew] / forg_sqrt[inew];
+					inv_evec[i*num_state+j] = a[jnew][inew] * forg_sqrt[jnew];
+					//jnew++;
+					jnew--;
+				} else {
+					evec[i*num_state+j] = (i == j);
+					inv_evec[i*num_state+j] = (i == j);
+//					evec[i*num_state+j] = 0.0;
+//					inv_evec[i*num_state+j] = 0.0;
+				}
+// 			inew++;
+ 			inew--;
+		} else 
+		for (j=0; j < num_state; j++) {
+			evec[i*num_state+j] = (i==j);
+			inv_evec[i*num_state+j] = (i==j);
+//			evec[i*num_state+j] = 0.0;
+//			inv_evec[i*num_state+j] = 0.0;
+		}
+
+
+
+	/* check eigenvalue equation */
+	error = 0.0;
+	for (j = 0; j < num_state; j++) {
+		for (i = 0, zero = 0.0; i < num_state; i++) {
+			for (k = 0; k < num_state; k++) zero += b[i][k] * evec[k*num_state+j];
+			zero -= eval[j] * evec[i*num_state+j];
+			if (fabs(zero) > 1.0e-3) {
+				error = max(error, fabs(zero));
+				break;
+			}
+		}
+	}
+	if (error > 1e-4) {
+		cout.precision(5);
+		cout << "\nWARNING: Eigensystem doesn't satisfy eigenvalue equation! (gap=" << error << ")" << endl;
+		assert(error < 1e-2);
+//		cout << "Rate matrix R: " << endl;
+//		for (i = 0; i < num_state; i++) {
+//			for (j = 0; j < num_state; j++) cout << rate_params[i][j] << " ";
+//			cout << endl;
+//		}
+		cout << "State frequencies: " << endl;
+		for (i = 0; i < num_state; i++) cout << state_freq[i] << " ";
+		cout << endl;
+	}
+
+	for (i=num_state-1; i>= 0; i--)
+		delete [] b[i];
+
+	for (i=num_state-1; i>= 0; i--)
+		delete [] a[i];
+
+	delete [] b;
+	delete [] a;
+	delete [] eval_new;
+	delete [] off_diag;
+	delete [] forg_sqrt;
+	delete [] new_forg;
+	delete [] forg;
+	
+} // eigensystem_new
+
+
+
+
+EigenDecomposition::~EigenDecomposition()
+{
+}
+
+/* make rate matrix with 0.01 expected substitutions per unit time */
+void EigenDecomposition::computeRateMatrix(double **a, double *stateFrqArr_, int num_state) {
+	
+/*
+	if (myrate.isNsSyHeterogenous())
+		return;
+*/
+	int i, j;
+	double delta, temp, sum;
+	double *m = new double[num_state];
+
+	if (!ignore_state_freq)
+	for (i = 0; i < num_state; i++) {
+		for (j = 0; j < num_state; j++) {
+			a[i][j] = stateFrqArr_[j]*a[i][j];
+		}
+	}
+
+	for (i = 0, sum = 0.0; i < num_state; i++) {
+		for (j = 0, temp = 0.0; j < num_state; j++)
+			temp += a[i][j];
+		m[i] = temp; /* row sum */
+		sum += temp*stateFrqArr_[i]; /* exp. rate */
+	}
+
+	if (normalize_matrix) {
+		delta = total_num_subst / sum; /* 0.01 subst. per unit time */
+
+		for (i = 0; i < num_state; i++) {
+			for (j = 0; j < num_state; j++) {
+				if (i != j)
+					a[i][j] = delta * a[i][j];
+				else
+					a[i][j] = delta * (-m[i]);
+			}
+		}
+	} else {
+		for (i = 0; i < num_state; i++)
+			a[i][i] = -m[i];
+	}
+	delete [] m;
+} /* onepamratematrix */
+
+void EigenDecomposition::eliminateZero(double **mat, double *forg, int num, 
+	double **new_mat, double *new_forg, int &new_num) {
+	int i, j, inew, jnew;
+	new_num = 0;
+	for (i = 0; i < num; i++)
+		if (forg[i] > ZERO) 
+			new_forg[new_num++] = forg[i];
+	if (new_num == num) return;
+	//writeDouble(forg, num);
+	//writeMat(mat, num);
+	for (i = 0, inew = 0; i < num; i++)
+		if (forg[i] > ZERO) {
+			for (j = 0, jnew = 0; j < num; j++) 
+				if (forg[j] > ZERO) {
+					new_mat[inew][jnew] = mat[i][j];
+					jnew++;
+				}
+			inew++;
+		}
+	if (verbose_mode >= VB_MED)
+		cout << "new_num_states = " << new_num << endl;
+	//writeMat(new_mat, new_num);
+	//writeDouble(new_forg, new_num);
+}
+
+void EigenDecomposition::symmetrizeRateMatrix(double **a, double *stateFrq, double *stateFrq_sqrt, int num_state) {
+	int i, j;
+
+	for (i = 0; i < num_state; i++)
+		stateFrq_sqrt[i] = sqrt(stateFrq[i]);
+	for (i = 0; i < num_state; i++) {
+        double tmp = 1.0/stateFrq_sqrt[i];
+		for (j = 0; j < i; j++) {
+            a[j][i] *= stateFrq_sqrt[j]*tmp;
+            a[i][j] = a[j][i];
+            
+//			a[i][j] *= stateFrq_sqrt[i] / stateFrq_sqrt[j];
+//            a[j][i] = a[i][j];
+
+//            a[j][i] *= stateFrq_sqrt[j] / stateFrq_sqrt[i];
+//			if (fabs(a[j][i] - a[i][j]) > 1e-5) {
+//                cout << a[i][j] << "  " << a[j][i];
+//                assert(0);
+//            }
+		}
+    }
+}
+
+
+void EigenDecomposition::tred2(double **a, int n, double *d, double *e)
+{
+	int l,k,j,i;
+	double scale,hh,h,g,f;
+
+	for (i=n-1;i>0;i--) {
+		l=i-1;
+		h=scale=0.0;
+		if (l > 0) {
+			for (k=0;k<=l;k++)
+				scale += fabs(a[i][k]);
+			if (scale == 0.0)
+				e[i]=a[i][l];
+			else {
+				for (k=0;k<=l;k++) {
+					a[i][k] /= scale;
+					h += a[i][k]*a[i][k];
+				}
+				f=a[i][l];
+				g=(f >= 0.0 ? -sqrt(h) : sqrt(h));
+				e[i]=scale*g;
+				h -= f*g;
+				a[i][l]=f-g;
+				f=0.0;
+				for (j=0;j<=l;j++) {
+					a[j][i]=a[i][j]/h;
+					g=0.0;
+					for (k=0;k<=j;k++)
+						g += a[j][k]*a[i][k];
+					for (k=j+1;k<=l;k++)
+						g += a[k][j]*a[i][k];
+					e[j]=g/h;
+					f += e[j]*a[i][j];
+				}
+				hh=f/(h+h);
+				for (j=0;j<=l;j++) {
+					f=a[i][j];
+					e[j]=g=e[j]-hh*f;
+					for (k=0;k<=j;k++)
+						a[j][k] -= (f*e[k]+g*a[i][k]);
+				}
+			}
+		} else
+			e[i]=a[i][l];
+		d[i]=h;
+	}
+	d[0]=0.0;
+	e[0]=0.0;
+	/* Contents of this loop can be omitted if eigenvectors not
+			wanted except for statement d[i]=a[i][i]; */
+	for (i=0;i<n;i++) {
+		l=i;
+		if (d[i] != 0.0) {
+			for (j=0;j<l;j++) {
+				g=0.0;
+				for (k=0;k<l;k++)
+					g += a[i][k]*a[k][j];
+				for (k=0;k<l;k++)
+					a[k][j] -= g*a[k][i];
+			}
+		}
+		d[i]=a[i][i];
+		a[i][i]=1.0;
+		for (j=0;j<l;j++) a[j][i]=a[i][j]=0.0;
+	}
+}
+
+/**
+	return a^2
+*/
+inline double sqr(double a) {
+	return (a == 0.0) ? 0.0 : a*a;
+}
+
+double pythag(double a, double b)
+{
+	double absa,absb;
+	absa=fabs(a);
+	absb=fabs(b);
+	if (absa > absb) return absa*sqrt(1.0+sqr(absb/absa));
+	else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+sqr(absa/absb)));
+}
+
+#define NRANSI
+#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+
+void EigenDecomposition::tqli(double *d, double *e, int n, double **z) 
+{
+	int m,l,iter,i,k;
+	double s,r,p,g,f,dd,c,b;
+
+	for (i=1;i<n;i++) e[i-1]=e[i];
+	e[n-1]=0.0;
+	for (l=0;l<n;l++) {
+		iter=0;
+		do {
+			for (m=l;m<n-1;m++) {
+				dd=fabs(d[m])+fabs(d[m+1]);
+				if ((double)(fabs(e[m])+dd) == dd) break;
+			}
+			if (m != l) {
+				if (iter++ == 100) 
+					nrerror("Too many iterations in tqli");
+ 
+				g=(d[l+1]-d[l])/(2.0*e[l]);
+				r=pythag(g,1.0);
+				g=d[m]-d[l]+e[l]/(g+SIGN(r,g));
+				s=c=1.0;
+				p=0.0;
+				for (i=m-1;i>=l;i--) {
+					f=s*e[i];
+					b=c*e[i];
+					e[i+1]=(r=pythag(f,g));
+					if (r == 0.0) {
+						d[i+1] -= p;
+						e[m]=0.0;
+						break;
+					}
+					s=f/r;
+					c=g/r;
+					g=d[i+1]-p;
+					r=(d[i]-g)*s+2.0*c*b;
+					d[i+1]=g+(p=s*r);
+					g=c*r-b;
+					for (k=0;k<n;k++) {
+						f=z[k][i+1];
+						z[k][i+1]=s*z[k][i]+c*f;
+						z[k][i]=c*z[k][i]-s*f;
+					}
+				}
+				if (r == 0.0 && i >= l) continue;
+				d[l] -= p;
+				e[l]=g;
+				e[m]=0.0;
+			}
+		} while (m != l);
+	}
+}
+#undef SIGN
+#undef NRANSI
+
+void EigenDecomposition::elmhes(double **a, int *ordr, int n) {
+	int m, j, i;
+	double y, x;
+
+
+	for (i = 0; i < n; i++)
+		ordr[i] = 0;
+	for (m = 2; m < n; m++) {
+		x = 0.0;
+		i = m;
+		for (j = m; j <= n; j++) {
+			if (fabs(a[j - 1][m - 2]) > fabs(x)) {
+				x = a[j - 1][m - 2];
+				i = j;
+			}
+		}
+		ordr[m - 1] = i;      /* vector */
+
+		if (i != m) {
+			for (j = m - 2; j < n; j++) {
+				y = a[i - 1][j];
+				a[i - 1][j] = a[m - 1][j];
+				a[m - 1][j] = y;
+			}
+			for (j = 0; j < n; j++) {
+				y = a[j][i - 1];
+				a[j][i - 1] = a[j][m - 1];
+				a[j][m - 1] = y;
+			}
+		}
+		if (x != 0.0) {
+			for (i = m; i < n; i++) {
+				y = a[i][m - 2];
+				if (y != 0.0) {
+					y /= x;
+					a[i][m - 2] = y;
+					for (j = m - 1; j < n; j++)
+						a[i][j] -= y * a[m - 1][j];
+					for (j = 0; j < n; j++)
+						a[j][m - 1] += y * a[j][i];
+				}
+			}
+		}
+	}
+} /* elmhes */
+
+
+void EigenDecomposition::eltran(double **a, double **zz, int *ordr, int n) {
+	int i, j, m;
+
+
+	for (i = 0; i < n; i++) {
+		for (j = i + 1; j < n; j++) {
+			zz[i][j] = 0.0;
+			zz[j][i] = 0.0;
+		}
+		zz[i][i] = 1.0;
+	}
+	if (n <= 2)
+		return;
+	for (m = n - 1; m >= 2; m--) {
+		for (i = m; i < n; i++)
+			zz[i][m - 1] = a[i][m - 2];
+		i = ordr[m - 1];
+		if (i != m) {
+			for (j = m - 1; j < n; j++) {
+				zz[m - 1][j] = zz[i - 1][j];
+				zz[i - 1][j] = 0.0;
+			}
+			zz[i - 1][m - 1] = 1.0;
+		}
+	}
+} /* eltran */
+
+
+void EigenDecomposition::mcdiv(double ar, double ai, double br, double bi,
+                  double *cr, double *ci) {
+	double s, ars, ais, brs, bis;
+
+
+	s = fabs(br) + fabs(bi);
+	ars = ar / s;
+	ais = ai / s;
+	brs = br / s;
+	bis = bi / s;
+	s = brs * brs + bis * bis;
+	*cr = (ars * brs + ais * bis) / s;
+	*ci = (ais * brs - ars * bis) / s;
+} /* mcdiv */
+
+
+void EigenDecomposition::hqr2(int n, int low, int hgh, double **h,
+                 double **zz, double *wr, double *wi) {
+	int i, j, k, l=0, m, en, na, itn, its;
+	double p=0, q=0, r=0, s=0, t, w, x=0, y, ra, sa, vi, vr, z=0, norm, tst1, tst2;
+	int notlas; /* boolean */
+
+
+	norm = 0.0;
+	k = 1;
+	/* store isolated roots and compute matrix norm */
+	for (i = 0; i < n; i++) {
+		for (j = k - 1; j < n; j++)
+			norm += fabs(h[i][j]);
+		k = i + 1;
+		if (i + 1 < low || i + 1 > hgh) {
+			wr[i] = h[i][i];
+			wi[i] = 0.0;
+		}
+	}
+	en = hgh;
+	t = 0.0;
+	itn = n * 30;
+	while (en >= low) {    /* search for next eigenvalues */
+		its = 0;
+		na = en - 1;
+		while (en >= 1) {
+			/* look for single small sub-diagonal element */
+			for (l = en; l > low; l--) {
+				s = fabs(h[l - 2][l - 2]) + fabs(h[l - 1][l - 1]);
+
+				if (s == 0.0)
+					s = norm;
+				tst1 = s;
+				tst2 = tst1 + fabs(h[l - 1][l - 2]);
+				if (tst2 == tst1)
+					goto L100;
+			}
+			l = low;
+		L100:
+			x = h[en - 1][en - 1];    /* form shift */
+			if (l == en || l == na)
+				break;
+			if (itn == 0) {
+				/* all eigenvalues have not converged */
+				cout << "\n\n\nHALT: PLEASE REPORT ERROR B TO DEVELOPERS\n\n\n";
+				exit(1);
+			}
+			y = h[na - 1][na - 1];
+			w = h[en - 1][na - 1] * h[na - 1][en - 1];
+			/* form exceptional shift */
+			if (its == 10 || its == 20) {
+				t += x;
+				for (i = low - 1; i < en; i++)
+					h[i][i] -= x;
+				s = fabs(h[en - 1][na - 1]) + fabs(h[na - 1][en - 3]);
+				x = 0.75 * s;
+				y = x;
+				w = -0.4375 * s * s;
+			}
+			its++;
+			itn--;
+			/* look for two consecutive small sub-diagonal elements */
+			for (m = en - 2; m >= l; m--) {
+				z = h[m - 1][m - 1];
+				r = x - z;
+				s = y - z;
+				p = (r * s - w) / h[m][m - 1] + h[m - 1][m];
+				q = h[m][m] - z - r - s;
+				r = h[m + 1][m];
+				s = fabs(p) + fabs(q) + fabs(r);
+				p /= s;
+				q /= s;
+				r /= s;
+				if (m == l)
+					break;
+				tst1 = fabs(p) *
+				       (fabs(h[m - 2][m - 2]) + fabs(z) + fabs(h[m][m]));
+				tst2 = tst1 + fabs(h[m - 1][m - 2]) * (fabs(q) + fabs(r));
+				if (tst2 == tst1)
+					break;
+			}
+			for (i = m + 2; i <= en; i++) {
+				h[i - 1][i - 3] = 0.0;
+				if (i != m + 2)
+					h[i - 1][i - 4] = 0.0;
+			}
+			for (k = m; k <= na; k++) {
+				notlas = (k != na);
+				if (k != m) {
+					p = h[k - 1][k - 2];
+					q = h[k][k - 2];
+					r = 0.0;
+					if (notlas)
+						r = h[k + 1][k - 2];
+					x = fabs(p) + fabs(q) + fabs(r);
+					if (x != 0.0) {
+						p /= x;
+						q /= x;
+						r /= x;
+					}
+				}
+				if (x != 0.0) {
+					if (p < 0.0) /* sign */
+						s = - sqrt(p * p + q * q + r * r);
+					else
+						s = sqrt(p * p + q * q + r * r);
+					if (k != m)
+						h[k - 1][k - 2] = -s * x;
+					else {
+						if (l != m)
+							h[k - 1][k - 2] = -h[k - 1][k - 2];
+					}
+					p += s;
+					x = p / s;
+					y = q / s;
+					z = r / s;
+					q /= p;
+					r /= p;
+					if (!notlas) {
+						for (j = k - 1; j < n; j++) {    /* row modification */
+							p = h[k - 1][j] + q * h[k][j];
+							h[k - 1][j] -= p * x;
+							h[k][j] -= p * y;
+						}
+						j = (en < (k + 3)) ? en : (k + 3); /* min */
+						for (i = 0; i < j; i++) {    /* column modification */
+							p = x * h[i][k - 1] + y * h[i][k];
+							h[i][k - 1] -= p;
+							h[i][k] -= p * q;
+						}
+						/* accumulate transformations */
+						for (i = low - 1; i < hgh; i++) {
+							p = x * zz[i][k - 1] + y * zz[i][k];
+							zz[i][k - 1] -= p;
+							zz[i][k] -= p * q;
+						}
+					} else {
+						for (j = k - 1; j < n; j++) {    /* row modification */
+							p = h[k - 1][j] + q * h[k][j] + r * h[k + 1][j];
+							h[k - 1][j] -= p * x;
+							h[k][j] -= p * y;
+							h[k + 1][j] -= p * z;
+						}
+						j = (en < (k + 3)) ? en : (k + 3); /* min */
+						for (i = 0; i < j; i++) {    /* column modification */
+							p = x * h[i][k - 1] + y * h[i][k] + z * h[i][k + 1];
+							h[i][k - 1] -= p;
+							h[i][k] -= p * q;
+							h[i][k + 1] -= p * r;
+						}
+						/* accumulate transformations */
+						for (i = low - 1; i < hgh; i++) {
+							p = x * zz[i][k - 1] + y * zz[i][k] +
+							    z * zz[i][k + 1];
+							zz[i][k - 1] -= p;
+							zz[i][k] -= p * q;
+							zz[i][k + 1] -= p * r;
+						}
+					}
+				}
+			}           /* for k */
+		}               /* while infinite loop */
+		if (l == en) {           /* one root found */
+			h[en - 1][en - 1] = x + t;
+			wr[en - 1] = h[en - 1][en - 1];
+			wi[en - 1] = 0.0;
+			en = na;
+			continue;
+		}
+		y = h[na - 1][na - 1];
+		w = h[en - 1][na - 1] * h[na - 1][en - 1];
+		p = (y - x) / 2.0;
+		q = p * p + w;
+		z = sqrt(fabs(q));
+		h[en - 1][en - 1] = x + t;
+		x = h[en - 1][en - 1];
+		h[na - 1][na - 1] = y + t;
+		if (q >= 0.0) {           /* real pair */
+			if (p < 0.0) /* sign */
+				z = p - fabs(z);
+			else
+				z = p + fabs(z);
+			wr[na - 1] = x + z;
+			wr[en - 1] = wr[na - 1];
+			if (z != 0.0)
+				wr[en - 1] = x - w / z;
+			wi[na - 1] = 0.0;
+			wi[en - 1] = 0.0;
+			x = h[en - 1][na - 1];
+			s = fabs(x) + fabs(z);
+			p = x / s;
+			q = z / s;
+			r = sqrt(p * p + q * q);
+			p /= r;
+			q /= r;
+			for (j = na - 1; j < n; j++) {    /* row modification */
+				z = h[na - 1][j];
+				h[na - 1][j] = q * z + p * h[en - 1][j];
+				h[en - 1][j] = q * h[en - 1][j] - p * z;
+			}
+			for (i = 0; i < en; i++) {    /* column modification */
+				z = h[i][na - 1];
+				h[i][na - 1] = q * z + p * h[i][en - 1];
+				h[i][en - 1] = q * h[i][en - 1] - p * z;
+			}
+			/* accumulate transformations */
+			for (i = low - 1; i < hgh; i++) {
+				z = zz[i][na - 1];
+				zz[i][na - 1] = q * z + p * zz[i][en - 1];
+				zz[i][en - 1] = q * zz[i][en - 1] - p * z;
+			}
+		} else {           /* complex pair */
+			wr[na - 1] = x + p;
+			wr[en - 1] = x + p;
+			wi[na - 1] = z;
+			wi[en - 1] = -z;
+		}
+		en -= 2;
+	}                   /* while en >= low */
+	/* backsubstitute to find vectors of upper triangular form */
+	if (norm != 0.0) {
+		for (en = n; en >= 1; en--) {
+			p = wr[en - 1];
+			q = wi[en - 1];
+			na = en - 1;
+			if (q == 0.0) {/* real vector */
+				m = en;
+				h[en - 1][en - 1] = 1.0;
+				if (na != 0) {
+					for (i = en - 2; i >= 0; i--) {
+						w = h[i][i] - p;
+						r = 0.0;
+						for (j = m - 1; j < en; j++)
+							r += h[i][j] * h[j][en - 1];
+						if (wi[i] < 0.0) {
+							z = w;
+							s = r;
+						} else {
+							m = i + 1;
+							if (wi[i] == 0.0) {
+								t = w;
+								if (t == 0.0) {
+									tst1 = norm;
+									t = tst1;
+									do {
+										t = 0.01 * t;
+										tst2 = norm + t;
+									} while (tst2 > tst1);
+								}
+								h[i][en - 1] = -(r / t);
+							} else {    /* solve real equations */
+								x = h[i][i + 1];
+								y = h[i + 1][i];
+								q = (wr[i] - p) * (wr[i] - p) + wi[i] * wi[i];
+								t = (x * s - z * r) / q;
+								h[i][en - 1] = t;
+								if (fabs(x) > fabs(z))
+									h[i + 1][en - 1] = (-r - w * t) / x;
+								else
+									h[i + 1][en - 1] = (-s - y * t) / z;
+							}
+							/* overflow control */
+							t = fabs(h[i][en - 1]);
+							if (t != 0.0) {
+								tst1 = t;
+								tst2 = tst1 + 1.0 / tst1;
+								if (tst2 <= tst1) {
+									for (j = i; j < en; j++)
+										h[j][en - 1] /= t;
+								}
+							}
+						}
+					}
+				}
+			} else if (q > 0.0) {
+				m = na;
+				if (fabs(h[en - 1][na - 1]) > fabs(h[na - 1][en - 1])) {
+					h[na - 1][na - 1] = q / h[en - 1][na - 1];
+					h[na - 1][en - 1] = (p - h[en - 1][en - 1]) /
+					                    h[en - 1][na - 1];
+				} else
+					mcdiv(0.0, -h[na - 1][en - 1], h[na - 1][na - 1] - p, q,
+					      &h[na - 1][na - 1], &h[na - 1][en - 1]);
+				h[en - 1][na - 1] = 0.0;
+				h[en - 1][en - 1] = 1.0;
+				if (en != 2) {
+					for (i = en - 3; i >= 0; i--) {
+						w = h[i][i] - p;
+						ra = 0.0;
+						sa = 0.0;
+						for (j = m - 1; j < en; j++) {
+							ra += h[i][j] * h[j][na - 1];
+							sa += h[i][j] * h[j][en - 1];
+						}
+						if (wi[i] < 0.0) {
+							z = w;
+							r = ra;
+							s = sa;
+						} else {
+							m = i + 1;
+							if (wi[i] == 0.0)
+								mcdiv(-ra, -sa, w, q, &h[i][na - 1],
+								      &h[i][en - 1]);
+							else {    /* solve complex equations */
+								x = h[i][i + 1];
+								y = h[i + 1][i];
+								vr = (wr[i] - p) * (wr[i] - p);
+								vr = vr + wi[i] * wi[i] - q * q;
+								vi = (wr[i] - p) * 2.0 * q;
+								if (vr == 0.0 && vi == 0.0) {
+									tst1 = norm * (fabs(w) + fabs(q) + fabs(x) +
+									               fabs(y) + fabs(z));
+									vr = tst1;
+									do {
+										vr = 0.01 * vr;
+										tst2 = tst1 + vr;
+									} while (tst2 > tst1);
+								}
+								mcdiv(x * r - z * ra + q * sa,
+								      x * s - z * sa - q * ra, vr, vi,
+								      &h[i][na - 1], &h[i][en - 1]);
+								if (fabs(x) > fabs(z) + fabs(q)) {
+									h[i + 1]
+									[na - 1] = (q * h[i][en - 1] -
+									            w * h[i][na - 1] - ra) / x;
+									h[i + 1][en - 1] = (-sa - w * h[i][en - 1] -
+									                    q * h[i][na - 1]) / x;
+								} else
+									mcdiv(-r - y * h[i][na - 1],
+									      -s - y * h[i][en - 1], z, q,
+									      &h[i + 1][na - 1], &h[i + 1][en - 1]);
+							}
+							/* overflow control */
+							t = (fabs(h[i][na - 1]) > fabs(h[i][en - 1])) ?
+							    fabs(h[i][na - 1]) : fabs(h[i][en - 1]);
+							if (t != 0.0) {
+								tst1 = t;
+								tst2 = tst1 + 1.0 / tst1;
+								if (tst2 <= tst1) {
+									for (j = i; j < en; j++) {
+										h[j][na - 1] /= t;
+										h[j][en - 1] /= t;
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+		/* end back substitution. vectors of isolated roots */
+		for (i = 0; i < n; i++) {
+			if (i + 1 < low || i + 1 > hgh) {
+				for (j = i; j < n; j++)
+					zz[i][j] = h[i][j];
+			}
+		}
+		/* multiply by transformation matrix to give vectors of
+		 * original full matrix. */
+		for (j = n - 1; j >= low - 1; j--) {
+			m = ((j + 1) < hgh) ? (j + 1) : hgh; /* min */
+			for (i = low - 1; i < hgh; i++) {
+				z = 0.0;
+				for (k = low - 1; k < m; k++)
+					z += zz[i][k] * h[k][j];
+				zz[i][j] = z;
+			}
+		}
+	}
+	return;
+} /* hqr2 */
+
+void EigenDecomposition::luinverse(double **inmat, double **imtrx, int size) {
+	double eps = 1.0e-20; /* ! */
+	int i, j, k, l, maxi=0, idx, ix, jx;
+	double sum, tmp, maxb, aw;
+	int *index = new int[size];
+	double *wk;
+	double **omtrx = (double**) new double[size];
+
+	for (i = 0; i < size; i++)
+		omtrx[i] = new double[size];
+
+	/* copy inmat to omtrx */
+	for (i = 0; i < size; i++)
+		for (j = 0; j < size; j++)
+			omtrx[i][j] = inmat[i][j];
+
+	wk = (double *) calloc((size_t)size, sizeof(double));
+	aw = 1.0;
+	for (i = 0; i < size; i++) {
+		maxb = 0.0;
+		for (j = 0; j < size; j++) {
+			if (fabs(omtrx[i][j]) > maxb)
+				maxb = fabs(omtrx[i][j]);
+		}
+		if (maxb == 0.0) {
+			/* Singular matrix */
+			cout << "\n\n\nHALT: PLEASE REPORT ERROR C TO DEVELOPERS\n\n\n";
+			exit(1);
+		}
+		wk[i] = 1.0 / maxb;
+	}
+	for (j = 0; j < size; j++) {
+		for (i = 0; i < j; i++) {
+			sum = omtrx[i][j];
+			for (k = 0; k < i; k++)
+				sum -= omtrx[i][k] * omtrx[k][j];
+			omtrx[i][j] = sum;
+		}
+		maxb = 0.0;
+		for (i = j; i < size; i++) {
+			sum = omtrx[i][j];
+			for (k = 0; k < j; k++)
+				sum -= omtrx[i][k] * omtrx[k][j];
+			omtrx[i][j] = sum;
+			tmp = wk[i] * fabs(sum);
+			if (tmp >= maxb) {
+				maxb = tmp;
+				maxi = i;
+			}
+		}
+		if (j != maxi) {
+			for (k = 0; k < size; k++) {
+				tmp = omtrx[maxi][k];
+				omtrx[maxi][k] = omtrx[j][k];
+				omtrx[j][k] = tmp;
+			}
+			aw = -aw;
+			wk[maxi] = wk[j];
+		}
+		index[j] = maxi;
+		if (omtrx[j][j] == 0.0)
+			omtrx[j][j] = eps;
+		if (j != size - 1) {
+			tmp = 1.0 / omtrx[j][j];
+			for (i = j + 1; i < size; i++)
+				omtrx[i][j] *= tmp;
+		}
+	}
+	for (jx = 0; jx < size; jx++) {
+		for (ix = 0; ix < size; ix++)
+			wk[ix] = 0.0;
+		wk[jx] = 1.0;
+		l = -1;
+		for (i = 0; i < size; i++) {
+			idx = index[i];
+			sum = wk[idx];
+			wk[idx] = wk[i];
+			if (l != -1) {
+				for (j = l; j < i; j++)
+					sum -= omtrx[i][j] * wk[j];
+			} else if (sum != 0.0)
+				l = i;
+			wk[i] = sum;
+		}
+		for (i = size - 1; i >= 0; i--) {
+			sum = wk[i];
+			for (j = i + 1; j < size; j++)
+				sum -= omtrx[i][j] * wk[j];
+			wk[i] = sum / omtrx[i][i];
+		}
+		for (ix = 0; ix < size; ix++)
+			imtrx[ix][jx] = wk[ix];
+	}
+	free((char *)wk);
+	wk = NULL;
+	for (i = size-1; i >= 0; i--)
+		delete [] omtrx[i];
+	delete [] omtrx;
+	delete [] index;
+} /* luinverse */
+
+void EigenDecomposition::checkevector(double *evec, double *ivec, int nn) {
+	int i, j, ia, ib, ic, error;
+	double **matx = (double**) new double [nn];
+	double sum;
+
+	for (i = 0; i < nn; i++)
+		matx[i] = new double[nn];
+
+	/* multiply matrix of eigenvectors and its inverse */
+	for (ia = 0; ia < nn; ia++) {
+		for (ic = 0; ic < nn; ic++) {
+			sum = 0.0;
+			for (ib = 0; ib < nn; ib++) sum += evec[ia*nn+ib] * ivec[ib*nn+ic];
+			matx[ia][ic] = sum;
+		}
+	}
+	/* check whether the unitary matrix is obtained */
+	error = 0;
+	for (i = 0; i < nn; i++) {
+		for (j = 0; j < nn; j++) {
+			if (i == j) {
+				if (fabs(matx[i][j] - 1.0) > 1.0e-5)
+					error = 1;
+			} else {
+				if (fabs(matx[i][j]) > 1.0e-5)
+					error = 1;
+			}
+		}
+	}
+	if (error) {
+		cout << "\nWARNING: Inversion of eigenvector matrix not perfect!\n";
+	}
+
+	for (i = nn-1; i >= 0; i--)
+		delete [] matx[i];
+	delete [] matx;
+} /* checkevector */
diff --git a/eigendecomposition.h b/eigendecomposition.h
new file mode 100644
index 0000000..2f62f5f
--- /dev/null
+++ b/eigendecomposition.h
@@ -0,0 +1,185 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef EIGENDECOMPOSITION_H
+#define EIGENDECOMPOSITION_H
+
+/**
+Eigenvalues, eigenvectors decomposition
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class EigenDecomposition{
+public:
+    EigenDecomposition();
+
+    ~EigenDecomposition();
+
+	/**
+		EigenSystem for symmetric matrix
+		@param rate_params rate parameters (not the rate matrix)
+		@param state_freq state frequencies
+		@param eval (OUT) eigenvalues
+		@param evec (OUT) eigenvectors
+		@param inv_evec (OUT) inverse matrix of eigenvectors
+		@param num_state (IN) number of states
+	*/
+	void eigensystem_sym(double **rate_params, double *state_freq, 
+	double *eval, double *evec, double *inv_evec, int num_state);
+
+	/**
+		EigenSystem for general non-symmetric matrix
+		@param rate_params rate parameters (not the rate matrix)
+		@param state_freq state frequencies
+		@param eval (OUT) eigenvalues
+		@param evec (OUT) eigenvectors
+		@param inv_evec (OUT) inverse matrix of eigenvectors
+		@param num_state (IN) number of states
+	*/
+	void eigensystem(double **rate_params, double *state_freq, 
+	double *eval, double **evec, double **inv_evec, int num_state);
+
+	/**
+		EigenSystem for general non-symmetric matrix without state frequencies
+		@param rate_params rate parameters (not the rate matrix)
+		@param eval (OUT) eigenvalues
+		@param evec (OUT) eigenvectors
+		@param inv_evec (OUT) inverse matrix of eigenvectors
+		@param num_state (IN) number of states
+	*/
+    void eigensystem(double **rate_params, double *eval, double **evec, double **inv_evec, int num_state);
+
+
+	/** TRUE to normalize rate matrix to 1.0 subst per unit time */
+	bool normalize_matrix;
+
+	/**
+		the total number of substitutions per unit time
+	*/
+	double total_num_subst;
+	
+    /** TRUE to ignore state_freq in computation, default: FALSE */
+    bool ignore_state_freq;
+
+
+protected:
+
+	/**
+		compute the rate matrix and then normalize it such that the total number of substitutions is 1.
+		@param rate_matrix (IN/OUT) As input, it contains rate parameters. On output it is filled with rate matrix entries
+		@param state_freq state frequencies
+		@param num_state number of states
+	*/
+	void computeRateMatrix(double **rate_matrix, double *state_freq, int num_state);
+
+	/**
+		Eliminate zero entries in the rate matrix. 
+		Return the new non-zero matrix with possibly reduced dimension.
+		@param mat input rate matrix
+		@param forg state frequencies
+		@param num number of states
+		@param new_mat (OUT) the new rate matrix
+		@param new_forg (OUT) new state frequencies
+		@param new_num (OUT) new number of states
+	*/
+	void eliminateZero(double **mat, double *forg, int num, 
+		double **new_mat, double *new_forg, int &new_num);
+
+/*********************************************************
+* aided function for symmetric matrix
+*********************************************************/
+
+	/**
+		transform the rate matrix into symmetric form, used for subsequent eigen decomposition
+		@param a (IN/OUT) rate matrix
+		@param stateFrq state frequencies
+		@param stateFrq_sqrt square root of state frequencies
+		@param num_state number of states
+	*/
+	void symmetrizeRateMatrix(double **a, double *stateFrq, double *stateFrq_sqrt, int num_state);
+
+
+	/**
+		Householder transformation of symmetric matrix A
+		to tridiagonal form 
+		@param a the input matrix, must be symmetric. On output,
+			a is replaced by the orthogonal matrix effecting the transformation
+		@param  n the size of matrix a
+		@param d [0..n-1] returned the diagonal elements of the tridiagonal matrix
+		@param e [0..n-1] returned the off-diagonal elements with e[0]=0
+	*/
+	void tred2(double **a, int n, double *d, double *e);
+
+	/**
+		QL algorithm with implicit shifts to determine eigenvalues and
+		eigenvectors of a real tridiagonal symmetric matrix.
+		@param d [0..n-1] diagonal elements of the tridiagonal matrix. 
+			On output d return the eigenvalues.
+		@param e [0..n-1] off-diagonal elements of the tridiagonal matrix, e[0] arbitrary.
+			On output e is destroyed.
+		@param n matrix size
+		@param z must be input as the matrix returned by tred2
+			z[k] return the normalized eigenvector corresponding to d[k]
+	*/
+	void tqli(double *d, double *e, int n, double **z);
+
+/*********************************************************
+* aided function for non-symmetric matrix
+*********************************************************/
+
+	/**
+		convert a non-symmetric matrix into Hessenberg form with zeros everywhere
+		below the diagonal except for the first sub-diagonal row
+		@param a (IN-OUT) the matrix
+		@param ordr (OUT) the order of columns
+		@param n (IN) size of matrix 
+	*/
+	void elmhes(double **a, int *ordr, int n);
+
+	/*
+		something here
+	*/
+	void eltran(double **a, double **zz, int *ordr, int n);
+
+	/*
+		something here
+	*/
+	void mcdiv(double ar, double ai, double br, double bi,
+	           double *cr, double *ci);
+
+	/**
+		QR algorithm for non-symmetric matrix to calculate eigenvectors and eigenvalues
+		of a Hessenberg matrix (should be preceded by elmhes function)
+		@param n (IN) size of matrix 
+	*/
+	void hqr2(int n, int low, int hgh, double **h, double **zz, double *wr, double *wi);
+
+	/**
+		compute the inverse of a square matrix
+		@param inmat (IN) the matrix
+		@param imtrx (OUT) the inverse of the input matrix
+		@param size the size of matrix
+	*/
+	void luinverse(double **inmat, double **imtrx, int size);
+
+	void checkevector(double *evec, double *ivec, int nn);
+
+};
+
+#endif
diff --git a/fmemopen.c b/fmemopen.c
new file mode 100644
index 0000000..bfd3c65
--- /dev/null
+++ b/fmemopen.c
@@ -0,0 +1,147 @@
+//
+// Copyright 2012 Jeff Verkoeyen
+// Originally ported from https://github.com/ingenuitas/python-tesseract/blob/master/fmemopen.c
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if defined __APPLE__ || defined __MACH__
+
+
+/*--------------------------------------------------------------*/
+/* portable version for fmemopen for MAC OSX */
+/*--------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+struct fmem {
+  size_t pos;
+  size_t size;
+  char *buffer;
+};
+typedef struct fmem fmem_t;
+
+static int readfn(void *handler, char *buf, int size) {
+  fmem_t *mem = handler;
+  size_t available = mem->size - mem->pos;
+  
+  if (size > available) {
+    size = available;
+  }
+  memcpy(buf, mem->buffer + mem->pos, sizeof(char) * size);
+  mem->pos += size;
+  
+  return size;
+}
+
+static int writefn(void *handler, const char *buf, int size) {
+  fmem_t *mem = handler;
+  size_t available = mem->size - mem->pos;
+
+  if (size > available) {
+    size = available;
+  }
+  memcpy(mem->buffer + mem->pos, buf, sizeof(char) * size);
+  mem->pos += size;
+
+  return size;
+}
+
+static fpos_t seekfn(void *handler, fpos_t offset, int whence) {
+  size_t pos;
+  fmem_t *mem = handler;
+
+  switch (whence) {
+    case SEEK_SET: pos = offset; break;
+    case SEEK_CUR: pos = mem->pos + offset; break;
+    case SEEK_END: pos = mem->size + offset; break;
+    default: return -1;
+  }
+
+  if (pos > mem->size) {
+    return -1;
+  }
+
+  mem->pos = pos;
+  return (fpos_t)pos;
+}
+
+static int closefn(void *handler) {
+  free(handler);
+  return 0;
+}
+
+FILE *fmemopen(void *buf, size_t size, const char *mode) {
+  // This data is released on fclose.
+  fmem_t* mem = (fmem_t *) malloc(sizeof(fmem_t));
+
+  // Zero-out the structure.
+  memset(mem, 0, sizeof(fmem_t));
+
+  mem->size = size;
+  mem->buffer = buf;
+
+  // funopen's man page: https://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/funopen.3.html
+  return funopen(mem, readfn, writefn, seekfn, closefn);
+}
+
+#elif defined WIN32 || defined _WIN32 || defined __WIN32__
+
+/*--------------------------------------------------------------*/
+/* SLOW portable version for fmemopen for WIN32 (using temp file) */
+/*--------------------------------------------------------------*/
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+FILE *fmemopen(void *buf, size_t size, const char *mode) {
+    char temppath[MAX_PATH - 13];
+    if (0 == GetTempPath(sizeof(temppath), temppath)) {
+                puts("Can't get temp path");
+        return NULL;
+        }
+    char filename[MAX_PATH + 1];
+    if (0 == GetTempFileName(temppath, "IQT", 0, filename)) {
+        puts("Can't get file name");
+        return NULL;
+    }
+        printf("file::%s\n",filename);
+    /* FILE *f = fopen(filename, "wb");
+      if (NULL == f)
+        return NULL;
+        */
+    FILE *f;
+    errno_t err;
+
+        if( (err  = fopen_s( &f, filename, "wb" )) !=0 )
+      printf( "The file '%s' was not opened\n", filename );
+
+    fwrite(buf, size, 1, f);
+    fclose(f);
+
+
+
+    /* return fopen(filename, mode); */
+        FILE *f2;
+        if( (err  = fopen_s( &f2, filename, mode )) !=0 )
+                return f2;
+
+
+}
+
+
+#endif
diff --git a/fmemopen.h b/fmemopen.h
new file mode 100644
index 0000000..b7ec325
--- /dev/null
+++ b/fmemopen.h
@@ -0,0 +1,60 @@
+//
+// Copyright 2012 Jeff Verkoeyen
+// Originally ported from https://github.com/ingenuitas/python-tesseract/blob/master/fmemopen.c
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef FMEMOPEN_H_
+#define FMEMOPEN_H_
+
+/* only define fmemopen for MAC OSX and WIN32 */
+#if defined __APPLE__ || defined __MACH__ || defined WIN32 || defined _WIN32 || defined __WIN32__
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * A BSD port of the fmemopen Linux method using funopen.
+ *
+ * man docs for fmemopen:
+ * http://linux.die.net/man/3/fmemopen
+ *
+ * man docs for funopen:
+ * https://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/funopen.3.html
+ *
+ * This method is ported from ingenuitas' python-tesseract project.
+ *
+ * You must call fclose on the returned file pointer or memory will be leaked.
+ *
+ *      @param buf The data that will be used to back the FILE* methods. Must be at least
+ *                 @c size bytes.
+ *      @param size The size of the @c buf data.
+ *      @param mode The permitted stream operation modes.
+ *      @returns A pointer that can be used in the fread/fwrite/fseek/fclose family of methods.
+ *               If a failure occurred NULL will be returned.
+ */
+FILE *fmemopen(void *buf, size_t size, const char *mode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif // #ifndef FMEMOPEN_H_
diff --git a/graph.cpp b/graph.cpp
new file mode 100644
index 0000000..6deae86
--- /dev/null
+++ b/graph.cpp
@@ -0,0 +1,63 @@
+/*
+ * graph.cpp
+ *
+ *  Created on: Nov 14, 2013
+ *      Author: olga
+ */
+
+#include "graph.h"
+#include <iostream>
+#include <list>
+#include <limits.h>
+
+Graph::Graph(int V){
+    this->V = V;
+    adj = new list<int>[V];
+}
+
+void Graph::addEdge(int v, int w){
+    adj[v].push_back(w); // Add w to v�s list.
+}
+
+bool Graph::isCyclicUtil(int v, bool visited[], bool *recStack){
+    if(visited[v] == false)
+    {
+        // Mark the current node as visited and part of recursion stack
+        visited[v] = true;
+        recStack[v] = true;
+
+        // Recur for all the vertices adjacent to this vertex
+        list<int>::iterator i;
+        for(i = adj[v].begin(); i != adj[v].end(); ++i)
+        {
+            if ( !visited[*i] && isCyclicUtil(*i, visited, recStack) ){
+                return true;
+            } else if (recStack[*i]){
+                return true;
+            }
+        }
+    }
+    recStack[v] = false;  // remove the vertex from recursion stack
+    return false;
+}
+
+// Returns true if the graph contains a cycle, else false.
+bool Graph::isCyclic()
+{
+    // Mark all the vertices as not visited and not part of recursion
+    // stack
+    bool *visited = new bool[V];
+    bool *recStack = new bool[V];
+    for(int i = 0; i < V; i++){
+        visited[i] = false;
+        recStack[i] = false;
+    }
+
+    // Call the recursive helper function to detect cycle in different
+    // DFS trees
+    for(int i = 0; i < V; i++)
+        if (isCyclicUtil(i, visited, recStack))
+            return true;
+
+    return false;
+}
diff --git a/graph.h b/graph.h
new file mode 100644
index 0000000..5e4e90c
--- /dev/null
+++ b/graph.h
@@ -0,0 +1,29 @@
+/*
+ * graph.h
+ *
+ *  Created on: Nov 14, 2013
+ *      Author: olga
+ */
+
+#include <iostream>
+#include <list>
+#include <limits.h>
+
+#ifndef GRAPH_H_
+#define GRAPH_H_
+
+using namespace std;
+
+class Graph
+{
+    int V;    			// No. of vertices
+    list<int> *adj;		// Pointer to an array containing adjacency lists
+    bool isCyclicUtil(int v, bool visited[], bool *rs);  // used by isCyclic()
+
+public:
+    Graph(int V);   // Constructor
+    void addEdge(int v, int w);   // to add an edge to graph
+    bool isCyclic();    // returns true if there is a cycle in this graph
+};
+
+#endif
diff --git a/greedy.cpp b/greedy.cpp
new file mode 100644
index 0000000..1e05ec6
--- /dev/null
+++ b/greedy.cpp
@@ -0,0 +1,202 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "greedy.h"
+
+/*********************************************
+	class Greedy
+*********************************************/
+/**
+	run the algorithm
+*/
+void Greedy::run(Params &params, vector<PDTaxaSet> &taxa_set)
+{
+	Node *node1, *node2;
+	NodeVector subtree;
+	subtree.resize(nodeNum, NULL);
+
+	//if (params.is_rooted) subsize++;
+
+	if (params.min_size < 2) 
+		params.min_size = params.sub_size;
+
+	taxa_set.resize(params.sub_size - params.min_size + 1);
+
+	if (initialset.empty()) {
+		taxa_set[0].score = root->longestPath2(node1, node2);
+		root = node1;
+		// initialize the subtree length
+		subtree[root->id] = root;
+		// update the current PD set
+		taxa_set[0].push_back(root);
+
+		list_size = params.sub_size-2;
+		updateOnLongestPath(root->highestNei->node, subtree, taxa_set[0]);
+	} else if (initialset.size() == 1) {
+		root = initialset[0];
+		subtree[root->id] = root;
+		root->calcHeight();
+		// initialize the subtree length
+		taxa_set[0].score = root->height;
+		taxa_set[0].push_back(root);
+		list_size = params.sub_size-2;
+		updateOnLongestPath(root->highestNei->node, subtree, taxa_set[0]);
+	} else {
+		root = initialset[0];
+		int included = initialset.size();
+		// put all taxa on the initial set into subtree
+		for (NodeVector::iterator it = initialset.begin(); it != initialset.end(); it++) {
+			if (subtree[(*it)->id]) {
+				cout << "Duplicated " << (*it)->name << endl;
+				included--;
+				continue;
+			}
+			subtree[(*it)->id] = (*it);
+			taxa_set[0].push_back(*it);
+		}
+		list_size = params.sub_size - included;
+		cout << included - rooted << " distinct taxa included, adding " << list_size << " more taxa" << endl;
+		// initialize maximal distance set
+		root->calcHeight();
+
+		NodeVector nodestack;
+		buildOnInitialSet(subtree, nodestack);
+		taxa_set[0].score = updateOnInitialSet(subtree);
+		//taxa_set[0].insert(taxa_set[0].end(), initialset.begin(), initialset.end());
+	}
+
+	// greedy step
+
+	if (list_size < 0) outError("Too small k");
+
+	int ts;
+	for (ts = 0; list_size > 0; list_size--)
+	{
+		if (params.sub_size - list_size >= params.min_size) {
+			taxa_set[ts].setSubTree(*this, subtree);
+			ts++;
+			taxa_set[ts] = taxa_set[ts-1];
+		}
+		NeighborSet::iterator itneigh = highestNeighbor();
+		Neighbor* neigh = *itneigh;
+		neighset.erase(itneigh);
+		// update the subtree length
+		taxa_set[ts].score += neigh->length + neigh->node->height;
+		// update the subtree
+		updateOnLongestPath(neigh->node, subtree, taxa_set[ts]);
+	}
+
+	taxa_set[ts].setSubTree(*this, subtree);
+
+}
+
+/**
+	initialize the ordered list based on the initial tree structure
+*/
+void Greedy::buildOnInitialSet(NodeVector &subtree, NodeVector &nodestack, Node *node, Node *dad) {
+	if (!node) node = root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		Node *next = (*it)->node;
+		nodestack.push_back(next);
+
+		if (next->isLeaf() && subtree[next->id] != NULL) {
+			// the next node is a leaf and is in the initial set
+			// put all node on the stack into the subtree
+			for (NodeVector::iterator itnode = nodestack.begin(); itnode != nodestack.end(); itnode++) {
+				subtree[(*itnode)->id] = (*itnode);
+			}
+		}
+		buildOnInitialSet(subtree, nodestack, next, node);
+		nodestack.pop_back();
+	}
+}
+
+/**
+	initialize the ordered list based on the initial subtree structure
+	@param subtree vector containing nodes in the subtree
+	@return the subtree length
+*/
+double Greedy::updateOnInitialSet(NodeVector &subtree) {
+	int i;
+	// scan through interior nodes
+	for (i = leafNum; i < nodeNum; i++) 
+		if (subtree[i] != NULL) {
+			Node *node = subtree[i];
+			for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++) 
+				if (subtree[(*it)->node->id] == NULL){
+					addNeighbor((*it));
+				}
+		}
+	double len = 0.0;
+	for (i = 0; i < nodeNum; i++) 
+		if (subtree[i] != NULL) {
+			Node *node = subtree[i];
+			for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++) 
+				if (subtree[(*it)->node->id] != NULL){
+					len += (*it)->length;
+				}
+		}
+	return len / 2.0;
+}
+
+void Greedy::updateOnLongestPath(Node *node, NodeVector &subtree, PDTaxaSet &cur_set)
+{
+
+	Node* next;
+	Node* current;
+
+	for (current = node; !current->isLeaf(); current = next)
+	{
+		subtree[current->id] = current;
+		next = current->highestNei->node;
+		// redirect the highest neighbor of the current
+		//for (int i = 0; i < current->neighbors.size(); i++)
+			//if (subtree[current->neighbors[i]->node->id] == NULL && current->neighbors[i]->node != next)
+		FOR_NEIGHBOR_IT(current, next, it)
+			if (subtree[(*it)->node->id] == NULL) {
+				addNeighbor((*it));
+			}
+	}
+	subtree[current->id] = current;
+	cur_set.push_back(current);
+}
+
+NeighborSet::iterator Greedy::highestNeighbor()
+{
+	return neighset.begin();
+}
+
+/**
+	add an edge into the NeighborSet
+*/
+void Greedy::addNeighbor(Neighbor* neigh) {
+	if (list_size <= 0)
+		return;
+	if (neighset.size() < list_size)
+		neighset.insert(neigh);
+	else {
+		NeighborSet::iterator last = neighset.end();
+		last--;
+		Neighbor* endn = *last;
+    	if ((neigh->length + neigh->node->height) > (endn->length + endn->node->height)) {
+			neighset.erase(last);
+			neighset.insert(neigh);
+		}
+ 	}
+}
diff --git a/greedy.h b/greedy.h
new file mode 100644
index 0000000..04c7a03
--- /dev/null
+++ b/greedy.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef GREEDY_H
+#define GREEDY_H
+
+#include "pdtree.h"
+
+/**
+Implementation of greedy algorithm with complexity O(n*logk)
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class Greedy : public PDTree
+{
+public:
+	/**
+		construct from program parameters
+		@param params program parameters
+	*/
+    Greedy(Params &params) : 
+		PDTree(params) {}
+
+	/**
+		construct from a tree
+		@param tree a tree class
+	*/
+    Greedy(PDTree &tree) : 
+		PDTree(tree) {}
+
+	/**
+		constructor
+	*/
+	Greedy() : PDTree() {};
+
+	/**
+		run the algorithm
+		@param params program parameters
+		@param taxa_set (OUT) vector of PD sets
+	*/
+	void run(Params &params, vector<PDTaxaSet> &taxa_set);
+
+	/**
+		update the ordered list based on the recent longest path
+		@param node the starting node
+		@param subtree (OUT) resulted subtree
+		@param cur_set the current set
+	*/
+	void updateOnLongestPath(Node *node, NodeVector &subtree, PDTaxaSet &cur_set);
+
+	/**
+		build the initial subtree based on the initial set of taxa
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param subtree (OUT) resulted subtree
+		@param nodestack (TEMP) stack of node, used only by function
+	*/
+	void buildOnInitialSet(NodeVector &subtree, NodeVector &nodestack, Node *node = NULL, Node *dad = NULL);
+
+	/**
+		initialize the ordered list based on the initial subtree structure
+		@param subtree vector containing nodes in the subtree
+		@return the subtree length
+	*/
+	double updateOnInitialSet(NodeVector &subtree);
+
+	/**
+		@return innodes.begin().
+	*/
+	NeighborSet::iterator highestNeighbor();
+
+	/**
+		add an edge into the NeighborSet
+	*/
+	void addNeighbor(Neighbor* neigh);
+
+	//NodeSet innodes;
+
+	/**
+		neighbor set
+	*/
+	NeighborSet neighset;
+
+	/**
+		list of nodes in the subtree
+	*/
+	NodeVector subtree;
+
+private:
+
+	/**
+		size of list of nodes, used internally during greedy search
+	*/
+	int list_size;
+};
+
+#endif
diff --git a/gss.cpp b/gss.cpp
new file mode 100644
index 0000000..b6d54a2
--- /dev/null
+++ b/gss.cpp
@@ -0,0 +1,345 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+/*
+	Geneset selection (GSS) for Roland
+*/
+
+#include "gss.h"
+#include "lpwrapper.h"
+#include "gurobiwrapper.h"
+#include "mtreeset.h"
+
+
+GSSNetwork::GSSNetwork(Params &params) : PDNetwork(params) {
+    readGenePValues(params);
+}
+
+bool GSSNetwork::isPDArea() {
+    return false;
+}
+
+void GSSNetwork::readGenePValues(Params &params) {
+
+    //taxa->Report(cout);
+    // first build the gene list
+    TaxaSetNameVector *allsets = sets->getSets();
+    TaxaSetNameVector::iterator i;
+    for (i = allsets->begin(); i != allsets->end(); i++) {
+        for (vector<string>::iterator it2 = (*i)->taxlist.begin(); it2 != (*i)->taxlist.end(); it2++) {
+            if (gene_index.find(*it2) == gene_index.end()) {
+                gene_index[*it2] = genes.size();
+                genes.push_back(*it2);
+            }
+        }
+    }
+    int ntaxa = genes.size();
+
+    // build the area_taxa structure
+    if (allsets->size() != getNTaxa())
+        outError("Number of gene sets do not match between tree file and set file");
+    area_taxa.resize(getNTaxa(), NULL);
+    for (i = allsets->begin(); i != allsets->end(); i++) {
+        int id = -1;
+        try {
+            id = taxa->FindTaxon(NxsString((*i)->name.c_str()));
+        } catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+            outError(ERR_NO_TAXON, (*i)->name);
+        }
+        if (area_taxa[id]) outError("Duplicated set name in set file", (*i)->name);
+        Split *sp = new Split(ntaxa);
+        for (vector<string>::iterator it2 = (*i)->taxlist.begin(); it2 != (*i)->taxlist.end(); it2++) {
+            sp->addTaxon(gene_index[*it2]);
+        }
+        area_taxa[id] = sp;
+        cout << id << "\t" << (*i)->name << endl;
+    }
+    cout << ntaxa << " genes and " << area_taxa.size() << " gene sets detected" << endl;
+
+    cout << "Reading p-values file " << params.gene_pvalue_file << " ..." << endl;
+    gene_pvalues.resize(ntaxa, -1);
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(params.gene_pvalue_file);
+        string name, tmp;
+
+        for (; !in.eof() && ntaxa > 0; ntaxa--) {
+            // remove the failbit
+            in.exceptions(ios::badbit);
+            if (!(in >> name)) break;
+            // set the failbit again
+            in.exceptions(ios::failbit | ios::badbit);
+            if (gene_index.find(name) == gene_index.end())
+                outError("A gene not found in gene p-values file");
+            // read the sequence weight
+            in >> tmp;
+            double pval = convert_double(tmp.c_str());
+            if (pval < 0 || pval > 1) outError("Some pvalue is out of range [0, 1]");
+            if (gene_pvalues[gene_index[name]] != -1) outError("Duplicated p-value entry");
+            gene_pvalues[gene_index[name]] = pval;
+        }
+        in.clear();
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    } catch (string str) {
+        outError(str);
+    }
+
+    if (params.gene_scale_factor < 0 || params.gene_scale_factor > 1)
+        outError("gene_scale_factor must be in range [0,1]");
+    cout << "Rescaling split weights with " << params.gene_scale_factor <<
+         " and gene p-values with " << 1 - params.gene_scale_factor << endl;
+    // incoporate into the split system
+    for (iterator it = begin(); it != end(); it++) {
+        // first, multiply split weight with the coefficient
+        (*it)->setWeight((*it)->getWeight() * params.gene_scale_factor);
+    }
+
+    for (DoubleVector::iterator it2 = gene_pvalues.begin(); it2 != gene_pvalues.end(); it2++)
+        if (params.gene_pvalue_loga)
+            (*it2) = (-log(*it2)) * (1 - params.gene_scale_factor);
+        else
+            (*it2) = (1 - (*it2)) * (1 - params.gene_scale_factor);
+
+}
+
+void GSSNetwork::checkZValue(int total_size, vector<int> &z_value) {
+    z_value.resize(genes.size(), -1);
+    int i, j;
+    for (i = 0; i < genes.size(); i++) {
+        int genesetid = -1;
+        for (j = 0; j < area_taxa.size(); j++)
+            if (area_taxa[j]->containTaxon(i))  {
+                if (genesetid < 0)
+                    genesetid = j;
+                else {
+                    genesetid = -1;
+                    break;
+                }
+            }
+        if (genesetid >= 0) z_value[i] = genesetid+2;
+    }
+}
+
+
+void GSSNetwork::lpObjectiveGSS(ostream &out, Params &params, IntVector &y_value, IntVector &z_value, int total_size) {
+    //IntVector y_value, count1, count2;
+    iterator spit;
+    int i;
+    // define the objective function
+    if (params.gurobi_format)
+        out << "Maximize" << endl;
+    else
+        out << "max: ";
+
+    // first compute the coefficient for x variable
+    DoubleVector xweights;
+    xweights.resize(getNTaxa(), 0.0);
+    for (spit = begin(),i=0; spit != end(); spit++,i++)	{
+        if (y_value[i] >= 2)
+            xweights[y_value[i] - 2] += (*spit)->getWeight();
+    }
+    for (i = 0; i < gene_pvalues.size(); i++)
+        if (z_value[i] >= 2)
+            xweights[z_value[i]-2] += gene_pvalues[i];
+
+    // now write down the objective function
+
+    for (i = 0; i < xweights.size(); i++)
+        out << " +" << xweights[i] << " x" << i;
+
+
+    for (spit = begin(),i=0; spit != end(); spit++,i++)	{
+        if (y_value[i] < 0)
+            out << " +" << (*spit)->getWeight() << " y" << i;
+    }
+
+    for (i = 0; i < gene_pvalues.size(); i++)
+        if (z_value[i] < 0)
+            out << " +" << gene_pvalues[i] << " z" << i;
+
+    if (params.gurobi_format)
+        out << endl << "Subject to" << endl;
+    else
+        out << ";" << endl;
+}
+
+
+void GSSNetwork::lpVariableBound(ostream &out, Params &params, Split &included_vars, IntVector &y_value, IntVector &z_value) {
+    int i;
+    PDNetwork::lpVariableBound(out, params, included_vars, y_value);
+
+    for (i = 0; i < gene_pvalues.size(); i++) {
+        if (z_value[i] >= 0) continue;
+        if (params.gurobi_format)
+            out << "0 <= ";
+        out << "z" << i << " <= 1";
+        if (params.gurobi_format)
+            out << endl;
+        else
+            out << ";" << endl;
+    }
+}
+
+void GSSNetwork::lpGeneConstraint(ostream &out, Params &params, IntVector &z_value) {
+    int i, j;
+    for (i = 0; i < genes.size(); i++) {
+        if (z_value[i] >= 0) continue;
+        out << "z" << i;
+        for (j = 0; j < area_taxa.size(); j++)
+            if (area_taxa[j]->containTaxon(i))
+                out << " -x" << j;
+        out << " <= 0";
+        if (params.gurobi_format)
+            out << endl;
+        else
+            out << ";" << endl;
+    }
+}
+
+void GSSNetwork::transformLP_GSS(Params &params, const char *outfile, int total_size, bool make_bin) {
+    Split included_tax(getNTaxa());
+    IntVector::iterator it2;
+    for (it2 = initialset.begin(); it2 != initialset.end(); it2++)
+        included_tax.addTaxon(*it2);
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(outfile);
+        vector<int> y_value;
+        vector<int> z_value;
+        checkYValue(total_size, y_value);
+        checkZValue(total_size, z_value);
+
+        lpObjectiveGSS(out, params, y_value, z_value, total_size);
+        lpSplitConstraint_TS(out, params, y_value, total_size);
+        lpK_BudgetConstraint(out, params, total_size);
+        lpGeneConstraint(out, params, z_value);
+        lpVariableBound(out, params, included_tax, y_value, z_value);
+        if (make_bin)
+            lpVariableBinary(out, params, included_tax);
+
+        out.close();
+        //cout << "Transformed LP problem printed to " << outfile << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, outfile);
+    }
+}
+
+void GSSNetwork::findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order) {
+    // call the entering function
+    if (isBudgetConstraint()) { // non-budget case
+        cout << "Please specify k";
+        return;
+    }
+    enterFindPD(params);
+    if (params.find_all)
+        outError("Current linear programming does not support multiple optimal sets!");
+
+    string ofile = params.out_prefix;
+    ofile += ".lp";
+    double score;
+    int lp_ret, i, ntaxa = getNTaxa();
+    int k, min_k, max_k, step_k, index;
+
+    double *variables = new double[ntaxa];
+
+    if (isBudgetConstraint()) { // non-budget case
+        min_k = params.min_budget;
+        max_k = params.budget;
+        step_k = params.step_budget;
+    } else {
+        min_k = params.min_size;
+        max_k = params.sub_size;
+        step_k = params.step_size;
+    }
+    taxa_set.resize((max_k - min_k)/step_k + 1);
+
+    // now construction the optimal PD sets
+    if (isBudgetConstraint())
+        cout << "running budget = ";
+    else
+        cout << "running k = ";
+    for (k = min_k; k <= max_k; k += step_k) {
+        index = (k - min_k) / step_k;
+        if (!params.binary_programming) {
+            transformLP_GSS(params, ofile.c_str(), k, false);
+            cout << " " << k;
+            cout.flush();
+            if (params.gurobi_format)
+                lp_ret = gurobi_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode, params.gurobi_threads);
+            else
+                lp_ret = lp_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode);
+        } else lp_ret = 7;
+        if (lp_ret != 0 && lp_ret != 7)
+            outError("Something went wrong with LP solver!");
+        if (lp_ret == 7) { // fail with non-binary case, do again with strict binary
+            if (params.binary_programming)
+                transformLP_GSS(params, ofile.c_str(), k, true);
+            else
+                lpVariableBinary(ofile.c_str(), params, initialset);
+            cout << " " << k << "(bin)";
+            cout.flush();
+            if (params.gurobi_format)
+                lp_ret = gurobi_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode, params.gurobi_threads);
+            else
+                lp_ret = lp_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode);
+            if (lp_ret != 0) // check error again without allowing non-binary
+                outError("Something went wrong with LP solver!");
+        }
+
+        Split *pd_set = new Split(ntaxa, score);
+        for (i = 0; i < ntaxa; i++)
+            if (1.0 - variables[i] < tolerance) {
+                //pd_set->addTaxon(taxa_order[i]);
+                pd_set->addTaxon(i);
+            }
+        calcPD(*pd_set);
+        taxa_set[index].push_back(pd_set);
+    }
+    cout << endl;
+    delete variables;
+    // call the leaving function
+    leaveFindPD(taxa_set);
+}
+
+extern void summarizeSplit(Params &params, PDNetwork &sg, vector<SplitSet> &pd_set, PDRelatedMeasures &pd_more, bool full_report);
+
+void runGSSAnalysis(Params &params) {
+    cout << "Dedicated for Roland..." << endl;
+    vector<SplitSet> taxa_set;
+    IntVector taxa_order;
+    StrVector genes;
+    DoubleVector gene_pvalues;
+    PDRelatedMeasures pd_more;
+
+    params.intype = detectInputFile(params.user_file);
+
+    GSSNetwork sg(params);
+
+    sg.findPD(params, taxa_set, taxa_order);
+
+    summarizeSplit(params, sg, taxa_set, pd_more, true);
+}
+
diff --git a/gss.h b/gss.h
new file mode 100644
index 0000000..b6687f9
--- /dev/null
+++ b/gss.h
@@ -0,0 +1,96 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+/*
+	Geneset selection (GSS) for Roland 
+*/
+
+#ifndef GSS_H
+#define GSS_H
+
+
+#include "tools.h"
+#include "pdnetwork.h"
+
+class GSSNetwork : public PDNetwork {
+public:
+	/**
+		construct PD network from a NEXUS or NEWICK file, e.g. produced by SplitsTree
+		@param params program parameters
+	*/
+    GSSNetwork(Params &params);
+
+	/**
+		transform the problem into an Integer Linear Programming and write to .lp file
+		@param params program parameters
+		@param outfile name of output file in LP format
+		@param total_size k for PD_k or total budget
+		@param make_bin TRUE if creating binary programming
+	*/
+	void transformLP_GSS(Params &params, const char *outfile, int total_size, bool make_bin);
+
+	/**
+		main function to search for maximal phylogenetic diversity
+		@param params program parameters
+		@param taxa_set (OUT) the vector of set of taxa in the maximal PD set
+		@param taxa_order (OUT) order of inserted taxa
+	*/
+	virtual void findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
+
+	/**
+		@return TRUE if we are doing PD area optimization
+	*/
+	virtual bool isPDArea();
+
+	void readGenePValues(Params &params);
+
+protected:
+
+	/**
+		names of the genes
+	*/
+	StrVector genes;
+
+	map<string, int> gene_index;
+	
+	/**
+		p-values of the genes
+	*/
+	DoubleVector gene_pvalues;
+
+
+	/**
+		z variables for genes in the LP formulation, check if it can be dropped or equals some x variable.
+		@param total_size k for PD_k or total budget
+		@param z_value (OUT): vector of: -1 if cannot reduce, 1 if equals 1, or id+2 where id is the trivial split id 
+	*/
+	void checkZValue(int total_size, vector<int> &z_value);
+
+	void lpObjectiveGSS(ostream &out, Params &params, IntVector &y_value, IntVector &z_value, int total_size);
+
+	void lpVariableBound(ostream &out, Params &params, Split &included_vars, IntVector &y_value, IntVector &z_value);
+
+	void lpGeneConstraint(ostream &out, Params &params, IntVector &z_value);
+
+};
+
+void runGSSAnalysis(Params &params);
+
+#endif
diff --git a/guidedbootstrap.cpp b/guidedbootstrap.cpp
new file mode 100644
index 0000000..0b28139
--- /dev/null
+++ b/guidedbootstrap.cpp
@@ -0,0 +1,1301 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <numeric>
+#include "phylotree.h"
+#include "phylosupertree.h"
+#include "phyloanalysis.h"
+#include "alignment.h"
+#include "superalignment.h"
+#include "iqtree.h"
+#include "model/modelgtr.h"
+#include "model/modeldna.h"
+#include "myreader.h"
+#include "model/rateheterogeneity.h"
+#include "model/rategamma.h"
+#include "model/rateinvar.h"
+#include "model/rategammainvar.h"
+//#include "modeltest_wrapper.h"
+#include "model/modelprotein.h"
+#include "stoprule.h"
+
+#include "mtreeset.h"
+#include "mexttree.h"
+#include "model/ratemeyerhaeseler.h"
+#include "whtest_wrapper.h"
+#include "model/partitionmodel.h"
+
+//#include "zpipe.h"
+#include "gzstream.h"
+#include "guidedbootstrap.h"
+#include "timeutil.h"
+
+void readPatternLogLL(Alignment* aln, char *fileName, vector<double*> &logLLs, DoubleVector &trees_logl)
+{
+    //First read the values from inFile to a DoubleVector
+    //int siteNum;
+    string currentString;
+    cout << "\nReading file containing site's loglikelihood: " << fileName << "...." << endl;
+    ifstream inFile;
+    int i;
+    try {
+        inFile.exceptions (ios::failbit | ios::badbit);
+        inFile.open(fileName);
+        /**really start reading*/
+        //read number of sites
+        getline(inFile,currentString);
+        //siteNum = convert_int(currentString.c_str());
+        //ignore "Site_Lh"
+        inFile.exceptions (ios::badbit);
+        while (!inFile.eof())
+        {
+            DoubleVector _logllVec;
+            if ( !(inFile >> currentString) ) break;
+            //reading each line of the file
+            //remove the badbit
+            //set the failbit again
+            double logl = 0.0;
+            for (i = 0; i < aln->getNSite(); i++) {
+                double ll;
+                if (!(inFile >> ll)) throw "Wrong logLL entry";
+                _logllVec.push_back(ll);
+                logl += ll;
+            }
+            double *logLL = new double[aln->getNPattern()];
+            memset(logLL, 0, sizeof(double) * aln->getNPattern());
+            //logLL.resize(aln->getNPattern(),0.0);
+            for (i = 0; i < _logllVec.size(); i++)
+            {
+                int patIndex = aln->getPatternID(i);
+                if ( logLL[patIndex] == 0 )
+                    logLL[patIndex] = _logllVec[i];
+                else
+                    if ( logLL[patIndex] != _logllVec[i] )
+                        outError("Conflicting between the likelihoods reported for pattern", aln->at(i));
+            }
+            logLLs.push_back(logLL);
+            trees_logl.push_back(logl);
+        }/**finish reading*/
+        inFile.clear();
+        inFile.exceptions (ios::failbit | ios::badbit);
+        inFile.close();
+    } catch (bad_alloc) {
+        outError(ERR_NO_MEMORY);
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    } catch (...) {
+        outError(ERR_READ_ANY);
+    }
+
+}
+
+void computeExpectedNorFre(Alignment *aln, double *logLL, IntVector &expectedNorFre)
+{
+    //IntVector expectedNorFre;
+    /*	if ( logLL.empty())
+    		outError("Error: log likelihood of patterns are not given!");
+    */
+
+    int patNum = aln->getNPattern();
+    int alignLen = aln->getNSite();
+    //resize the expectedNorFre vector
+    expectedNorFre.resize(patNum,-1);
+
+    //Vector containing the likelihood of the pattern p_i
+    DoubleVector LL(patNum,-1.0);
+    double sumLL = 0; //sum of the likelihood of the patterns in the alignment
+
+    //Compute the likelihood from the logLL
+    for ( int i = 0; i < patNum; i++ )
+    {
+        LL[i] = exp(logLL[i]);
+        sumLL += LL[i];
+    }
+
+    //Vector containing l_i = p_i*ell/sum_i(p_i)
+    DoubleVector ell(patNum, -1.0);
+    //Compute l_i
+    for ( int i = 0; i < patNum; i++ )
+    {
+        ell[i] = (double)alignLen * LL[i] / sumLL;
+    }
+
+
+    //Vector containing r_i where r_0 = ell_0; r_{i+1} = ell_{i+1} + r_i - ordinaryRounding(r_i)
+    DoubleVector r(patNum, -1.0);
+    //Compute r_i and the expected normalized frequencies
+    r[0] = ell[0];
+    expectedNorFre[0] = (int)floor(ell[0]+0.5); //note that floor(_number+0.5) returns the ordinary rounding of _number
+    int sum = expectedNorFre[0];
+    for (int j = 1; j < patNum; j++ )
+    {
+        r[j] = ell[j] + r[j-1] - floor(r[j-1]+0.5);
+        expectedNorFre[j] = (int)floor(r[j]+0.5);
+        sum += expectedNorFre[j];
+    }
+
+    //cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
+    //return expectedNorFre;
+}
+
+void computeTreeWeights(DoubleVector &reProb, IntVector &reW) {
+    int nDiff = reProb.size();
+    reW.resize(nDiff,-1);
+    DoubleVector ratio(nDiff,-1.0);
+    double sumRatio = 0;
+    int i;
+    double max_prob = reProb[0];
+    for ( i = 0; i < nDiff; i++ )
+        if (reProb[i] > max_prob) max_prob = reProb[i];
+
+    for ( i = 0; i < nDiff; i++ )
+    {
+        ratio[i] = exp(reProb[i]-max_prob);
+        sumRatio += ratio[i];
+    }
+    for ( i = 0; i < nDiff; i++ )
+    {
+        double temp = (ratio[i]/sumRatio)*1000000;
+        reW[i] = (int) floor(temp+0.5);
+    }
+}
+
+double euclideanDist(IntVector &vec1, IntVector &vec2) {
+    if (vec1.size() != vec2.size()) outError("Different vector size ", __func__);
+    double dist = 0.0;
+    for (int i = 0; i < vec1.size(); i++)
+        dist += (vec1[i]-vec2[i])*(vec1[i]-vec2[i]);
+    return sqrt(dist);
+}
+
+inline double computeRELL(double *pattern_lh, IntVector &pattern_freq) {
+    double lh = 0.0;
+    int npat = pattern_freq.size();
+    //if (npat != pattern_freq.size()) outError("Wrong vector size ", __func__);
+    for (int i = 0; i < npat; i++) lh += pattern_freq[i] * pattern_lh[i];
+    return lh;
+}
+
+/**
+	computing Expected Likelihood Weights (ELW) of trees by Strimmer & Rambaut (2002)
+*/
+void computeExpectedLhWeights(Alignment *aln, vector<double*> &pattern_lhs,
+                              IntVector &treeids, int num_replicates, DoubleVector &elw,
+                              const char* spec, DoubleVector *sh_pval = NULL) {
+    cout << "Computing Expected Likelihood Weights (ELW) with " << num_replicates << " replicates ..." << endl;
+    int i, j, ntrees = treeids.size();
+    elw.resize(treeids.size(), 0.0);
+    vector<DoubleVector> all_logl;
+    // general RELL logl
+    for (i = 0; i < num_replicates; i++) {
+        IntVector pattern_freq;
+        aln->createBootstrapAlignment(pattern_freq, spec);
+        DoubleVector logl;
+        logl.resize(treeids.size(), 0.0);
+        j = 0;
+        for (IntVector::iterator it = treeids.begin(); it != treeids.end(); it++, j++) {
+            logl[j] = computeRELL(pattern_lhs[*it], pattern_freq);
+        }
+        if (sh_pval) all_logl.push_back(logl);
+        double max_logl = logl[0];
+        for (j = 0; j < logl.size(); j++)
+            if (max_logl < logl[j]) max_logl = logl[j];
+        double sum = 0.0;
+        for (j = 0; j < logl.size(); j++) {
+            logl[j] = exp(logl[j] - max_logl);
+            sum += logl[j];
+        }
+        for (j = 0; j < logl.size(); j++)
+            elw[j] += (logl[j]/sum);
+    }
+    // normalize ELW weights to sum of 1
+    for (j = 0; j < elw.size(); j++)
+        elw[j] /= num_replicates;
+
+    if (!sh_pval) return;
+
+
+    // centering step in SH test
+    DoubleVector mean_logl;
+    mean_logl.resize(ntrees, 0);
+    for (i = 0; i < num_replicates; i++)
+        for (j = 0; j < ntrees; j++) {
+            mean_logl[j] += all_logl[i][j];
+        }
+    for (j = 0; j < ntrees; j++)
+        mean_logl[j] /= num_replicates;
+    for (i = 0; i < num_replicates; i++)
+        for (j = 0; j < ntrees; j++) {
+            all_logl[i][j] -= mean_logl[j];
+        }
+
+    // computing delta
+    for (i = 0; i < num_replicates; i++) {
+        double max_logl = *max_element(all_logl[i].begin(), all_logl[i].end());
+        for (j = 0; j < ntrees; j++) all_logl[i][j] = max_logl - all_logl[i][j];
+    }
+
+    // computing original delta
+    DoubleVector orig_logl;
+    orig_logl.resize(ntrees, 0);
+    for (j = 0; j < ntrees; j++) {
+        int tree_id = treeids[j];
+        i = 0;
+        for (Alignment::iterator it = aln->begin(); it != aln->end(); it++, i++)
+            orig_logl[j] += pattern_lhs[tree_id][i] * it->frequency;
+    }
+    double max_logl = *max_element(orig_logl.begin(), orig_logl.end());
+    for (j = 0; j < ntrees; j++) orig_logl[j] = max_logl - orig_logl[j];
+    sh_pval->resize(ntrees, 0);
+    for (i = 0; i < num_replicates; i++)
+        for (j = 0; j < ntrees; j++) {
+            if (orig_logl[j] < all_logl[i][j]) (*sh_pval)[j] += 1.0;
+        }
+    for (j = 0; j < ntrees; j++)
+        (*sh_pval)[j] /= num_replicates;
+}
+
+void printTrees(const char *ofile, IQTree &tree, IntVector *weights, bool compression)
+{
+    int count = 0;
+    try {
+        ostream *out;
+        if (compression) out = new ogzstream;
+        else out = new ofstream;
+        out->exceptions(ios::failbit | ios::badbit);
+        if (compression)
+            ((ogzstream*)out)->open(ofile);
+        else
+            ((ofstream*)out)->open(ofile);
+        (*out) << "[ scale=" << tree.len_scale << " ]" << endl;
+        for (StringIntMap::iterator it = tree.treels.begin(); it != tree.treels.end(); it++)
+            if (!weights || weights->at(it->second)) {
+                int id = it->second;
+                out->precision(10);
+                (*out) << "[ lh=" << tree.treels_logl[id];
+                if (weights) (*out) << " w=" << weights->at(id);
+                (*out) << " ] ";
+                (*out) << tree.treels_newick[id] << endl;
+                count++;
+            }
+        cout << count << " tree(s) printed to " << ofile << endl;
+
+        if (compression) {
+            z_off_t uncompress = ((ogzstream*)out)->get_raw_bytes();
+            ((ogzstream*)out)->close();
+            struct stat st;
+            stat(ofile, &st);
+            cout << "Compression ratio: " << ((double)st.st_size/uncompress)
+                 << " (" << uncompress << " -> " << st.st_size << " bytes)" << endl;
+        }
+        else
+            ((ofstream*)out)->close();
+        delete out;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, ofile);
+    }
+}
+
+void printPatternLh(const char *ofile, IQTree *tree, bool compression) {
+    int count = 0, i;
+    int scale = 1000;
+    try {
+        ostream *out;
+        if (compression) out = new ogzstream;
+        else out = new ofstream;
+        out->exceptions(ios::failbit | ios::badbit);
+        if (compression)
+            ((ogzstream*)out)->open(ofile/*, ios::out | ios::binary*/);
+        else
+            ((ofstream*)out)->open(ofile/*, ios::out | ios::binary*/);
+        int idfirst = tree->treels.begin()->second;
+        (*out) << tree->treels.size() << " " << tree->aln->getNSite() <<
+        " " << tree->aln->getNPattern() << " " << scale << endl;
+        for (i = 0; i < tree->aln->getNSite(); i++)
+            (*out) << " " << tree->aln->getPatternID(i);
+        (*out) << endl;
+        // DO NOT CHANGE
+        for (StringIntMap::iterator it = tree->treels.begin(); it != tree->treels.end(); it++)
+        {
+            int id = it->second;
+            assert(id < tree->treels_ptnlh.size());
+            //out->write((char*)tree->treels_ptnlh[id], sizeof(double)*tree->aln->size());
+            out->precision(10);
+            (*out) << -tree->treels_logl[id];
+            if (id == idfirst) {
+                out->precision(6);
+                for (int i = 0; i < tree->aln->size(); i++)
+                    (*out) << " " << -tree->treels_ptnlh[id][i];
+            } else {
+                for (int i = 0; i < tree->aln->size(); i++) {
+                    int diff = round((tree->treels_ptnlh[id][i]-tree->treels_ptnlh[idfirst][i])*scale);
+                    (*out) << " " << diff;
+                }
+            }
+            (*out) << endl;
+            count++;
+        }
+        if (compression)
+            ((ogzstream*)out)->close();
+        else
+            ((ofstream*)out)->close();
+        delete out;
+        cout << count << " pattern log-likelihood vector(s) printed to " << ofile << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, ofile);
+    }
+}
+
+void readPatternLh(const char *infile, IQTree *tree, bool compression) {
+    int count = 0, i;
+    int ntrees, nsite, nptn, scale;
+    double max_tol = 0.0;
+    try {
+        istream *in;
+        if (compression) in = new igzstream;
+        else in = new ifstream;
+        in->exceptions(ios::failbit | ios::badbit);
+        if (compression)
+            ((igzstream*)in)->open(infile/*, ios::out | ios::binary*/);
+        else
+            ((ifstream*)in)->open(infile/*, ios::out | ios::binary*/);
+        (*in) >> ntrees >> nsite >> nptn >> scale;
+        if (nsite != tree->aln->getNSite()) outError("Number of sites does not match");
+        if (nptn !=  tree->aln->getNPattern()) outError("Number of patterns does not match");
+        for (i = 0; i < nsite; i++) {
+            int id;
+            (*in) >> id;
+            if (id != tree->aln->getPatternID(i)) outError("Pattern ID does not match");
+        }
+        tree->treels_logl.resize(ntrees, 0.0);
+        tree->treels_ptnlh.resize(ntrees, NULL);
+        for (int id = 0; id < ntrees; id++)
+        {
+            double logl;
+            (*in) >> logl;
+            logl = -logl;
+            tree->treels_logl[id] = logl;
+            double *pattern_lh = new double[nptn];
+            if (id == 0) {
+                for (i = 0; i < nptn; i++) {
+                    (*in) >> pattern_lh[i];
+                    pattern_lh[i] = -pattern_lh[i];
+                }
+            } else {
+                double sum = 0.0;
+                for (i = 0; i < nptn; i++) {
+                    int diff;
+                    (*in) >> diff;
+                    pattern_lh[i] = tree->treels_ptnlh[0][i]+(double)diff/scale;
+                    sum += pattern_lh[i] * tree->aln->at(i).frequency;
+                }
+                max_tol = max(max_tol, fabs(sum-logl));
+            }
+            tree->treels_ptnlh[id] = pattern_lh;
+            count++;
+        }
+        cout << "max tolerance = " << max_tol << endl;
+        if (compression)
+            ((igzstream*)in)->close();
+        else
+            ((ifstream*)in)->close();
+        delete in;
+        cout << count << " pattern log-likelihood vector(s) read from " << infile << endl;
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, infile);
+    }
+}
+
+void computeAllPatternLh(Params &params, IQTree &tree) {
+    /* this part copied from phyloanalysis.cpp */
+    tree.optimize_by_newton = params.optimize_by_newton;
+    ModelsBlock *models_block = new ModelsBlock;
+
+    try {
+        if (!tree.getModelFactory()) {
+            if (tree.isSuperTree())
+                tree.setModelFactory(new PartitionModel(params, (PhyloSuperTree*)&tree, models_block));
+            else
+                tree.setModelFactory(new ModelFactory(params, &tree, models_block));
+        }
+    } catch (string &str) {
+        outError(str);
+    }
+    delete models_block;
+    tree.setModel(tree.getModelFactory()->model);
+    tree.setRate(tree.getModelFactory()->site_rate);
+    if (tree.isSuperTree()) ((PhyloSuperTree*)&tree)->mapTrees();
+    tree.setLikelihoodKernel(params.SSE);
+
+    int model_df = tree.getModel()->getNDim() + tree.getRate()->getNDim();
+    cout << endl;
+    cout << "Estimating model parameters for: " << tree.getModelName() << " (" << model_df << " free parameters)" << endl;
+    cout << "Fixed branch lengths: " << ((params.fixed_branch_length) ? "Yes" : "No") << endl;
+    /* optimize model parameters */
+    cout << endl;
+    cout << "Optimizing model parameters" << endl;
+    double bestTreeScore = tree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, TOL_LIKELIHOOD);
+    cout << "Log-likelihood of the current tree: " << bestTreeScore << endl;
+
+    //Update tree score
+    tree.setCurScore(bestTreeScore);
+    if (tree.isSuperTree()) ((PhyloSuperTree*)&tree)->computeBranchLengths();
+    stringstream best_tree_string;
+    tree.printTree(best_tree_string, WT_TAXON_ID + WT_BR_LEN);
+
+    cout << "Computing pattern log-likelihoods for trees in " << params.user_file << " ..." << endl;
+    /* now compute the treels_ptnlh */
+    try {
+        istream *in;
+        if (params.do_compression) in = new igzstream;
+        else in = new ifstream;
+        in->exceptions(ios::failbit | ios::badbit);
+        if (params.do_compression)
+            ((igzstream*)in)->open(params.user_file);
+        else
+            ((ifstream*)in)->open(params.user_file);
+        double max_logl_diff = 0.0;
+        char ch;
+        (*in) >> ch;
+        if (ch == '[') {
+            string str;
+            (*in) >> str;
+            if (str.substr(0,6) == "scale=") {
+                tree.len_scale = convert_double(str.substr(6).c_str());
+            }
+            do {
+                (*in) >> ch;
+            } while (!in->eof() && ch != ']');
+        } else in->unget();
+        cout << "Applying branch length scaling: " << tree.len_scale << endl;
+
+        while (!in->eof()) {
+            in->exceptions(ios::goodbit);
+            (*in) >> ch;
+            if (in->eof()) break;
+            in->exceptions(ios::failbit | ios::badbit);
+            double expected_lh = 0.0;
+            if (ch == '[') {
+                string str;
+                (*in) >> str;
+                if (str.substr(0,3) == "lh=") {
+                    expected_lh = convert_double(str.substr(3).c_str());
+                }
+                do  {
+                    (*in) >> ch;
+                } while (!in->eof() && ch != ']');
+            } else in->unget();
+
+            tree.freeNode();
+            tree.readTree(*in, tree.rooted);
+            tree.scaleLength(1.0/tree.len_scale); // scale the branch length
+            tree.assignLeafNames();
+            tree.initializeAllPartialLh();
+            tree.clearAllPartialLH();
+            if (tree.isSuperTree()) ((PhyloSuperTree*)&tree)->mapTrees();
+            double *pattern_lh = new double [tree.aln->getNPattern()];
+            if (!params.fixed_branch_length) {
+                tree.setCurScore(tree.optimizeAllBranches());
+                tree.computePatternLikelihood(pattern_lh);
+            } else {
+                tree.setCurScore(tree.computeLikelihood(pattern_lh));
+            }
+            if (expected_lh != 0.0)
+                max_logl_diff = max(max_logl_diff, fabs(tree.getCurScore()-expected_lh));
+            tree.treels_ptnlh.push_back(pattern_lh);
+            tree.treels_logl.push_back(tree.getCurScore());
+			cout << "Tree " << tree.treels_logl.size() << ": " << tree.getCurScore() << endl;
+            if (tree.treels_ptnlh.size() % 500 == 0)
+                cout << tree.treels_ptnlh.size() << " trees evaluated" << endl;
+        }
+
+        cout << tree.treels_ptnlh.size() << " trees evaluated in total" << endl;
+        cout << "Maximal log-likelihood error is " << max_logl_diff << endl << endl;
+
+        if (params.do_compression) ((igzstream*)in)->close();
+        else ((ifstream*)in)->close();
+        delete in;
+    } catch (ios::failure&) {
+        outError(ERR_READ_INPUT, params.user_file);
+    }
+
+    /* take back the current best tree */
+    best_tree_string.seekg(0, ios::beg);
+    tree.freeNode();
+    tree.readTree(best_tree_string, tree.rooted);
+    tree.assignLeafNames();
+    tree.initializeAllPartialLh();
+    tree.clearAllPartialLH();
+}
+
+void readTrees(Params &params, Alignment *alignment, IQTree &tree) {
+    if (!params.user_file) {
+        outError("You have to specify user tree file");
+    }
+    if (!params.second_tree) {
+        outError("Please provide target tree file via -sup option");
+    }
+
+    // read tree file
+    cout << "Reading tree file " << params.second_tree << endl;
+    tree.readTree(params.second_tree, params.is_rooted);
+    // reindex the taxa in the tree to aphabetical names
+    NodeVector taxa;
+    tree.getTaxa(taxa);
+    sort(taxa.begin(), taxa.end(), nodenamecmp);
+    int i = 0;
+    for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++) {
+        (*it)->id = i++;
+    }
+    // read in corresponding site-log-likelihood for all trees
+    /*trees_logl = new DoubleVector;
+    pattern_lhs = new vector<double*>;
+    readPatternLogLL(alignment, params.siteLL_file, *pattern_lhs, *trees_logl);*/
+
+    if (params.siteLL_file) {
+        // read pattern loglikelihoods from file
+        readPatternLh(params.siteLL_file, &tree, params.do_compression);
+    } else {
+        // compute all pattern log-likelihoods
+        tree.setAlignment(alignment);
+        computeAllPatternLh(params, tree);
+    }
+}
+
+void runGuidedBootstrapReal(Params &params, Alignment *alignment, IQTree &tree) {
+
+    int i, j;
+
+    double begin_time = getCPUTime();
+
+    MTreeSet trees;
+    vector<double*> *pattern_lhs = NULL;
+    vector<IntVector> expected_freqs;
+    DoubleVector *trees_logl = NULL;
+    IntVector diff_tree_ids;
+    int ntrees = 0;
+    IntVector::iterator it;
+
+    if (!tree.save_all_trees) {
+        readTrees(params, alignment, tree);
+        pattern_lhs = &tree.treels_ptnlh;
+        trees_logl = &tree.treels_logl;
+        if (!params.distinct_trees) {
+            // read in trees file
+            trees.init(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
+
+            if (pattern_lhs->size() != trees.size())
+                outError("Different number of sitelh vectors");
+            // get distinct trees
+            ntrees = trees.size();
+            IntVector tree_category;
+            trees.categorizeDistinctTrees(tree_category);
+            for (i = 0; i < ntrees; i++) {
+                int cat = tree_category[i];
+                if (diff_tree_ids.empty() || tree_category[diff_tree_ids.back()] < cat)
+                    diff_tree_ids.push_back(i);
+            }
+            cout << diff_tree_ids.size() << " distinct trees detected" << endl;
+        }
+
+    } else {
+        if (tree.treels_ptnlh.empty()) {
+            cout << "New bootstrap is not applicable due to no candiate trees" << endl;
+            return;
+        }
+        pattern_lhs = &tree.treels_ptnlh;
+        trees_logl = &tree.treels_logl;
+        //cout << "logl_cutoff = " << tree.logl_cutoff << " after " << tree.max_candidate_trees <<" trees" << endl;
+    }
+
+    if (diff_tree_ids.empty()) {
+        diff_tree_ids.resize(pattern_lhs->size());
+        ntrees = pattern_lhs->size();
+        for (i = 0; i < ntrees; i++) diff_tree_ids[i] = i;
+    }
+
+    IntVector origin_freq;
+    for (i = 0; i < alignment->getNPattern(); i++)
+        origin_freq.push_back(alignment->at(i).frequency);
+
+
+    if (verbose_mode >= VB_DEBUG) {
+        cout << "Original pattern freq: ";
+        for (i = 0; i < alignment->getNPattern(); i++)
+            cout << alignment->at(i).frequency << " ";
+        cout << endl;
+    }
+
+    cout << pattern_lhs->size() << " log-likelihood vectors loaded" << endl;
+
+    int ndiff = diff_tree_ids.size();
+
+    // consider only 10,000 trees with highest likelihoods
+    if (params.max_candidate_trees > 0 && ndiff > params.max_candidate_trees) {
+        DoubleVector neg_logl;
+        neg_logl.resize(ndiff);
+        for (i = 0; i < ndiff; i++)
+            neg_logl[i] = -trees_logl->at(diff_tree_ids[i]);
+        nth_element(neg_logl.begin(), neg_logl.begin() + params.max_candidate_trees, neg_logl.end());
+        double logl_cutoff = -neg_logl[params.max_candidate_trees];
+        IntVector diff_tree_ids_new;
+        diff_tree_ids_new.reserve(params.max_candidate_trees);
+        for (i = 0; i < ndiff; i++)
+            if (trees_logl->at(diff_tree_ids[i]) > logl_cutoff)
+                diff_tree_ids_new.push_back(diff_tree_ids[i]);
+        diff_tree_ids = diff_tree_ids_new;
+        ndiff = diff_tree_ids.size();
+        cout << "Reduce to " << ndiff << " highest likelihood trees with cutoff " << logl_cutoff << endl;
+    }
+
+    IntVector orig_diff_tree_ids = diff_tree_ids;
+
+    // compute multinomial probability for every distinct tree
+    DoubleVector prob_vec;
+    for (it = diff_tree_ids.begin(); it != diff_tree_ids.end(); it++) {
+        double prob;
+        alignment->multinomialProb((*pattern_lhs)[*it], prob);
+        prob_vec.push_back(prob);
+        IntVector expected_freq;
+        computeExpectedNorFre(alignment, (*pattern_lhs)[*it], expected_freq);
+        expected_freqs.push_back(expected_freq);
+        if (verbose_mode >= VB_DEBUG) {
+            for (i = 0; i < expected_freq.size(); i++)
+                cout << expected_freq[i] << " ";
+            cout << endl;
+        }
+    }
+
+    IntVector diff_tree_weights;
+
+    if (params.use_elw_method) { 	// compute ELW weights
+
+        DoubleVector elw, sh_pval;
+        computeExpectedLhWeights(alignment, (*pattern_lhs), diff_tree_ids, params.gbo_replicates, elw, params.bootstrap_spec, &sh_pval);
+        string elw_file_name = params.out_prefix;
+        elw_file_name += ".elw";
+        ofstream elw_file(elw_file_name.c_str());
+        elw_file << "Treeid\tELW\tSH-pval" << endl;
+        for (i = 0; i < elw.size(); i++)
+            elw_file << diff_tree_ids[i]+1 << "\t" << elw[i] << "\t" << sh_pval[i] << endl;
+        elw_file.close();
+        cout << "ELW printed to " << elw_file_name << endl;
+        diff_tree_weights.resize(diff_tree_ids.size(), 0);
+        for (i = 0; i < diff_tree_ids.size(); i++)
+            diff_tree_weights[i] = round(elw[i]*1000000);
+    } else {
+        double own_prob;
+        alignment->multinomialProb(*alignment, own_prob);
+        //cout << "Own prob: " << own_prob << endl;
+
+        cout << "Conducting " << params.gbo_replicates << " non-parametric resampling ";
+        if (params.use_rell_method)
+            cout << "using RELL" << endl;
+        else
+            cout << "using Euclidean distance" << endl;
+        if (params.use_weighted_bootstrap)
+            cout << "Multinomial weighting for bootstrap sample ";
+        else
+            cout << "Equal weighting for bootstrap sample ";
+
+        if (params.use_max_tree_per_bootstrap)
+            cout << "and selecting one tree per bootstrap" << endl;
+        else
+            cout << "and selecting multiple trees per bootstrap" << endl;
+
+        double accepted_diff = 0.5;
+        cout << "Accepted logl difference: " << accepted_diff << endl;
+
+        // generate bootstrap samples
+        for (i = 0; i < params.gbo_replicates; i++) {
+            IntVector pattern_freq;
+            alignment->createBootstrapAlignment(pattern_freq, params.bootstrap_spec);
+            double prob;
+            if (params.use_weighted_bootstrap)
+                prob = alignment->multinomialProb(pattern_freq);
+            else
+                prob = 0;
+            if (params.use_rell_method) {
+                // select best-fit tree by RELL method
+                DoubleVector logl;
+                logl.resize(ndiff);
+                for (j = 0; j < ndiff; j++) {
+                    int tree_id = diff_tree_ids[j];
+                    logl[j] = computeRELL((*pattern_lhs)[tree_id], pattern_freq);
+                    //if (verbose_mode >= VB_MAX) cout << logl << endl;
+                }
+                DoubleVector::iterator max_logl = max_element(logl.begin(), logl.end());
+                int k = 0;
+                if (params.use_max_tree_per_bootstrap) {
+                    double logl_cutoff = *max_logl - accepted_diff;
+                    int num_max = 0;
+                    for (j = 0; j < ndiff; j++)
+                        if (logl[j] >= logl_cutoff) num_max++;
+                    if (num_max == 1) {
+                        diff_tree_ids.push_back(diff_tree_ids[max_logl - logl.begin()]);
+                        prob_vec.push_back(prob);
+                    } else {
+                        int max_rand = random_int(num_max);
+                        for (j = 0; j < ndiff && max_rand >= 0; j++)
+                            if (logl[j] >= logl_cutoff) {
+                                max_rand--;
+                                if (max_rand < 0) {
+                                    diff_tree_ids.push_back(diff_tree_ids[j]);
+                                    prob_vec.push_back(prob);
+                                    break;
+                                }
+                            }
+                    }
+                    if (verbose_mode >= VB_MAX) {
+                        cout << "Bootstrap " << i+1 <<  " lprob=" << prob << " max_logl=" <<
+                             *max_logl << " select " << diff_tree_ids[j]+1;
+                        if (num_max > 1)
+                            cout << "  tie broken " << num_max << endl;
+                        else
+                            cout << endl;
+                    }
+                } else {
+                    DoubleVector weights;
+                    weights.resize(ndiff);
+                    for (j = 0; j < ndiff; j++) weights[j] = exp(logl[j] - *max_logl);
+                    double sum = accumulate(weights.begin(), weights.end(), 0.0);
+                    for (j = 0; j < ndiff; j++) weights[j] /= sum;
+                    int max_id = max_element(weights.begin(), weights.end()) - weights.begin();
+
+                    double weight_cutoff = weights[max_id] * 0.001;
+                    for (j = 0; j < ndiff; j++) {
+                        if (weights[j] >= weight_cutoff) {
+                            diff_tree_ids.push_back(diff_tree_ids[j]);
+                            prob_vec.push_back(prob + log(weights[j]));
+                            k++;
+                        }
+                    }
+                    if (verbose_mode >= VB_MAX)
+                        cout << "Bootstrap " << i+1 <<  " lprob=" << prob << " max_id=" << max_id << " max_w=" <<
+                             weights[max_id] << " " << k << " trees" << endl;
+                }
+            }
+            else {
+                // select best-fit tree by euclidean distance
+                double min_dist = -1.0;
+                int chosen_id = -1;
+                for (j = 0; j < expected_freqs.size(); j++) {
+                    double dist = euclideanDist(pattern_freq, expected_freqs[j]);
+                    //cout << dist << " ";
+                    if (dist < min_dist || min_dist < 0) {
+                        min_dist = dist;
+                        chosen_id = j;
+                    }
+                }
+                diff_tree_ids.push_back(diff_tree_ids[chosen_id]);
+                prob_vec.push_back(prob);
+                if (verbose_mode >= VB_MAX) {
+                    cout << "Bootstrap " << i+1 << " choose id=" << diff_tree_ids[chosen_id]+1 // <<" dist=" << min_dist
+                         << " lprob=" << prob << endl;
+                }
+            }
+
+            if (verbose_mode >= VB_DEBUG) {
+                for (j = 0; j < pattern_freq.size(); j++)
+                    cout << pattern_freq[j] << " ";
+                cout << endl;
+            }
+        }
+
+        // compute tree weights from the log-probability
+        computeTreeWeights(prob_vec, diff_tree_weights);
+
+    } // end of Arndt's method
+
+    IntVector final_tree_weights;
+    final_tree_weights.resize(ntrees, 0);
+    //for (i = 0; i < ntrees; i++) trees.tree_weights[i] = 0;
+    for (it = diff_tree_ids.begin(), i = 0; it != diff_tree_ids.end(); it++, i++) {
+        final_tree_weights[*it] += diff_tree_weights[i];
+    }
+
+    // now load in the trees
+    if (tree.save_all_trees) {
+        trees.init(tree.treels, tree.rooted, final_tree_weights);
+        string out_file = params.out_prefix;
+        if (params.do_compression) {
+            out_file += ".btrees.gz";
+            printTrees(out_file.c_str(), tree, &final_tree_weights, params.do_compression);
+            out_file = params.out_prefix;
+            out_file += ".alltrees.gz";
+            printTrees(out_file.c_str(), tree, NULL, params.do_compression);
+            if (params.print_site_lh) {
+                out_file = params.out_prefix;
+                out_file += ".ptnlh.gz";
+                printPatternLh(out_file.c_str(), &tree, params.do_compression);
+            }
+        }
+    } else if (params.distinct_trees) {
+        trees.init(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count, NULL, &final_tree_weights, params.do_compression);
+        // assuming user_file contains species ID (instead of full name)
+        trees.assignLeafID();
+        //trees.init(params.user_file, params.is_rooted, params.tree_burnin, NULL);
+        /*		if (pattern_lhs->size() != trees.size())
+        			outError("Different number of sitelh vectors");*/
+    }
+
+	tree.summarizeBootstrap(params, trees);
+/*    int sum_weights = trees.sumTreeWeights();
+    if (verbose_mode >= VB_MED) {
+        for (i = 0; i < trees.size(); i++)
+            if (trees.tree_weights[i] > 0)
+                cout << "Tree " << i+1 << " weight= " << trees.tree_weights[i] * 100 / sum_weights << endl;
+    }
+    int max_tree_id = max_element(trees.tree_weights.begin(), trees.tree_weights.end()) - trees.tree_weights.begin();
+    cout << "max_tree_id = " << max_tree_id+1 << "   max_weight = " << trees.tree_weights[max_tree_id];
+    cout << " (" << trees.tree_weights[max_tree_id] * 100 / sum_weights << "%)"<< endl;
+    // assign bootstrap support
+    SplitGraph sg;
+    SplitIntMap hash_ss;
+    // make the taxa name
+    vector<string> taxname;
+    taxname.resize(tree.leafNum);
+    tree.getTaxaName(taxname);
+
+    trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, false); // do not sort taxa
+
+    cout << sg.size() << " splits found" << endl;
+    // compute the percentage of appearance
+    sg.scaleWeight(100.0 / trees.sumTreeWeights(), true);
+    //	printSplitSet(sg, hash_ss);
+    //sg.report(cout);
+    cout << "Creating bootstrap support values..." << endl;
+    stringstream tree_stream;
+    tree.printTree(tree_stream, WT_TAXON_ID |  WT_BR_LEN);
+    MExtTree mytree;
+    mytree.readTree(tree_stream, tree.rooted);
+    mytree.assignLeafID();
+    mytree.createBootstrapSupport(taxname, trees, sg, hash_ss);
+
+    // now write resulting tree with supports
+    tree_stream.seekp(0, ios::beg);
+    mytree.printTree(tree_stream);
+
+    // now read resulting tree
+    tree_stream.seekg(0, ios::beg);
+    tree.freeNode();
+    tree.readTree(tree_stream, tree.rooted);
+    tree.assignLeafNames();
+    tree.initializeAllPartialLh();
+    tree.clearAllPartialLH();
+
+    string out_file;
+
+    if (!tree.save_all_trees) {
+        out_file = params.out_prefix;
+        out_file += ".suptree";
+
+        tree.printTree(out_file.c_str());
+        cout << "Tree with assigned bootstrap support written to " << out_file << endl;
+    }
+
+    out_file = params.out_prefix;
+    out_file += ".splits";
+
+    sg.saveFile(out_file.c_str(), true);
+    cout << "Split supports printed to NEXUS file " << out_file << endl;
+
+    out_file = params.out_prefix;
+    out_file += ".supval";
+    tree.writeInternalNodeNames(out_file);
+
+    cout << "Support values written to " << out_file << endl;*/
+    
+    /*
+    if (!tree.save_all_trees) {
+    	for (vector<double* >::reverse_iterator it = pattern_lhs->rbegin(); it != pattern_lhs->rend(); it++)
+    		delete [] (*it);
+    	delete pattern_lhs;
+    	delete trees_logl;
+    }*/
+
+    double end_time = getCPUTime();
+
+    cout << "Time for guided bootstrap: " << (end_time-begin_time) << " seconds" << endl << endl;
+    //delete [] rfdist;
+}
+
+void runGuidedBootstrap(Params &params, Alignment *alignment, IQTree &tree) {
+    if (!params.check_gbo_sample_size) {
+        runGuidedBootstrapReal(params, alignment, tree);
+        return;
+    }
+    int max_sample = params.max_candidate_trees;
+    if (tree.save_all_trees) max_sample = tree.treels.size();
+    for (int sample_size = params.check_gbo_sample_size; sample_size <= max_sample; sample_size *= 2) {
+        cout << "CHECKING SAMPLING SIZE " << sample_size << endl;
+        int sample_saved = params.max_candidate_trees;
+        char *prefix_saved = params.out_prefix;
+
+        // set parameters properly
+        string prefix = params.out_prefix;
+        stringstream ss;
+        ss << ".S" << sample_size;
+        prefix += ss.str();
+        //params.out_prefix = (char*)prefix.c_str();
+        params.max_candidate_trees = sample_size;
+
+        runGuidedBootstrapReal(params, alignment, tree);
+        // restore parameters
+        params.max_candidate_trees = sample_saved;
+        params.out_prefix = prefix_saved;
+    }
+}
+
+/* compute logarithm of (n choose k) */
+double logNchooseK(int n, int k) {
+    if (k > n-k) k = n-k;
+    double ret = 0.0;
+    int i;
+    for (i = k+1; i <= n; i++) ret += log(i);
+    for (i = 2; i <= n-k; i++) ret -= log(i);
+    return ret;
+}
+
+void generateFirstMultinorm(IntVector &x, int n, int k) {
+    x.resize(k, 0);
+    x.back() = n;
+}
+
+bool generateNextMultinorm(IntVector &x) {
+    if (x.size() < 2) return false;
+    int id = x.size()-1;
+    while (id >= 0 && x[id] == 0) id--;
+    if (id <= 0) return false;
+    x[id-1]++;
+    x.back() = x[id]-1;
+    if (id < x.size()-1) x[id] = 0;
+    return true;
+}
+
+void generateMultinorm(IntVector &x, int n, int k, int i, int sum) {
+    if (x.empty()) x.resize(k, 0);
+    if (i == k-1) {
+        x[i] = sum;
+        for (int j = 0; j < k; j++) cout << x[j] << " ";
+        cout << endl;
+        return;
+    }
+    for (int j = 0; j <= sum; j++) {
+        x[i] = j;
+        generateMultinorm(x, n, k, i+1, sum-j);
+    }
+}
+
+void runAvHTest(Params &params, Alignment *alignment, IQTree &tree) {
+    // collection of distinct bootstrapped site-pattern frequency vectors
+    IntVectorCollection boot_freqs;
+    // number of times the bootstrap alignments were resampled
+    IntVector boot_times;
+    // hash_map to quick search through the collection
+    IntVectorMap boot_map;
+    // multinomial probability of distinct bootstrap alignments
+    DoubleVector boot_prob;
+
+    // index from bootstrap number b to disinct bootstrap alignment
+    IntVector boot_index;
+    // number of distinct bootstrap aligments per B
+    IntVector diff_boot_alns;
+
+    // map from distinct alignment to tree
+    IntVector aln_tree_map;
+    StringIntMap tree_map;
+    // vector of all distinct reconstructed trees
+    MTreeSet boot_trees;
+    int id;
+
+    vector<ModelInfo> model_info;
+
+    cout << "Checking Arndt curiosity for " << params.avh_test << " bootstrap replicates ..." << endl;
+
+    // generate all distinct bootstrap alignments
+    cout << "Theoretical number of distinct alignments = " <<
+         exp(logNchooseK(alignment->getNSite()+alignment->getNPattern()-1, alignment->getNPattern()-1)) << endl;
+    IntVector afreq;
+    //generateMultinorm(x, alignment->getNSite(), alignment->getNPattern(), 0, alignment->getNSite());
+    generateFirstMultinorm(afreq, alignment->getNSite(), alignment->getNPattern());
+    int num_multi = 0;
+    do {
+        num_multi++;
+        cout << num_multi << ": ";
+        for (id = 0; id < afreq.size(); id++) cout << afreq[id] << " ";
+        cout << endl;
+        IntVector *boot_freq = new IntVector;
+        *boot_freq = afreq;
+        boot_map[boot_freq] = boot_freqs.size();
+        boot_freqs.push_back(boot_freq);
+        boot_times.push_back(0);
+        boot_prob.push_back(alignment->multinomialProb(*boot_freq));
+    } while (generateNextMultinorm(afreq));
+    cout << num_multi << " distinct bootstrap alignments" << endl;
+
+    // generate usual bootstrap alignments
+    int diff_boot_aln = 0;
+    for (id = 0; id < params.avh_test; id++) {
+        IntVector *boot_freq = new IntVector;
+        alignment->createBootstrapAlignment(*boot_freq, params.bootstrap_spec);
+        IntVectorMap::iterator it = boot_map.find(boot_freq);
+        if (it == boot_map.end()) { // not found
+            outError(__func__);
+            boot_index.push_back(boot_freqs.size());
+            boot_map[boot_freq] = boot_freqs.size();
+            boot_freqs.push_back(boot_freq);
+            boot_times.push_back(1);
+            boot_prob.push_back(alignment->multinomialProb(*boot_freq));
+        } else {
+            if (boot_times[it->second] == 0) diff_boot_aln++;
+            boot_times[it->second]++;
+            boot_index.push_back(it->second);
+            delete boot_freq;
+        }
+        diff_boot_alns.push_back(diff_boot_aln);
+    }
+
+    cout << boot_freqs.size() << " distinct alignments have been sampled" << endl;
+
+    // reconstruct tree for each distinct alignment
+    string orig_model = params.model_name;
+    int saved_aLRT_replicates = params.aLRT_replicates;
+    params.aLRT_replicates = 0;
+    for (id = 0; id < boot_freqs.size(); id++) {
+        cout << endl << "===> COMPUTING TREE FOR ALIGNMENT " << id << endl;
+        Alignment *boot_aln = new Alignment;
+        boot_aln->extractPatternFreqs(alignment, *boot_freqs[id]);
+
+        IQTree boot_tree(boot_aln);
+        runTreeReconstruction(params, orig_model, boot_tree, model_info);
+        boot_tree.setRootNode(params.root);
+        stringstream ss;
+        boot_tree.printTree(ss, WT_SORT_TAXA);
+        string str = ss.str();
+        StringIntMap::iterator it = tree_map.find(str);
+        if (it == tree_map.end()) { // not found
+            tree_map[str] = boot_trees.size();
+            aln_tree_map.push_back(boot_trees.size());
+            MTree *tree = new MTree;
+            tree->readTree(ss, params.is_rooted);
+            boot_trees.push_back(tree);
+        } else {
+            aln_tree_map.push_back(it->second);
+        }
+        //delete boot_aln;
+    }
+    cout << boot_trees.size() << " distinct trees have been reconstructed" << endl;
+    string out_file = params.out_prefix;
+    out_file += ".bootmap";
+    ofstream out;
+    out.open(out_file.c_str());
+    for (id = 0; id < boot_freqs.size(); id++) {
+        for (int i = 0; i < boot_freqs[id]->size(); i++) out << boot_freqs[id]->at(i) << " ";
+        out << boot_prob[id] << " " << aln_tree_map[id] << endl;
+    }
+    out.close();
+    cout << "===> EVALUATING TREES ON ORIGINAL ALIGNMENT" << endl;
+    out_file = params.out_prefix;
+    out_file += ".trees";
+    boot_trees.printTrees(out_file.c_str(),WT_SORT_TAXA);
+    params.min_iterations = 0;
+    runTreeReconstruction(params, orig_model, tree, model_info);
+    params.treeset_file = (char*)out_file.c_str();
+    evaluateTrees(params, &tree);
+
+    params.aLRT_replicates = saved_aLRT_replicates;
+
+    double logn = log(params.avh_test);
+    if (verbose_mode >= VB_MED) {
+        for (int j = 0; j < boot_freqs.size(); j++) {
+            cout << "p=" << alignment->multinomialProb(*boot_freqs[j])
+                 << " p_obs=" << log(boot_times[j]) - logn << " tree=" << aln_tree_map[j] << " ";
+            //boot_trees[aln_tree_map[j]]->printTree(cout,WT_SORT_TAXA);
+            cout << " ";
+
+            for (int i = 0; i < boot_freqs[j]->size(); i++)
+                cout << boot_freqs[j]->at(i) << " ";
+            cout << endl;
+        }
+    }
+
+    // computing weights
+    double max_prob = *max_element(boot_prob.begin(), boot_prob.end());
+    DoubleVector boot_weight;
+    boot_weight.resize(boot_prob.size(), 0.0);
+    for (id = 0; id < boot_freqs.size(); id++)
+        boot_weight[id] = exp(boot_prob[id] - max_prob);
+
+
+    // summarize results
+    out_file = params.out_prefix;
+    out_file += ".avh";
+    out.open(out_file.c_str());
+    out << boot_trees.size() << endl;
+    //boot_trees.printTrees(out, WT_SORT_TAXA);
+
+    /* computing true bootstrap probabilities based on all distinct bootstrap alignments */
+    DoubleVector tree_weights;
+    tree_weights.resize(boot_trees.size(), 0);
+    for (id = 0; id < boot_freqs.size(); id++)
+        tree_weights[aln_tree_map[id]] += boot_weight[id];
+    double sum_weight = accumulate(tree_weights.begin(), tree_weights.end(), 0.0);
+    for (id = 0; id < boot_trees.size(); id++) {
+        tree_weights[id] /= sum_weight;
+        out << tree_weights[id] << endl;
+    }
+
+    out << "B\tTree\tpB_T\tDiff_B\tpwB_T" << endl;
+    for (int sample = 1; sample <= params.avh_test; sample++) {
+        // weighted bootstrap version
+        tree_weights.resize(0);
+        tree_weights.resize(boot_trees.size(), 0);
+        vector<bool> duplicated;
+        duplicated.resize(boot_freqs.size(), false);
+        for (id = 0; id < sample; id++)
+            if (!duplicated[boot_index[id]]) {
+                tree_weights[aln_tree_map[boot_index[id]]] += boot_weight[boot_index[id]];
+                duplicated[boot_index[id]] = true;
+            }
+        double sum_weight = accumulate(tree_weights.begin(), tree_weights.end(), 0.0);
+        for (id = 0; id < boot_trees.size(); id++) {
+            tree_weights[id] /= sum_weight;
+        }
+
+        // by standard bootstrap
+        DoubleVector normal_tree_weights;
+        normal_tree_weights.resize(boot_trees.size(), 0);
+        for (id = 0; id < sample; id++) {
+            normal_tree_weights[aln_tree_map[boot_index[id]]] += 1;
+        }
+        sum_weight = accumulate(normal_tree_weights.begin(), normal_tree_weights.end(), 0.0);
+        for (id = 0; id < boot_trees.size(); id++)
+            normal_tree_weights[id] /= sum_weight;
+        // print results
+        for (id = 0; id < boot_trees.size(); id++)
+        {
+            out << sample << "\t" << id << "\t" << normal_tree_weights[id] << "\t"
+            << diff_boot_alns[sample-1] << "\t" << tree_weights[id] << endl;
+        }
+    }
+
+    out.close();
+    cout << "Results printed to " << out_file << endl;
+    for (IntVectorCollection::reverse_iterator rit = boot_freqs.rbegin(); rit != boot_freqs.rend(); rit++)
+        delete (*rit);
+}
+
+void runBootLhTest(Params &params, Alignment *alignment, IQTree &tree) {
+    // collection of distinct bootstrapped site-pattern frequency vectors
+    cout << "Doing likelihood-bootstrap plot using Kullback-Leibler distance with " << params.bootlh_test << " bootstrap replicates ..." << endl;
+    int id, ptn;
+    IntVector ptnfreq;
+    alignment->getPatternFreq(ptnfreq);
+    string orig_model = params.model_name;
+    vector<ModelInfo> model_info;
+    IntVector partitions;
+
+    if (params.bootlh_partitions) {
+    	convert_int_vec(params.bootlh_partitions, partitions);
+    	cout << "Using " << partitions.size() << " partitions" << endl;
+    }
+
+    string outfile = params.out_prefix;
+    outfile += ".bootlhtest";
+    ofstream out;
+    out.open(outfile.c_str());
+    string bootfreqfile = params.out_prefix; // bootstrap pattern frequency vector file
+    bootfreqfile += ".bootfreq";
+    ofstream outfreq;
+    outfreq.open(bootfreqfile.c_str());
+    //out << "ID KLdist" << endl;
+
+    out.precision(8);
+    params.min_iterations = 0; // do not do tree search
+    int start_site = 0;
+    for (id = 0; id < params.bootlh_test; id++) {
+    	Alignment *boot_aln;
+        IntVector boot_freq;
+        if (id==0) {
+        	// include original alignment
+        	boot_aln = alignment;
+        	boot_freq = ptnfreq;
+        } else if (id <= partitions.size()) {
+        	int end_site = start_site + partitions[id-1];
+        	boot_freq.resize(ptnfreq.size(), 0);
+        	for (int site = start_site; site < end_site; site++)
+        		boot_freq[alignment->getPatternID(site)]++;
+        	// now multiplying the frequencies
+        	for ( ptn = 0; ptn < boot_freq.size(); ptn++)
+        		boot_freq[ptn]*=partitions.size();
+    		if (alignment->isSuperAlignment())
+    			boot_aln = new SuperAlignment;
+    		else
+    			boot_aln = new Alignment;
+    		stringstream sitestr;
+    		sitestr << start_site+1 << "-" << end_site;
+        	cout << "-->Extracting sites " << sitestr.str() << endl;
+    		boot_aln->extractSites(alignment, sitestr.str().c_str());
+        	// now multiplying the frequencies
+    		for (ptn = 0; ptn < boot_aln->size(); ptn++)
+    			boot_aln->at(ptn).frequency *= partitions.size();
+    		start_site = end_site;
+        } else {
+    		if (alignment->isSuperAlignment())
+    			boot_aln = new SuperAlignment;
+    		else
+    			boot_aln = new Alignment;
+        	boot_aln->createBootstrapAlignment(alignment, &boot_freq, params.bootstrap_spec);
+        }
+        for ( ptn = 0; ptn < boot_freq.size(); ptn++)
+        	outfreq << "\t" << boot_freq[ptn];
+        outfreq << endl;
+        // computing Kullback-Leibler distance
+        double dist = 0.0;
+        for ( ptn = 0; ptn < ptnfreq.size(); ptn++)
+        	if (boot_freq[ptn]) {
+        		dist += log(((double)boot_freq[ptn])/ptnfreq[ptn]) * boot_freq[ptn];
+        	}
+        dist /= tree.getAlnNSite();
+        out << id+1 << " " << dist;
+        // now run analysis and compute tree likelihood for params.treeset_file
+        if (params.treeset_file) {
+			IQTree boot_tree(boot_aln);
+			runTreeReconstruction(params, orig_model, boot_tree, model_info);
+        	vector<TreeInfo> info;
+        	IntVector distinct_ids;
+        	evaluateTrees(params, &boot_tree, info, distinct_ids);
+            for (int i = 0; i < info.size(); i++)
+            	out << " " << info[i].logl;
+        }
+        out << endl;
+        if (id != 0)
+        	delete boot_aln;
+    }
+    out.close();
+    outfreq.close();
+}
diff --git a/guidedbootstrap.h b/guidedbootstrap.h
new file mode 100644
index 0000000..8291800
--- /dev/null
+++ b/guidedbootstrap.h
@@ -0,0 +1,95 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef GUIDED_BOOTSTRAP_H
+#define GUIDED_BOOTSTRAP_H
+
+#include "tools.h"
+#include "iqtree.h"
+#include "alignment.h"
+
+struct hashfunc_IntVector {
+	size_t operator()(const IntVector* sp) const {
+		size_t sum = 0;
+		for (IntVector::const_iterator it = sp->begin(); it != sp->end(); it++)
+			sum = (*it) + (sum << 6) + (sum << 16) - sum;
+		return sum;
+	}
+};
+
+namespace std {
+	/**
+		Define equal_to of two IntVector, used for hash_set (or hash_map) template
+	*/
+	template<>
+	struct equal_to<IntVector*> {
+		/**
+			@return true if *s1 == *s2
+			@param s1 first IntVector
+			@param s2 second IntVector
+		*/
+		bool operator()(const IntVector* s1, const IntVector* s2) const{
+			return *s1 == *s2;
+		}
+	};
+	/**
+		Define less than relationship of two IntVector, used for set (or map) template
+	*/
+	template<>
+	struct less<IntVector*> {
+		/**
+			@return true if *s1 < *s2 alphabetically
+			@param s1 first IntVector
+			@param s2 second IntVector
+		*/
+		bool operator()(const IntVector *s1, const IntVector *s2) const {
+			assert(s1->size() == s2->size());
+			for (int i = 0; i < s1->size(); i++)
+				if ((*s1)[i] < (*s2)[i]) 
+					return true;
+				else if ((*s1)[i] > (*s2)[i]) return false;
+			return false;
+		}
+	};
+} // namespace std
+
+
+#ifdef USE_HASH_MAP
+typedef unordered_map<IntVector*, int, hashfunc_IntVector> IntVectorMap;
+#else
+typedef map<IntVector*, int> IntVectorMap;
+#endif
+
+typedef vector<IntVector*> IntVectorCollection;
+
+/**
+	OBSOLETE: run guided bootstrap (this function was only used at the beginning of the UFBoot project
+*/
+void runGuidedBootstrap(Params &params, Alignment *alignment, IQTree &tree);
+
+void runAvHTest(Params &params, Alignment *alignment, IQTree &tree);
+
+/**
+ * make the plot with x-axis being the alignments and y-axis being the likelihood of all trees
+ * Right now we use Kullback-Leibler distance to arrange alignments on the x-axis
+ */
+void runBootLhTest(Params &params, Alignment *alignment, IQTree &tree);
+
+#endif
diff --git a/gurobiwrapper.cpp b/gurobiwrapper.cpp
new file mode 100644
index 0000000..c4f187c
--- /dev/null
+++ b/gurobiwrapper.cpp
@@ -0,0 +1,135 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#include <string.h>
+#include <sstream>
+#include "tools.h"
+#include "gurobiwrapper.h"
+
+#define tolerance 0.000001
+
+/**
+	interface to call GUROBI LP solver
+	@param filename name of input lp file
+	@param ntaxa number of taxa
+	@param score (OUT) returned optimal score
+	@param variables (OUT) array of returned solution
+	@param verbose_mode verbose mode
+	@return 
+		-1 if gurobi was not installed properly or does not exist at all
+		0 if everything works file, 
+		5 if solution is not optimal, 
+		6 if some variable has wrong name, 
+		7 if returned solution is not binary. In this case, one should run the solver 
+		again with strict binary variable constraint.
+*/
+int gurobi_solve(char *filename, int ntaxa, double *score, double *variables, int verbose_mode, int num_threads) {
+	int ret = 0;
+	*score = -1;
+	string command;
+	ostringstream ss;
+
+	ss << "gurobi_cl Threads=" << num_threads << " ResultFile=" << filename
+		<< ".sol MIPGap=0 "<< filename  << " >" << filename << ".log ";
+	command = ss.str();
+	if (verbose_mode >= VB_MED)
+		cout << command << endl;
+	int sys_ret = system(command.c_str());
+	if (sys_ret != 0) {
+		cout << "gurobi_cl could not be executed. Make sure it was installed with proper license." << endl;
+		cout << command << endl;
+		return -1;
+	}
+
+	command = filename;
+	command += ".sol";
+
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(command.c_str());
+		string str;
+
+		while (!in.eof()) {
+			// remove the failbit
+			in.exceptions(ios::badbit);
+			if(!(in >> str)) break;
+			// set the failbit again
+			in.exceptions(ios::failbit | ios::badbit);
+			if (str[0] != 'x') continue;
+			int index = convert_int(str.substr(1).c_str());
+			if (index < 0 || index >= ntaxa) {
+				cout << "Index x_" << index << " is not in the range!" << endl;
+				ret = 6;
+				break;
+			}
+			double value;
+			in >> value;
+			if (value > tolerance && (1.0 - value) > tolerance) {
+				if (verbose_mode >= VB_MED) cout << endl << str << " = " << value;
+				ret = 7;
+				if (!verbose_mode) break;
+			}
+			variables[index] = value;
+		}
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (string str) {
+		outError(str);
+	}
+
+	command = filename;
+	command += ".log";
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(command.c_str());
+		string str;
+
+		while (!in.eof()) {
+			in.exceptions(ios::badbit);
+			if(!(in >> str)) break;
+			// set the failbit again
+			in.exceptions(ios::failbit | ios::badbit);
+			if (str != "Best" && str != "Optimal") continue;
+			in >> str;
+			if (str != "objective") continue;
+			in >> str;
+			// remove the ending comma ,
+			if (*str.rbegin() == ',') str.erase(str.length()-1);
+			*score = convert_double(str.c_str());
+			break;
+		}
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (string str) {
+		outError(str);
+	}
+	return ret;
+}
diff --git a/gurobiwrapper.h b/gurobiwrapper.h
new file mode 100644
index 0000000..3f9969c
--- /dev/null
+++ b/gurobiwrapper.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef GUROBI_WRAPPER_H
+#define GUROBI_WRAPPER_H
+
+
+/**
+	interface to call GUROBI LP solver
+	@param filename name of input lp file
+	@param ntaxa number of taxa
+	@param score (OUT) returned optimal score
+	@param variables (OUT) array of returned solution
+	@param verbose_mode verbose mode
+	@return 
+		0 if everything works file, 
+		5 if solution is not optimal, 
+		6 if some variable has wrong name, 
+		7 if returned solution is not binary. In this case, one should run the solver 
+		again with strict binary variable constraint.
+*/
+int gurobi_solve(char *filename, int ntaxa, double *score, double *variables, int verbose_mode, int num_threads);
+
+
+#endif
diff --git a/gzstream.cpp b/gzstream.cpp
new file mode 100644
index 0000000..d9c81a3
--- /dev/null
+++ b/gzstream.cpp
@@ -0,0 +1,170 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.C
+// Revision      : $Revision: 1.7 $
+// Revision_date : $Date: 2003/01/08 14:41:27 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#include "gzstream.h"
+#include <iostream>
+#include <string.h>  // for memcpy
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See header file for user classes.
+// ----------------------------------------------------------------------------
+
+// --------------------------------------
+// class gzstreambuf:
+// --------------------------------------
+
+gzstreambuf* gzstreambuf::open( const char* name, int open_mode) {
+    if ( is_open())
+        return (gzstreambuf*)0;
+    mode = open_mode;
+    // no append nor read/write mode
+    if ((mode & std::ios::ate) || (mode & std::ios::app)
+        || ((mode & std::ios::in) && (mode & std::ios::out)))
+        return (gzstreambuf*)0;
+    char  fmode[10];
+    char* fmodeptr = fmode;
+    if ( mode & std::ios::in)
+        *fmodeptr++ = 'r';
+    else if ( mode & std::ios::out)
+        *fmodeptr++ = 'w';
+    *fmodeptr++ = 'b';
+    *fmodeptr++ = '9'; // best compression ratio
+    *fmodeptr = '\0';
+    file = gzopen( name, fmode);
+    if (file == 0)
+        return (gzstreambuf*)0;
+    opened = 1;
+    return this;
+}
+
+gzstreambuf * gzstreambuf::close() {
+    if ( is_open()) {
+        sync();
+        opened = 0;
+        if ( gzclose( file) == Z_OK)
+            return this;
+    }
+    return (gzstreambuf*)0;
+}
+
+int gzstreambuf::underflow() { // used for input buffer only
+    if ( gptr() && ( gptr() < egptr()))
+        return * reinterpret_cast<unsigned char *>( gptr());
+
+    if ( ! (mode & std::ios::in) || ! opened)
+        return EOF;
+    // Josuttis' implementation of inbuf
+    int n_putback = gptr() - eback();
+    if ( n_putback > 4)
+        n_putback = 4;
+    memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback);
+
+    int num = gzread( file, buffer+4, bufferSize-4);
+    if (num <= 0) // ERROR or EOF
+        return EOF;
+
+    // reset buffer pointers
+    setg( buffer + (4 - n_putback),   // beginning of putback area
+          buffer + 4,                 // read position
+          buffer + 4 + num);          // end of buffer
+
+    // return next character
+    return * reinterpret_cast<unsigned char *>( gptr());    
+}
+
+int gzstreambuf::flush_buffer() {
+    // Separate the writing of the buffer from overflow() and
+    // sync() operation.
+    int w = pptr() - pbase();
+    if ( gzwrite( file, pbase(), w) != w)
+        return EOF;
+    pbump( -w);
+    return w;
+}
+
+int gzstreambuf::overflow( int c) { // used for output buffer only
+    if ( ! ( mode & std::ios::out) || ! opened)
+        return EOF;
+    if (c != EOF) {
+        *pptr() = c;
+        pbump(1);
+    }
+    if ( flush_buffer() == EOF)
+        return EOF;
+    return c;
+}
+
+int gzstreambuf::sync() {
+    // Changed to use flush_buffer() instead of overflow( EOF)
+    // which caused improper behavior with std::endl and flush(),
+    // bug reported by Vincent Ricard.
+    if ( pptr() && pptr() > pbase()) {
+        if ( flush_buffer() == EOF)
+            return -1;
+    }
+    return 0;
+}
+
+// --------------------------------------
+// class gzstreambase:
+// --------------------------------------
+
+gzstreambase::gzstreambase( const char* name, int mode) {
+    init( &buf);
+    open( name, mode);
+}
+
+gzstreambase::~gzstreambase() {
+    buf.close();
+}
+
+void gzstreambase::open( const char* name, int open_mode) {
+    if ( ! buf.open( name, open_mode))
+        clear( rdstate() | std::ios::badbit);
+}
+
+void gzstreambase::close() {
+    if ( buf.is_open())
+        if ( ! buf.close())
+            clear( rdstate() | std::ios::badbit);
+}
+
+z_off_t gzstreambase::get_raw_bytes() {
+	return gztell(buf.file);
+}
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+// ============================================================================
+// EOF //
diff --git a/gzstream.h b/gzstream.h
new file mode 100644
index 0000000..f7f971f
--- /dev/null
+++ b/gzstream.h
@@ -0,0 +1,124 @@
+// ============================================================================
+// gzstream, C++ iostream classes wrapping the zlib compression library.
+// Copyright (C) 2001  Deepak Bandyopadhyay, Lutz Kettner
+//
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// ============================================================================
+//
+// File          : gzstream.h
+// Revision      : $Revision: 1.5 $
+// Revision_date : $Date: 2002/04/26 23:30:15 $
+// Author(s)     : Deepak Bandyopadhyay, Lutz Kettner
+// 
+// Standard streambuf implementation following Nicolai Josuttis, "The 
+// Standard C++ Library".
+// ============================================================================
+
+#ifndef GZSTREAM_H
+#define GZSTREAM_H 1
+
+// standard C++ with new header file names and std:: namespace
+#include <iostream>
+#include <fstream>
+#include "zlib-1.2.7/zlib.h"
+
+#ifdef GZSTREAM_NAMESPACE
+namespace GZSTREAM_NAMESPACE {
+#endif
+
+// ----------------------------------------------------------------------------
+// Internal classes to implement gzstream. See below for user classes.
+// ----------------------------------------------------------------------------
+
+class gzstreambuf : public std::streambuf {
+	friend class gzstreambase;
+private:
+    static const int bufferSize = 47+256;    // size of data buff
+    // totals 512 bytes under g++ for igzstream at the end.
+
+    gzFile           file;               // file handle for compressed file
+    char             buffer[bufferSize]; // data buffer
+    char             opened;             // open/close state of stream
+    int              mode;               // I/O mode
+
+    int flush_buffer();
+public:
+    gzstreambuf() : opened(0) {
+        setp( buffer, buffer + (bufferSize-1));
+        setg( buffer + 4,     // beginning of putback area
+              buffer + 4,     // read position
+              buffer + 4);    // end position      
+        // ASSERT: both input & output capabilities will not be used together
+    }
+    int is_open() { return opened; }
+    gzstreambuf* open( const char* name, int open_mode);
+    gzstreambuf* close();
+    ~gzstreambuf() { close(); }
+    
+    virtual int     overflow( int c = EOF);
+    virtual int     underflow();
+    virtual int     sync();
+};
+
+class gzstreambase : virtual public std::ios {
+protected:
+    gzstreambuf buf;
+public:
+    gzstreambase() { init(&buf); }
+    gzstreambase( const char* name, int open_mode);
+    ~gzstreambase();
+    void open( const char* name, int open_mode);
+    void close();
+	z_off_t get_raw_bytes(); // BQM: return number of uncompressed bytes
+
+    gzstreambuf* rdbuf() { return &buf; }
+};
+
+// ----------------------------------------------------------------------------
+// User classes. Use igzstream and ogzstream analogously to ifstream and
+// ofstream respectively. They read and write files based on the gz* 
+// function interface of the zlib. Files are compatible with gzip compression.
+// ----------------------------------------------------------------------------
+
+class igzstream : public gzstreambase, public std::istream {
+public:
+    igzstream() : std::istream( &buf) {} 
+    igzstream( const char* name, int open_mode = std::ios::in)
+        : gzstreambase( name, open_mode), std::istream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::in) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+class ogzstream : public gzstreambase, public std::ostream {
+public:
+    ogzstream() : std::ostream( &buf) {}
+    ogzstream( const char* name, int mode = std::ios::out)
+        : gzstreambase( name, mode), std::ostream( &buf) {}  
+    gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); }
+    void open( const char* name, int open_mode = std::ios::out) {
+        gzstreambase::open( name, open_mode);
+    }
+};
+
+#ifdef GZSTREAM_NAMESPACE
+} // namespace GZSTREAM_NAMESPACE
+#endif
+
+#endif // GZSTREAM_H
+// ============================================================================
+// EOF //
+
diff --git a/hashsplitset.cpp b/hashsplitset.cpp
new file mode 100644
index 0000000..93a2399
--- /dev/null
+++ b/hashsplitset.cpp
@@ -0,0 +1,72 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "hashsplitset.h"
+#include "splitgraph.h"
+
+Split *SplitIntMap::findSplit(Split *sp) {
+    iterator ass_it = find(sp);
+    if (ass_it != end()) {
+        return ass_it->first;
+    }
+    return NULL;
+}
+
+
+Split *SplitIntMap::findSplit(Split *sp, int &value) {
+    iterator ass_it = find(sp);
+    if (ass_it != end()) {
+        value = ass_it->second;
+        return ass_it->first;
+    }
+    value = 0;
+    return NULL;
+}
+
+int SplitIntMap::getValue(Split *sp) {
+    int value;
+    if (!findSplit(sp, value)) outError(__func__);
+    return value;
+}
+
+void SplitIntMap::setValue(Split *sp, int value) {
+    if (!findSplit(sp)) outError(__func__);
+    (*this)[sp] = value;
+}
+
+void SplitIntMap::eraseSplit(Split *sp) {
+    if (!findSplit(sp)) outError(__func__);
+    erase(sp);
+}
+
+void SplitIntMap::insertSplit(Split *sp, int value) {
+    if (findSplit(sp)) outError(__func__);
+    if (verbose_mode >= VB_MAX) sp->report(cout);
+    (*this)[sp] = value;
+}
+
+void SplitIntMap::buildMap(SplitGraph &sg, bool use_index) {
+	clear();
+	for (int i = 0; i < sg.size(); i++) {
+		if (use_index) 
+			insertSplit(sg[i], i);
+		else
+			insertSplit(sg[i], sg[i]->getWeight());
+	}
+}
diff --git a/hashsplitset.h b/hashsplitset.h
new file mode 100644
index 0000000..f86fdd0
--- /dev/null
+++ b/hashsplitset.h
@@ -0,0 +1,128 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef HASHSPLITSET_H
+#define HASHSPLITSET_H
+
+
+#include "tools.h"
+#include "split.h"
+
+using namespace std;
+
+class SplitGraph;
+
+#ifdef USE_HASH_MAP
+/*
+	Define the hash function of Split
+*/
+struct hashfunc_Split {
+	size_t operator()(const Split* sp) const {
+		size_t sum = 0;
+		for (Split::const_iterator it = sp->begin(); it != sp->end(); it++)
+			sum = (*it) + (sum << 6) + (sum << 16) - sum;
+		return sum;
+	}
+};
+#endif // USE_HASH_MAP
+
+namespace std {
+	/**
+		Define equal_to of two splits, used for hash_set (or hash_map) template
+	*/
+	template<>
+	struct equal_to<Split*> {
+		/**
+			@return true if *s1 == *s2
+			@param s1 first split
+			@param s2 second split
+		*/
+		bool operator()(const Split* s1, const Split* s2) const{
+			return *s1 == *s2;
+		}
+	};
+	/**
+		Define less than relationship of two splits, used for set (or map) template
+	*/
+	template<>
+	struct less<Split*> {
+		/**
+			@return true if *s1 < *s2 alphabetically
+			@param s1 first split
+			@param s2 second split
+		*/
+		bool operator()(const Split *s1, const Split *s2) const {
+			assert(s1->size() == s2->size());
+			for (int i = 0; i < s1->size(); i++)
+				if ((*s1)[i] < (*s2)[i]) 
+					return true;
+				else if ((*s1)[i] > (*s2)[i]) return false;
+			return false;
+		}
+	};
+} // namespace std
+
+
+/**
+SplitSet for quick search purpose
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+#ifdef USE_HASH_MAP
+class SplitIntMap : public unordered_map<Split*, int, hashfunc_Split>
+#else
+class SplitIntMap : map<Split*, int, std::less<Split*> > 
+#endif
+{
+public:
+
+	/**
+		find a split
+		@param sp target split
+		@return the split containing the same set of taxa with sp, NULL if not found
+	*/
+	Split *findSplit(Split *sp);
+
+	/**
+		find a split
+		@param sp target split
+		@param value (OUT) associated value
+		@return the split containing the same set of taxa with sp, NULL if not found
+	*/
+	Split *findSplit(Split *sp, int &value);
+
+	int getValue(Split *sp);
+
+	void setValue(Split *sp, int value);
+
+	void eraseSplit(Split *sp);
+
+	void insertSplit(Split *sp, int value);
+
+	/**
+	 * build a map from the input split graph
+	 * @param sg input split graph
+	 * @param use_index TRUE to map to index of splits in sg, FALSE to map to split weights
+	*/
+	void buildMap(SplitGraph &sg, bool use_index = true);
+	
+};
+
+#endif
diff --git a/iqtree.cpp b/iqtree.cpp
new file mode 100644
index 0000000..22bf83e
--- /dev/null
+++ b/iqtree.cpp
@@ -0,0 +1,3207 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "iqtree.h"
+#include "phylosupertree.h"
+#include "phylosupertreeplen.h"
+#include "mexttree.h"
+#include "timeutil.h"
+#include "model/modelgtr.h"
+#include "model/rategamma.h"
+#include <numeric>
+#include "pllrepo/src/pllInternal.h"
+#include "pllrepo/src/pll.h"
+#include "pllnni.h"
+#include "vectorclass/vectorclass.h"
+#include "vectorclass/vectormath_common.h"
+
+
+Params *globalParam;
+Alignment *globalAlignment;
+extern StringIntMap pllTreeCounter;
+
+
+IQTree::IQTree() : PhyloTree() {
+    IQTree::init();
+}
+
+void IQTree::init() {
+//	PhyloTree::init();
+    k_represent = 0;
+    k_delete = k_delete_min = k_delete_max = k_delete_stay = 0;
+    dist_matrix = NULL;
+    var_matrix = NULL;
+    nni_count_est = 0.0;
+    nni_delta_est = 0;
+//    curScore = 0.0; // Current score of the tree
+    cur_pars_score = -1;
+//    enable_parsimony = false;
+    estimate_nni_cutoff = false;
+    nni_cutoff = -1e6;
+    nni_sort = false;
+    testNNI = false;
+    print_tree_lh = false;
+    write_intermediate_trees = 0;
+    max_candidate_trees = 0;
+    logl_cutoff = 0.0;
+    len_scale = 10000;
+    save_all_br_lens = false;
+    duplication_counter = 0;
+    //boot_splits = new SplitGraph;
+    pll2iqtree_pattern_index = NULL;
+    fastNNI = true;
+}
+
+IQTree::IQTree(Alignment *aln) : PhyloTree(aln) {
+    IQTree::init();
+}
+
+void IQTree::initSettings(Params &params) {
+    searchinfo.speednni = params.speednni;
+    searchinfo.nni_type = params.nni_type;
+    optimize_by_newton = params.optimize_by_newton;
+    setLikelihoodKernel(params.SSE);
+    candidateTrees.init(this->aln, &params);
+//    if (params.maxtime != 1000000) {
+//        params.autostop = false;
+//    }
+    if (params.min_iterations == -1) {
+        if (!params.gbo_replicates) {
+            if (params.stop_condition == SC_UNSUCCESS_ITERATION) {
+                params.min_iterations = aln->getNSeq() * 100;
+            } else if (aln->getNSeq() < 100) {
+                params.min_iterations = 200;
+            } else {
+                params.min_iterations = aln->getNSeq() * 2;
+            }
+            if (params.iteration_multiple > 1)
+                params.min_iterations = aln->getNSeq() * params.iteration_multiple;
+        } else {
+            params.min_iterations = 100;
+        }
+    }
+    if (params.treeset_file && params.min_iterations == -1) {
+        params.min_iterations = 1;
+		params.stop_condition = SC_FIXED_ITERATION;
+		params.numInitTrees = 1;
+    }
+    if (params.gbo_replicates)
+        params.max_iterations = max(params.max_iterations, max(params.min_iterations, 1000));
+
+    k_represent = params.k_representative;
+
+    if (params.p_delete == -1.0) {
+        if (aln->getNSeq() < 4)
+            params.p_delete = 0.0; // delete nothing
+        else if (aln->getNSeq() == 4)
+            params.p_delete = 0.25; // just delete 1 leaf
+        else if (aln->getNSeq() == 5)
+            params.p_delete = 0.4; // just delete 2 leaves
+        else if (aln->getNSeq() < 51)
+            params.p_delete = 0.5;
+        else if (aln->getNSeq() < 100)
+            params.p_delete = 0.3;
+        else if (aln->getNSeq() < 200)
+            params.p_delete = 0.2;
+        else if (aln->getNSeq() < 400)
+            params.p_delete = 0.1;
+        else
+            params.p_delete = 0.05;
+    }
+    //tree.setProbDelete(params.p_delete);
+    if (params.p_delete != -1.0) {
+        k_delete = k_delete_min = k_delete_max = ceil(params.p_delete * leafNum);
+    } else {
+        k_delete = k_delete_min = 10;
+        k_delete_max = leafNum / 2;
+        if (k_delete_max > 100)
+            k_delete_max = 100;
+        if (k_delete_max < 20)
+            k_delete_max = 20;
+        k_delete_stay = ceil(leafNum / k_delete);
+    }
+
+    //tree.setIQPIterations(params.stop_condition, params.stop_confidence, params.min_iterations, params.max_iterations);
+
+    stop_rule.initialize(params);
+
+    //tree.setIQPAssessQuartet(params.iqp_assess_quartet);
+    iqp_assess_quartet = params.iqp_assess_quartet;
+    estimate_nni_cutoff = params.estimate_nni_cutoff;
+    nni_cutoff = params.nni_cutoff;
+    nni_sort = params.nni_sort;
+    testNNI = params.testNNI;
+
+    globalParam = ¶ms;
+    globalAlignment = aln;
+
+    write_intermediate_trees = params.write_intermediate_trees;
+
+    if (write_intermediate_trees > 2 || params.gbo_replicates > 0) {
+        save_all_trees = 1;
+    }
+    if (params.gbo_replicates > 0) {
+        if (params.iqp_assess_quartet != IQP_BOOTSTRAP) {
+            save_all_trees = 2;
+        }
+    }
+    if (params.gbo_replicates > 0 && params.do_compression)
+        save_all_br_lens = true;
+    print_tree_lh = params.print_tree_lh;
+    max_candidate_trees = params.max_candidate_trees;
+    if (max_candidate_trees == 0)
+        max_candidate_trees = aln->getNSeq() * params.step_iterations;
+    setRootNode(params.root);
+
+    string bootaln_name = params.out_prefix;
+    bootaln_name += ".bootaln";
+    if (params.print_bootaln) {
+        ofstream bootalnout;
+    	bootalnout.open(bootaln_name.c_str());
+    	bootalnout.close();
+    }
+    size_t i;
+
+    if (params.online_bootstrap && params.gbo_replicates > 0) {
+        cout << "Generating " << params.gbo_replicates << " samples for ultrafast bootstrap..." << endl;
+        // allocate memory for boot_samples
+        boot_samples.resize(params.gbo_replicates);
+        size_t orig_nptn = getAlnNPattern();
+#ifdef BOOT_VAL_FLOAT
+        size_t nptn = get_safe_upper_limit_float(orig_nptn);
+#else
+        size_t nptn = get_safe_upper_limit(orig_nptn);
+#endif
+        BootValType *mem = aligned_alloc<BootValType>(nptn * (size_t)(params.gbo_replicates));
+        memset(mem, 0, nptn * (size_t)(params.gbo_replicates) * sizeof(BootValType));
+        for (i = 0; i < params.gbo_replicates; i++)
+        	boot_samples[i] = mem + i*nptn;
+
+        boot_logl.resize(params.gbo_replicates, -DBL_MAX);
+        boot_trees.resize(params.gbo_replicates, -1);
+        boot_counts.resize(params.gbo_replicates, 0);
+        VerboseMode saved_mode = verbose_mode;
+        verbose_mode = VB_QUIET;
+        for (i = 0; i < params.gbo_replicates; i++) {
+        	if (params.print_bootaln) {
+    			Alignment* bootstrap_alignment;
+    			if (aln->isSuperAlignment())
+    				bootstrap_alignment = new SuperAlignment;
+    			else
+    				bootstrap_alignment = new Alignment;
+    			IntVector this_sample;
+    			bootstrap_alignment->createBootstrapAlignment(aln, &this_sample, params.bootstrap_spec);
+    			for (size_t j = 0; j < orig_nptn; j++)
+    				boot_samples[i][j] = this_sample[j];
+				bootstrap_alignment->printPhylip(bootaln_name.c_str(), true);
+				delete bootstrap_alignment;
+        	} else {
+    			IntVector this_sample;
+        		aln->createBootstrapAlignment(this_sample, params.bootstrap_spec);
+    			for (size_t j = 0; j < orig_nptn; j++)
+    				boot_samples[i][j] = this_sample[j];
+        	}
+        }
+        verbose_mode = saved_mode;
+        if (params.print_bootaln) {
+        	cout << "Bootstrap alignments printed to " << bootaln_name << endl;
+        }
+
+        cout << "Max candidate trees (tau): " << max_candidate_trees << endl;
+    }
+
+    if (params.root_state) {
+        if (strlen(params.root_state) != 1)
+            outError("Root state must have exactly 1 character");
+        root_state = aln->convertState(params.root_state[0]);
+        if (root_state < 0 || root_state >= aln->num_states)
+            outError("Invalid root state");
+    }
+}
+
+void myPartitionsDestroy(partitionList *pl) {
+	int i;
+	for (i = 0; i < pl->numberOfPartitions; i++) {
+		rax_free(pl->partitionData[i]->partitionName);
+		rax_free(pl->partitionData[i]);
+	}
+	rax_free(pl->partitionData);
+	rax_free(pl);
+}
+
+IQTree::~IQTree() {
+    //if (bonus_values)
+    //delete bonus_values;
+    //bonus_values = NULL;
+    if (dist_matrix)
+        delete[] dist_matrix;
+    dist_matrix = NULL;
+
+    if (var_matrix)
+        delete[] var_matrix;
+    var_matrix = NULL;
+
+    for (vector<double*>::reverse_iterator it = treels_ptnlh.rbegin(); it != treels_ptnlh.rend(); it++)
+        delete[] (*it);
+    treels_ptnlh.clear();
+    for (vector<SplitGraph*>::reverse_iterator it2 = boot_splits.rbegin(); it2 != boot_splits.rend(); it2++)
+        delete (*it2);
+    //if (boot_splits) delete boot_splits;
+    if (pllPartitions)
+    	myPartitionsDestroy(pllPartitions);
+    if (pllAlignment)
+    	pllAlignmentDataDestroy(pllAlignment);
+    if (pllInst)
+        pllDestroyInstance(pllInst);
+
+    if (!boot_samples.empty())
+    	aligned_free(boot_samples[0]); // free memory
+}
+
+extern const char *aa_model_names_rax[];
+
+void IQTree::createPLLPartition(Params &params, ostream &pllPartitionFileHandle) {
+    if (isSuperTree()) {
+        PhyloSuperTree *siqtree = (PhyloSuperTree*) this;
+        // additional check for PLL hard limit
+        if (siqtree->size() > PLL_NUM_BRANCHES)
+        	outError("Number of partitions exceeds PLL limit, please increase PLL_NUM_BRANCHES constant in pll.h");
+        int i = 0;
+        int startPos = 1;
+        for (PhyloSuperTree::iterator it = siqtree->begin(); it != siqtree->end(); it++) {
+            i++;
+            int curLen = ((*it))->getAlnNSite();
+            if ((*it)->aln->seq_type == SEQ_DNA) {
+                pllPartitionFileHandle << "DNA";
+            } else if ((*it)->aln->seq_type == SEQ_PROTEIN) {
+            	if (siqtree->part_info[i-1].model_name != "" && siqtree->part_info[i-1].model_name.substr(0, 4) != "TEST") {
+                    string modelStr = siqtree->part_info[i - 1].model_name.
+                            substr(0, siqtree->part_info[i - 1].model_name.find_first_of("+{"));
+                    if (modelStr == "LG4")
+                        modelStr = "LG4M";
+                    bool name_ok = false;
+                    for (int j = 0; j < 18; j++)
+                        if (modelStr == aa_model_names_rax[j]) {
+                            name_ok = true;
+                            break;
+                        }
+                    if (name_ok)
+                        pllPartitionFileHandle << modelStr;
+                    else
+                        pllPartitionFileHandle << "WAG";                    
+                } else {
+                    pllPartitionFileHandle << "WAG";
+                }
+            } else
+            	outError("PLL only works with DNA/protein alignments");
+            pllPartitionFileHandle << ", p" << i << " = " << startPos << "-" << startPos + curLen - 1 << endl;
+            startPos = startPos + curLen;
+        }
+    } else {
+        /* create a partition file */
+        string model;
+        if (aln->seq_type == SEQ_DNA) {
+            model = "DNA";
+        } else if (aln->seq_type == SEQ_PROTEIN) {
+        	if (params.pll && params.model_name != "" && params.model_name.substr(0, 4) != "TEST") {
+        		model = params.model_name.substr(0, params.model_name.find_first_of("+{"));
+        	} else {
+        		model = "WAG";
+        	}
+        } else {
+        	model = "WAG";
+        	//outError("PLL currently only supports DNA/protein alignments");
+        }
+        pllPartitionFileHandle << model << ", p1 = " << "1-" << getAlnNSite() << endl;
+    }
+}
+
+void IQTree::computeInitialTree(string &dist_file, LikelihoodKernel kernel) {
+    double start = getRealTime();
+    string initTree;
+    string out_file = params->out_prefix;
+    int score;
+    if (params->stop_condition == SC_FIXED_ITERATION && params->numNNITrees > params->min_iterations)
+    	params->numNNITrees = params->min_iterations;
+    int fixed_number = 0;
+    setParsimonyKernel(kernel);
+    
+    if (params->user_file) {
+        // start the search with user-defined tree
+        cout << "Reading input tree file " << params->user_file << " ..." << endl;
+        bool myrooted = params->is_rooted;
+        readTree(params->user_file, myrooted);
+        setAlignment(aln);
+        if (isSuperTree())
+        	wrapperFixNegativeBranch(!params->fixed_branch_length);
+        else
+        	fixed_number = wrapperFixNegativeBranch(false);
+        params->numInitTrees = 1;
+        params->numNNITrees = 1;
+        // change to old kernel if tree is multifurcating
+		if ((params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) && !isBifurcating()) {
+			cout << "NOTE: Changing to old kernel as input tree is multifurcating" << endl;
+			params->SSE = LK_SSE;
+		}
+		if (params->pll)
+			pllReadNewick(getTreeString());
+    } else switch (params->start_tree) {
+    case STT_PARSIMONY:
+        // Create parsimony tree using IQ-Tree kernel
+        if (kernel == LK_EIGEN_SSE)
+            cout << "Creating fast SIMD initial parsimony tree by random order stepwise addition..." << endl;
+        else if (kernel == LK_EIGEN)
+            cout << "Creating fast initial parsimony tree by random order stepwise addition..." << endl;
+        else
+            cout << "Creating initial parsimony tree by random order stepwise addition..." << endl;
+//        aln->orderPatternByNumChars();
+        start = getRealTime();
+        score = computeParsimonyTree(params->out_prefix, aln);
+        cout << getRealTime() - start << " seconds, parsimony score: " << score
+        	<< " (based on " << aln->num_informative_sites << " informative sites)"<< endl;
+//		if (params->pll)
+//			pllReadNewick(getTreeString());
+	    wrapperFixNegativeBranch(false);
+
+        break;
+    case STT_PLL_PARSIMONY:
+        cout << endl;
+        cout << "Create initial parsimony tree by phylogenetic likelihood library (PLL)... ";
+        pllInst->randomNumberSeed = params->ran_seed;
+        pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
+        resetBranches(pllInst);
+        pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back,
+                PLL_FALSE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+        PhyloTree::readTreeString(string(pllInst->tree_string));
+        cout << getRealTime() - start << " seconds" << endl;
+	    wrapperFixNegativeBranch(true);
+        break;
+    case STT_BIONJ:
+        // This is the old default option: using BIONJ as starting tree
+        computeBioNJ(*params, aln, dist_file);
+        cout << getRealTime() - start << " seconds" << endl;
+        params->numInitTrees = 1;
+//		if (params->pll)
+//			pllReadNewick(getTreeString());
+        if (isSuperTree())
+        	wrapperFixNegativeBranch(true);
+        else
+        	fixed_number = wrapperFixNegativeBranch(false);
+		break;
+    case STT_RANDOM_TREE:
+        cout << "Generate random initial Yule-Harding tree..." << endl;
+        generateRandomTree(YULE_HARDING);
+        wrapperFixNegativeBranch(true);
+        break;
+    }
+
+    if (fixed_number) {
+        cout << "WARNING: " << fixed_number << " undefined/negative branch lengths are initialized with parsimony" << endl;
+    }
+
+    if (params->root) {
+        string str = params->root;
+        if (!findNodeName(str)) {
+            str = "Specified root name " + str + "not found";
+            outError(str);
+        }
+    }
+    if (params->write_init_tree) {
+        out_file += ".init_tree";
+        printTree(out_file.c_str(), WT_NEWLINE);
+//        printTree(getTreeString().c_str(), WT_NEWLINE);
+    }
+}
+
+void IQTree::initCandidateTreeSet(int nParTrees, int nNNITrees) {
+    cout << "--------------------------------------------------------------------" << endl;
+    cout << "|             INITIALIZING CANDIDATE TREE SET                      |" << endl;
+    cout << "--------------------------------------------------------------------" << endl;
+
+    cout << "Generating " << nParTrees  << " parsimony trees... ";
+    cout.flush();
+    double startTime = getRealTime();
+    int numDupPars = 0;
+#ifdef _OPENMP
+    StrVector pars_trees;
+    if (params->start_tree == STT_PARSIMONY && nParTrees > 1) {
+        pars_trees.resize(nParTrees-1);
+        #pragma omp parallel
+        {
+            PhyloTree tree;
+            tree.setParams(params);
+            tree.setParsimonyKernel(params->SSE);
+            #pragma omp for
+            for (int i = 1; i < nParTrees; i++) {
+                tree.computeParsimonyTree(NULL, aln);
+                pars_trees[i-1] = tree.getTreeString();
+            }
+        }
+    }
+#endif
+    for (int treeNr = 1; treeNr <= nParTrees; treeNr++) {
+        string curParsTree;
+
+        /********* Create parsimony tree using PLL *********/
+        if (params->start_tree == STT_PLL_PARSIMONY) {
+			pllInst->randomNumberSeed = params->ran_seed + treeNr * 12345;
+	        pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInst, pllPartitions, params->sprDist);
+	        resetBranches(pllInst);
+			pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions,
+					pllInst->start->back, PLL_FALSE, PLL_TRUE, PLL_FALSE,
+					PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+			curParsTree = string(pllInst->tree_string);
+			PhyloTree::readTreeString(curParsTree);
+			wrapperFixNegativeBranch(true);
+			curParsTree = getTreeString();
+        } else {
+            /********* Create parsimony tree using IQ-TREE *********/
+#ifdef _OPENMP
+            curParsTree = pars_trees[treeNr-1];
+#else
+            computeParsimonyTree(NULL, aln);
+            curParsTree = getTreeString();
+#endif
+        }
+
+        if (candidateTrees.treeExist(curParsTree)) {
+            numDupPars++;
+            continue;
+        } else {
+            if (params->count_trees) {
+                string tree = getTopology();
+                if (pllTreeCounter.find(tree) == pllTreeCounter.end()) {
+                    // not found in hash_map
+                    pllTreeCounter[curParsTree] = 1;
+                } else {
+                    // found in hash_map
+                    pllTreeCounter[curParsTree]++;
+                }
+        	}
+        	candidateTrees.update(curParsTree, -DBL_MAX, false);
+        }
+    }
+    double parsTime = getRealTime() - startTime;
+    cout << parsTime << " seconds ";
+    cout << candidateTrees.size() << " distinct starting trees" << endl;
+
+    /****************************************************************************************
+                      Compute logl of all parsimony trees
+    *****************************************************************************************/
+
+    cout << "Computing log-likelihood of parsimony trees ... ";
+    startTime = getRealTime();
+//    CandidateSet candTrees = candidateTrees.getBestCandidateTrees(candidateTrees.size());
+    CandidateSet candTrees = candidateTrees;
+
+    for (CandidateSet::iterator it = candTrees.begin(); it != candTrees.end(); ++it) {
+        string treeString;
+        double score;
+        if (it->first == -DBL_MAX) {
+            readTreeString(it->second.tree);
+            treeString = optimizeBranches(2);
+            score = getCurScore();
+        } else {
+            treeString = it->second.tree;
+            score = it->first;
+        }
+        candidateTrees.update(treeString, score);
+    }
+    
+    if (verbose_mode >= VB_MED) {
+        vector<double> bestScores = candidateTrees.getBestScores(candidateTrees.size());
+        for (vector<double>::iterator it = bestScores.begin(); it != bestScores.end(); it++)
+            cout << (*it) << " ";
+        cout << endl;
+    }
+
+
+    double loglTime = getRealTime() - startTime;
+    cout << loglTime << " seconds" << endl;
+
+    // Only select the best nNNITrees for doing NNI search
+    CandidateSet initParsimonyTrees = candidateTrees.getBestCandidateTrees(nNNITrees);
+    candidateTrees.clear();
+
+    cout << "Optimizing top parsimony trees with NNI..." << endl;
+    startTime = getCPUTime();
+    /*********** START: Do NNI on the best parsimony trees ************************************/
+    CandidateSet::reverse_iterator rit;
+    stop_rule.setCurIt(1);
+    for (rit = initParsimonyTrees.rbegin(); rit != initParsimonyTrees.rend(); ++rit, stop_rule.setCurIt(
+            stop_rule.getCurIt() + 1)) {
+    	int nniCount, nniStep;
+        double initLogl, nniLogl;
+        string tree;
+        readTreeString(rit->second.tree);
+        computeLogL();
+//         THIS HAPPEN WHENEVER USING FULL PARTITION MODEL
+//        if (isSuperTree() && params->partition_type == 0) {
+//        	if (verbose_mode >= VB_MED)
+//        		cout << "curScore: " << getCurScore() << " expected score: " << rit->first << endl;
+//        	optimizeBranches(2);
+//        }
+        initLogl = getCurScore();
+        tree = doNNISearch(nniCount, nniStep);
+        nniLogl = getCurScore();
+        cout << "Iteration " << stop_rule.getCurIt() << " / LogL: " << getCurScore();
+        if (verbose_mode >= VB_MED) {
+        	cout << " / NNI count, steps: " << nniCount << "," << nniStep;
+        	cout << " / Parsimony logl " << initLogl << " / NNI logl: " << nniLogl;
+        }
+        cout << " / Time: " << convert_time(getRealTime() - params->start_real_time) << endl;
+
+        bool betterScore = false;
+        // Better tree or score is found
+        if (getCurScore() > candidateTrees.getBestScore() + params->modeps) {
+            // Re-optimize model parameters (the sNNI algorithm)
+        	tree = optimizeModelParameters(false, params->modeps * 10);
+        	betterScore = true;
+        }
+        bool newTree = candidateTrees.update(tree, getCurScore());
+		if (betterScore) {
+			if (newTree && nniCount != 0)
+				cout << "BETTER TREE FOUND at iteration " << stop_rule.getCurIt() << ": "
+						<< getCurScore() << endl;
+			else
+				cout << "BETTER SCORE FOUND at iteration " << stop_rule.getCurIt() << ": "
+						<< getCurScore() << endl;
+		}
+//        if (params.partition_type)
+//        	((PhyloSuperTreePlen*)&iqtree)->printNNIcasesNUM();
+    }
+    double nniTime = getCPUTime() - startTime;
+    cout << "Average CPU time for 1 NNI search: " << nniTime / initParsimonyTrees.size() << endl;
+}
+
+void IQTree::initializePLL(Params &params) {
+  pllAttr.rateHetModel = PLL_GAMMA;
+  pllAttr.fastScaling = PLL_FALSE;
+  pllAttr.saveMemory = PLL_FALSE;
+  pllAttr.useRecom = PLL_FALSE;
+  pllAttr.randomNumberSeed = params.ran_seed;
+  pllAttr.numberOfThreads = params.num_threads; /* This only affects the pthreads version */
+  if (pllInst != NULL) {
+    pllDestroyInstance(pllInst);
+  }
+  /* Create a PLL instance */
+  pllInst = pllCreateInstance(&pllAttr);
+
+  /* Read in the alignment file */
+  stringstream pllAln;
+  if (aln->isSuperAlignment()) {
+    ((SuperAlignment*) aln)->printCombinedAlignment(pllAln);
+  } else {
+    aln->printPhylip(pllAln);
+  }
+  string pllAlnStr = pllAln.str();
+  pllAlignment = pllParsePHYLIPString(pllAlnStr.c_str(), pllAlnStr.length());
+
+  /* Read in the partition information */
+  // BQM: to avoid printing file
+  stringstream pllPartitionFileHandle;
+  createPLLPartition(params, pllPartitionFileHandle);
+  pllQueue *partitionInfo = pllPartitionParseString(pllPartitionFileHandle.str().c_str());
+
+  /* Validate the partitions */
+  if (!pllPartitionsValidate(partitionInfo, pllAlignment)) {
+    outError("pllPartitionsValidate");
+  }
+
+  /* Commit the partitions and build a partitions structure */
+  pllPartitions = pllPartitionsCommit(partitionInfo, pllAlignment);
+
+  /* We don't need the the intermediate partition queue structure anymore */
+  pllQueuePartitionsDestroy(&partitionInfo);
+
+  /* eliminate duplicate sites from the alignment and update weights vector */
+  pllAlignmentRemoveDups(pllAlignment, pllPartitions);
+
+  pllTreeInitTopologyForAlignment(pllInst, pllAlignment);
+
+  /* Connect the alignment and partition structure with the tree structure */
+  if (!pllLoadAlignment(pllInst, pllAlignment, pllPartitions)) {
+    outError("Incompatible tree/alignment combination");
+  }
+}
+
+
+void IQTree::initializeModel(Params &params) {
+	ModelsBlock *models_block = readModelsDefinition(params);
+    try {
+        if (!getModelFactory()) {
+            if (isSuperTree()) {
+                if (params.partition_type) {
+                    setModelFactory(new PartitionModelPlen(params, (PhyloSuperTreePlen*) this, models_block));
+                } else
+                    setModelFactory(new PartitionModel(params, (PhyloSuperTree*) this, models_block));
+            } else {
+                setModelFactory(new ModelFactory(params, this, models_block));
+            }
+        }
+    } catch (string & str) {
+        outError(str);
+    }
+    setModel(getModelFactory()->model);
+    setRate(getModelFactory()->site_rate);
+
+    if (params.pll) {
+        if (getRate()->getNDiscreteRate() == 1) {
+        	outError("Non-Gamma model is not yet supported by PLL.");
+            // TODO: change rateHetModel to PLL_CAT in case of non-Gamma model
+        }
+        if (getRate()->name.substr(0,2) == "+I")
+        	outError("+Invar model is not yet supported by PLL.");
+        if (aln->seq_type == SEQ_DNA && getModel()->name != "GTR")
+        	outError("non GTR model for DNA is not yet supported by PLL.");
+        pllInitModel(pllInst, pllPartitions);
+    }
+    delete models_block;
+}
+double IQTree::getProbDelete() {
+    return (double) k_delete / leafNum;
+}
+
+void IQTree::resetKDelete() {
+    k_delete = k_delete_min;
+    k_delete_stay = ceil(leafNum / k_delete);
+}
+
+void IQTree::increaseKDelete() {
+    if (k_delete >= k_delete_max)
+        return;
+    k_delete_stay--;
+    if (k_delete_stay > 0)
+        return;
+    k_delete++;
+    k_delete_stay = ceil(leafNum / k_delete);
+    if (verbose_mode >= VB_MED)
+        cout << "Increase k_delete to " << k_delete << endl;
+}
+
+//void IQTree::setIQPIterations(STOP_CONDITION stop_condition, double stop_confidence, int min_iterations,
+//        int max_iterations) {
+//    stop_rule.setStopCondition(stop_condition);
+//    stop_rule.setConfidenceValue(stop_confidence);
+//    stop_rule.setIterationNum(min_iterations, max_iterations);
+//}
+
+RepresentLeafSet* IQTree::findRepresentLeaves(vector<RepresentLeafSet*> &leaves_vec, int nei_id, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) (dad->neighbors[nei_id]->node);
+    int set_id = dad->id * 3 + nei_id;
+    if (leaves_vec[set_id])
+        return leaves_vec[set_id];
+    RepresentLeafSet *leaves = new RepresentLeafSet;
+    RepresentLeafSet * leaves_it[2] = { NULL, NULL };
+    leaves_vec[set_id] = leaves;
+    RepresentLeafSet::iterator last;
+    RepresentLeafSet::iterator cur_it;
+    int i, j;
+    //double admit_height = 1000000;
+
+    leaves->clear();
+    if (node->isLeaf()) {
+        // set the depth to zero
+        //node->height = 0.0;
+        leaves->insert(new RepLeaf(node, 0));
+    } else {
+        for (i = 0, j = 0; i < node->neighbors.size(); i++)
+            if (node->neighbors[i]->node != dad) {
+                leaves_it[j++] = findRepresentLeaves(leaves_vec, i, node);
+            }
+        assert(j == 2 && leaves_it[0] && leaves_it[1]);
+        if (leaves_it[0]->empty() && leaves_it[1]->empty()) {
+            cout << "wrong";
+        }
+        RepresentLeafSet::iterator lit[2] = { leaves_it[0]->begin(), leaves_it[1]->begin() };
+        while (leaves->size() < k_represent) {
+            int id = -1;
+            if (lit[0] != leaves_it[0]->end() && lit[1] != leaves_it[1]->end()) {
+                if ((*lit[0])->height < (*lit[1])->height)
+                    id = 0;
+                else if ((*lit[0])->height > (*lit[1])->height)
+                    id = 1;
+                else { // tie, choose at random
+                    id = random_int(2);
+                }
+            } else if (lit[0] != leaves_it[0]->end())
+                id = 0;
+            else if (lit[1] != leaves_it[1]->end())
+                id = 1;
+            else
+                break;
+            assert(id < 2 && id >= 0);
+            leaves->insert(new RepLeaf((*lit[id])->leaf, (*lit[id])->height + 1));
+            lit[id]++;
+        }
+    }
+    assert(!leaves->empty());
+    /*
+     if (verbose_mode >= VB_MAX) {
+     for (cur_it = leaves->begin(); cur_it != leaves->end(); cur_it++)
+     cout << (*cur_it)->leaf->name << " ";
+     cout << endl;
+     }*/
+    return leaves;
+}
+
+/*
+ void IQPTree::clearRepresentLeaves(vector<RepresentLeafSet*> &leaves_vec, Node *node, Node *dad) {
+ int nei_id;
+ for (nei_id = 0; nei_id < node->neighbors.size(); nei_id++)
+ if (node->neighbors[nei_id]->node == dad) break;
+ assert(nei_id < node->neighbors.size());
+ int set_id = node->id * 3 + nei_id;
+ if (leaves_vec[set_id]) {
+ for (RepresentLeafSet::iterator rlit = leaves_vec[set_id]->begin(); rlit != leaves_vec[set_id]->end(); rlit++)
+ delete (*rlit);
+ delete leaves_vec[set_id];
+ leaves_vec[set_id] = NULL;
+ }
+ FOR_NEIGHBOR_IT(node, dad, it) {
+ clearRepresentLeaves(leaves_vec, (*it)->node, node);
+ }
+ }*/
+
+void IQTree::deleteNonCherryLeaves(PhyloNodeVector &del_leaves) {
+    NodeVector cherry_taxa;
+    NodeVector noncherry_taxa;
+    // get the vector of non cherry taxa
+    getNonCherryLeaves(noncherry_taxa, cherry_taxa);
+    root = NULL;
+    int num_taxa = aln->getNSeq();
+    int num_delete = k_delete;
+    if (num_delete > num_taxa - 4)
+        num_delete = num_taxa - 4;
+    if (verbose_mode >= VB_DEBUG) {
+        cout << "Deleting " << num_delete << " leaves" << endl;
+    }
+    vector<unsigned int> indices_noncherry(noncherry_taxa.size());
+    //iota(indices_noncherry.begin(), indices_noncherry.end(), 0);
+    unsigned int startValue = 0;
+    for (vector<unsigned int>::iterator it = indices_noncherry.begin(); it != indices_noncherry.end(); ++it) {
+        (*it) = startValue;
+        ++startValue;
+    }
+    my_random_shuffle(indices_noncherry.begin(), indices_noncherry.end());
+    int i;
+    for (i = 0; i < num_delete && i < noncherry_taxa.size(); i++) {
+        PhyloNode *taxon = (PhyloNode*) noncherry_taxa[indices_noncherry[i]];
+        del_leaves.push_back(taxon);
+        deleteLeaf(taxon);
+        //cout << taxon->id << ", ";
+    }
+    int j = 0;
+    if (i < num_delete) {
+        vector<unsigned int> indices_cherry(cherry_taxa.size());
+        //iota(indices_cherry.begin(), indices_cherry.end(), 0);
+        startValue = 0;
+        for (vector<unsigned int>::iterator it = indices_cherry.begin(); it != indices_cherry.end(); ++it) {
+            (*it) = startValue;
+            ++startValue;
+        }
+        my_random_shuffle(indices_cherry.begin(), indices_cherry.end());
+        while (i < num_delete) {
+            PhyloNode *taxon = (PhyloNode*) cherry_taxa[indices_cherry[j]];
+            del_leaves.push_back(taxon);
+            deleteLeaf(taxon);
+            i++;
+            j++;
+        }
+    }
+    root = cherry_taxa[j];
+}
+
+void IQTree::deleteLeaves(PhyloNodeVector &del_leaves) {
+    NodeVector taxa;
+    // get the vector of taxa
+    getTaxa(taxa);
+    root = NULL;
+    //int num_delete = floor(p_delete * taxa.size());
+    int num_delete = k_delete;
+    int i;
+    if (num_delete > taxa.size() - 4)
+        num_delete = taxa.size() - 4;
+    if (verbose_mode >= VB_DEBUG) {
+        cout << "Deleting " << num_delete << " leaves" << endl;
+    }
+    // now try to randomly delete some taxa of the probability of p_delete
+    for (i = 0; i < num_delete;) {
+        int id = random_int(taxa.size());
+        if (!taxa[id])
+            continue;
+        else
+            i++;
+        PhyloNode *taxon = (PhyloNode*) taxa[id];
+        del_leaves.push_back(taxon);
+        deleteLeaf(taxon);
+        taxa[id] = NULL;
+    }
+    // set root to the first taxon which was not deleted
+    for (i = 0; i < taxa.size(); i++)
+        if (taxa[i]) {
+            root = taxa[i];
+            break;
+        }
+}
+
+int IQTree::assessQuartet(Node *leaf0, Node *leaf1, Node *leaf2, Node *del_leaf) {
+    assert(dist_matrix);
+    int nseq = aln->getNSeq();
+    //int id0 = leaf0->id, id1 = leaf1->id, id2 = leaf2->id;
+    double dist0 = dist_matrix[leaf0->id * nseq + del_leaf->id] + dist_matrix[leaf1->id * nseq + leaf2->id];
+    double dist1 = dist_matrix[leaf1->id * nseq + del_leaf->id] + dist_matrix[leaf0->id * nseq + leaf2->id];
+    double dist2 = dist_matrix[leaf2->id * nseq + del_leaf->id] + dist_matrix[leaf0->id * nseq + leaf1->id];
+    if (dist0 < dist1 && dist0 < dist2)
+        return 0;
+    if (dist1 < dist2)
+        return 1;
+    return 2;
+}
+
+int IQTree::assessQuartetParsimony(Node *leaf0, Node *leaf1, Node *leaf2, Node *del_leaf) {
+    int score[3] = { 0, 0, 0 };
+    for (Alignment::iterator it = aln->begin(); it != aln->end(); it++) {
+        char ch0 = (*it)[leaf0->id];
+        char ch1 = (*it)[leaf1->id];
+        char ch2 = (*it)[leaf2->id];
+        char chd = (*it)[del_leaf->id];
+        if (ch0 >= aln->num_states || ch1 >= aln->num_states || ch2 >= aln->num_states || chd >= aln->num_states)
+            continue;
+        if (chd == ch0 && ch1 == ch2)
+            score[0] += (*it).frequency;
+        if (chd == ch1 && ch0 == ch2)
+            score[1] += (*it).frequency;
+        if (chd == ch2 && ch0 == ch1)
+            score[2] += (*it).frequency;
+    }
+    if (score[0] == score[1] && score[0] == score[2]) {
+        int id = random_int(3);
+        return id;
+    }
+    if (score[0] > score[1] && score[0] > score[2])
+        return 0;
+    if (score[1] < score[2])
+        return 2;
+    return 1;
+}
+
+void IQTree::initializeBonus(PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    if (dad) {
+        PhyloNeighbor *node_nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        PhyloNeighbor *dad_nei = (PhyloNeighbor*) dad->findNeighbor(node);
+        node_nei->lh_scale_factor = 0.0;
+        node_nei->partial_lh_computed = 0;
+        dad_nei->lh_scale_factor = 0.0;
+        dad_nei->partial_lh_computed = 0;
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    initializeBonus((PhyloNode*) ((*it)->node), node);
+}
+}
+
+void IQTree::raiseBonus(Neighbor *nei, Node *dad, double bonus) {
+    ((PhyloNeighbor*) nei)->lh_scale_factor += bonus;
+    if (verbose_mode >= VB_DEBUG)
+        cout << dad->id << " - " << nei->node->id << " : " << bonus << endl;
+
+    //  FOR_NEIGHBOR_IT(nei->node, dad, it)
+    //	raiseBonus((*it), nei->node, bonus);
+}
+
+double IQTree::computePartialBonus(Node *node, Node* dad) {
+    PhyloNeighbor *node_nei = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (node_nei->partial_lh_computed)
+        return node_nei->lh_scale_factor;
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    node_nei->lh_scale_factor += computePartialBonus((*it)->node, node);
+}
+    node_nei->partial_lh_computed = 1;
+    return node_nei->lh_scale_factor;
+}
+
+void IQTree::findBestBonus(double &best_score, NodeVector &best_nodes, NodeVector &best_dads, Node *node, Node *dad) {
+    double score;
+    if (!node)
+        node = root;
+    if (!dad) {
+        best_score = 0;
+    } else {
+        score = computePartialBonus(node, dad) + computePartialBonus(dad, node);
+        if (score >= best_score) {
+            if (score > best_score) {
+                best_score = score;
+                best_nodes.clear();
+                best_dads.clear();
+            }
+            best_nodes.push_back(node);
+            best_dads.push_back(dad);
+        }
+        //cout << node->id << " - " << dad->id << " : " << best_score << endl;
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    findBestBonus(best_score, best_nodes, best_dads, (*it)->node, node);
+}
+}
+
+void IQTree::assessQuartets(vector<RepresentLeafSet*> &leaves_vec, PhyloNode *cur_root, PhyloNode *del_leaf) {
+    const int MAX_DEGREE = 3;
+    RepresentLeafSet * leaves[MAX_DEGREE];
+    double bonus[MAX_DEGREE];
+    memset(bonus, 0, MAX_DEGREE * sizeof(double));
+    int cnt = 0;
+
+    // only work for birfucating tree
+    assert(cur_root->degree() == MAX_DEGREE);
+
+    // find the representative leaf set for three subtrees
+
+    FOR_NEIGHBOR_IT(cur_root, NULL, it){
+    leaves[cnt] = findRepresentLeaves(leaves_vec, cnt, cur_root);
+    cnt++;
+}
+    for (RepresentLeafSet::iterator i0 = leaves[0]->begin(); i0 != leaves[0]->end(); i0++)
+        for (RepresentLeafSet::iterator i1 = leaves[1]->begin(); i1 != leaves[1]->end(); i1++)
+            for (RepresentLeafSet::iterator i2 = leaves[2]->begin(); i2 != leaves[2]->end(); i2++) {
+                int best_id;
+                if (iqp_assess_quartet == IQP_DISTANCE)
+                    best_id = assessQuartet((*i0)->leaf, (*i1)->leaf, (*i2)->leaf, del_leaf);
+                else
+                    best_id = assessQuartetParsimony((*i0)->leaf, (*i1)->leaf, (*i2)->leaf, del_leaf);
+                bonus[best_id] += 1.0;
+            }
+    for (cnt = 0; cnt < MAX_DEGREE; cnt++)
+        if (bonus[cnt] > 0.0)
+            raiseBonus(cur_root->neighbors[cnt], cur_root, bonus[cnt]);
+
+}
+
+void IQTree::reinsertLeavesByParsimony(PhyloNodeVector &del_leaves) {
+    assert(0 && "this function is obsolete");
+    PhyloNodeVector::iterator it_leaf;
+    assert(root->isLeaf());
+    for (it_leaf = del_leaves.begin(); it_leaf != del_leaves.end(); it_leaf++) {
+        //cout << "Add leaf " << (*it_leaf)->id << " to the tree" << endl;
+        initializeAllPartialPars();
+        clearAllPartialLH();
+        Node *target_node = NULL;
+        Node *target_dad = NULL;
+        Node *added_node = (*it_leaf)->neighbors[0]->node;
+        Node *node1 = NULL;
+        Node *node2 = NULL;
+        //Node *leaf;
+        for (int i = 0; i < 3; i++) {
+            if (added_node->neighbors[i]->node->id == (*it_leaf)->id) {
+                //leaf = added_node->neighbors[i]->node;
+            } else if (!node1) {
+                node1 = added_node->neighbors[i]->node;
+            } else {
+                node2 = added_node->neighbors[i]->node;
+            }
+        }
+
+        //cout << "(" << node1->id << ", " << node2->id << ")" << "----" << "(" << added_node->id << "," << leaf->id << ")" << endl;
+        added_node->updateNeighbor(node1, (Node*) 1);
+        added_node->updateNeighbor(node2, (Node*) 2);
+
+        best_pars_score = INT_MAX;
+        // TODO: this needs to be adapted
+//        addTaxonMPFast(added_node, target_node, target_dad, NULL, root->neighbors[0]->node, root);
+        target_node->updateNeighbor(target_dad, added_node, -1.0);
+        target_dad->updateNeighbor(target_node, added_node, -1.0);
+        added_node->updateNeighbor((Node*) 1, target_node, -1.0);
+        added_node->updateNeighbor((Node*) 2, target_dad, -1.0);
+
+    }
+
+}
+
+void IQTree::reinsertLeaves(PhyloNodeVector &del_leaves) {
+    PhyloNodeVector::iterator it_leaf;
+
+    //int num_del_leaves = del_leaves.size();
+    assert(root->isLeaf());
+
+    for (it_leaf = del_leaves.begin(); it_leaf != del_leaves.end(); it_leaf++) {
+        if (verbose_mode >= VB_DEBUG)
+            cout << "Reinserting " << (*it_leaf)->name << " (" << (*it_leaf)->id << ")" << endl;
+        vector<RepresentLeafSet*> leaves_vec;
+        leaves_vec.resize(nodeNum * 3, NULL);
+        initializeBonus();
+        NodeVector nodes;
+        getInternalNodes(nodes);
+        if (verbose_mode >= VB_DEBUG)
+            drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+        //printTree(cout, WT_BR_LEN | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE);
+        for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++) {
+            assessQuartets(leaves_vec, (PhyloNode*) (*it), (*it_leaf));
+        }
+        NodeVector best_nodes, best_dads;
+        double best_bonus;
+        findBestBonus(best_bonus, best_nodes, best_dads);
+        if (verbose_mode >= VB_DEBUG)
+            cout << "Best bonus " << best_bonus << " " << best_nodes[0]->id << " " << best_dads[0]->id << endl;
+        assert(best_nodes.size() == best_dads.size());
+        int node_id = random_int(best_nodes.size());
+        if (best_nodes.size() > 1 && verbose_mode >= VB_DEBUG)
+            cout << best_nodes.size() << " branches show the same best bonus, branch nr. " << node_id << " is chosen"
+                    << endl;
+
+        reinsertLeaf(*it_leaf, best_nodes[node_id], best_dads[node_id]);
+        //clearRepresentLeaves(leaves_vec, *it_node, *it_leaf);
+        /*if (verbose_mode >= VB_DEBUG) {
+         printTree(cout);
+         cout << endl;
+         }*/
+        for (vector<RepresentLeafSet*>::iterator rit = leaves_vec.begin(); rit != leaves_vec.end(); rit++)
+            if ((*rit)) {
+                RepresentLeafSet *tit = (*rit);
+                for (RepresentLeafSet::iterator rlit = tit->begin(); rlit != tit->end(); rlit++)
+                    delete (*rlit);
+                delete (*rit);
+            }
+    }
+    initializeTree(); // BQM: re-index nodes and branches s.t. ping-pong neighbors have the same ID
+
+    if (verbose_mode >= VB_DEBUG)
+        drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+}
+
+void IQTree::doParsimonyReinsertion() {
+    PhyloNodeVector del_leaves;
+
+    deleteLeaves(del_leaves);
+
+    reinsertLeavesByParsimony(del_leaves);
+    fixNegativeBranch(false);
+}
+
+
+int IQTree::removeBranches(NodeVector& nodes1, NodeVector& nodes2, SplitGraph& splits) {
+	if (splits.size() == 0)
+		return 0;
+	NodeVector _nodes1, _nodes2;
+	NodeVector::iterator it1, it2;
+	_nodes1 = nodes1;
+	_nodes2 = nodes2;
+	nodes1.clear();
+	nodes2.clear();
+	for (it1 = _nodes1.begin(), it2 = _nodes2.begin(); it1 != _nodes1.end() && it2 != _nodes2.end(); it1++, it2++) {
+		Split* sp = getSplit(*it1, *it2);
+		if (!splits.containSplit(*sp)) {
+			nodes1.push_back(*it1);
+			nodes2.push_back(*it2);
+		}
+		delete sp;
+	}
+	return (_nodes1.size() - nodes1.size());
+}
+
+void IQTree::doRandomNNIs(int numNNI) {
+	NodeVector nodes1, nodes2;
+	//SplitGraph usedSplits;
+	NodeVector::iterator it1, it2;
+    int cntNNI = 0;
+    while (cntNNI < numNNI) {
+    	nodes1.clear();
+    	nodes2.clear();
+		getAllInnerBranches(nodes1, nodes2, &candidateTrees.getStableSplits());
+    	// remove all used splits
+		//removeBranches(nodes1, nodes2, usedSplits);
+		if (nodes1.size() == 0) {
+			assert(nodes2.size() == 0);
+			break;
+		}
+    	// randomly take an inner branch and do a random NNI
+        int index = random_int(nodes1.size());
+        doOneRandomNNI(nodes1[index], nodes2[index]);
+//        if (params->fix_stable_splits) {
+//            Split* newSp = getSplit(nodes1[index], nodes2[index]);
+//            usedSplits.push_back(newSp);
+//        }
+    	cntNNI++;
+    }
+	//cout << "Number of random NNI performed: " << cntNNI << endl;
+    setAlignment(aln);
+    setRootNode(params->root);
+
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+    }
+
+    if (params->pll) {
+    	pllReadNewick(getTreeString());
+    }
+
+    resetCurScore();
+}
+
+/*
+void IQTree::doRandomNNIs(int numNNI) {
+    map<int, Node*> usedNodes;
+    NodeVector nodeList1, nodeList2;
+    getInternalBranches(nodeList1, nodeList2);
+    int numInBran = nodeList1.size();
+    assert(numInBran == aln->getNSeq() - 3);
+    for (int i = 0; i < numNNI; i++) {
+        int index = random_int(numInBran);
+        if (usedNodes.find(nodeList1[index]->id) == usedNodes.end()
+                && usedNodes.find(nodeList2[index]->id) == usedNodes.end()) {
+            doOneRandomNNI(nodeList1[index], nodeList2[index]);
+            usedNodes.insert(map<int, Node*>::value_type(nodeList1[index]->id, nodeList1[index]));
+            usedNodes.insert(map<int, Node*>::value_type(nodeList2[index]->id, nodeList2[index]));
+        } else {
+            usedNodes.clear();
+            nodeList1.clear();
+            nodeList2.clear();
+            getInternalBranches(nodeList1, nodeList2);
+            doOneRandomNNI(nodeList1[index], nodeList2[index]);
+            usedNodes.insert(map<int, Node*>::value_type(nodeList1[index]->id, nodeList1[index]));
+            usedNodes.insert(map<int, Node*>::value_type(nodeList2[index]->id, nodeList2[index]));
+        }
+    }
+    setAlignment(aln);
+    setRootNode(params->root);
+
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+}
+    if (params->pll) {
+    	pllReadNewick(getTreeString());
+    }
+
+    lhComputed = false;
+}
+*/
+
+void IQTree::doIQP() {
+    if (verbose_mode >= VB_DEBUG)
+        drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+    //double time_begin = getCPUTime();
+    PhyloNodeVector del_leaves;
+    deleteLeaves(del_leaves);
+    reinsertLeaves(del_leaves);
+
+    // just to make sure IQP does it right
+    setAlignment(aln);
+    if (params->pll) {
+    	pllReadNewick(getTreeString());
+    }
+
+    resetCurScore();
+//    lhComputed = false;
+
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+    }
+
+//    if (enable_parsimony) {
+//        cur_pars_score = computeParsimony();
+//        if (verbose_mode >= VB_MAX) {
+//            cout << "IQP Likelihood = " << curScore << "  Parsimony = " << cur_pars_score << endl;
+//        }
+//    }
+}
+
+double IQTree::inputTree2PLL(string treestring, bool computeLH) {
+    double res = 0.0;
+    // read in the tree string from IQTree kernel
+    pllNewickTree *newick = pllNewickParseString(treestring.c_str());
+    pllTreeInitTopologyNewick(pllInst, newick, PLL_FALSE);
+    pllNewickParseDestroy(&newick);
+    if (computeLH) {
+        pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_TRUE, PLL_FALSE);
+        res = pllInst->likelihood;
+    }
+    return res;
+}
+
+double* IQTree::getModelRatesFromPLL() {
+    assert(aln->num_states == 4);
+    int numberOfRates = (pllPartitions->partitionData[0]->states * pllPartitions->partitionData[0]->states
+            - pllPartitions->partitionData[0]->states) / 2;
+    double* rate_params = new double[numberOfRates];
+    for (int i = 0; i < numberOfRates; i++) {
+        rate_params[i] = pllPartitions->partitionData[0]->substRates[i];
+    }
+    return rate_params;
+}
+
+void IQTree::pllPrintModelParams() {
+    cout.precision(6);
+    cout << fixed;
+    for (int part = 0; part < pllPartitions->numberOfPartitions; part++) {
+        cout << "Alpha[" << part << "]" << ": " << pllPartitions->partitionData[part]->alpha << endl;
+        if (aln->num_states == 4) {
+            int states, rates;
+            states = pllPartitions->partitionData[part]->states;
+            rates = ((states * states - states) / 2);
+            cout << "Rates[" << part << "]: " << " ac ag at cg ct gt: ";
+            for (int i = 0; i < rates; i++) {
+                cout << pllPartitions->partitionData[part]->substRates[i] << " ";
+            }
+            cout << endl;
+            cout <<  "Frequencies: ";
+            for (int i = 0; i < 4; i++) {
+                cout << pllPartitions->partitionData[part]->empiricalFrequencies[i] << " ";
+            }
+            cout << endl;
+        }
+    }
+    cout.precision(3);
+    cout << fixed;
+}
+
+double IQTree::getAlphaFromPLL() {
+    return pllPartitions->partitionData[0]->alpha;
+}
+
+void IQTree::inputModelPLL2IQTree() {
+    // TODO add support for partitioned model
+    dynamic_cast<RateGamma*>(getRate())->setGammaShape(pllPartitions->partitionData[0]->alpha);
+    if (aln->num_states == 4) {
+        ((ModelGTR*) getModel())->setRateMatrix(pllPartitions->partitionData[0]->substRates);
+        getModel()->decomposeRateMatrix();
+    }
+    ((ModelGTR*) getModel())->setStateFrequency(pllPartitions->partitionData[0]->empiricalFrequencies);
+}
+
+void IQTree::inputModelIQTree2PLL() {
+    // get the alpha parameter
+    double alpha = getRate()->getGammaShape();
+    if (alpha == 0.0)
+        alpha = PLL_ALPHA_MAX;
+    if (aln->num_states == 4) {
+        // get the rate parameters
+        double *rate_param = new double[6];
+        getModel()->getRateMatrix(rate_param);
+        // get the state frequencies
+        double *state_freqs = new double[aln->num_states];
+        getModel()->getStateFrequency(state_freqs);
+
+        /* put them into PLL */
+        stringstream linkagePattern;
+        int partNr;
+        for (partNr = 0; partNr < pllPartitions->numberOfPartitions - 1; partNr++) {
+            linkagePattern << partNr << ",";
+        }
+        linkagePattern << partNr;
+        char *pattern = new char[linkagePattern.str().length() + 1];
+        strcpy(pattern, linkagePattern.str().c_str());
+        pllLinkAlphaParameters(pattern, pllPartitions);
+        pllLinkFrequencies(pattern, pllPartitions);
+        pllLinkRates(pattern, pllPartitions);
+        delete[] pattern;
+
+        for (partNr = 0; partNr < pllPartitions->numberOfPartitions; partNr++) {
+            pllSetFixedAlpha(alpha, partNr, pllPartitions, pllInst);
+            pllSetFixedBaseFrequencies(state_freqs, 4, partNr, pllPartitions, pllInst);
+            pllSetFixedSubstitutionMatrix(rate_param, 6, partNr, pllPartitions, pllInst);
+        }
+        delete[] rate_param;
+        delete[] state_freqs;
+    } else if (aln->num_states == 20) {
+        double *state_freqs = new double[aln->num_states];
+        getModel()->getStateFrequency(state_freqs);
+        int partNr;
+        for (partNr = 0; partNr < pllPartitions->numberOfPartitions; partNr++) {
+            pllSetFixedAlpha(alpha, partNr, pllPartitions, pllInst);
+            pllSetFixedBaseFrequencies(state_freqs, 20, partNr, pllPartitions, pllInst);
+        }
+        delete[] state_freqs;
+    } else {
+        if (params->pll) {
+            outError("Phylogenetic likelihood library current does not support data type other than DNA or Protein");
+        }
+    }
+}
+
+void IQTree::pllBuildIQTreePatternIndex(){
+    pll2iqtree_pattern_index = new int[pllAlignment->sequenceLength];
+    char ** pll_aln = new char *[pllAlignment->sequenceCount];
+    for(int i = 0; i < pllAlignment->sequenceCount; i++)
+        pll_aln[i] = new char[pllAlignment->sequenceLength];
+
+    int pos;
+    for(int i = 0; i < pllAlignment->sequenceCount; i++){
+        pos = 0;
+        for(int model = 0; model < pllPartitions->numberOfPartitions; model++){
+            memcpy(&pll_aln[i][pos],
+                    &pllAlignment->sequenceData[i + 1][pllPartitions->partitionData[model]->lower],
+                    pllPartitions->partitionData[model]->width);
+            pos += pllPartitions->partitionData[model]->width;
+        }
+    }
+
+	char * pll_site = new char[pllAlignment->sequenceCount + 1];
+	char * site = new char[pllAlignment->sequenceCount + 1];
+    for(int i = 0; i < pllAlignment->sequenceLength; i++){
+        for(int j = 0; j < pllAlignment->sequenceCount; j++)
+            pll_site[j]= pll_aln[j][i];
+        pll_site[pllAlignment->sequenceCount] = '\0';
+
+        site[pllAlignment->sequenceCount] = '\0';
+        for(int k = 0; k < aln->size(); k++){
+            for(int p = 0; p < pllAlignment->sequenceCount; p++)
+                site[p] = aln->convertStateBack(aln->at(k)[p]);
+            pllBaseSubstitute(site, pllPartitions->partitionData[0]->dataType);
+            if(memcmp(pll_site,site, pllAlignment->sequenceCount) == 0){
+                pll2iqtree_pattern_index[i] = k;
+            }
+        }
+    }
+
+    delete [] pll_site;
+    delete [] site;
+    for(int i = 0; i < pllAlignment->sequenceCount; i++)
+        delete [] pll_aln[i];
+    delete [] pll_aln;
+}
+
+
+/**
+ * DTH:
+ * Substitute bases in seq according to PLL's rules
+ * This function should be updated if PLL's rules change.
+ * @param seq: data of some sequence to be substituted
+ * @param dataType: PLL_DNA_DATA or PLL_AA_DATA
+ */
+void IQTree::pllBaseSubstitute (char *seq, int dataType)
+{
+    char meaningDNA[256];
+    char  meaningAA[256];
+    char * d;
+
+    for (int i = 0; i < 256; ++ i)
+    {
+        meaningDNA[i] = -1;
+        meaningAA[i]  = -1;
+    }
+
+    /* DNA data */
+
+    meaningDNA[(int)'A'] =  1;
+    meaningDNA[(int)'B'] = 14;
+    meaningDNA[(int)'C'] =  2;
+    meaningDNA[(int)'D'] = 13;
+    meaningDNA[(int)'G'] =  4;
+    meaningDNA[(int)'H'] = 11;
+    meaningDNA[(int)'K'] = 12;
+    meaningDNA[(int)'M'] =  3;
+    meaningDNA[(int)'R'] =  5;
+    meaningDNA[(int)'S'] =  6;
+    meaningDNA[(int)'T'] =  8;
+    meaningDNA[(int)'U'] =  8;
+    meaningDNA[(int)'V'] =  7;
+    meaningDNA[(int)'W'] =  9;
+    meaningDNA[(int)'Y'] = 10;
+    meaningDNA[(int)'a'] =  1;
+    meaningDNA[(int)'b'] = 14;
+    meaningDNA[(int)'c'] =  2;
+    meaningDNA[(int)'d'] = 13;
+    meaningDNA[(int)'g'] =  4;
+    meaningDNA[(int)'h'] = 11;
+    meaningDNA[(int)'k'] = 12;
+    meaningDNA[(int)'m'] =  3;
+    meaningDNA[(int)'r'] =  5;
+    meaningDNA[(int)'s'] =  6;
+    meaningDNA[(int)'t'] =  8;
+    meaningDNA[(int)'u'] =  8;
+    meaningDNA[(int)'v'] =  7;
+    meaningDNA[(int)'w'] =  9;
+    meaningDNA[(int)'y'] = 10;
+
+    meaningDNA[(int)'N'] =
+    meaningDNA[(int)'n'] =
+    meaningDNA[(int)'O'] =
+    meaningDNA[(int)'o'] =
+    meaningDNA[(int)'X'] =
+    meaningDNA[(int)'x'] =
+    meaningDNA[(int)'-'] =
+    meaningDNA[(int)'?'] = 15;
+
+    /* AA data */
+
+    meaningAA[(int)'A'] =  0;  /* alanine */
+    meaningAA[(int)'R'] =  1;  /* arginine */
+    meaningAA[(int)'N'] =  2;  /*  asparagine*/
+    meaningAA[(int)'D'] =  3;  /* aspartic */
+    meaningAA[(int)'C'] =  4;  /* cysteine */
+    meaningAA[(int)'Q'] =  5;  /* glutamine */
+    meaningAA[(int)'E'] =  6;  /* glutamic */
+    meaningAA[(int)'G'] =  7;  /* glycine */
+    meaningAA[(int)'H'] =  8;  /* histidine */
+    meaningAA[(int)'I'] =  9;  /* isoleucine */
+    meaningAA[(int)'L'] =  10; /* leucine */
+    meaningAA[(int)'K'] =  11; /* lysine */
+    meaningAA[(int)'M'] =  12; /* methionine */
+    meaningAA[(int)'F'] =  13; /* phenylalanine */
+    meaningAA[(int)'P'] =  14; /* proline */
+    meaningAA[(int)'S'] =  15; /* serine */
+    meaningAA[(int)'T'] =  16; /* threonine */
+    meaningAA[(int)'W'] =  17; /* tryptophan */
+    meaningAA[(int)'Y'] =  18; /* tyrosine */
+    meaningAA[(int)'V'] =  19; /* valine */
+    meaningAA[(int)'B'] =  20; /* asparagine, aspartic 2 and 3*/
+    meaningAA[(int)'Z'] =  21; /*21 glutamine glutamic 5 and 6*/
+    meaningAA[(int)'a'] =  0;  /* alanine */
+    meaningAA[(int)'r'] =  1;  /* arginine */
+    meaningAA[(int)'n'] =  2;  /*  asparagine*/
+    meaningAA[(int)'d'] =  3;  /* aspartic */
+    meaningAA[(int)'c'] =  4;  /* cysteine */
+    meaningAA[(int)'q'] =  5;  /* glutamine */
+    meaningAA[(int)'e'] =  6;  /* glutamic */
+    meaningAA[(int)'g'] =  7;  /* glycine */
+    meaningAA[(int)'h'] =  8;  /* histidine */
+    meaningAA[(int)'i'] =  9;  /* isoleucine */
+    meaningAA[(int)'l'] =  10; /* leucine */
+    meaningAA[(int)'k'] =  11; /* lysine */
+    meaningAA[(int)'m'] =  12; /* methionine */
+    meaningAA[(int)'f'] =  13; /* phenylalanine */
+    meaningAA[(int)'p'] =  14; /* proline */
+    meaningAA[(int)'s'] =  15; /* serine */
+    meaningAA[(int)'t'] =  16; /* threonine */
+    meaningAA[(int)'w'] =  17; /* tryptophan */
+    meaningAA[(int)'y'] =  18; /* tyrosine */
+    meaningAA[(int)'v'] =  19; /* valine */
+    meaningAA[(int)'b'] =  20; /* asparagine, aspartic 2 and 3*/
+    meaningAA[(int)'z'] =  21; /*21 glutamine glutamic 5 and 6*/
+
+    meaningAA[(int)'X'] =
+    meaningAA[(int)'x'] =
+    meaningAA[(int)'?'] =
+    meaningAA[(int)'*'] =
+    meaningAA[(int)'-'] = 22;
+
+    d = (dataType == PLL_DNA_DATA) ? meaningDNA : meaningAA;
+    int seq_len = strlen(seq);
+    for (int i = 0; i < seq_len; ++ i)
+    {
+        seq[i] = d[(int)seq[i]];
+    }
+}
+
+double IQTree::swapTaxa(PhyloNode *node1, PhyloNode *node2) {
+    assert(node1->isLeaf());
+    assert(node2->isLeaf());
+
+    PhyloNeighbor *node1nei = (PhyloNeighbor*) *(node1->neighbors.begin());
+    PhyloNeighbor *node2nei = (PhyloNeighbor*) *(node2->neighbors.begin());
+
+    node2nei->node->updateNeighbor(node2, node1);
+    node1nei->node->updateNeighbor(node1, node2);
+
+    // Update the new neightbors of the 2 nodes
+    node1->updateNeighbor(node1->neighbors.begin(), node2nei);
+    node2->updateNeighbor(node2->neighbors.begin(), node1nei);
+
+    PhyloNeighbor *node1NewNei = (PhyloNeighbor*) *(node1->neighbors.begin());
+    PhyloNeighbor *node2NewNei = (PhyloNeighbor*) *(node2->neighbors.begin());
+
+    // Reoptimize the branch lengths
+    optimizeOneBranch(node1, (PhyloNode*) node1NewNei->node);
+//    this->curScore = optimizeOneBranch(node2, (PhyloNode*) node2NewNei->node);
+    optimizeOneBranch(node2, (PhyloNode*) node2NewNei->node);
+    //drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE);
+    this->curScore = computeLikelihoodFromBuffer();
+    return this->curScore;
+}
+
+double IQTree::perturb(int times) {
+    while (times > 0) {
+        NodeVector taxa;
+        // get the vector of taxa
+        getTaxa(taxa);
+        int taxonid1 = random_int(taxa.size());
+        PhyloNode *taxon1 = (PhyloNode*) taxa[taxonid1];
+        PhyloNode *taxon2;
+        int *dists = new int[taxa.size()];
+        int minDist = 1000000;
+        for (int i = 0; i < taxa.size(); i++) {
+            if (i == taxonid1)
+                continue;
+            taxon2 = (PhyloNode*) taxa[i];
+            int dist = taxon1->calDist(taxon2);
+            dists[i] = dist;
+            if (dist >= 7 && dist < minDist)
+                minDist = dist;
+        }
+
+        int taxonid2 = -1;
+        for (int i = 0; i < taxa.size(); i++) {
+            if (dists[i] == minDist)
+                taxonid2 = i;
+        }
+
+        taxon2 = (PhyloNode*) taxa[taxonid2];
+
+        cout << "Swapping node " << taxon1->id << " and node " << taxon2->id << endl;
+        cout << "Distance " << minDist << endl;
+        curScore = swapTaxa(taxon1, taxon2);
+        //taxa.erase( taxa.begin() + taxaID1 );
+        //taxa.erase( taxa.begin() + taxaID2 -1 );
+
+        times--;
+        delete[] dists;
+    }
+    curScore = optimizeAllBranches(1);
+    return curScore;
+}
+
+//extern "C" pllUFBootData * pllUFBootDataPtr;
+extern pllUFBootData * pllUFBootDataPtr;
+
+string IQTree::optimizeModelParameters(bool printInfo, double logl_epsilon) {
+	if (logl_epsilon == -1)
+		logl_epsilon = params->modeps;
+    cout << "Estimate model parameters (epsilon = " << logl_epsilon << ")" << endl;
+	double stime = getRealTime();
+	string newTree;
+	if (params->pll) {
+        if (curScore == -DBL_MAX) {
+			pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_TRUE, PLL_FALSE);
+		} else {
+			pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_FALSE, PLL_FALSE);
+		}
+		pllOptimizeModelParameters(pllInst, pllPartitions, logl_epsilon);
+		curScore = pllInst->likelihood;
+		pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions,
+				pllInst->start->back, PLL_TRUE,
+				PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH,
+				PLL_FALSE, PLL_FALSE);
+		if (printInfo) {
+			pllPrintModelParams();
+		}
+		newTree = string(pllInst->tree_string);
+        double etime = getRealTime();
+        if (printInfo)
+            cout << etime - stime << " seconds (logl: " << curScore << ")" << endl;
+	} else {
+		double modOptScore =
+                getModelFactory()->optimizeParameters(params->fixed_branch_length, printInfo, logl_epsilon);
+		if (isSuperTree()) {
+			((PhyloSuperTree*) this)->computeBranchLengths();
+		}
+		if (getModelFactory()->isUnstableParameters() && aln->seq_type != SEQ_CODON) {
+			cout << endl;
+			outWarning("Estimated model parameters are at boundary that can cause numerical instability!");
+			cout << endl;
+		}
+
+		if (modOptScore < curScore - 1.0) {
+			cout << "  BUG: Tree logl gets worse after model optimization!" << endl;
+			cout << "  Old logl: " << curScore << " / " << "new logl: " << modOptScore << endl;
+			printTree("debug.tree");
+			abort();
+		} else {
+			curScore = modOptScore;
+			newTree = getTreeString();
+		}
+        if (params->print_site_posterior)
+            computePatternCategories();
+	}
+
+	return newTree;
+}
+
+void IQTree::printBestScores(int numBestScore) {
+	vector<double> bestScores = candidateTrees.getBestScores(params->popSize);
+	for (vector<double>::iterator it = bestScores.begin(); it != bestScores.end(); it++)
+		cout << (*it) << " ";
+	cout << endl;
+}
+
+void IQTree::computeLogL() {
+	if (params->pll) {
+		if (curScore == -DBL_MAX) {
+			pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_TRUE, PLL_FALSE);
+		} else {
+			pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_FALSE, PLL_FALSE);
+		}
+        curScore = pllInst->likelihood;
+	} else {
+//		if (!lhComputed) {
+//	        initializeAllPartialLh();
+//	        clearAllPartialLH();
+//		}
+		curScore = computeLikelihood();
+	}
+//	lhComputed = true;
+}
+
+string IQTree::optimizeBranches(int maxTraversal) {
+	string tree;
+    if (params->pll) {
+    	if (curScore == -DBL_MAX) {
+    		pllEvaluateLikelihood(pllInst, pllPartitions, pllInst->start, PLL_TRUE, PLL_FALSE);
+//            lhComputed = true;
+    	}
+        pllOptimizeBranchLengths(pllInst, pllPartitions, maxTraversal);
+        curScore = pllInst->likelihood;
+        pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE, PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE,
+                PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+        tree = string(pllInst->tree_string);
+    } else {
+//    	if (!lhComputed) {
+//            initializeAllPartialLh();
+//            clearAllPartialLH();
+//            lhComputed = true;
+//    	}
+    	curScore = optimizeAllBranches(maxTraversal);
+        tree = getTreeString();
+    }
+    return tree;
+}
+
+double IQTree::doTreeSearch() {
+    cout << "--------------------------------------------------------------------" << endl;
+    cout << "|               OPTIMIZING CANDIDATE TREE SET                      |" << endl;
+    cout << "--------------------------------------------------------------------" << endl;
+    string tree_file_name = params->out_prefix;
+    tree_file_name += ".treefile";
+    // PLEASE PRINT TREE HERE!
+    printResultTree();
+    string treels_name = params->out_prefix;
+    treels_name += ".treels";
+    string out_lh_file = params->out_prefix;
+    out_lh_file += ".treelh";
+    string site_lh_file = params->out_prefix;
+    site_lh_file += ".sitelh";
+
+    if (params->print_tree_lh) {
+        out_treelh.open(out_lh_file.c_str());
+        out_sitelh.open(site_lh_file.c_str());
+    }
+
+    if (params->write_intermediate_trees)
+        out_treels.open(treels_name.c_str());
+
+    if (params->write_intermediate_trees && save_all_trees != 2) {
+        printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+    }
+
+    setRootNode(params->root);
+    // keep the best tree into a string
+    //stringstream bestTreeStream;
+    //stringstream bestTopoStream;
+//    string perturb_tree_string;
+    string imd_tree;
+    //printTree(bestTreeStream, WT_TAXON_ID + WT_BR_LEN);
+    //printTree(bestTopoStream, WT_TAXON_ID + WT_SORT_TAXA);
+    //string best_tree_topo = bestTopoStream.str();
+
+    stop_rule.addImprovedIteration(1);
+    searchinfo.curPerStrength = params->initPS;
+
+	double cur_correlation = 0.0;
+
+	/*====================================================
+	 * MAIN LOOP OF THE IQ-TREE ALGORITHM
+	 *====================================================*/
+    for (; !stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation); stop_rule.setCurIt(
+            stop_rule.getCurIt() + 1)) {
+        searchinfo.curIter = stop_rule.getCurIt();
+        // estimate logl_cutoff for bootstrap
+        if (params->avoid_duplicated_trees && max_candidate_trees > 0 && treels_logl.size() > 1000) {
+        	int predicted_iteration = ((stop_rule.getCurIt()+params->step_iterations-1)/params->step_iterations)*params->step_iterations;
+            int num_entries = floor(max_candidate_trees * ((double) stop_rule.getCurIt() / predicted_iteration));
+            if (num_entries < treels_logl.size() * 0.9) {
+                DoubleVector logl = treels_logl;
+                nth_element(logl.begin(), logl.begin() + (treels_logl.size() - num_entries), logl.end());
+                logl_cutoff = logl[treels_logl.size() - num_entries] - 1.0;
+            } else
+                logl_cutoff = 0.0;
+            if (verbose_mode >= VB_MED) {
+                if (stop_rule.getCurIt() % 10 == 0) {
+                    cout << treels.size() << " trees, " << treels_logl.size() << " logls, logl_cutoff= " << logl_cutoff;
+                    if (params->store_candidate_trees)
+                        cout << " duplicates= " << duplication_counter << " ("
+                                << (int) round(100 * ((double) duplication_counter / treels_logl.size())) << "%)" << endl;
+                    else
+                        cout << endl;
+                }
+            }
+        }
+
+        if (estimate_nni_cutoff && nni_info.size() >= 500) {
+            estimate_nni_cutoff = false;
+            estimateNNICutoff(params);
+        }
+
+        Alignment *saved_aln = aln;
+
+    	/*----------------------------------------
+    	 * Perturb the tree
+    	 *---------------------------------------*/
+        double perturbScore = 0.0;
+        if (iqp_assess_quartet == IQP_BOOTSTRAP) {
+            // create bootstrap sample
+            Alignment* bootstrap_alignment;
+            if (aln->isSuperAlignment())
+                bootstrap_alignment = new SuperAlignment;
+            else
+                bootstrap_alignment = new Alignment;
+            bootstrap_alignment->createBootstrapAlignment(aln, NULL, params->bootstrap_spec);
+            setAlignment(bootstrap_alignment);
+            initializeAllPartialLh();
+            clearAllPartialLH();
+            curScore = optimizeAllBranches();
+        } else {
+            if (params->snni) {
+            	int numStableBranches = aln->getNSeq() - 3 - candidateTrees.getStableSplits().size();
+                int numNNI = floor(searchinfo.curPerStrength * numStableBranches);
+//                string candidateTree = candidateTrees.getRandCandTree();
+//                readTreeString(candidateTree);
+                readTreeString(candidateTrees.getRandCandTree());
+//                if (params->fix_stable_splits)
+//                	assert(containsSplits(candidateTrees.getStableSplits()));
+                if (params->iqp) {
+                    doIQP();
+                } else {
+                    doRandomNNIs(numNNI);
+                }
+            } else {
+            	readTreeString(candidateTrees.getBestTrees()[0]);
+                doIQP();
+            }
+//            perturb_tree_string = getTreeString();
+            if (params->count_trees) {
+                string perturb_tree_topo = getTopology();
+                if (pllTreeCounter.find(perturb_tree_topo) == pllTreeCounter.end()) {
+                    // not found in hash_map
+                    pllTreeCounter[perturb_tree_topo] = 1;
+                } else {
+                    // found in hash_map
+                    pllTreeCounter[perturb_tree_topo]++;
+                }
+            }
+
+            computeLogL();
+        }
+
+    	/*----------------------------------------
+    	 * Optimize tree with NNI
+    	 *---------------------------------------*/
+        int nni_count = 0;
+        int nni_steps = 0;
+
+        imd_tree = doNNISearch(nni_count, nni_steps);
+
+        if (iqp_assess_quartet == IQP_BOOTSTRAP) {
+            // restore alignment
+            delete aln;
+            setAlignment(saved_aln);
+            initializeAllPartialLh();
+            clearAllPartialLH();
+        }
+
+        if (isSuperTree()) {
+            ((PhyloSuperTree*) this)->computeBranchLengths();
+        }
+
+    	/*----------------------------------------
+    	 * Print information
+    	 *---------------------------------------*/
+        double realtime_remaining = stop_rule.getRemainingTime(stop_rule.getCurIt(), cur_correlation);
+        cout.setf(ios::fixed, ios::floatfield);
+
+        // only print every 10th iteration or high verbose mode
+        if (stop_rule.getCurIt() % 10 == 0 || verbose_mode >= VB_MED) {
+			cout << ((iqp_assess_quartet == IQP_BOOTSTRAP) ? "Bootstrap " : "Iteration ") << stop_rule.getCurIt() << " / LogL: ";
+			if (verbose_mode >= VB_MED)
+				cout << perturbScore << " -> ";
+			cout << curScore;
+			if (verbose_mode >= VB_MED)
+				cout << " / (NNIs, Steps): (" << nni_count << "," << nni_steps << ")";
+			cout << " / Time: " << convert_time(getRealTime() - params->start_real_time);
+
+			if (stop_rule.getCurIt() > 10) {
+				cout << " (" << convert_time(realtime_remaining) << " left)";
+			}
+			cout << endl;
+        }
+
+        if (params->write_intermediate_trees && save_all_trees != 2) {
+            printIntermediateTree(WT_NEWLINE | WT_APPEND | WT_SORT_TAXA | WT_BR_LEN);
+        }
+
+    	/*----------------------------------------
+    	 * Update if better tree is found
+    	 *---------------------------------------*/
+        if (curScore > candidateTrees.getBestScore() + params->modeps) {
+        	if (params->snni) {
+        		imd_tree = optimizeModelParameters();
+        	}
+            if (!candidateTrees.treeExist(imd_tree)) {
+                stop_rule.addImprovedIteration(stop_rule.getCurIt());
+                cout << "BETTER TREE FOUND at iteration " << stop_rule.getCurIt() << ": " << curScore << endl;
+            } else {
+                cout << "UPDATE BEST LOG-LIKELIHOOD: " << curScore << endl;
+            }
+            printResultTree();
+        }
+
+    	candidateTrees.update(imd_tree, curScore);
+    	if (params->snni && verbose_mode >= VB_MED) {
+        	printBestScores(params->popSize);
+    	}
+
+        // DTH: make pllUFBootData usable in summarizeBootstrap
+        if(params->pll && params->online_bootstrap && (params->gbo_replicates > 0))
+            pllConvertUFBootData2IQTree();
+        // DTH: Carefully watch the -pll case here
+
+
+    	/*----------------------------------------
+    	 * convergence criterion for ultrafast bootstrap
+    	 *---------------------------------------*/
+        if ((stop_rule.getCurIt()) % (params->step_iterations / 2) == 0 && params->stop_condition == SC_BOOTSTRAP_CORRELATION) {
+        	// compute split support every half step
+            SplitGraph *sg = new SplitGraph;
+            summarizeBootstrap(*sg);
+            boot_splits.push_back(sg);
+            if (params->max_candidate_trees == 0)
+                max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + (params->step_iterations / 2)) /
+                                                           stop_rule.getCurIt();
+			cout << "NOTE: " << treels_logl.size() << " bootstrap candidate trees evaluated (logl-cutoff: " << logl_cutoff << ")" << endl;
+
+			// check convergence every full step
+			if (stop_rule.getCurIt() % params->step_iterations == 0) {
+	        	cur_correlation = computeBootstrapCorrelation();
+	            cout << "NOTE: Bootstrap correlation coefficient of split occurrence frequencies: " << cur_correlation << endl;
+	            if (!stop_rule.meetStopCondition(stop_rule.getCurIt(), cur_correlation)) {
+	                if (params->max_candidate_trees == 0) {
+	                    max_candidate_trees = treels_logl.size() * (stop_rule.getCurIt() + params->step_iterations) /
+                                                                   stop_rule.getCurIt();
+	                }
+//	                cout << "INFO: UFBoot does not converge, continue " << params->step_iterations << " more iterations" << endl;
+	            }
+	        }
+        } // end of bootstrap convergence test
+
+        // print UFBoot trees every 10 iterations
+		if (params->gbo_replicates && params->online_bootstrap && params->print_ufboot_trees &&
+                                                                  stop_rule.getCurIt() % 10 == 0)
+				writeUFBootTrees(*params);
+
+       //if (params->partition_type)
+       // 	((PhyloSuperTreePlen*)this)->printNNIcasesNUM();
+    }
+
+    readTreeString(candidateTrees.getTopTrees()[0]);
+
+    if (testNNI)
+        outNNI.close();
+    if (params->write_intermediate_trees)
+        out_treels.close();
+    if (params->print_tree_lh) {
+        out_treelh.close();
+        out_sitelh.close();
+    }
+
+    // DTH: pllUFBoot deallocation
+    if(params->pll) {
+        pllDestroyUFBootData();
+    }
+
+    return candidateTrees.getBestScore();
+}
+
+/****************************************************************************
+ Fast Nearest Neighbor Interchange by maximum likelihood
+ ****************************************************************************/
+string IQTree::doNNISearch(int& nniCount, int& nniSteps) {
+	string treeString;
+    if (params->pll) {
+    	if (params->partition_file)
+    		outError("Unsupported -pll -sp combination!");
+        curScore = pllOptimizeNNI(nniCount, nniSteps, searchinfo);
+        pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE,
+                PLL_TRUE, 0, 0, 0, PLL_SUMMARIZE_LH, 0, 0);
+        treeString = string(pllInst->tree_string);
+//        readTreeString(treeString);
+    } else {
+        curScore = optimizeNNI(nniCount, nniSteps);
+        if (isSuperTree()) {
+            ((PhyloSuperTree*) this)->computeBranchLengths();
+        }
+        treeString = getTreeString();
+        if (params->print_site_posterior)
+            computePatternCategories();
+    }
+    return treeString;
+}
+
+double IQTree::optimizeNNI(int &nni_count, int &nni_steps) {
+    bool rollBack = false;
+    nni_count = 0;
+    int numNNIs = 0; // number of NNI to be applied in each step
+    const int MAXSTEPS = aln->getNSeq(); // maximum number of NNI steps
+    NodeVector nodes1, nodes2;
+    for (nni_steps = 1; nni_steps <= MAXSTEPS; nni_steps++) {
+        double oldScore = curScore;
+        if (!rollBack) { // tree get improved and was not rollbacked
+            if (save_all_trees == 2) {
+                saveCurrentTree(curScore); // BQM: for new bootstrap
+            }
+            if (verbose_mode >= VB_DEBUG) {
+                cout << "Doing NNI round " << nni_steps << endl;
+                if (isSuperTree()) {
+                    ((PhyloSuperTree*) this)->printMapInfo();
+                }
+            }
+
+            nonConfNNIs.clear(); // Vector containing non-conflicting positive NNIs
+            optBrans.clear(); // Vector containing branch length of the positive NNIs
+            orgBrans.clear(); // Vector containing all current branch of the tree
+            plusNNIs.clear(); // Vector containing all positive NNIs
+            saveBranches(); // save all current branch lengths
+            initPartitionInfo(); // for super tree
+            int numRemoved;
+            if (nodes1.size() == 0) {
+            	assert (nodes2.size() == 0);
+            	getAllInnerBranches(nodes1, nodes2, &candidateTrees.getStableSplits());
+            	assert(nodes1.size() == (aln->getNSeq() - 3 - candidateTrees.getStableSplits().size()));
+            } else {
+            	// exclude stable splits from NNI evaluation
+                numRemoved = removeBranches(nodes1, nodes2, candidateTrees.getStableSplits());
+            }
+//            cout << "Number of splits removed: " << numRemoved << endl;
+            assert(nodes1.size() == nodes2.size());
+//            for (int i = 0; i < nodes1.size(); i++) {
+//            	cout << "(" << nodes1[i]->id << "," << nodes2[i]->id << ") ; ";
+//            }
+//            cout << endl;
+//            printTree(cout, WT_TAXON_ID + WT_INT_NODE + WT_NEWLINE);
+            evalNNIs(nodes1, nodes2);
+
+//            if (!nni_sort) {
+//                evalNNIs(); // generate all positive NNI moves
+//            } else {
+//                evalNNIsSort(params->approximate_nni);
+//            }
+
+            /* sort all positive NNI moves (descending) */
+            sort(plusNNIs.begin(), plusNNIs.end());
+            if (verbose_mode >= VB_DEBUG) {
+                cout << "curScore: " << curScore << endl;
+                for (int i = 0; i < plusNNIs.size(); i++) {
+                    cout << "Logl of positive NNI " << i << " : " << plusNNIs[i].newloglh << endl;
+                }
+            }
+
+            if (plusNNIs.size() == 0) {
+                break;
+            }
+
+            /* remove conflicting NNIs */
+            genNonconfNNIs();
+            numNNIs = nonConfNNIs.size();
+            if (verbose_mode >= VB_DEBUG) {
+                for (int i = 0; i < nonConfNNIs.size(); i++) {
+                    cout << "Log-likelihood of non-conflicting NNI " << i << " : " << nonConfNNIs[i].newloglh << endl;
+                }
+            }
+        }
+        // Apply all non-conflicting positive NNIs
+        doNNIs(numNNIs);
+
+        if (verbose_mode >= VB_DEBUG) {
+        	cout << "NNI step: " << nni_steps << " / Number of NNIs applied: " << numNNIs << endl;
+        }
+    	nodes1.clear();
+    	nodes2.clear();
+
+        if (searchinfo.speednni) {
+        	getBranchesForNNI(nodes1, nodes2, appliedNNIs);
+            appliedNNIs.clear();
+        }
+
+        // FOR TUNG: If you want to introduce this heuristic, please confirm with reevaluation again.
+//        if (numNNIs > 1) {
+            // Re-estimate branch lengths of the new tree
+            curScore = optimizeAllBranches(1, params->loglh_epsilon, PLL_NEWZPERCYCLE);
+//        } else {
+//        	curScore = computeLikelihood();
+//        }
+
+
+		// curScore should be larger than score of the best NNI
+        if (curScore >= nonConfNNIs.at(0).newloglh - params->loglh_epsilon) {
+            nni_count += numNNIs;
+            rollBack = false;
+        	if (params->reduction) {
+        		string newickToplogy = getTopology();
+        		string newickString = getTreeString();
+            	if (candidateTrees.treeTopologyExist(newickToplogy)) {
+            		double oldScore = candidateTrees.getTopologyScore(newickToplogy);
+            		if (curScore > oldScore)
+    					candidateTrees.update(newickString, curScore, false);
+            		break;
+            	} else {
+					candidateTrees.update(newickString, curScore, false);
+            	}
+        	}
+        } else {
+            /* tree cannot be worse if only 1 NNI is applied */
+            if (numNNIs == 1 && curScore < nonConfNNIs.at(0).newloglh - 1.0) {
+            	cout.precision(15);
+                cout << "BUG: current logl=" << curScore << " < " << nonConfNNIs.at(0).newloglh
+                        << "(best NNI)" << endl;
+                assert(0);
+            }
+            if (verbose_mode >= VB_MED) {
+                cout << "New score = " << curScore << " after applying " << numNNIs <<
+                        " is worse than score = " << nonConfNNIs.at(0).newloglh
+                        << " of the best NNI. Roll back tree ..." << endl;
+            }
+
+            // restore the tree by reverting all NNIs
+            for (int i = 0; i < numNNIs; i++)
+                doNNI(nonConfNNIs.at(i));
+            // restore the branch lengths
+            restoreAllBrans();
+            // This is important because after restoring the branch lengths, all partial
+            // likelihood need to be cleared.
+//            if (params->lh_mem_save == LM_PER_NODE) {
+//                initializeAllPartialLh();
+//            } else
+            clearAllPartialLH();
+            
+            // UPDATE: the following is not needed as clearAllPartialLH() is now also defined for SuperTree
+            // BQM: This was missing: one should also clear all subtrees of a supertree
+//            if (isSuperTree()) {
+//            	PhyloSuperTree *stree = (PhyloSuperTree*)this;
+//            	for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++) {
+//            		(*it)->clearAllPartialLH();
+//            	}
+//            }
+            rollBack = true;
+            // only apply the best NNI
+            numNNIs = 1;
+            curScore = oldScore;
+        }
+        if (curScore - oldScore < 0.1)
+        	break;
+    }
+
+    if (nni_count == 0 && verbose_mode >= VB_MED) {
+        cout << "NOTE: Tree is already NNI-optimized" << endl;
+    }
+    if (nni_steps == MAXSTEPS) {
+    	cout << "WARNING: NNI search needs unusual large number of steps (" << MAXSTEPS << ") to converge!" << endl;
+    }
+    return curScore;
+}
+
+void IQTree::getBranchesForNNI(NodeVector& nodes1, NodeVector& nodes2, vector<NNIMove>& nnis) {
+	assert(nodes1.size() == nodes2.size());
+    for (vector<NNIMove>::iterator it = nnis.begin(); it != nnis.end(); it++) {
+    	if (!branchExist((*it).node1, (*it).node2, nodes1, nodes2)) {
+    		assert(isInnerBranch((*it).node1, (*it).node2));
+        	nodes1.push_back((*it).node1);
+        	nodes2.push_back((*it).node2);
+    }
+    	getInnerBranches(nodes1, nodes2, 2, (*it).node1, (*it).node2);
+    	getInnerBranches(nodes1, nodes2, 2, (*it).node2, (*it).node1);
+}
+
+}
+
+double IQTree::pllOptimizeNNI(int &totalNNICount, int &nniSteps, SearchInfo &searchinfo) {
+    if((globalParam->online_bootstrap == PLL_TRUE) && (globalParam->gbo_replicates > 0)) {
+        pllInitUFBootData();
+    }
+    searchinfo.numAppliedNNIs = 0;
+    searchinfo.curLogl = curScore;
+    //cout << "curLogl: " << searchinfo.curLogl << endl;
+    const int MAX_NNI_STEPS = aln->getNSeq();
+    totalNNICount = 0;
+    for (nniSteps = 1; nniSteps <= MAX_NNI_STEPS; nniSteps++) {
+        searchinfo.curNumNNISteps = nniSteps;
+        searchinfo.posNNIList.clear();
+        double newLH = pllDoNNISearch(pllInst, pllPartitions, searchinfo);
+        if (searchinfo.curNumAppliedNNIs == 0) { // no positive NNI was found
+            searchinfo.curLogl = newLH;
+            break;
+        } else {
+            searchinfo.curLogl = newLH;
+            searchinfo.numAppliedNNIs += searchinfo.curNumAppliedNNIs;
+        }
+    }
+
+    if (nniSteps == (MAX_NNI_STEPS + 1)) {
+    	cout << "WARNING: NNI search needs unusual large number of steps (" << MAX_NNI_STEPS << ") to converge!" << endl;
+    }
+
+    if (searchinfo.numAppliedNNIs == 0) {
+        cout << "NOTE: Tree is already NNI-optimized" << endl;
+    }
+
+    totalNNICount = searchinfo.numAppliedNNIs;
+    pllInst->likelihood = searchinfo.curLogl;
+    return searchinfo.curLogl;
+}
+
+void IQTree::pllLogBootSamples(int** pll_boot_samples, int nsamples, int npatterns){
+    ofstream bfile("boot_samples.log");
+    bfile << "Original freq:" << endl;
+    int sum = 0;
+    for(int i = 0; i < pllAlignment->sequenceLength; i++){
+        bfile << setw(4) << pllInst->aliaswgt[i];
+        sum += pllInst->aliaswgt[i];
+    }
+    bfile << endl << "sum = " << sum << endl;
+
+    bfile << "Bootstrap freq:" << endl;
+
+    for(int i = 0; i < nsamples; i++){
+        sum = 0;
+        for(int j = 0; j < npatterns; j++){
+            bfile << setw(4) << pll_boot_samples[i][j];
+            sum += pll_boot_samples[i][j];
+        }
+        bfile << endl << "sum = "  << sum << endl;
+    }
+    bfile.close();
+
+}
+
+void IQTree::pllInitUFBootData(){
+    if(pllUFBootDataPtr == NULL){
+        pllUFBootDataPtr = (pllUFBootData *) malloc(sizeof(pllUFBootData));
+        pllUFBootDataPtr->boot_samples = NULL;
+        pllUFBootDataPtr->candidate_trees_count = 0;
+
+        if(params->online_bootstrap && params->gbo_replicates > 0){
+        	if(!pll2iqtree_pattern_index) pllBuildIQTreePatternIndex();
+
+            pllUFBootDataPtr->treels = pllHashInit(max_candidate_trees);
+            pllUFBootDataPtr->treels_size = max_candidate_trees; // track size of treels_logl, treels_newick, treels_ptnlh
+
+            pllUFBootDataPtr->treels_logl =
+                (double *) malloc(max_candidate_trees * (sizeof(double)));
+            if(!pllUFBootDataPtr->treels_logl) outError("Not enough dynamic memory!");
+            //memset(pllUFBootDataPtr->treels_logl, 0, max_candidate_trees * (sizeof(double)));
+
+            pllUFBootDataPtr->treels_newick =
+                (char **) malloc(max_candidate_trees * (sizeof(char *)));
+            if(!pllUFBootDataPtr->treels_newick) outError("Not enough dynamic memory!");
+            memset(pllUFBootDataPtr->treels_newick, 0, max_candidate_trees * (sizeof(char *)));
+
+
+            pllUFBootDataPtr->treels_ptnlh =
+                (double **) malloc(max_candidate_trees * (sizeof(double *)));
+            if(!pllUFBootDataPtr->treels_ptnlh) outError("Not enough dynamic memory!");
+            memset(pllUFBootDataPtr->treels_ptnlh, 0, max_candidate_trees * (sizeof(double *)));
+
+            // aln->createBootstrapAlignment() must be called before this fragment
+            pllUFBootDataPtr->boot_samples =
+                (int **) malloc(params->gbo_replicates * sizeof(int *));
+            if(!pllUFBootDataPtr->boot_samples) outError("Not enough dynamic memory!");
+            for(int i = 0; i < params->gbo_replicates; i++){
+                pllUFBootDataPtr->boot_samples[i] =
+                    (int *) malloc(pllAlignment->sequenceLength * sizeof(int));
+                if(!pllUFBootDataPtr->boot_samples[i]) outError("Not enough dynamic memory!");
+                for(int j = 0; j < pllAlignment->sequenceLength; j++){
+                    pllUFBootDataPtr->boot_samples[i][j] =
+                        boot_samples[i][pll2iqtree_pattern_index[j]];
+                }
+            }
+
+//            pllLogBootSamples(pllUFBootDataPtr->boot_samples,
+//                    params->gbo_replicates, pllAlignment->sequenceLength);
+
+            pllUFBootDataPtr->boot_logl =
+                (double *) malloc(params->gbo_replicates * (sizeof(double)));
+            if(!pllUFBootDataPtr->boot_logl) outError("Not enough dynamic memory!");
+            for(int i = 0; i < params->gbo_replicates; i++)
+                pllUFBootDataPtr->boot_logl[i] = -DBL_MAX;
+
+            pllUFBootDataPtr->boot_counts =
+                (int *) malloc(params->gbo_replicates * (sizeof(int)));
+            if(!pllUFBootDataPtr->boot_counts) outError("Not enough dynamic memory!");
+            memset(pllUFBootDataPtr->boot_counts, 0, params->gbo_replicates * (sizeof(int)));
+
+            pllUFBootDataPtr->boot_trees =
+                (int *) malloc(params->gbo_replicates * (sizeof(int)));
+            if(!pllUFBootDataPtr->boot_trees) outError("Not enough dynamic memory!");
+
+            pllUFBootDataPtr->duplication_counter = 0;
+        }
+    }
+    pllUFBootDataPtr->max_candidate_trees = max_candidate_trees;
+    pllUFBootDataPtr->save_all_trees = save_all_trees;
+    pllUFBootDataPtr->save_all_br_lens = save_all_br_lens;
+    pllUFBootDataPtr->logl_cutoff = logl_cutoff;
+    pllUFBootDataPtr->n_patterns = pllAlignment->sequenceLength;
+}
+
+void IQTree::pllDestroyUFBootData(){
+    if(pll2iqtree_pattern_index){
+        delete [] pll2iqtree_pattern_index;
+        pll2iqtree_pattern_index = NULL;
+    }
+
+    if(params->online_bootstrap && params->gbo_replicates > 0){
+        pllHashDestroy(&(pllUFBootDataPtr->treels), rax_free);
+
+        free(pllUFBootDataPtr->treels_logl);
+
+        for(int i = 0; i < pllUFBootDataPtr->candidate_trees_count; i++)
+            if(pllUFBootDataPtr->treels_newick[i])
+                free(pllUFBootDataPtr->treels_newick[i]);
+        free(pllUFBootDataPtr->treels_newick);
+
+        for(int i = 0; i < pllUFBootDataPtr->treels_size; i++)
+            if(pllUFBootDataPtr->treels_ptnlh[i])
+                free(pllUFBootDataPtr->treels_ptnlh[i]);
+        free(pllUFBootDataPtr->treels_ptnlh);
+
+        for(int i = 0; i < params->gbo_replicates; i++)
+            free(pllUFBootDataPtr->boot_samples[i]);
+        free(pllUFBootDataPtr->boot_samples);
+
+        free(pllUFBootDataPtr->boot_logl);
+
+        free(pllUFBootDataPtr->boot_counts);
+
+        free(pllUFBootDataPtr->boot_trees);
+    }
+    free(pllUFBootDataPtr);
+    pllUFBootDataPtr = NULL;
+}
+
+
+void IQTree::doNNIs(int nni2apply, bool changeBran) {
+    for (int i = 0; i < nni2apply; i++) {
+        doNNI(nonConfNNIs.at(i));
+        appliedNNIs.push_back(nonConfNNIs.at(i));
+        if (!params->leastSquareNNI && changeBran) {
+            // apply new branch lengths
+            changeNNIBrans(nonConfNNIs.at(i));
+        }
+    }
+//    if (params->lh_mem_save == LM_PER_NODE) {
+//        initializeAllPartialLh();
+//    }
+}
+
+
+void IQTree::genNonconfNNIs() {
+    for (vector<NNIMove>::iterator iterMove = plusNNIs.begin(); iterMove != plusNNIs.end(); iterMove++) {
+        bool choosen = true;
+        for (vector<NNIMove>::iterator iterNextMove = nonConfNNIs.begin(); iterNextMove != nonConfNNIs.end();
+                iterNextMove++) {
+            if ((*iterMove).node1 == (*(iterNextMove)).node1 || (*iterMove).node2 == (*(iterNextMove)).node1
+                    || (*iterMove).node1 == (*(iterNextMove)).node2 || (*iterMove).node2 == (*(iterNextMove)).node2) {
+                choosen = false;
+                break;
+            }
+        }
+        if (choosen) {
+            nonConfNNIs.push_back(*iterMove);
+        }
+    }
+}
+
+//double IQTree::estN95() {
+//    if (vecNumNNI.size() == 0) {
+//        return 0;
+//    } else {
+//        sort(vecNumNNI.begin(), vecNumNNI.end());
+//        int index = floor(vecNumNNI.size() * speed_conf);
+//        return vecNumNNI[index];
+//    }
+//}
+
+double IQTree::getAvgNumNNI() {
+    if (vecNumNNI.size() == 0) {
+        return 0;
+    } else {
+        double median;
+        size_t size = vecNumNNI.size();
+        sort(vecNumNNI.begin(), vecNumNNI.end());
+        if (size % 2 == 0) {
+            median = (vecNumNNI[size / 2 + 1] + vecNumNNI[size / 2]) / 2;
+        } else {
+            median = vecNumNNI[size / 2];
+        }
+        return median;
+    }
+}
+
+double IQTree::estDeltaMedian() {
+    if (vecImpProNNI.size() == 0) {
+        return 0;
+    } else {
+        double median;
+        size_t size = vecImpProNNI.size();
+        sort(vecImpProNNI.begin(), vecImpProNNI.end());
+        if (size % 2 == 0) {
+            median = (vecImpProNNI[size / 2 + 1] + vecImpProNNI[size / 2]) / 2;
+        } else {
+            median = vecImpProNNI[size / 2];
+        }
+        return median;
+    }
+}
+
+//inline double IQTree::estDelta95() {
+//    if (vecImpProNNI.size() == 0) {
+//        return 0;
+//    } else {
+//        sort(vecImpProNNI.begin(), vecImpProNNI.end());
+//        int index = floor(vecImpProNNI.size() * speed_conf);
+//        return vecImpProNNI[index];
+//    }
+//}
+
+int IQTree::getDelete() const {
+    return k_delete;
+}
+
+void IQTree::setDelete(int _delete) {
+    k_delete = _delete;
+}
+
+void IQTree::changeBranLen(PhyloNode *node1, PhyloNode *node2, double newlen) {
+    node1->findNeighbor(node2)->length = newlen;
+    node2->findNeighbor(node1)->length = newlen;
+    node1->clearReversePartialLh(node2);
+    node2->clearReversePartialLh(node1);
+}
+
+double IQTree::getBranLen(PhyloNode *node1, PhyloNode *node2) {
+    return  node1->findNeighbor(node2)->length;
+}
+
+void IQTree::saveBranches(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+    if (dad) {
+        double len = getBranLen(node, dad);
+        string key = getBranchID(node, dad);
+        orgBrans.insert(mapString2Double::value_type(key, len));
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    saveBranches((PhyloNode*) (*it)->node, node);
+}
+}
+
+void IQTree::restoreAllBrans(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+    if (dad) {
+        string key = getBranchID(node, dad);
+        Neighbor* bran_it = node->findNeighbor(dad);
+        assert(bran_it);
+        Neighbor* bran_it_back = dad->findNeighbor(node);
+        assert(bran_it_back);
+        assert(orgBrans.count(key));
+        bran_it->length = orgBrans[key];
+        bran_it_back->length = orgBrans[key];
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    restoreAllBrans((PhyloNode*) (*it)->node, node);
+}
+}
+
+void IQTree::evalNNIs(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+    // internal branch
+    if (!node->isLeaf() && dad && !dad->isLeaf()) {
+        NNIMove myMove = getBestNNIForBran(node, dad, NULL);
+        if (myMove.newloglh > curScore + params->loglh_epsilon) {
+            addPositiveNNIMove(myMove);
+        }
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+        evalNNIs((PhyloNode*) (*it)->node, node);
+    }
+}
+
+void IQTree::evalNNIs(NodeVector& nodes1, NodeVector& nodes2) {
+	if (!nodes1.empty()) {
+		assert(!nodes2.empty());
+		assert(nodes1.size() == nodes2.size());
+		NodeVector::iterator it1;
+		NodeVector::iterator it2;
+		for (it1 = nodes1.begin(), it2 = nodes2.begin(); it1 != nodes1.end() && it2 != nodes2.end(); it1++, it2++) {
+			assert(isInnerBranch(*it1, *it2));
+			NNIMove myMove = getBestNNIForBran((PhyloNode*) *it1, (PhyloNode*) *it2, NULL);
+        if (myMove.newloglh > curScore + params->loglh_epsilon) {
+            addPositiveNNIMove(myMove);
+        }
+		}
+	} else {
+		evalNNIs();
+    }
+}
+
+/**
+ *  Currently not used, commented out to simplify the interface of getBestNNIForBran
+void IQTree::evalNNIsSort(bool approx_nni) {
+        if (myMove.newloglh > curScore + params->loglh_epsilon) {
+        if (myMove.newloglh > curScore + params->loglh_epsilon) {
+            addPositiveNNIMove(myMove);
+        }
+            addPositiveNNIMove(myMove);
+        }
+    NodeVector nodes1, nodes2;
+    int i;
+    double cur_lh = curScore;
+    vector<IntBranchInfo> int_branches;
+
+    getInternalBranches(nodes1, nodes2);
+    assert(nodes1.size() == leafNum - 3 && nodes2.size() == leafNum - 3);
+
+    for (i = 0; i < leafNum - 3; i++) {
+        IntBranchInfo int_branch;
+        PhyloNeighbor *node12_it = (PhyloNeighbor*) nodes1[i]->findNeighbor(nodes2[i]);
+        //PhyloNeighbor *node21_it = (PhyloNeighbor*) nodes2[i]->findNeighbor(nodes1[i]);
+        int_branch.lh_contribution = cur_lh - computeLikelihoodZeroBranch(node12_it, (PhyloNode*) nodes1[i]);
+        if (int_branch.lh_contribution < 0.0)
+            int_branch.lh_contribution = 0.0;
+        if (int_branch.lh_contribution < fabs(nni_cutoff)) {
+            int_branch.node1 = (PhyloNode*) nodes1[i];
+            int_branch.node2 = (PhyloNode*) nodes2[i];
+            int_branches.push_back(int_branch);
+        }
+    }
+    std::sort(int_branches.begin(), int_branches.end(), int_branch_cmp);
+    for (vector<IntBranchInfo>::iterator it = int_branches.begin(); it != int_branches.end(); it++)
+        if (it->lh_contribution >= 0.0) // evaluate NNI if branch contribution is big enough
+                {
+            NNIMove myMove = getBestNNIForBran(it->node1, it->node2, NULL, approx_nni, it->lh_contribution);
+            if (myMove.newloglh > curScore) {
+                addPositiveNNIMove(myMove);
+                if (!estimate_nni_cutoff)
+                    for (vector<IntBranchInfo>::iterator it2 = it + 1; it2 != int_branches.end(); it2++) {
+                        if (it2->node1 == it->node1 || it2->node2 == it->node1 || it2->node1 == it->node2
+                                || it2->node2 == it->node2)
+                            it2->lh_contribution = -1.0; // do not evaluate this branch later on
+                    }
+            }
+        } else { // otherwise, only optimize the branch length
+            PhyloNode *node1 = it->node1;
+            PhyloNode *node2 = it->node2;
+            PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+            PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
+            double stored_len = node12_it->length;
+            curScore = optimizeOneBranch(node1, node2, false);
+            string key("");
+            if (node1->id < node2->id) {
+                key += convertIntToString(node1->id) + "->" + convertIntToString(node2->id);
+            } else {
+                key += convertIntToString(node2->id) + "->" + convertIntToString(node1->id);
+            }
+
+            optBrans.insert(mapString2Double::value_type(key, node12_it->length));
+            node12_it->length = stored_len;
+            node21_it->length = stored_len;
+        }
+}
+*/
+
+void IQTree::estimateNNICutoff(Params* params) {
+    double *delta = new double[nni_info.size()];
+    int i;
+    for (i = 0; i < nni_info.size(); i++) {
+        double lh_score[4];
+        memmove(lh_score, nni_info[i].lh_score, 4 * sizeof(double));
+        std::sort(lh_score + 1, lh_score + 4); // sort in ascending order
+        delta[i] = lh_score[0] - lh_score[2];
+        if (verbose_mode >= VB_MED)
+            cout << i << ": " << lh_score[0] << " " << lh_score[1] << " " << lh_score[2] << " " << lh_score[3] << endl;
+    }
+    std::sort(delta, delta + nni_info.size());
+    nni_cutoff = delta[nni_info.size() / 20];
+    cout << endl << "Estimated NNI cutoff: " << nni_cutoff << endl;
+    string file_name = params->out_prefix;
+    file_name += ".nnidelta";
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(file_name.c_str());
+        for (i = 0; i < nni_info.size(); i++) {
+            out << delta[i] << endl;
+        }
+        out.close();
+        cout << "NNI delta printed to " << file_name << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, file_name);
+    }
+    delete[] delta;
+}
+
+void IQTree::saveCurrentTree(double cur_logl) {
+    ostringstream ostr;
+    string tree_str;
+    StringIntMap::iterator it = treels.end();
+    if (params->store_candidate_trees) {
+        printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+        tree_str = ostr.str();
+        it = treels.find(tree_str);
+    }
+    int tree_index = -1;
+    if (it != treels.end()) { // already in treels
+        duplication_counter++;
+        tree_index = it->second;
+        if (cur_logl <= treels_logl[it->second] + 1e-4) {
+            if (cur_logl < treels_logl[it->second] - 5.0)
+                if (verbose_mode >= VB_MED)
+                    cout << "Current lh " << cur_logl << " is much worse than expected " << treels_logl[it->second]
+                            << endl;
+            return;
+        }
+        if (verbose_mode >= VB_MAX)
+            cout << "Updated logl " << treels_logl[it->second] << " to " << cur_logl << endl;
+        treels_logl[it->second] = cur_logl;
+        if (save_all_br_lens) {
+            ostr.seekp(ios::beg);
+            printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
+            treels_newick[it->second] = ostr.str();
+        }
+        if (boot_samples.empty()) {
+            computePatternLikelihood(treels_ptnlh[it->second], &cur_logl);
+            return;
+        }
+        if (verbose_mode >= VB_MAX)
+            cout << "Update treels_logl[" << tree_index << "] := " << cur_logl << endl;
+    } else {
+        if (logl_cutoff != 0.0 && cur_logl <= logl_cutoff + 1e-4)
+            return;
+        tree_index = treels_logl.size();
+        if (params->store_candidate_trees)
+            treels[tree_str] = tree_index;
+        treels_logl.push_back(cur_logl);
+        if (verbose_mode >= VB_MAX)
+            cout << "Add    treels_logl[" << tree_index << "] := " << cur_logl << endl;
+    }
+
+    if (write_intermediate_trees)
+        printTree(out_treels, WT_NEWLINE | WT_BR_LEN);
+
+    int nptn = getAlnNPattern();
+
+#ifdef BOOT_VAL_FLOAT
+    int maxnptn = get_safe_upper_limit_float(nptn);
+    BootValType *pattern_lh = aligned_alloc<BootValType>(maxnptn);
+    memset(pattern_lh, 0, maxnptn*sizeof(BootValType));
+    double *pattern_lh_orig = aligned_alloc<double>(nptn);
+    computePatternLikelihood(pattern_lh_orig, &cur_logl);
+    for (int i = 0; i < nptn; i++)
+    	pattern_lh[i] = (float)pattern_lh_orig[i];
+#else
+    int maxnptn = get_safe_upper_limit(nptn);
+    BootValType *pattern_lh = aligned_alloc<BootValType>(maxnptn);
+    memset(pattern_lh, 0, maxnptn*sizeof(BootValType));
+    computePatternLikelihood(pattern_lh, &cur_logl);
+#endif
+
+
+    if (boot_samples.empty()) {
+        // for runGuidedBootstrap
+#ifdef BOOT_VAL_FLOAT
+        treels_ptnlh.push_back(pattern_lh_orig);
+#else
+        treels_ptnlh.push_back(pattern_lh);
+#endif
+    } else {
+        // online bootstrap
+        int ptn;
+//        int updated = 0;
+        int nsamples = boot_samples.size();
+
+        #ifdef _OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int sample = 0; sample < nsamples; sample++) {
+            double rell = 0.0;
+
+            if (false) {
+            	BootValType *boot_sample = boot_samples[sample];
+            	BootValType rellll = 0.0;
+				for (ptn = 0; ptn < nptn; ptn++)
+					rellll += pattern_lh[ptn] * boot_sample[ptn];
+				rell = (double)rellll;
+            } else {
+            	// SSE optimized version of the above loop
+				BootValType *boot_sample = boot_samples[sample];
+
+				BootValType res = (this->*dotProduct)(pattern_lh, boot_sample, nptn);
+
+				rell = res;
+            }
+
+            bool better = rell > boot_logl[sample] + params->ufboot_epsilon;
+            if (!better && rell > boot_logl[sample] - params->ufboot_epsilon) {
+                #ifdef _OPENMP
+                #pragma omp critical
+                #endif
+                better = random_double() <= 1.0 / (boot_counts[sample] + 1);
+            }
+            if (better) {
+                if (tree_str == "") 
+                #ifdef _OPENMP
+                #pragma omp critical
+                #endif
+                {
+                    printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+                    tree_str = ostr.str();
+                    it = treels.find(tree_str);
+                    if (it != treels.end()) {
+                        tree_index = it->second;
+                    } else {
+                        tree_index = treels.size();
+                        treels[tree_str] = tree_index;
+                    }
+                }
+                if (rell <= boot_logl[sample] + params->ufboot_epsilon) {
+                    boot_counts[sample]++;
+                } else {
+                    boot_counts[sample] = 1;
+                }
+                boot_logl[sample] = max(boot_logl[sample], rell);
+                boot_trees[sample] = tree_index;
+//                updated++;
+            } /*else if (verbose_mode >= VB_MED && rell > boot_logl[sample] - 0.01) {
+             cout << "Info: multiple RELL score trees detected" << endl;
+             }*/
+        }
+//        if (updated && verbose_mode >= VB_MAX)
+//            cout << updated << " boot trees updated" << endl;
+        /*
+         if (tree_index >= max_candidate_trees/2 && boot_splits->empty()) {
+         // summarize split support half way for stopping criterion
+         cout << "Summarizing current bootstrap supports..." << endl;
+         summarizeBootstrap(*boot_splits);
+         }*/
+    }
+    if (save_all_br_lens) {
+        ostr.seekp(ios::beg);
+        printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
+        treels_newick.push_back(ostr.str());
+    }
+    if (print_tree_lh) {
+        out_treelh << cur_logl;
+        double prob;
+#ifdef BOOT_VAL_FLOAT
+        aln->multinomialProb(pattern_lh_orig, prob);
+#else
+        aln->multinomialProb(pattern_lh, prob);
+#endif
+        out_treelh << "\t" << prob << endl;
+
+        IntVector pattern_index;
+        aln->getSitePatternIndex(pattern_index);
+        out_sitelh << "Site_Lh   ";
+        for (int i = 0; i < getAlnNSite(); i++)
+            out_sitelh << " " << pattern_lh[pattern_index[i]];
+        out_sitelh << endl;
+    }
+
+    if (!boot_samples.empty()) {
+#ifdef BOOT_VAL_FLOAT
+    	aligned_free(pattern_lh_orig);
+#endif
+    	aligned_free(pattern_lh);
+    } else {
+#ifdef BOOT_VAL_FLOAT
+    	aligned_free(pattern_lh);
+#endif
+    }
+
+}
+
+void IQTree::saveNNITrees(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+    if (dad && !node->isLeaf() && !dad->isLeaf()) {
+        double *pat_lh1 = new double[aln->getNPattern()];
+        double *pat_lh2 = new double[aln->getNPattern()];
+        double lh1, lh2;
+        computeNNIPatternLh(curScore, lh1, pat_lh1, lh2, pat_lh2, node, dad);
+        delete[] pat_lh2;
+        delete[] pat_lh1;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)saveNNITrees((PhyloNode*) (*it)->node, node);
+}
+
+void IQTree::summarizeBootstrap(Params &params, MTreeSet &trees) {
+    int sum_weights = trees.sumTreeWeights();
+    int i;
+    if (verbose_mode >= VB_MAX) {
+        for (i = 0; i < trees.size(); i++)
+            if (trees.tree_weights[i] > 0)
+                cout << "Tree " << i + 1 << " weight= " << (double) trees.tree_weights[i] * 100 / sum_weights << endl;
+    }
+    int max_tree_id = max_element(trees.tree_weights.begin(), trees.tree_weights.end()) - trees.tree_weights.begin();
+    if (verbose_mode >= VB_MED) {
+		cout << "max_tree_id = " << max_tree_id + 1 << "   max_weight = " << trees.tree_weights[max_tree_id];
+		cout << " (" << (double) trees.tree_weights[max_tree_id] * 100 / sum_weights << "%)" << endl;
+    }
+    // assign bootstrap support
+    SplitGraph sg;
+    SplitIntMap hash_ss;
+    // make the taxa name
+    vector<string> taxname;
+    taxname.resize(leafNum);
+    if (boot_splits.empty()) {
+        getTaxaName(taxname);
+    } else {
+        boot_splits.back()->getTaxaName(taxname);
+    }
+    /*if (!tree.save_all_trees)
+     trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1);
+     else
+     trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, false);
+     */
+    trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, NULL, false); // do not sort taxa
+
+    if (verbose_mode >= VB_MED)
+    	cout << sg.size() << " splits found" << endl;
+
+    if (!boot_splits.empty()) {
+        // check the stopping criterion for ultra-fast bootstrap
+        if (computeBootstrapCorrelation() < params.min_correlation)
+            cout << "WARNING: bootstrap analysis did not converge. You should rerun with higher number of iterations (-nm option)" << endl;
+
+    }
+
+    sg.scaleWeight(1.0 / trees.sumTreeWeights(), false, 4);
+    string out_file;
+    out_file = params.out_prefix;
+    out_file += ".splits";
+    if (params.print_splits_file) {
+		sg.saveFile(out_file.c_str(), IN_OTHER, true);
+		cout << "Split supports printed to star-dot file " << out_file << endl;
+    }
+    // compute the percentage of appearance
+    sg.scaleWeight(100.0, true);
+    //	printSplitSet(sg, hash_ss);
+    //sg.report(cout);
+    cout << "Creating bootstrap support values..." << endl;
+    stringstream tree_stream;
+    printTree(tree_stream, WT_TAXON_ID | WT_BR_LEN);
+    MExtTree mytree;
+    mytree.readTree(tree_stream, rooted);
+    mytree.assignLeafID();
+    mytree.createBootstrapSupport(taxname, trees, sg, hash_ss, NULL);
+
+    // now write resulting tree with supports
+    tree_stream.seekp(0, ios::beg);
+    mytree.printTree(tree_stream);
+
+    // now read resulting tree
+    tree_stream.seekg(0, ios::beg);
+    freeNode();
+    // RARE BUG FIX: to avoid cases that identical seqs were removed and leaf name happens to be IDs
+    MTree::readTree(tree_stream, rooted);
+    
+    assignLeafNames();
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+    } else {
+		initializeAllPartialLh();
+		clearAllPartialLH();
+    }
+
+    if (!save_all_trees) {
+        out_file = params.out_prefix;
+        out_file += ".suptree";
+
+        printTree(out_file.c_str());
+        cout << "Tree with assigned bootstrap support written to " << out_file << endl;
+    }
+
+    out_file = params.out_prefix;
+    out_file += ".splits.nex";
+    sg.saveFile(out_file.c_str(), IN_NEXUS, false);
+    cout << "Split supports printed to NEXUS file " << out_file << endl;
+
+    /*
+     out_file = params.out_prefix;
+     out_file += ".supval";
+     writeInternalNodeNames(out_file);
+
+     cout << "Support values written to " << out_file << endl;
+     */
+
+//    if (params.print_ufboot_trees) {
+//        string filename = params.out_prefix;
+//        filename += ".ufboot";
+//        ofstream out(filename.c_str());
+//        for (i = 0; i < trees.size(); i++) {
+//            NodeVector taxa;
+//            // change the taxa name from ID to real name
+//            trees[i]->getOrderedTaxa(taxa);
+//            for (j = 0; j < taxa.size(); j++)
+//                taxa[j]->name = aln->getSeqName(taxa[j]->id);
+//            // now print to file
+//            for (j = 0; j < trees.tree_weights[i]; j++)
+//                trees[i]->printTree(out, WT_NEWLINE);
+//        }
+//        out.close();
+//        cout << "UFBoot trees printed to " << filename << endl;
+//    }
+//
+}
+
+void IQTree::writeUFBootTrees(Params &params) {
+    MTreeSet trees;
+    IntVector tree_weights;
+    int sample, i, j;
+    tree_weights.resize(treels_logl.size(), 0);
+    for (sample = 0; sample < boot_trees.size(); sample++)
+        tree_weights[boot_trees[sample]]++;
+    trees.init(treels, rooted, tree_weights);
+	string filename = params.out_prefix;
+	filename += ".ufboot";
+	ofstream out(filename.c_str());
+	for (i = 0; i < trees.size(); i++) {
+		NodeVector taxa;
+		// change the taxa name from ID to real name
+		trees[i]->getOrderedTaxa(taxa);
+		for (j = 0; j < taxa.size(); j++)
+			taxa[j]->name = aln->getSeqName(taxa[j]->id);
+		if (removed_seqs.size() > 0) {
+			// reinsert removed seqs into each tree
+			trees[i]->insertTaxa(removed_seqs, twin_seqs);
+		}
+		// now print to file
+		for (j = 0; j < trees.tree_weights[i]; j++)
+			trees[i]->printTree(out, WT_NEWLINE);
+	}
+	out.close();
+	cout << "UFBoot trees printed to " << filename << endl;
+}
+
+void IQTree::summarizeBootstrap(Params &params) {
+	setRootNode(params.root);
+	if (verbose_mode >= VB_MED)
+		cout << "Summarizing from " << treels.size() << " candidate trees..." << endl;
+    MTreeSet trees;
+    IntVector tree_weights;
+    int sample;
+    tree_weights.resize(treels_logl.size(), 0);
+    for (sample = 0; sample < boot_trees.size(); sample++)
+        tree_weights[boot_trees[sample]]++;
+    trees.init(treels, rooted, tree_weights);
+    summarizeBootstrap(params, trees);
+}
+
+void IQTree::summarizeBootstrap(SplitGraph &sg) {
+    MTreeSet trees;
+    IntVector tree_weights;
+    tree_weights.resize(treels_logl.size(), 0);
+    for (int sample = 0; sample < boot_trees.size(); sample++)
+        tree_weights[boot_trees[sample]]++;
+    trees.init(treels, rooted, tree_weights);
+    //SplitGraph sg;
+    SplitIntMap hash_ss;
+    // make the taxa name
+    vector<string> taxname;
+    taxname.resize(leafNum);
+    getTaxaName(taxname);
+
+    /*if (!tree.save_all_trees)
+     trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1);
+     else
+     trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, false);
+     */
+    trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, NULL, false); // do not sort taxa
+}
+
+void IQTree::pllConvertUFBootData2IQTree(){
+    // duplication_counter
+    duplication_counter = pllUFBootDataPtr->duplication_counter;
+    //treels_logl
+    treels_logl.clear();
+    for(int i = 0; i < pllUFBootDataPtr->candidate_trees_count; i++)
+        treels_logl.push_back(pllUFBootDataPtr->treels_logl[i]);
+
+    //boot_trees
+    boot_trees.clear();
+    for(int i = 0; i < params->gbo_replicates; i++)
+        boot_trees.push_back(pllUFBootDataPtr->boot_trees[i]);
+
+    //treels
+    treels.clear();
+    if(pllUFBootDataPtr->candidate_trees_count > 0){
+        struct pllHashItem * hItem;
+        struct pllHashTable * hTable = pllUFBootDataPtr->treels;
+        for (int i = 0; i < hTable->size; ++ i){
+            hItem = hTable->Items[i];
+            while (hItem){
+                string k(hItem->str);
+                treels[k] = *((int *)hItem->data);
+                hItem = hItem->next;
+            }
+        }
+    }
+}
+
+double computeCorrelation(IntVector &ix, IntVector &iy) {
+
+    assert(ix.size() == iy.size());
+    DoubleVector x;
+    DoubleVector y;
+
+    double mx = 0.0, my = 0.0; // mean value
+    int i;
+    x.resize(ix.size());
+    y.resize(iy.size());
+    for (i = 0; i < x.size(); i++) {
+        x[i] = ix[i];
+        y[i] = iy[i];
+        mx += x[i];
+        my += y[i];
+    }
+    mx /= x.size();
+    my /= y.size();
+    for (i = 0; i < x.size(); i++) {
+        x[i] = x[i] / mx - 1.0;
+        y[i] = y[i] / my - 1.0;
+    }
+
+    double f1 = 0.0, f2 = 0.0, f3 = 0.0;
+    for (i = 0; i < x.size(); i++) {
+        f1 += (x[i]) * (y[i]);
+        f2 += (x[i]) * (x[i]);
+        f3 += (y[i]) * (y[i]);
+    }
+    if (f2 == 0.0 || f3 == 0.0)
+        return 1.0;
+    return f1 / (sqrt(f2) * sqrt(f3));
+}
+
+double IQTree::computeBootstrapCorrelation() {
+    if (boot_splits.size() < 2)
+        return 0.0;
+    IntVector split_supports;
+    SplitIntMap split_map;
+    int i;
+    // collect split supports
+    SplitGraph *sg = boot_splits.back();
+    SplitGraph *half = boot_splits[(boot_splits.size() - 1) / 2];
+    for (i = 0; i < half->size(); i++)
+        if (half->at(i)->trivial() == -1) {
+            split_map.insertSplit(half->at(i), split_supports.size());
+            split_supports.push_back((int) (half->at(i)->getWeight()));
+        }
+
+    // collect split supports for new tree collection
+    IntVector split_supports_new;
+    split_supports_new.resize(split_supports.size(), 0);
+    for (i = 0; i < sg->size(); i++)
+        if ((*sg)[i]->trivial() == -1) {
+            int index;
+            Split *sp = split_map.findSplit((*sg)[i], index);
+            if (sp) {
+                // split found
+                split_supports_new[index] = (int) ((*sg)[i]->getWeight());
+            } else {
+                // new split
+                split_supports_new.push_back((int) ((*sg)[i]->getWeight()));
+            }
+        }
+    if (verbose_mode >= VB_MED)
+    	cout << split_supports_new.size() - split_supports.size() << " new splits compared to old boot_splits" << endl;
+    if (split_supports_new.size() > split_supports.size())
+        split_supports.resize(split_supports_new.size(), 0);
+
+    // now compute correlation coefficient
+    double corr = computeCorrelation(split_supports, split_supports_new);
+    // printing supports into file
+    /*
+     string outfile = params->out_prefix;
+     outfile += ".splitsup";
+     try {
+     ofstream out;
+     out.exceptions(ios::failbit | ios::badbit);
+     out.open(outfile.c_str());
+     out << "tau=" << max_candidate_trees / 2 << "\ttau="
+     << treels_logl.size() << endl;
+     for (int i = 0; i < split_supports.size(); i++)
+     out << split_supports[i] << "\t" << split_supports_new[i] << endl;
+     out.close();
+     cout << "Split support values printed to " << outfile << endl;
+     } catch (ios::failure) {
+     outError(ERR_WRITE_OUTPUT, outfile);
+     }
+     */
+    return corr;
+}
+
+void IQTree::addPositiveNNIMove(NNIMove myMove) {
+    plusNNIs.push_back(myMove);
+}
+
+void IQTree::printResultTree(string suffix) {
+    setRootNode(params->root);
+    string tree_file_name = params->out_prefix;
+    tree_file_name += ".treefile";
+    if (suffix.compare("") != 0) {
+        string iter_tree_name = tree_file_name + "." + suffix;
+        printTree(iter_tree_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+    } else {
+        printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+    }
+    //printTree(tree_file_name.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH);
+}
+
+void IQTree::printResultTree(ostream &out) {
+    setRootNode(params->root);
+    printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+}
+
+/*
+void IQTree::printPhylolibModelParams(const char* suffix) {
+    char phyloliModelFile[1024];
+    strcpy(phyloliModelFile, params->out_prefix);
+    strcat(phyloliModelFile, suffix);
+    ofstream modelfile;
+    modelfile.open(phyloliModelFile);
+    for (int model = 0; model < pllInst->NumberOfModels; model++) {
+        cout << "Rate parameters: ";
+        for (int i = 0; i < 6; i++) {
+            cout << pllInst->partitionData[model].substRates[i] << " ";
+            modelfile << pllInst->partitionData[model].substRates[i] << " ";
+        }
+        cout << endl;
+        modelfile << endl;
+        cout << "Base frequencies: ";
+        for (int i = 0; i < aln->num_states; i++) {
+            cout << pll_tree->partitionData[model].frequencies[i] << " ";
+            modelfile << pll_tree->partitionData[model].frequencies[i] << " ";
+        }
+        cout << endl;
+        modelfile << endl;
+        cout << "Gamma shape :" << pll_tree->partitionData[model].alpha << endl;
+        modelfile << pll_tree->partitionData[model].alpha << endl;
+    }
+}
+*/
+
+void IQTree::printPhylolibTree(const char* suffix) {
+    pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_TRUE, 1, 0, 0, 0,
+            PLL_SUMMARIZE_LH, 0, 0);
+    char phylolibTree[1024];
+    strcpy(phylolibTree, params->out_prefix);
+    strcat(phylolibTree, suffix);
+    FILE *phylolib_tree = fopen(phylolibTree, "w");
+    fprintf(phylolib_tree, "%s", pllInst->tree_string);
+    cout << "Tree optimized by Phylolib was written to " << phylolibTree << endl;
+}
+
+void IQTree::printIntermediateTree(int brtype) {
+    setRootNode(params->root);
+    bool duplicated_tree = false;
+    double *pattern_lh = NULL;
+    double logl = curScore;
+    if (params->avoid_duplicated_trees) {
+        // estimate logl_cutoff
+        stringstream ostr;
+        printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+        string tree_str = ostr.str();
+        StringIntMap::iterator it = treels.find(tree_str);
+        if (it != treels.end()) { // already in treels
+            duplicated_tree = true;
+            if (curScore > treels_logl[it->second] + 1e-4) {
+                if (verbose_mode >= VB_MAX)
+                    cout << "Updated logl " << treels_logl[it->second] << " to " << curScore << endl;
+                treels_logl[it->second] = curScore;
+                computeLikelihood(treels_ptnlh[it->second]);
+                if (save_all_br_lens) {
+                    ostr.seekp(ios::beg);
+                    printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
+                    treels_newick[it->second] = ostr.str();
+                }
+            }
+            //pattern_lh = treels_ptnlh[treels[tree_str]];
+        } else {
+            //cout << __func__ << ": new tree" << endl;
+            if (logl_cutoff != 0.0 && curScore <= logl_cutoff + 1e-4)
+                duplicated_tree = true;
+            else {
+                treels[tree_str] = treels_ptnlh.size();
+                pattern_lh = new double[getAlnNPattern()];
+//                computePatternLikelihood(pattern_lh, &logl);
+                computePatternLikelihood(pattern_lh);
+                treels_ptnlh.push_back(pattern_lh);
+                treels_logl.push_back(logl);
+                if (save_all_br_lens) {
+                    ostr.seekp(ios::beg);
+                    printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA | WT_BR_LEN | WT_BR_SCALE | WT_BR_LEN_ROUNDING);
+                    treels_newick.push_back(ostr.str());
+                }
+            }
+        }
+        //cout << tree_str << endl;
+    } else {
+        if (params->print_tree_lh) {
+            pattern_lh = new double[getAlnNPattern()];
+            computePatternLikelihood(pattern_lh, &logl);
+        }
+    }
+
+    if (!duplicated_tree) {
+        if (write_intermediate_trees)
+            printTree(out_treels, brtype);
+        if (params->print_tree_lh) {
+            out_treelh.precision(10);
+            out_treelh << logl;
+            double prob;
+            aln->multinomialProb(pattern_lh, prob);
+            out_treelh << "\t" << prob << endl;
+            if (!(brtype & WT_APPEND))
+                out_sitelh << aln->getNSite() << endl;
+            out_sitelh << "Site_Lh   ";
+            for (int i = 0; i < aln->getNSite(); i++)
+                out_sitelh << "\t" << pattern_lh[aln->getPatternID(i)];
+            out_sitelh << endl;
+            if (!params->avoid_duplicated_trees)
+                delete[] pattern_lh;
+        }
+    }
+    if (params->write_intermediate_trees == 1 && save_all_trees != 1) {
+        return;
+    }
+    int x = save_all_trees;
+    save_all_trees = 2;
+    evalNNIs();
+    save_all_trees = x;
+}
+
+
diff --git a/iqtree.h b/iqtree.h
new file mode 100644
index 0000000..94e8fe9
--- /dev/null
+++ b/iqtree.h
@@ -0,0 +1,822 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef IQPTREE_H
+#define IQPTREE_H
+
+#include <set>
+#include <map>
+#include <stack>
+#include <vector>
+#include "phylotree.h"
+#include "phylonode.h"
+#include "stoprule.h"
+#include "mtreeset.h"
+#include "node.h"
+#include "candidateset.h"
+#include "pllnni.h"
+
+typedef std::map< string, double > mapString2Double;
+typedef std::multiset< double, std::less< double > > multiSetDB;
+typedef std::multiset< int, std::less< int > > MultiSetInt;
+
+class RepLeaf {
+public:
+    Node *leaf;
+    int height;
+
+    RepLeaf(Node *aleaf, int aheight = 0) {
+        leaf = aleaf;
+        height = aheight;
+    }
+};
+
+/**
+        nodeheightcmp, for building k-representative leaf set
+ */
+struct nodeheightcmp {
+
+    bool operator()(const RepLeaf* s1, const RepLeaf * s2) const {
+        return (s1->height) < (s2->height);
+    }
+};
+
+struct IntBranchInfo {
+    PhyloNode *node1;
+    PhyloNode *node2;
+    double lh_contribution; // log-likelihood contribution of this branch: L(T)-L(T|e=0)
+};
+
+inline int int_branch_cmp(const IntBranchInfo a, const IntBranchInfo b) {
+    return (a.lh_contribution < b.lh_contribution);
+}
+
+/**
+        Representative Leaf Set, stored as a multiset template of STL,
+        sorted in ascending order of leaf's height
+ */
+typedef multiset<RepLeaf*, nodeheightcmp> RepresentLeafSet;
+
+/**
+Important Quartet Puzzling
+
+        @author BUI Quang Minh <minh.bui at univie.ac.at>
+ */
+class IQTree : public PhyloTree {
+public:
+    /**
+            default constructor
+     */
+    IQTree();
+
+    IQTree(Alignment *aln);
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+    /**
+            destructor
+     */
+    virtual ~IQTree();
+
+    void init();
+
+    /**
+     * setup all necessary parameters  (declared as virtual needed for phylosupertree)
+     */
+    virtual void initSettings(Params& params);
+
+    void createPLLPartition(Params &params, ostream &pllPartitionFileHandle);
+
+    void initializePLL(Params &params);
+
+    void initializeModel(Params &params);
+
+    /**
+            print tree to .treefile
+            @param params program parameters, field root is taken
+     */
+    void printResultTree(string suffix = "");
+    /**
+            print tree to out
+            @param params program parameters, field root is taken
+            @param out (OUT) output stream
+     */
+    void printResultTree(ostream &out);
+
+    /**
+     * print phylolib tree to a file.
+     * @param suffix suffix string for the tree file
+     */
+    void printPhylolibTree(const char* suffix);
+
+
+    /**
+     *  print model parameters of Phylolib(rates, base frequencies, alpha) to stdout and
+     *  to file
+     */
+    //void printPhylolibModelParams(const char* suffix);
+
+    /**
+        print intermediate tree
+     */
+    void printIntermediateTree(int brtype);
+
+    /**
+            set k-representative parameter
+            @param k_rep k-representative
+     */
+    // void setRepresentNum(int k_rep);
+
+    /**
+            set the probability of deleteing sequences for IQP algorithm
+            @param p_del probability of deleting sequences
+     */
+    //void setProbDelete(double p_del);
+
+    double getProbDelete();
+
+    void resetKDelete();
+    void increaseKDelete();
+
+    /**
+            set the number of iterations for the IQPNNI algorithm
+            @param stop_condition stop condition (SC_FIXED_ITERATION, SC_STOP_PREDICT)
+            @param min_iterations the min number of iterations
+            @param max_iterations the maximum number of iterations
+     */
+//    void setIQPIterations(STOP_CONDITION stop_condition, double stop_confidence, int min_iterations, int max_iterations);
+
+    /**
+            @param assess_quartet the quartet assessment, either IQP_DISTANCE or IQP_PARSIMONY
+     */
+    //void setIQPAssessQuartet(IQP_ASSESS_QUARTET assess_quartet);
+
+    /**
+            find the k-representative leaves under the node
+            @param node the node at which the subtree is rooted
+            @param dad the dad node of the considered subtree, to direct the search
+            @param leaves (OUT) the k-representative leaf set
+     */
+    RepresentLeafSet* findRepresentLeaves(vector<RepresentLeafSet*> &leaves, int nei_id,
+            PhyloNode *dad);
+
+    /**
+            clear representative leave sets iteratively, called once a leaf is re-inserted into the tree
+            @param node the node at which the subtree is rooted
+            @param dad the dad node of the considered subtree, to direct the search
+            @param leaves (OUT) the k-representative leaf set
+     */
+    void clearRepresentLeaves(vector<RepresentLeafSet*> &leaves_vec, Node *node, Node *dad);
+
+    /**
+            remove a portion of leaves and reinsert them using the IQP algorithm
+     */
+    void doIQP();
+
+    /**
+     *  @brief remove all branches mapped to splits in \a split
+     *  @param nodes1 node vector containing one end of the branches
+     *  @param nodes2 node vector containing the other end of the branches
+     *  @return number of branches removed
+     */
+    int removeBranches(NodeVector& nodes1, NodeVector& nodes2, SplitGraph& splits);
+
+    /**
+     * 		Perform a series of random NNI moves
+     * 		@param numNNI number of random NNIs
+     */
+    void doRandomNNIs(int numNNI);
+
+    /**
+     *   input model parameters from IQ-TREE to PLL
+     */
+    void inputModelIQTree2PLL();
+
+    /**
+     *  input model parameters from PLL to IQ-TREE
+     */
+    void inputModelPLL2IQTree();
+
+    /**
+     *  get the rate parameters from PLL
+     *  @return double array containing the 6 rates
+     */
+    double* getModelRatesFromPLL();
+
+    /**
+     *  get the alpha parameter from PLL for the GAMMA distribution of rate heterogenity
+     *  @return alpha parameter
+     */
+    double getAlphaFromPLL();
+
+    /**
+     *  print model parameters from PLL
+     */
+    void pllPrintModelParams();
+
+    /**
+     * input the tree string from IQTree kernel to PLL kernel
+     * @return
+     */
+    double inputTree2PLL(string treestring, bool computeLH = true);
+
+    //bool containPosNNI(vector<NNIMove> posNNIs);
+
+    /**
+     * Perturb the tree for the next round of local search by swaping position of 2 random leaves
+     * @param nbDist The minimum distance between the 2 nodes that are swapped
+     * @param nbTimes Number of times that the swap operations are carried out
+     * @return The new loglikelihood of the tree
+     */
+    double perturb(int times);
+
+    /**
+     * TODO
+     * @param node1
+     * @param node2
+     * @return
+     */
+    double swapTaxa(PhyloNode *node1, PhyloNode *node2);
+
+    /**
+            perform tree search
+            @return best likelihood found
+     */
+    double doTreeSearch();
+
+    /**
+     *  Wrapper function that uses either PLL or IQ-TREE to optimize the branch length
+     *  @param maxTraversal
+     *  	maximum number of tree traversal for branch length optimization
+     *  @return NEWICK tree string
+     */
+    string optimizeBranches(int maxTraversal = 100);
+
+    /**
+     *  Wrapper function to compute tree log-likelihood.
+     *  This function with call either PLL or IQ-TREE to compute tree log-likelihood
+     */
+    void computeLogL();
+
+    /**
+     *	Print numBestScore found so far, starting from the highest
+     */
+    void printBestScores(int numBestScore);
+
+    /****************************************************************************
+            Fast Nearest Neighbor Interchange by maximum likelihood
+     ****************************************************************************/
+
+
+    /**
+            This implement the fastNNI algorithm proposed in PHYML paper
+            TUNG: this is a virtual function, so it will be called automatically by optimizeNNIBranches()
+            @return best likelihood found
+            @param skipped (OUT) 1 if current iteration is skipped, otherwise 0
+            @param nni_count (OUT) the number of single NNI moves proceeded so far
+     */
+    double optimizeNNI(int &nni_count, int &nni_steps);
+
+    /**
+     * 		Do fastNNI using PLL
+     *
+     *      @param nniCount (OUT) number of NNIs applied
+     * 		@param nniSteps (OUT) number of NNI steps done
+     */
+    double pllOptimizeNNI(int &nniCount, int &nniSteps, SearchInfo &searchinfo);
+
+    /**
+     * 		@brief Perform NNI search on the current tree topology
+     * 		This function will automatically use the selected kernel (either PLL or IQ-TREE)
+     *
+     * 		@param nniCount (OUT) number of NNIs applied
+     * 		@param nniSteps (OUT) number of NNI steps done
+     * 		@return the new NEWICK string
+     */
+    string doNNISearch(int &nniCount, int &nniSteps);
+
+    /**
+            @brief evaluate all NNIs and store them in possilbleNNIMoves list
+            @param  node    evaluate all NNIs of the subtree rooted at node
+            @param  dad     a neighbor of \p node which does not belong to the subtree
+                            being considered (used for traverse direction)
+
+     */
+    void evalNNIs(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     * @brief Evaluate all NNIs on branch defined by \a nodes1 and \a nodes2
+     *
+     * @param[in] nodes1 contains one ends of the branches for NNI evaluation
+     * @param[in] nodes2 contains the other ends of the branches for NNI evaluation
+     */
+    void evalNNIs(NodeVector &nodes1, NodeVector &nodes2);
+
+    /**
+            search all positive NNI move on the current tree and save them
+            on the possilbleNNIMoves list
+     */
+    void evalNNIsSort(bool approx_nni);
+
+    /**
+            apply nni2apply NNIs from the non-conflicting NNI list
+            @param nni2apply number of NNIs to apply from the list
+            @param changeBran whether or not the computed branch lengths should be applied
+     */
+    virtual void doNNIs(int nni2apply, bool changeBran = true);
+
+    /**
+     *  Restore the old 5 branch lengths stored in the NNI move.
+     *  This is called after an NNI is reverted.
+     *  @param nnimove the NNI move currently in consideration
+     */
+    //void restoreNNIBranches(NNIMove nnimove);
+
+    /**
+            generate non conflicting NNI moves.
+            moves are saved in vec_nonconf_nni
+     */
+    void genNonconfNNIs();
+
+    /**
+            add a NNI move to the list of possible NNI moves;
+     */
+    void addPositiveNNIMove(NNIMove myMove);
+
+    /**
+     * 	Save all the current branch lengths
+     */
+    void saveBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     * 	 Restore the branch lengths from the saved values
+     */
+    virtual void restoreAllBrans(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     * Get the branch length of the branch node1-node2
+     * @param node1
+     * @param node2
+     * @return the branch length
+     */
+    double getBranLen(PhyloNode *node1, PhyloNode *node2);
+
+
+    /**
+            Described in PhyML paper: apply change to branch that does not
+            correspond to a swap with the following formula l = l + lamda(la - l)
+            @param node1 the first node of the branch
+            @param node2 the second node of the branch
+     */
+    void changeBranLen(PhyloNode *node1, PhyloNode *node2, double branLen);
+
+    /**
+     * Estimate the 95% quantile of the distribution of N (see paper for more d
+                                                           details)
+     * @return the estimated value
+     */
+    inline double estN95(void);
+
+    /**
+     * Estimate the median of the distribution of N (see paper for more d
+                                                           details)
+     * @return the estimated value
+     */
+    double getAvgNumNNI(void);
+
+    /**
+     * Estimate the median of the distribution of N (see paper for more d
+                                                          details)
+     * @return the estimated value
+     */
+    double estDeltaMedian(void);
+
+    /**
+     * Estimate the 95% quantile of the distribution of DELTA (see paper for
+                                                               more detail)
+     * @return the estimated value
+     */
+    inline double estDelta95(void);
+
+    /**
+            current parsimony score of the tree
+     */
+    int cur_pars_score;
+
+//    bool enable_parsimony;
+    /**
+            stopping rule
+     */
+    StopRule stop_rule;
+
+    /**
+     *      Parsimony scores, used for linear regression
+     */
+    double* pars_scores;
+
+    /**
+        Log-likelihood variastring IQPTree::bran2string(PhyloNode* node1, PhyloNode* node2)nce
+     */
+    double logl_variance;
+
+    /**
+     *      The coressponding log-likelihood score from computed indendently from the parsimony
+     *      scores
+     */
+    double* lh_scores;
+
+    Linear* linRegModel;
+
+
+    inline double getNNICutoff() {
+        return nni_cutoff;
+    }
+
+    /*
+     *  Contains a sorted list of all NNIs (2n-6) evaluated for the current best tree
+     *  The last element (nni_for_pertub.end()) is the best NNI
+     */
+    vector<pllNNIMove> nniListOfBestTree;
+
+
+    /**
+     *  information and parameters for the tree search procedure
+     */
+    SearchInfo searchinfo;
+
+    /**
+     *  Vector contains number of NNIs used at each iterations
+     */
+    vector<int> vecNumNNI;
+
+    /**
+     * Do memory allocation and initialize parameter for UFBoot to run with PLL
+     */
+    void pllInitUFBootData();
+
+    /**
+     * Do memory deallocation for UFBoot data (PLL mode)
+     */
+    void pllDestroyUFBootData();
+
+    /**
+     * DTH:
+     * Substitute bases in seq according to PLL's rules
+     * This function should be updated if PLL's rules change.
+     * @param seq: data of some sequence to be substituted
+     * @param dataType: PLL_DNA_DATA or PLL_AA_DATA
+     */
+   void pllBaseSubstitute (char *str, int dataType);
+
+   /*
+    * An array to map site index in pllAlignment into IQTree pattern index
+    * Born due to the order difference of these two
+    * Will be deallocated in pllDestroyUFBootData()
+    */
+   int * pll2iqtree_pattern_index;
+
+   /*
+    * Build pll2iqtree_pattern_index
+    * Must be called AFTER initializing PLL model
+    */
+   void pllBuildIQTreePatternIndex();
+
+   /**
+    * FOR TESTING:
+    * Write to log file the freq of pllAlignment sites, and
+    * freq of bootstrap site stored in pllUFBootDataPtr->boot_samples
+    */
+   void pllLogBootSamples(int** pll_boot_samples, int nsamples, int npatterns);
+
+   /**
+    * Convert certain arrays in pllUFBootDataPtr
+    * into IQTree data structures
+    * to be usable in IQTree::summarizeBootstrap()
+    */
+   void pllConvertUFBootData2IQTree();
+
+protected:
+    /**
+            criterion to assess important quartet
+     */
+    IQP_ASSESS_QUARTET iqp_assess_quartet;
+
+
+    /**
+     * Taxa set
+     */
+    NodeVector taxaSet;
+
+    /**
+     * confidence value for number of NNIs found in one iteration
+     */
+    int nni_count_est;
+
+    /**
+     * confidence value for likelihood improvement made by one NNI
+     */
+    double nni_delta_est;
+
+
+    /**
+     *  Vector contains approximated improvement pro NNI at each iterations
+     */
+    vector<double> vecImpProNNI;
+
+    /**
+        List of positive NNI for the current tree;
+     */
+    vector<NNIMove> plusNNIs;
+
+    /**
+        List of non-conflicting NNIs for the current tree;
+     */
+    vector<NNIMove> nonConfNNIs;
+
+    /**
+     *  NNIs that have been applied in the previous step
+     */
+    vector<NNIMove> appliedNNIs;
+
+    /**
+        Optimal branch lengths
+     */
+    mapString2Double optBrans;
+
+    /**
+     *  @brief get branches, on which NNIs are evaluated for the next NNI step.
+     *  @param[out] nodes1 one ends of the branches
+     *  @param[out] nodes2 the other ends of the branches
+     *  @param[in] nnis NNIs that have been previously applied
+     */
+    void getBranchesForNNI(NodeVector& nodes1, NodeVector& nodes2, vector<NNIMove>& nnis);
+
+    /**
+     *  Use fastNNI heuristic
+     */
+    bool fastNNI;
+
+    /**
+            Original branch lengths
+     */
+    mapString2Double orgBrans;
+
+    int k_delete, k_delete_min, k_delete_max, k_delete_stay;
+
+    /**
+            number of representative leaves for IQP step
+     */
+    int k_represent;
+
+public:
+
+    /**
+     *  Generate the initial candidate tree set
+     *  @param nParTrees number of parsimony trees to generate
+     *  @param nNNITrees number of NNI locally optimal trees to generate
+     */
+    void initCandidateTreeSet(int nParTrees, int nNNITrees);
+
+
+    /**
+     * Generate the initial tree (usually used for model parameter estimation)
+     * @param dist_file only needed for BIONJ tree
+     */
+    void computeInitialTree(string &dist_file, LikelihoodKernel kernel);
+
+    /**
+     *  @brief: optimize model parameters on the current tree
+     *  either IQ-TREE or PLL
+     *  @param printInfo to print model parameters to the screen or not
+     *  @param epsilon likelihood epsilon for optimization
+     *
+     */
+    string optimizeModelParameters(bool printInfo = false, double epsilon = -1);
+
+    /**
+     *  variable storing the current best tree topology
+     */
+    topol* pllBestTree;
+
+    CandidateSet candidateTrees;
+
+
+    /****** following variables are for ultra-fast bootstrap *******/
+
+    /** TRUE to save also branch lengths into treels_newick */
+    bool save_all_br_lens;
+
+    /**
+        this keeps the list of intermediate trees.
+        it will be activated if params.avoid_duplicated_trees is TRUE.
+     */
+    StringIntMap treels;
+
+    /** pattern log-likelihood vector for each treels */
+    vector<double* > treels_ptnlh;
+
+    /** tree log-likelihood for each treels */
+    DoubleVector treels_logl;
+
+    /** NEWICK string for each treels */
+    StrVector treels_newick;
+
+    /** maximum number of distinct candidate trees (tau parameter) */
+    int max_candidate_trees;
+
+    /** log-likelihood threshold (l_min) */
+    double logl_cutoff;
+
+    /** vector of bootstrap alignments generated */
+    vector<BootValType* > boot_samples;
+
+    /** newick string of corresponding bootstrap trees */
+    IntVector boot_trees;
+
+	/** number of multiple optimal trees per replicate */
+	IntVector boot_counts;
+
+    /** corresponding RELL log-likelihood */
+    DoubleVector boot_logl;
+
+    /** Set of splits occuring in bootstrap trees */
+    vector<SplitGraph*> boot_splits;
+
+    /** log-likelihood of bootstrap consensus tree */
+    double boot_consense_logl;
+
+    /** Corresponding map for set of splits occuring in bootstrap trees */
+    //SplitIntMap boot_splits_map;
+
+    /** summarize all bootstrap trees */
+    void summarizeBootstrap(Params &params, MTreeSet &trees);
+
+    void summarizeBootstrap(Params &params);
+
+    /** summarize bootstrap trees into split set */
+    void summarizeBootstrap(SplitGraph &sg);
+
+    void writeUFBootTrees(Params &params);
+
+    /** @return bootstrap correlation coefficient for assessing convergence */
+    double computeBootstrapCorrelation();
+
+	int getDelete() const;
+	void setDelete(int _delete);
+
+protected:
+    /**** NNI cutoff heuristic *****/
+    /**
+     */
+    vector<NNIInfo> nni_info;
+
+
+    bool estimate_nni_cutoff;
+
+    double nni_cutoff;
+
+    bool nni_sort;
+
+    bool testNNI;
+
+    ofstream outNNI;
+protected:
+
+    bool print_tree_lh;
+
+    int write_intermediate_trees;
+
+    ofstream out_treels, out_treelh, out_sitelh, out_treebetter;
+
+    void estimateNNICutoff(Params* params);
+
+    virtual void saveCurrentTree(double logl); // save current tree
+
+    void saveNNITrees(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    int duplication_counter;
+
+    /**
+            number of IQPNNI iterations
+     */
+    //int iqpnni_iterations;
+
+    /**
+            bonus values of all branches, used for IQP algorithm
+     */
+    //double *bonus_values;
+
+    /**
+            delete a set of leaves from tree (with the probability p_delete), assume tree is birfucating
+            @param del_leaves (OUT) the list of deleted leaves
+     */
+    void deleteLeaves(PhyloNodeVector &del_leaves);
+
+    void deleteNonTabuLeaves(PhyloNodeVector &del_leaves);
+
+    /**
+     * 		delete a set of leaves from tree
+     * 		non-cherry leaves are selected first
+     * 		@param del_leaves (OUT) the list of deleted leaves
+     */
+    void deleteNonCherryLeaves(PhyloNodeVector &del_leaves);
+
+    /**
+            reinsert the whole list of leaves back into the tree
+            @param del_leaves the list of deleted leaves, returned by deleteLeaves() function
+     */
+    virtual void reinsertLeaves(PhyloNodeVector &del_leaves);
+
+    void reinsertLeavesByParsimony(PhyloNodeVector &del_leaves);
+
+
+    void doParsimonyReinsertion();
+
+
+    /**
+            assess a quartet with four taxa. Current implementation uses the four-point condition
+            based on distance matrix for quick evaluation.
+            @param leaf0 one of the leaf in the existing sub-tree
+            @param leaf1 one of the leaf in the existing sub-tree
+            @param leaf2 one of the leaf in the existing sub-tree
+            @param del_leaf a leaf that was deleted (not in the existing sub-tree)
+            @return 0, 1, or 2 depending on del_leaf should be in subtree containing leaf0, leaf1, or leaf2, respectively
+     */
+    int assessQuartet(Node *leaf0, Node *leaf1, Node *leaf2, Node *del_leaf);
+
+    /**
+            assess a quartet with four taxa using parsimony
+            @param leaf0 one of the leaf in the existing sub-tree
+            @param leaf1 one of the leaf in the existing sub-tree
+            @param leaf2 one of the leaf in the existing sub-tree
+            @param del_leaf a leaf that was deleted (not in the existing sub-tree)
+            @return 0, 1, or 2 depending on del_leaf should be in subtree containing leaf0, leaf1, or leaf2, respectively
+     */
+    int assessQuartetParsimony(Node *leaf0, Node *leaf1, Node *leaf2,
+            Node *del_leaf);
+
+    /**
+            assess the important quartets around a virtual root of the tree.
+            This function will assign bonus points to branches by updating the variable 'bonus_values'
+            @param cur_root the current virtual root
+            @param del_leaf a leaf that was deleted (not in the existing sub-tree)
+     */
+    void assessQuartets(vector<RepresentLeafSet*> &leaves_vec, PhyloNode *cur_root, PhyloNode *del_leaf);
+
+    /**
+            initialize the bonus points to ZERO
+            @param node the root of the sub-tree
+            @param dad dad of 'node', used to direct the recursion
+     */
+    void initializeBonus(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            raise the bonus points for all branches in the subtree rooted at a node
+            @param node the root of the sub-tree
+            @param dad dad of 'node', used to direct the recursion
+     */
+    void raiseBonus(Neighbor *nei, Node *dad, double bonus);
+
+    /**
+            Bonuses are stored in a partial fashion. This function will propagate the bonus at every branch
+            into the subtree at this branch.
+            @param node the root of the sub-tree
+            @param dad dad of 'node', used to direct the recursion
+            @return the partial bonus of the branch (node -> dad)
+     */
+    double computePartialBonus(Node *node, Node* dad);
+
+    /**
+            determine the list of branches with the same best bonus point
+            @param best_bonus the best bonus determined by findBestBonus()
+            @param best_nodes (OUT) vector of one ends of the branches with highest bonus point
+            @param best_dads (OUT) vector of the other ends of the branches with highest bonus point
+            @param node the root of the sub-tree
+            @param dad dad of 'node', used to direct the recursion
+     */
+    void findBestBonus(double &best_score, NodeVector &best_nodes, NodeVector &best_dads, Node *node = NULL, Node *dad = NULL);
+
+    void estDeltaMin();
+
+};
+
+void estimateNNICutoff(Params &params);
+
+
+#endif
diff --git a/iqtree_config.h.in b/iqtree_config.h.in
new file mode 100644
index 0000000..f19b326
--- /dev/null
+++ b/iqtree_config.h.in
@@ -0,0 +1,14 @@
+#define iqtree_VERSION_MAJOR @iqtree_VERSION_MAJOR@
+#define iqtree_VERSION_MINOR @iqtree_VERSION_MINOR@
+#define iqtree_VERSION_PATCH @iqtree_VERSION_PATCH@
+
+/* does the platform provide gettimeofday functions? */
+#cmakedefine HAVE_GETTIMEOFDAY
+/* does the platform provide getrusage functions? */
+#cmakedefine HAVE_GETRUSAGE
+/* does the platform provide popen functions? */
+/*#cmakedefine HAVE_POPEN*/
+/* does the platform provide pclose functions? */
+/*#cmakedefine HAVE_PCLOSE*/
+/* does the platform provide GlobalMemoryStatusEx functions? */
+#cmakedefine HAVE_GLOBALMEMORYSTATUSEX
\ No newline at end of file
diff --git a/lbfgsb/CMakeLists.txt b/lbfgsb/CMakeLists.txt
new file mode 100644
index 0000000..f6ac993
--- /dev/null
+++ b/lbfgsb/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_library(lbfgsb
+lbfgsb_new.cpp
+)
diff --git a/lbfgsb/lbfgsb_new.cpp b/lbfgsb/lbfgsb_new.cpp
new file mode 100644
index 0000000..3e829dc
--- /dev/null
+++ b/lbfgsb/lbfgsb_new.cpp
@@ -0,0 +1,4605 @@
+/*
+ *
+ * lbfgsb_new.cpp
+ * HAL_HAS
+ *
+ * CSIRO Open Source Software License Agreement (GPLv3)
+ * Copyright (c) 2014, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
+ * All rights reserved. CSIRO is willing to grant you a license to HAL-HAS on the terms of the GNU General Public
+ * License version 3 as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except
+ * where otherwise indicated for third party material.
+ * The following additional terms apply under clause 7 of that license:
+ * EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE
+ * IS PROVIDED "AS-IS". CSIRO MAKES NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY
+ * OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE
+ * OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE LIABLE ON ANY LEGAL THEORY (INCLUDING,
+ * WITHOUT LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES
+ * OR OTHER LIABILITY HOWSOEVER INCURRED.  WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF
+ * LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS;
+ * OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; OR ANY SPECIAL,
+ * INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS
+ * AGREEMENT, ACCESS OF THE SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED OF
+ * THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.
+ * APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS,
+ * OR IMPOSES OBLIGATIONS OR LIABILITY ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT
+ * SET OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES".  TO THE EXTENT THAT SUCH CONSUMER
+ * GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY
+ * OF CSIRO UNDER THE RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO’S OPTION) TO ONE OF FOLLOWING
+ * REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES:
+ * (a)               THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT
+ *                   SERVICES AGAIN;
+ * (b)               THE REPAIR OF THE SOFTWARE;
+ * (c)               THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE
+ *                   RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE REPAIRED.
+ * IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF THE SOFTWARE OR MATERIAL DISTRIBUTED
+ * WITH IT.  CSIRO MAY ENFORCE ANY RIGHTS ON BEHALF OF THE RELEVANT THIRD PARTY.
+ * Third Party Components
+ * The following third party components are distributed with the Software.  You agree to comply with the license
+ * terms for these components as part of accessing the Software.  Other third party software may also be identified
+ * in separate files distributed with the Software.
+ * ___________________________________________________________________
+ * 
+ * R : A Computer Language for Statistical Data Analysis version 3.0.1 (http://cran.r-project.org/src/base/R-3/R-3.0.1.tar.gz)
+ * Copyright (C) 2000-2004 The R Core Team
+ * This software is licensed under GNU GPL
+ * 
+ * JACOBI_EIGENVALUE.C (http://people.sc.fsu.edu/~jburkardt/c_src/jacobi_eigenvalue/jacobi_eigenvalue.c)
+ * Copyright (C) 2003-2013 John Burkardt
+ * This software is licensed under GNU LGPL (http://www.gnu.org/licenses/lgpl.html)
+ * ___________________________________________________________________
+ */
+
+
+#include "lbfgsb_new.h"
+#include <algorithm>
+//using namespace std;
+
+static int c__1 = 1;
+static int c__11 = 11;
+
+#if 0
+
+// Function to access the L-BFGS-B function
+// 1. int n : The number of the variables
+// 2. double* x : initial values of the variables
+// 3. double* l : lower bounds of the variables
+// 4. int maxit : max # of iterations
+// 5. void* ex  : the wrapped variables for objective function
+// After the function is invoked, the values of x will be updated
+void lbfgsb_R(int n, double* x, double* l, int maxit, void* ex) {
+	int i;
+	double Fmin;
+	int fail;
+	int fncount;
+	int grcount;
+	char msg[100];
+
+	int m = 5;          // number of BFGS updates retained in the "L-BFGS-B" method. It defaults to 5.
+
+	double *u = NULL;   // upper bounds of the variables;
+
+	int *nbd;           // 0: unbounded; 1: lower bounded; 2: both lower & upper; 3: upper bounded
+	nbd = new int[n];
+	for (i=0; i<n; i++)
+		nbd[i] = 1;
+
+	double factr = 1e7; // control the convergence of the "L-BFGS-B" method.
+	// Convergence occurs when the reduction in the object is within this factor
+	// of the machine tolerance.
+	// Default is 1e7, that is a tolerance of about 1e-8
+
+	double pgtol = 0;   // helps control the convergence of the "L-BFGS-B" method.
+	// It is a tolerance on the projected gradient in the current search direction.
+	// Default is zero, when the check is suppressed
+
+	int trace = 0;      // non-negative integer.
+	// If positive, tracing information on the progress of the optimization is produced.
+	// Higher values may produce more tracing information.
+
+	int nREPORT = 10;   // The frequency of reports for the "L-BFGS-B" methods if "trace" is positive.
+	// Defaults to every 10 iterations.
+
+/*#ifdef USE_OLD_PARAM
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, fn, gr1, &fail, ex,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+#else*/
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, fn, gr2, &fail, ex,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+//#endif
+
+	delete[] nbd;
+}
+
+// Function to access the L-BFGS-B function
+// 1. int n : The number of the variables
+// 2. double* x : initial values of the variables
+// 3. double* l : lower bounds of the variables
+// 4. double* u : upper bounds of the variables
+// 5. int maxit : max # of iterations
+// 6. void* ex  : the wrapped variables for objective function
+// After the function is invoked, the values of x will be updated
+void lbfgsb_R2(int n, double* x, double* l, double* u, int maxit, void* ex) {
+	int i;
+	double Fmin;
+	int fail;
+	int fncount;
+	int grcount;
+	char msg[100];
+
+	int m = 5;          // number of BFGS updates retained in the "L-BFGS-B" method. It defaults to 5.
+
+	int *nbd;           // 0: unbounded; 1: lower bounded; 2: both lower & upper; 3: upper bounded
+	nbd = new int[n];
+	for (i=0; i<n; i++)
+		nbd[i] = 2;
+
+	double factr = 1e7; // control the convergence of the "L-BFGS-B" method.
+	// Convergence occurs when the reduction in the object is within this factor
+	// of the machine tolerance.
+	// Default is 1e7, that is a tolerance of about 1e-8
+
+	double pgtol = 0;   // helps control the convergence of the "L-BFGS-B" method.
+	// It is a tolerance on the projected gradient in the current search direction.
+	// Default is zero, when the check is suppressed
+
+	int trace = 0;      // non-negative integer.
+	// If positive, tracing information on the progress of the optimization is produced.
+	// Higher values may produce more tracing information.
+
+	int nREPORT = 10;   // The frequency of reports for the "L-BFGS-B" methods if "trace" is positive.
+	// Defaults to every 10 iterations.
+
+/*#ifdef USE_OLD_PARAM
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, fn, gr1, &fail, ex,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+#else*/
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, fn, gr2, &fail, ex,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+//#endif
+
+	delete[] nbd;
+}
+
+#endif
+
+// ========================================================= //
+// FUNCTIONS converted from R v.3.0.1
+// ========================================================= //
+
+void lbfgsb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *Fmin, optimfn fminfn, optimgr fmingr, int *fail,
+		void *ex, double factr, double pgtol,
+		int *fncount, int *grcount, int maxit, char *msg,
+		int trace, int nREPORT)
+{
+	char task[60];
+	double f, *g, dsave[29], *wa;
+	int tr = -1, iter = 0, *iwa, isave[44], lsave[4];
+
+	/* shut up gcc -Wall in 4.6.x */
+
+	for(int i = 0; i < 4; i++) lsave[i] = 0;
+
+	if(n == 0) { /* not handled in setulb */
+		*fncount = 1;
+		*grcount = 0;
+		*Fmin = fminfn(n, u, ex);
+		strcpy(msg, "NOTHING TO DO");
+		*fail = 0;
+		return;
+	}
+	if (nREPORT <= 0) {
+		cerr << "REPORT must be > 0 (method = \"L-BFGS-B\")" << endl;
+		exit(1);
+	}
+	switch(trace) {
+	case 2: tr = 0; break;
+	case 3: tr = nREPORT; break;
+	case 4: tr = 99; break;
+	case 5: tr = 100; break;
+	case 6: tr = 101; break;
+	default: tr = -1; break;
+	}
+
+	*fail = 0;
+	g = (double*) malloc (n * sizeof(double));
+	/* this needs to be zeroed for snd in mainlb to be zeroed */
+	wa = (double *) malloc((2*m*n+4*n+11*m*m+8*m) * sizeof(double));
+	iwa = (int *) malloc(3*n * sizeof(int));
+	strcpy(task, "START");
+	while(1) {
+		/* Main workhorse setulb() from ../appl/lbfgsb.c : */
+		setulb(n, m, x, l, u, nbd, &f, g, factr, &pgtol, wa, iwa, task,
+				tr, lsave, isave, dsave);
+		/*    Rprintf("in lbfgsb - %s\n", task);*/
+		if (strncmp(task, "FG", 2) == 0) {
+			f = fminfn(n, x, ex);
+			if (!isfinite(f)) {
+				cerr << "L-BFGS-B needs finite values of 'fn'" << endl;
+				exit(1);
+			}
+			fmingr(n, x, g, ex);
+		} else if (strncmp(task, "NEW_X", 5) == 0) {
+			iter++;
+			if(trace == 1 && (iter % nREPORT == 0)) {
+				cout << "iter " << iter << " value " << f << endl;
+			}
+			if (iter > maxit) {
+				*fail = 1;
+				break;
+			}
+		} else if (strncmp(task, "WARN", 4) == 0) {
+			*fail = 51;
+			break;
+		} else if (strncmp(task, "CONV", 4) == 0) {
+			break;
+		} else if (strncmp(task, "ERROR", 5) == 0) {
+			*fail = 52;
+			break;
+		} else { /* some other condition that is not supposed to happen */
+			*fail = 52;
+			break;
+		}
+	}
+	*Fmin = f;
+	*fncount = *grcount = isave[33];
+	if (trace) {
+		cout << "final value " << *Fmin << endl;
+		if (iter < maxit && *fail == 0)
+			cout << "converged" << endl;
+		else
+			cout << "stopped after " << iter << " iterations\n";
+	}
+	strcpy(msg, task);
+	free(g);
+	free(wa);
+	free(iwa);
+}
+
+void setulb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *f, double *g, double factr, double *pgtol,
+		double *wa, int * iwa, char *task, int iprint,
+		int *lsave, int *isave, double *dsave)
+{
+	/*     ************
+
+	 Subroutine setulb
+
+	 This subroutine partitions the working arrays wa and iwa, and
+	 then uses the limited memory BFGS method to solve the bound
+	 constrained optimization problem by calling mainlb.
+	 (The direct method will be used in the subspace minimization.)
+
+	 n is an integer variable.
+	 On entry n is the dimension of the problem.
+	 On exit n is unchanged.
+
+	 m is an integer variable.
+	 On entry m is the maximum number of variable metric corrections
+	 used to define the limited memory matrix.
+	 On exit m is unchanged.
+
+	 x is a double precision array of dimension n.
+	 On entry x is an approximation to the solution.
+	 On exit x is the current approximation.
+
+	 l is a double precision array of dimension n.
+	 On entry l is the lower bound on x.
+	 On exit l is unchanged.
+
+	 u is a double precision array of dimension n.
+	 On entry u is the upper bound on x.
+	 On exit u is unchanged.
+
+	 nbd is an integer array of dimension n.
+	 On entry nbd represents the type of bounds imposed on the
+	 variables, and must be specified as follows:
+	 nbd(i)=0 if x(i) is unbounded,
+	 1 if x(i) has only a lower bound,
+	 2 if x(i) has both lower and upper bounds, and
+	 3 if x(i) has only an upper bound.
+	 On exit nbd is unchanged.
+
+	 f is a double precision variable.
+	 On first entry f is unspecified.
+	 On final exit f is the value of the function at x.
+
+	 g is a double precision array of dimension n.
+	 On first entry g is unspecified.
+	 On final exit g is the value of the gradient at x.
+
+	 factr is a double precision variable.
+	 On entry factr >= 0 is specified by the user.    The iteration
+	 will stop when
+
+	 (f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= factr*epsmch
+
+	 where epsmch is the machine precision, which is automatically
+	 generated by the code. Typical values for factr: 1.d+12 for
+	 low accuracy; 1.d+7 for moderate accuracy; 1.d+1 for extremely
+	 high accuracy.
+	 On exit factr is unchanged.
+
+	 pgtol is a double precision variable.
+	 On entry pgtol >= 0 is specified by the user.    The iteration
+	 will stop when
+
+	 max{|proj g_i | i = 1, ..., n} <= pgtol
+
+	 where pg_i is the ith component of the projected gradient.
+	 On exit pgtol is unchanged.
+
+	 wa is a double precision working array of length
+	 (2mmax + 4)nmax + 11mmax^2 + 8mmax.
+
+	 iwa is an integer working array of length 3nmax.
+
+	 task is a working string of characters of length 60 indicating
+	 the current job when entering and quitting this subroutine.
+
+	 iprint is an integer variable that must be set by the user.
+	 It controls the frequency and type of output generated:
+	 iprint<0    no output is generated;
+	 iprint=0    print only one line at the last iteration;
+	 0<iprint<99 print also f and |proj g| every iprint iterations;
+	 iprint=99   print details of every iteration except n-vectors;
+	 iprint=100  print also the changes of active set and final x;
+	 iprint>100  print details of every iteration including x and g;
+	 When iprint > 0, the file iterate.dat will be created to
+	 summarize the iteration.
+
+	 csave is a working string of characters of length 60.
+
+	 lsave is a logical working array of dimension 4.
+	 On exit with 'task' = NEW_X, the following information is
+	 available:
+	 If lsave(1) = .true. then  the initial X has been replaced by
+	 its projection in the feasible set;
+	 If lsave(2) = .true. then  the problem is constrained;
+	 If lsave(3) = .true. then  each variable has upper and lower
+	 bounds;
+
+	 isave is an integer working array of dimension 44.
+	 On exit with 'task' = NEW_X, the following information is
+	 available:
+	 isave(22) = the total number of intervals explored in the
+	 search of Cauchy points;
+	 isave(26) = the total number of skipped BFGS updates before
+	 the current iteration;
+	 isave(30) = the number of current iteration;
+	 isave(31) = the total number of BFGS updates prior the current
+	 iteration;
+	 isave(33) = the number of intervals explored in the search of
+	 Cauchy point in the current iteration;
+	 isave(34) = the total number of function and gradient
+	 evaluations;
+	 isave(36) = the number of function value or gradient
+	 evaluations in the current iteration;
+	 if isave(37) = 0  then the subspace argmin is within the box;
+	 if isave(37) = 1  then the subspace argmin is beyond the box;
+	 isave(38) = the number of free variables in the current
+	 iteration;
+	 isave(39) = the number of active constraints in the current
+	 iteration;
+	 n + 1 - isave(40) = the number of variables leaving the set of
+	 active constraints in the current iteration;
+	 isave(41) = the number of variables entering the set of active
+	 constraints in the current iteration.
+
+	 dsave is a double precision working array of dimension 29.
+	 On exit with 'task' = NEW_X, the following information is
+	 available:
+	 dsave(1) = current 'theta' in the BFGS matrix;
+	 dsave(2) = f(x) in the previous iteration;
+	 dsave(3) = factr*epsmch;
+	 dsave(4) = 2-norm of the line search direction vector;
+	 dsave(5) = the machine precision epsmch generated by the code;
+	 dsave(7) = the accumulated time spent on searching for
+	 Cauchy points;
+	 dsave(8) = the accumulated time spent on
+	 subspace minimization;
+	 dsave(9) = the accumulated time spent on line search;
+	 dsave(11) = the slope of the line search function at
+	 the current point of line search;
+	 dsave(12) = the maximum relative step length imposed in
+	 line search;
+	 dsave(13) = the infinity norm of the projected gradient;
+	 dsave(14) = the relative step length in the line search;
+	 dsave(15) = the slope of the line search function at
+	 the starting point of the line search;
+	 dsave(16) = the square of the 2-norm of the line search
+	 direction vector.
+
+	 Subprograms called:
+
+	 L-BFGS-B Library ... mainlb.
+
+
+	 References:
+
+	 [1] R. H. Byrd, P. Lu, J. Nocedal and C. Zhu, ``A limited
+	 memory algorithm for bound constrained optimization'',
+	 SIAM J. Scientific Computing 16 (1995), no. 5, pp. 1190--1208.
+
+	 [2] C. Zhu, R.H. Byrd, P. Lu, J. Nocedal, ``L-BFGS-B: a
+	 limited memory FORTRAN code for solving bound constrained
+	 optimization problems'', Tech. Report, NAM-11, EECS Department,
+	 Northwestern University, 1994.
+
+	 (Postscript files of these papers are available via anonymous
+	 ftp to ece.nwu.edu in the directory pub/lbfgs/lbfgs_bcm.)
+
+	 [Aug 2000: via http://www.ece.nwu.edu/~nocedal/lbfgsb.html]
+
+	 *    *  *
+
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	char csave[60];
+
+	/* Local variables */
+	int lsnd, ld, lr, lt;
+	int lz, lwa, lwn, lss, lws, lwt, lsy, lwy;
+
+	/* make sure csave is initialized */
+	csave[0] = '\0';
+
+	/* Parameter adjustments */
+	--wa;
+	--isave;
+
+	/* Function Body */
+	if (strncmp(task, "START", 5) == 0) {
+		isave[1] = m * n;
+		isave[2] = m * m;
+		isave[3] = m * m << 2;
+		isave[4] = 1;
+		isave[5] = isave[4] + isave[1];
+		isave[6] = isave[5] + isave[1];
+		isave[7] = isave[6] + isave[2];
+		isave[8] = isave[7] + isave[2];
+		isave[9] = isave[8];
+		isave[10] = isave[9] + isave[2];
+		isave[11] = isave[10] + isave[3];
+		isave[12] = isave[11] + isave[3];
+		isave[13] = isave[12] + n;
+		isave[14] = isave[13] + n;
+		isave[15] = isave[14] + n;
+		isave[16] = isave[15] + n;
+	}
+	lws = isave[4];
+	lwy = isave[5];
+	lsy = isave[6];
+	lss = isave[7];
+	lwt = isave[9];
+	lwn = isave[10];
+	lsnd = isave[11];
+	lz = isave[12];
+	lr = isave[13];
+	ld = isave[14];
+	lt = isave[15];
+	lwa = isave[16];
+	mainlb(n, m, x, l, u, nbd, f, g, factr, pgtol,
+			&wa[lws], &wa[lwy], &wa[lsy],&wa[lss], &wa[lwt],&wa[lwn],
+			&wa[lsnd], &wa[lz], &wa[lr], &wa[ld], &wa[lt], &wa[lwa],
+			iwa, &iwa[n], &iwa[n << 1], task, iprint,
+			csave, lsave, &isave[22], dsave);
+	return;
+} /* setulb */
+
+
+void mainlb(int n, int m, double *x,
+		double *l, double *u, int *nbd, double *f, double *g,
+		double factr, double *pgtol, double *ws, double * wy,
+		double *sy, double *ss, double *wt, double *wn,
+		double *snd, double *z, double *r, double *d,
+		double *t, double *wa, int *indx, int *iwhere,
+		int *indx2, char *task, int iprint,
+		char *csave, int *lsave, int *isave, double *dsave)
+{
+	/*     ************
+	 Subroutine mainlb
+
+	 This subroutine solves bound constrained optimization problems by
+	 using the compact formula of the limited memory BFGS updates.
+
+	 n is an integer variable.
+	 On entry n is the number of variables.
+	 On exit n is unchanged.
+
+	 m is an integer variable.
+	 On entry m is the maximum number of variable metric
+	 corrections allowed in the limited memory matrix.
+	 On exit m is unchanged.
+
+	 x is a double precision array of dimension n.
+	 On entry x is an approximation to the solution.
+	 On exit x is the current approximation.
+
+	 l is a double precision array of dimension n.
+	 On entry l is the lower bound of x.
+	 On exit l is unchanged.
+
+	 u is a double precision array of dimension n.
+	 On entry u is the upper bound of x.
+	 On exit u is unchanged.
+
+	 nbd is an integer array of dimension n.
+	 On entry nbd represents the type of bounds imposed on the
+	 variables, and must be specified as follows:
+	 nbd(i)=0 if x(i) is unbounded,
+	 1 if x(i) has only a lower bound,
+	 2 if x(i) has both lower and upper bounds,
+	 3 if x(i) has only an upper bound.
+	 On exit nbd is unchanged.
+
+	 f is a double precision variable.
+	 On first entry f is unspecified.
+	 On final exit f is the value of the function at x.
+
+	 g is a double precision array of dimension n.
+	 On first entry g is unspecified.
+	 On final exit g is the value of the gradient at x.
+
+	 factr is a double precision variable.
+	 On entry factr >= 0 is specified by the user.    The iteration
+	 will stop when
+
+	 (f^k - f^{k+1})/max{|f^k|,|f^{k+1}|,1} <= factr*epsmch
+
+	 where epsmch is the machine precision, which is automatically
+	 generated by the code.
+	 On exit factr is unchanged.
+
+	 pgtol is a double precision variable.
+	 On entry pgtol >= 0 is specified by the user.    The iteration
+	 will stop when
+
+	 max{|proj g_i | i = 1, ..., n} <= pgtol
+
+	 where pg_i is the ith component of the projected gradient.
+	 On exit pgtol is unchanged.
+
+	 ws, wy, sy, and wt are double precision working arrays used to
+	 store the following information defining the limited memory
+	 BFGS matrix:
+	 ws, of dimension n x m, stores S, the matrix of s-vectors;
+	 wy, of dimension n x m, stores Y, the matrix of y-vectors;
+	 sy, of dimension m x m, stores S'Y;
+	 ss, of dimension m x m, stores S'S;
+	 wt, of dimension m x m, stores the Cholesky factorization
+	 of (theta*S'S+LD^(-1)L'); see eq.
+	 (2.26) in [3].
+
+	 wn is a double precision working array of dimension 2m x 2m
+	 used to store the LEL^T factorization of the indefinite matrix
+	 K = [-D -Y'ZZ'Y/theta     L_a'-R_z'    ]
+	 [L_a -R_z       theta*S'AA'S ]
+
+	 where       E = [-I  0]
+	 [ 0  I]
+
+	 snd is a double precision working array of dimension 2m x 2m
+	 used to store the lower triangular part of
+	 N = [Y' ZZ'Y      L_a'+R_z']
+	 [L_a +R_z  S'AA'S   ]
+
+	 z(n),r(n),d(n),t(n),wa(8*m) are double precision working arrays.
+	 z is used at different times to store the Cauchy point and
+	 the Newton point.
+
+
+	 indx is an integer working array of dimension n.
+	 In subroutine freev, indx is used to store the free and fixed
+	 variables at the Generalized Cauchy Point (GCP).
+
+	 iwhere is an integer working array of dimension n used to record
+	 the status of the vector x for GCP computation.
+	 iwhere(i)=0 or -3 if x(i) is free and has bounds,
+	 1       if x(i) is fixed at l(i), and l(i) .ne. u(i)
+	 2       if x(i) is fixed at u(i), and u(i) .ne. l(i)
+	 3       if x(i) is always fixed, i.e.,  u(i)=x(i)=l(i)
+	 -1       if x(i) is always free, i.e., no bounds on it.
+
+	 indx2 is an integer working array of dimension n.
+	 Within subroutine cauchy, indx2 corresponds to the array iorder.
+	 In subroutine freev, a list of variables entering and leaving
+	 the free set is stored in indx2, and it is passed on to
+	 subroutine formk with this information.
+
+	 task is a working string of characters of length 60 indicating
+	 the current job when entering and leaving this subroutine.
+
+	 iprint is an INTEGER variable that must be set by the user.
+	 It controls the frequency and type of output generated:
+	 iprint<0    no output is generated;
+	 iprint=0    print only one line at the last iteration;
+	 0<iprint<99 print also f and |proj g| every iprint iterations;
+	 iprint=99   print details of every iteration except n-vectors;
+	 iprint=100  print also the changes of active set and final x;
+	 iprint>100  print details of every iteration including x and g;
+	 When iprint > 0, the file iterate.dat will be created to
+	 summarize the iteration.
+
+	 csave is a working string of characters of length 60.
+
+	 lsave is a logical working array of dimension 4.
+
+	 isave is an integer working array of dimension 23.
+
+	 dsave is a double precision working array of dimension 29.
+
+
+	 Subprograms called
+
+	 L-BFGS-B Library ... cauchy, subsm, lnsrlb, formk,
+
+	 errclb, prn1lb, prn2lb, prn3lb, active, projgr,
+
+	 freev, cmprlb, matupd, formt.
+
+	 Minpack2 Library ... timer, dpmeps.
+
+	 Linpack Library ... dcopy, ddot.
+
+
+	 References:
+
+	 [1] R. H. Byrd, P. Lu, J. Nocedal and C. Zhu, ``A limited
+	 memory algorithm for bound constrained optimization'',
+	 SIAM J. Scientific Computing 16 (1995), no. 5, pp. 1190--1208.
+
+	 [2] C. Zhu, R.H. Byrd, P. Lu, J. Nocedal, ``L-BFGS-B: FORTRAN
+	 Subroutines for Large Scale Bound Constrained Optimization''
+	 Tech. Report, NAM-11, EECS Department, Northwestern University,
+	 1994.
+
+	 [3] R. Byrd, J. Nocedal and R. Schnabel "Representations of
+	 Quasi-Newton Matrices and their use in Limited Memory Methods'',
+	 Mathematical Programming 63 (1994), no. 4, pp. 129-156.
+
+	 (Postscript files of these papers are available via anonymous
+	 ftp to ece.nwu.edu in the directory pub/lbfgs/lbfgs_bcm.)
+
+	 *  *     *
+	 */
+
+	/*
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+
+	/* System generated locals */
+	int ws_offset=0, wy_offset=0, sy_offset=0, ss_offset=0, wt_offset=0,
+			wn_offset=0, snd_offset=0, i__1;
+	double d__1, d__2;
+
+	/* Local variables */
+	int head;
+	double fold;
+	int nact;
+	double ddum;
+	int info;
+	int nfgv, ifun, iter, nint;
+	char word[4]; /* allow for terminator */
+	int i, iback, k = 0; /* -Wall */
+	double gdold;
+	int nfree;
+	int boxed;
+	int itail;
+	double theta;
+	double dnorm;
+	int nskip, iword;
+	double xstep = 0.0, stpmx; /* xstep is printed before being used */
+	double gd, dr, rr;
+	int ileave;
+	int itfile;
+	double cachyt, epsmch;
+	int updatd;
+	double sbtime;
+	int prjctd;
+	int iupdat;
+	int cnstnd;
+	double sbgnrm;
+	int nenter;
+	double lnscht;
+	int nintol;
+	double dtd;
+	int col;
+	double tol;
+	int wrk;
+	double stp, cpu1, cpu2;
+
+	/* Parameter adjustments */
+	--indx2;
+	--iwhere;
+	--indx;
+	--t;
+	--d;
+	--r;
+	--z;
+	--g;
+	--nbd;
+	--u;
+	--l;
+	--x;
+	--wa;
+	--lsave;
+	--isave;
+	--dsave;
+
+	/* Function Body */
+	if (strncmp(task, "START", 5) == 0) {
+		/*      Generate the current machine precision. */
+		epsmch = DBL_EPSILON;
+		fold = 0.;
+		dnorm = 0.;
+		cpu1 = 0.;
+		gd = 0.;
+		sbgnrm = 0.;
+		stp = 0.;
+		xstep = 0.;
+		stpmx = 0.;
+		gdold = 0.;
+		dtd = 0.;
+		/*      Initialize counters and scalars when task='START'. */
+		/*         for the limited memory BFGS matrices: */
+		col = 0;
+		head = 1;
+		theta = 1.;
+		iupdat = 0;
+		updatd = 0;
+		iback = 0;
+		itail = 0;
+		ifun = 0;
+		iword = 0;
+		nact = 0;
+		ileave = 0;
+		nenter = 0;
+		/*         for operation counts: */
+		iter = 0;
+		nfgv = 0;
+		nint = 0;
+		nintol = 0;
+		nskip = 0;
+		nfree = n;
+		/*         for stopping tolerance: */
+		tol = factr * epsmch;
+		/*         for measuring running time: */
+		cachyt = 0.;
+		sbtime = 0.;
+		lnscht = 0.;
+		/*         'word' records the status of subspace solutions. */
+		strcpy(word, "---");
+		/*         'info' records the termination information. */
+		info = 0;
+		itfile = 0;
+		/*      Check the input arguments for errors. */
+		errclb(n, m, factr, &l[1], &u[1], &nbd[1], task, &info, &k);
+		if (strncmp(task, "ERROR", 5) == 0) {
+			prn3lb(n, x+1, f, task, iprint, info,
+					iter, nfgv, nintol, nskip, nact, sbgnrm,
+					nint, word, iback, stp, xstep, k);
+			return;
+		}
+
+		prn1lb(n, m, l+1, u+1, x+1, iprint, epsmch);
+
+		/*      Initialize iwhere & project x onto the feasible set. */
+		active(n, &l[1], &u[1], &nbd[1], &x[1], &iwhere[1], iprint, &prjctd,
+				&cnstnd, &boxed);
+		/*      The end of the initialization. */
+	} else {
+		/*        restore local variables. */
+		prjctd = lsave[1];
+		cnstnd = lsave[2];
+		boxed = lsave[3];
+		updatd = lsave[4];
+
+		nintol = isave[1];
+		itfile = isave[3];
+		iback = isave[4];
+		nskip = isave[5];
+		head = isave[6];
+		col = isave[7];
+		itail = isave[8];
+		iter = isave[9];
+		iupdat = isave[10];
+		nint = isave[12];
+		nfgv = isave[13];
+		info = isave[14];
+		ifun = isave[15];
+		iword = isave[16];
+		nfree = isave[17];
+		nact = isave[18];
+		ileave = isave[19];
+		nenter = isave[20];
+
+		theta = dsave[1];
+		fold = dsave[2];
+		tol = dsave[3];
+		dnorm = dsave[4];
+		epsmch = dsave[5];
+		cpu1 = dsave[6];
+		cachyt = dsave[7];
+		sbtime = dsave[8];
+		lnscht = dsave[9];
+		gd = dsave[11];
+		stpmx = dsave[12];
+		sbgnrm = dsave[13];
+		stp = dsave[14];
+		gdold = dsave[15];
+		dtd = dsave[16];
+		/*    After returning from the driver go to the point where execution */
+		/*    is to resume. */
+		if (strncmp(task, "FG_LN", 5) == 0)    goto L666;
+		if (strncmp(task, "NEW_X", 5) == 0)     goto L777;
+		if (strncmp(task, "FG_ST", 5) == 0)     goto L111;
+
+		if (strncmp(task, "STOP", 4) == 0) {
+			if (strncmp(task + 6, "CPU", 3) == 0) {
+				// restore the previous iterate.
+				dcopy(&n, &t[1], &c__1, &x[1], &c__1);
+				dcopy(&n, &r[1], &c__1, &g[1], &c__1);
+				*f = fold;
+			}
+			goto L999;
+		}
+	}
+	/*     Compute f0 and g0. */
+	strcpy(task, "FG_START");
+	/*        return to the driver to calculate f and g; reenter at 111. */
+	goto L1000;
+	L111:
+	nfgv = 1;
+	/*     Compute the infinity norm of the (-) projected gradient. */
+	projgr(n, &l[1], &u[1], &nbd[1], &x[1], &g[1], &sbgnrm);
+
+	if (iprint >= 1)
+		cout << "At iterate " << iter << " f= " << *f << " |proj g|= " << sbgnrm << endl;
+
+	if (sbgnrm <= *pgtol) {
+		/*                  terminate the algorithm. */
+		strcpy(task, "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL");
+		goto L999;
+	}
+	/* ----------------- the beginning of the loop -------------------------- */
+	L222:
+	if (iprint >= 99)
+		cout << "Iteration " << iter << endl;
+	iword = -1;
+
+	if (! cnstnd && col > 0) {
+		/*                          skip the search for GCP. */
+		dcopy(&n, &x[1], &c__1, &z[1], &c__1);
+		wrk = updatd;
+		nint = 0;
+		goto L333;
+	}
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+
+	/*     Compute the Generalized Cauchy Point (GCP). */
+
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+	timer(&cpu1);
+	cauchy(n, &x[1], &l[1], &u[1], &nbd[1], &g[1], &indx2[1], &iwhere[1], &t[
+																			 1], &d[1], &z[1], m, &wy[wy_offset], &ws[ws_offset], &sy[
+																																	  sy_offset], &wt[wt_offset], &theta, &col, &head, &wa[1], &wa[(m
+																																			  << 1) + 1], &wa[(m << 2) + 1], &wa[m * 6 + 1], &nint, iprint, &
+																																			  sbgnrm, &info, &epsmch);
+	if (info != 0) {
+		/*       singular triangular system detected; refresh the lbfgs memory. */
+		if (iprint >= 1) {
+			cout << "Singular triangular system detected;" << endl;
+			cout << "   refresh the lbfgs memory and restart the iteration." << endl;
+		}
+		info = 0;
+		col = 0;
+		head = 1;
+		theta = 1.;
+		iupdat = 0;
+		updatd = 0;
+		timer(&cpu2);
+		cachyt = cachyt + cpu2 - cpu1;
+		goto L222;
+	}
+	timer(&cpu2);
+	cachyt = cachyt + cpu2 - cpu1;
+	nintol += nint;
+	/*     Count the entering and leaving variables for iter > 0; */
+	/*     find the index set of free and active variables at the GCP. */
+	freev(n, &nfree, &indx[1], &nenter, &ileave, &indx2[1], &iwhere[1], &
+			wrk, &updatd, &cnstnd, iprint, &iter);
+	nact = n - nfree;
+	L333:
+	/*     If there are no free variables or B=theta*I, then */
+	/*                      skip the subspace minimization. */
+	if (nfree == 0 || col == 0) {
+		goto L555;
+	}
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+
+	/*     Subspace minimization. */
+
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+	timer(&cpu1);
+	/*     Form  the LEL^T factorization of the indefinite */
+	/*     matrix       K = [-D -Y'ZZ'Y/theta     L_a'-R_z'    ] */
+	/*               [L_a -R_z       theta*S'AA'S ] */
+	/*     where       E = [-I  0] */
+	/*               [ 0  I] */
+	if (wrk) {
+		formk(n, &nfree, &indx[1], &nenter, &ileave, &indx2[1], &iupdat, &
+				updatd, &wn[wn_offset], &snd[snd_offset], m, &ws[ws_offset], &
+				wy[wy_offset], &sy[sy_offset], &theta, &col, &head, &info);
+	}
+	if (info != 0) {
+		/*        nonpositive definiteness in Cholesky factorization; */
+		/*        refresh the lbfgs memory and restart the iteration. */
+		if (iprint >= 0) {
+			cout << "Nonpositive definiteness in Cholesky factorization in formk;" << endl;
+			cout << "   refresh the lbfgs memory and restart the iteration." << endl;
+		}
+		info = 0;
+		col = 0;
+		head = 1;
+		theta = 1.;
+		iupdat = 0;
+		updatd = 0;
+		timer(&cpu2);
+		sbtime = sbtime + cpu2 - cpu1;
+		goto L222;
+	}
+	/*      compute r=-Z'B(xcp-xk)-Z'g (using wa(2m+1)=W'(xcp-x) */
+	/*                             from 'cauchy'). */
+	cmprlb(n, m, &x[1], &g[1], &ws[ws_offset], &wy[wy_offset], &sy[sy_offset]
+																   , &wt[wt_offset], &z[1], &r[1], &wa[1], &indx[1], &theta, &
+																   col, &head, &nfree, &cnstnd, &info);
+	if (info != 0) {
+		goto L444;
+	}
+	/*     call the direct method. */
+	subsm(n, m, &nfree, &indx[1], &l[1], &u[1], &nbd[1], &z[1], &r[1], &
+			ws[ws_offset], &wy[wy_offset], &theta, &col, &head, &iword, &wa[1]
+																			, &wn[wn_offset], iprint, &info);
+	L444:
+	if (info != 0) {
+		/*        singular triangular system detected; */
+		/*        refresh the lbfgs memory and restart the iteration. */
+		if (iprint >= 1) {
+			cout << "Singular triangular system detected;" << endl;
+			cout << "   refresh the lbfgs memory and restart the iteration." << endl;
+		}
+		info = 0;
+		col = 0;
+		head = 1;
+		theta = 1.;
+		iupdat = 0;
+		updatd = 0;
+		timer(&cpu2);
+		sbtime = sbtime + cpu2 - cpu1;
+		goto L222;
+	}
+	timer(&cpu2);
+	sbtime = sbtime + cpu2 - cpu1;
+	L555:
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+
+	/*     Line search and optimality tests. */
+
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+	/*     Generate the search direction d:=z-x. */
+	i__1 = n;
+	for (i = 1; i <= i__1; ++i) {
+		d[i] = z[i] - x[i];
+		/* L40: */
+	}
+	timer(&cpu1);
+	L666:
+	lnsrlb(n, &l[1], &u[1], &nbd[1], &x[1], f, &fold, &gd, &gdold, &g[1], &
+			d[1], &r[1], &t[1], &z[1], &stp, &dnorm, &dtd, &xstep, &
+			stpmx, &iter, &ifun, &iback, &nfgv, &info, task, &boxed, &cnstnd,
+			csave, &isave[22], &dsave[17]);
+	if (info != 0 || iback >= 20) {
+		/*        restore the previous iterate. */
+		dcopy(&n, &t[1], &c__1, &x[1], &c__1);
+		dcopy(&n, &r[1], &c__1, &g[1], &c__1);
+		*f = fold;
+		if (col == 0) {
+			/*           abnormal termination. */
+			if (info == 0) {
+				info = -9;
+				/*          restore the actual number of f and g evaluations etc. */
+				--nfgv;
+				--ifun;
+				--iback;
+			}
+			strcpy(task, "ERROR: ABNORMAL_TERMINATION_IN_LNSRCH");
+			++iter;
+			goto L999;
+		} else {
+			/*           refresh the lbfgs memory and restart the iteration. */
+			if (iprint >= 1) {
+				cout << "Bad direction in the line search;" << endl;
+				cout << "   refresh the lbfgs memory and restart the iteration." << endl;
+			}
+			if (info == 0) {
+				--nfgv;
+			}
+			info = 0;
+			col = 0;
+			head = 1;
+			theta = 1.;
+			iupdat = 0;
+			updatd = 0;
+			strcpy(task, "RESTART_FROM_LNSRCH");
+			timer(&cpu2);
+			lnscht = lnscht + cpu2 - cpu1;
+			goto L222;
+		}
+	} else if (strncmp(task, "FG_LN", 5) == 0) {
+		/*        return to the driver for calculating f and g; reenter at 666. */
+		goto L1000;
+	} else {
+		/*        calculate and print out the quantities related to the new X. */
+		timer(&cpu2);
+		lnscht = lnscht + cpu2 - cpu1;
+		++iter;
+		/*      Compute the infinity norm of the projected (-)gradient. */
+		projgr(n, &l[1], &u[1], &nbd[1], &x[1], &g[1], &sbgnrm);
+		/*      Print iteration information. */
+		prn2lb(n, x+1, f, g+1, iprint, iter, nfgv, nact,
+				sbgnrm, nint, word, iword, iback, stp, xstep);
+		goto L1000;
+	}
+	L777:
+	/*     Test for termination. */
+	if (sbgnrm <= *pgtol) {
+		/*                  terminate the algorithm. */
+		strcpy(task, "CONVERGENCE: NORM OF PROJECTED GRADIENT <= PGTOL");
+		goto L999;
+	}
+	/* Computing MAX */
+	d__1 = fabs(fold), d__2 = fabs(*f), d__1 = max(d__1,d__2);
+	ddum = max((double)d__1,(double)1.);
+	if (fold - *f <= tol * ddum) {
+		/*                      terminate the algorithm. */
+		strcpy(task, "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH");
+		if (iback >= 10) info = -5;
+		/*         i.e., to issue a warning if iback>10 in the line search. */
+		goto L999;
+	}
+	/*     Compute d=newx-oldx, r=newg-oldg, rr=y'y and dr=y's. */
+	i__1 = n;
+	for (i = 1; i <= i__1; ++i) {
+		r[i] = g[i] - r[i];
+		/* L42: */
+	}
+	rr = ddot(&n, &r[1], &c__1, &r[1], &c__1);
+	if (stp == 1.) {
+		dr = gd - gdold;
+		ddum = -gdold;
+	} else {
+		dr = (gd - gdold) * stp;
+		dscal(&n, &stp, &d[1], &c__1);
+		ddum = -gdold * stp;
+	}
+	if (dr <= epsmch * ddum) {
+		/*                  skip the L-BFGS update. */
+		++nskip;
+		updatd = 0;
+		if (iprint >= 1) {
+			cout << "ys=" << dr << "   -gs=" << ddum << ", BFGS update SKIPPED" << endl;
+		}
+		goto L888;
+	}
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+
+	/*     Update the L-BFGS matrix. */
+
+	/* ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc */
+	updatd = 1;
+	++iupdat;
+	/*     Update matrices WS and WY and form the middle matrix in B. */
+	matupd(n, m, &ws[ws_offset], &wy[wy_offset], &sy[sy_offset], &ss[
+																	 ss_offset], &d[1], &r[1], &itail, &iupdat, &col, &head, &
+																	 theta, &rr, &dr, &stp, &dtd);
+	/*     Form the upper half of the pds T = theta*SS + L*D^(-1)*L'; */
+	/*      Store T in the upper triangular of the array wt; */
+	/*      Cholesky factorize T to J*J' with */
+	/*         J' stored in the upper triangular of wt. */
+	formt(m, &wt[wt_offset], &sy[sy_offset], &ss[ss_offset], &col, &theta, &
+			info);
+	if (info != 0) {
+		/*        nonpositive definiteness in Cholesky factorization; */
+		/*        refresh the lbfgs memory and restart the iteration. */
+		if (iprint >= 0) {
+			cout << "Nonpositive definiteness in Cholesky factorization in formk;" << endl;
+			cout << "   refresh the lbfgs memory and restart the iteration." << endl;
+		}
+		info = 0;
+		col = 0;
+		head = 1;
+		theta = 1.;
+		iupdat = 0;
+		updatd = 0;
+		goto L222;
+	}
+	/*     Now the inverse of the middle matrix in B is */
+	/*     [  D^(1/2)     O ] [ -D^(1/2)     D^(-1/2)*L' ] */
+	/*     [ -L*D^(-1/2)     J ] [    0     J'         ] */
+	L888:
+	/* -------------------- the end of the loop ----------------------------- */
+	goto L222;
+	L999:
+	L1000:
+	/*     Save local variables. */
+	lsave[1] = prjctd;
+	lsave[2] = cnstnd;
+	lsave[3] = boxed;
+	lsave[4] = updatd;
+	isave[1] = nintol;
+	isave[3] = itfile;
+	isave[4] = iback;
+	isave[5] = nskip;
+	isave[6] = head;
+	isave[7] = col;
+	isave[8] = itail;
+	isave[9] = iter;
+	isave[10] = iupdat;
+	isave[12] = nint;
+	isave[13] = nfgv;
+	isave[14] = info;
+	isave[15] = ifun;
+	isave[16] = iword;
+	isave[17] = nfree;
+	isave[18] = nact;
+	isave[19] = ileave;
+	isave[20] = nenter;
+	dsave[1] = theta;
+	dsave[2] = fold;
+	dsave[3] = tol;
+	dsave[4] = dnorm;
+	dsave[5] = epsmch;
+	dsave[6] = cpu1;
+	dsave[7] = cachyt;
+	dsave[8] = sbtime;
+	dsave[9] = lnscht;
+	dsave[11] = gd;
+	dsave[12] = stpmx;
+	dsave[13] = sbgnrm;
+	dsave[14] = stp;
+	dsave[15] = gdold;
+	dsave[16] = dtd;
+	prn3lb(n, x+1, f, task, iprint, info,
+			iter, nfgv, nintol, nskip, nact, sbgnrm,
+			nint, word, iback, stp, xstep, k);
+	return;
+} /* mainlb */
+/* ======================= The end of mainlb ============================= */
+
+void errclb(int n, int m, double factr, double *l, double *u,
+		int *nbd, char *task, int *info, int *k)
+{
+	/*    ************
+	 Subroutine errclb
+
+	 This subroutine checks the validity of the input data.
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* Local variables */
+	int i;
+
+	/* Parameter adjustments */
+	--nbd;
+	--u;
+	--l;
+
+	/* Function Body */
+	/*     Check the input arguments for errors. */
+	if (n <= 0)
+		strcpy(task, "ERROR: N .LE. 0");
+	if (m <= 0)
+		strcpy(task, "ERROR: M .LE. 0");
+	if (factr < 0.)
+		strcpy(task, "ERROR: FACTR .LT. 0");
+
+	/*     Check the validity of the arrays nbd(i), u(i), and l(i). */
+	for (i = 1; i <= n; ++i) {
+		if (nbd[i] < 0 || nbd[i] > 3) {
+			/*                             return */
+			strcpy(task, "ERROR: INVALID NBD");
+			*info = -6;
+			*k = i;
+		}
+		if (nbd[i] == 2) {
+			if (l[i] > u[i]) {
+				/*                      return */
+				strcpy(task, "ERROR: NO FEASIBLE SOLUTION");
+				*info = -7;
+				*k = i;
+			}
+		}
+	}
+	return;
+} /* errclb */
+/* ======================= The end of errclb ============================= */
+
+void active(int n, double *l, double *u,
+		int *nbd, double *x, int *iwhere, int iprint,
+		int *prjctd, int *cnstnd, int *boxed)
+{
+	/*    ************
+
+	 Subroutine active
+
+	 This subroutine initializes iwhere and projects the initial x to
+	 the feasible set if necessary.
+
+	 iwhere is an integer array of dimension n.
+	 On entry iwhere is unspecified.
+	 On exit iwhere(i)=-1    if x(i) has no bounds
+	 3    if l(i)=u(i)
+	 0    otherwise.
+	 In cauchy, iwhere is given finer gradations.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* Local variables */
+	int nbdd, i;
+
+	/* Parameter adjustments */
+	--iwhere;
+	--x;
+	--nbd;
+	--u;
+	--l;
+
+	/* Function Body */
+
+	/*Initialize nbdd, prjctd, cnstnd and boxed. */
+	nbdd = 0;
+	*prjctd = 0;
+	*cnstnd = 0;
+	*boxed = 1;
+	/*     Project the initial x to the easible set if necessary. */
+	for (i = 1; i <= n; ++i) {
+		if (nbd[i] > 0) {
+			if (nbd[i] <= 2 && x[i] <= l[i]) {
+				if (x[i] < l[i]) {
+					*prjctd = 1;
+					x[i] = l[i];
+				}
+				++nbdd;
+			} else if (nbd[i] >= 2 && x[i] >= u[i]) {
+				if (x[i] > u[i]) {
+					*prjctd = 1;
+					x[i] = u[i];
+				}
+				++nbdd;
+			}
+		}
+	}
+
+	/*     Initialize iwhere and assign values to cnstnd and boxed. */
+	for (i = 1; i <= n; ++i) {
+		if (nbd[i] != 2) {
+			*boxed = 0;
+		}
+		if (nbd[i] == 0) {
+			/*                  this variable is always free */
+			iwhere[i] = -1;
+			/*         otherwise set x(i)=mid(x(i), u(i), l(i)). */
+		} else {
+			*cnstnd = 1;
+			if (nbd[i] == 2 && u[i] - l[i] <= 0.) {
+				/*             this variable is always fixed */
+				iwhere[i] = 3;
+			} else {
+				iwhere[i] = 0;
+			}
+		}
+	}
+	if (iprint >= 0) {
+		if (*prjctd)
+			cout << "The initial X is infeasible.  Restart with its projection." << endl;
+		if (!*cnstnd)
+			cout << "This problem is unconstrained." << endl;
+	}
+	if (iprint > 0)
+		cout << "At X0, " << nbdd << " variables are exactly at the bounds" << endl;
+
+	return;
+} /* active */
+/* ======================= The end of active ============================= */
+
+void cauchy(int n, double *x, double *l, double *u, int *nbd,
+		double *g, int *iorder, int * iwhere, double *t,
+		double *d, double *xcp, int m,
+		double *wy, double *ws, double *sy, double *wt,
+		double *theta, int *col, int *head, double *p,
+		double *c, double *wbp, double *v, int *nint,
+		int iprint, double *sbgnrm, int *info, double * epsmch)
+{
+	/*     ************
+	 Subroutine cauchy
+
+	 For given x, l, u, g (with sbgnrm > 0), and a limited memory
+	 BFGS matrix B defined in terms of matrices WY, WS, WT, and
+	 scalars head, col, and theta, this subroutine computes the
+	 generalized Cauchy point (GCP), defined as the first local
+	 minimizer of the quadratic
+
+	 Q(x + s) = g's + 1/2 s'Bs
+
+	 along the projected gradient direction P(x-tg,l,u).
+	 The routine returns the GCP in xcp.
+
+	 n is an integer variable.
+	 On entry n is the dimension of the problem.
+	 On exit n is unchanged.
+
+	 x is a double precision array of dimension n.
+	 On entry x is the starting point for the GCP computation.
+	 On exit x is unchanged.
+
+	 l is a double precision array of dimension n.
+	 On entry l is the lower bound of x.
+	 On exit l is unchanged.
+
+	 u is a double precision array of dimension n.
+	 On entry u is the upper bound of x.
+	 On exit u is unchanged.
+
+	 nbd is an integer array of dimension n.
+	 On entry nbd represents the type of bounds imposed on the
+	 variables, and must be specified as follows:
+	 nbd(i)=0 if x(i) is unbounded,
+	 1 if x(i) has only a lower bound,
+	 2 if x(i) has both lower and upper bounds, and
+	 3 if x(i) has only an upper bound.
+	 On exit nbd is unchanged.
+
+	 g is a double precision array of dimension n.
+	 On entry g is the gradient of f(x).  g must be a nonzero vector.
+	 On exit g is unchanged.
+
+	 iorder is an integer working array of dimension n.
+	 iorder will be used to store the breakpoints in the piecewise
+	 linear path and free variables encountered. On exit,
+	 iorder(1),...,iorder(nleft) are indices of breakpoints
+	 which have not been encountered;
+	 iorder(nleft+1),...,iorder(nbreak) are indices of
+	 encountered breakpoints; and
+	 iorder(nfree),...,iorder(n) are indices of variables which
+	 have no bound constraits along the search direction.
+
+	 iwhere is an integer array of dimension n.
+	 On entry iwhere indicates only the permanently fixed (iwhere=3)
+	 or free (iwhere= -1) components of x.
+	 On exit iwhere records the status of the current x variables.
+	 iwhere(i)=-3  if x(i) is free and has bounds, but is not moved
+	 0   if x(i) is free and has bounds, and is moved
+	 1   if x(i) is fixed at l(i), and l(i) .ne. u(i)
+	 2   if x(i) is fixed at u(i), and u(i) .ne. l(i)
+	 3   if x(i) is always fixed, i.e.,  u(i)=x(i)=l(i)
+	 -1  if x(i) is always free, i.e., it has no bounds.
+
+	 t is a double precision working array of dimension n.
+	 t will be used to store the break points.
+
+	 d is a double precision array of dimension n used to store
+	 the Cauchy direction P(x-tg)-x.
+
+	 xcp is a double precision array of dimension n used to return the
+	 GCP on exit.
+
+	 m is an integer variable.
+	 On entry m is the maximum number of variable metric corrections
+	 used to define the limited memory matrix.
+	 On exit m is unchanged.
+
+	 ws, wy, sy, and wt are double precision arrays.
+	 On entry they store information that defines the
+	 limited memory BFGS matrix:
+	 ws(n,m) stores S, a set of s-vectors;
+	 wy(n,m) stores Y, a set of y-vectors;
+	 sy(m,m) stores S'Y;
+	 wt(m,m) stores the
+	 Cholesky factorization of (theta*S'S+LD^(-1)L').
+	 On exit these arrays are unchanged.
+
+	 theta is a double precision variable.
+	 On entry theta is the scaling factor specifying B_0 = theta I.
+	 On exit theta is unchanged.
+
+	 col is an integer variable.
+	 On entry col is the actual number of variable metric
+	 corrections stored so far.
+	 On exit col is unchanged.
+
+	 head is an integer variable.
+	 On entry head is the location of the first s-vector
+	 (or y-vector) in S (or Y).
+	 On exit col is unchanged.
+
+	 p is a double precision working array of dimension 2m.
+	 p will be used to store the vector p = W^(T)d.
+
+	 c is a double precision working array of dimension 2m.
+	 c will be used to store the vector c = W^(T)(xcp-x).
+
+	 wbp is a double precision working array of dimension 2m.
+	 wbp will be used to store the row of W corresponding
+	 to a breakpoint.
+
+	 v is a double precision working array of dimension 2m.
+
+	 nint is an integer variable.
+	 On exit nint records the number of quadratic segments explored
+	 in searching for the GCP.
+
+	 iprint is an INTEGER variable that must be set by the user.
+	 It controls the frequency and type of output generated:
+	 iprint<0    no output is generated;
+	 iprint=0    print only one line at the last iteration;
+	 0<iprint<99 print also f and |proj g| every iprint iterations;
+	 iprint=99   print details of every iteration except n-vectors;
+	 iprint=100  print also the changes of active set and final x;
+	 iprint>100  print details of every iteration including x and g;
+	 When iprint > 0, the file iterate.dat will be created to
+	 summarize the iteration.
+
+	 sbgnrm is a double precision variable.
+	 On entry sbgnrm is the norm of the projected gradient at x.
+	 On exit sbgnrm is unchanged.
+
+	 info is an integer variable.
+	 On entry info is 0.
+	 On exit info = 0    for normal return,
+	 = nonzero for abnormal return when the the system
+	 used in routine bmv is singular.
+
+	 Subprograms called:
+
+	 L-BFGS-B Library ... hpsolb, bmv.
+
+	 Linpack ... dscal dcopy, daxpy.
+
+
+	 References:
+
+	 [1] R. H. Byrd, P. Lu, J. Nocedal and C. Zhu, ``A limited
+	 memory algorithm for bound constrained optimization'',
+	 SIAM J. Scientific Computing 16 (1995), no. 5, pp. 1190--1208.
+
+	 [2] C. Zhu, R.H. Byrd, P. Lu, J. Nocedal, ``L-BFGS-B: FORTRAN
+	 Subroutines for Large Scale Bound Constrained Optimization''
+	 Tech. Report, NAM-11, EECS Department, Northwestern University, 1994.
+
+	 (Postscript files of these papers are available via anonymous
+	 ftp to ece.nwu.edu in the directory pub/lbfgs/lbfgs_bcm.)
+
+	 *    *  *
+
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int wy_dim1, wy_offset, ws_dim1, ws_offset, sy_dim1, sy_offset,
+	wt_dim1, wt_offset, i__2;
+	double d__1;
+
+	/* Local variables */
+	double bkmin, dibp, dibp2, zibp, neggi, tsum;
+	double f1, f2, f2_org__, dt, tj, tj0, tl= 0.0, tu=0.0, dtm, wmc, wmp, wmw;
+
+	int i, j, ibp, iter, bnded, nfree, nleft, nbreak, ibkmin, pointr;
+	int xlower, xupper, col2;
+
+	/* Parameter adjustments */
+	--xcp;
+	--d;
+	--t;
+	--iwhere;
+	--iorder;
+	--g;
+	--nbd;
+	--u;
+	--l;
+	--x;
+	--v;
+	--wbp;
+	--c;
+	--p;
+	wt_dim1 = m;    wt_offset = 1 + wt_dim1 * 1;    wt -= wt_offset;
+	sy_dim1 = m;    sy_offset = 1 + sy_dim1 * 1;    sy -= sy_offset;
+	ws_dim1 = n;    ws_offset = 1 + ws_dim1 * 1;    ws -= ws_offset;
+	wy_dim1 = n;    wy_offset = 1 + wy_dim1 * 1;    wy -= wy_offset;
+
+	/* Function Body */
+
+	/*     Check the status of the variables, reset iwhere(i) if necessary;
+	 *     compute the Cauchy direction d and the breakpoints t; initialize
+	 *     the derivative f1 and the vector p = W'd (for theta = 1).
+	 */
+
+	if (*sbgnrm <= 0.) {
+		if (iprint >= 0) cout << "Subgnorm = 0.  GCP = X.\n";
+		dcopy(&n, &x[1], &c__1, &xcp[1], &c__1);
+		return;
+	}
+	bnded = 1;
+	nfree = n + 1;
+	nbreak = 0;
+	ibkmin = 0;
+	bkmin = 0.;
+	col2 = *col << 1;
+	f1 = 0.;
+	if (iprint >= 99)
+		cout << "\n---------------- CAUCHY entered-------------------\n\n";
+
+	/*     We set p to zero and build it up as we determine d. */
+	for (i = 1; i <= col2; ++i)
+		p[i] = 0.;
+
+	/*     In the following loop we determine for each variable its bound */
+	/*      status and its breakpoint, and update p accordingly. */
+	/*      Smallest breakpoint is identified. */
+
+	for (i = 1; i <= n; ++i) {
+		neggi = -g[i];
+		if (iwhere[i] != 3 && iwhere[i] != -1) {
+			/*           if x(i) is not a constant and has bounds, */
+			/*           compute the difference between x(i) and its bounds. */
+			if (nbd[i] <= 2) {
+				tl = x[i] - l[i];
+			}
+			if (nbd[i] >= 2) {
+				tu = u[i] - x[i];
+			}
+			/*         If a variable is close enough to a bound */
+			/*           we treat it as at bound. */
+			xlower = nbd[i] <= 2 && tl <= 0.;
+			xupper = nbd[i] >= 2 && tu <= 0.;
+			/*        reset iwhere(i). */
+			iwhere[i] = 0;
+			if (xlower) {
+				if (neggi <= 0.) {
+					iwhere[i] = 1;
+				}
+			} else if (xupper) {
+				if (neggi >= 0.) {
+					iwhere[i] = 2;
+				}
+			} else {
+				if (fabs(neggi) <= 0.) {
+					iwhere[i] = -3;
+				}
+			}
+		}
+		pointr = *head;
+		if (iwhere[i] != 0 && iwhere[i] != -1) {
+			d[i] = 0.;
+		} else {
+			d[i] = neggi;
+			f1 -= neggi * neggi;
+			/*           calculate p := p - W'e_i* (g_i). */
+			i__2 = *col;
+			for (j = 1; j <= i__2; ++j) {
+				p[j] += wy[i + pointr * wy_dim1] * neggi;
+				p[*col + j] += ws[i + pointr * ws_dim1] * neggi;
+				pointr = pointr % m + 1;
+			}
+			if (nbd[i] <= 2 && nbd[i] != 0 && neggi < 0.) {
+				/*                   x(i) + d(i) is bounded; compute t(i). */
+				++nbreak;
+				iorder[nbreak] = i;
+				t[nbreak] = tl / (-neggi);
+				if (nbreak == 1 || t[nbreak] < bkmin) {
+					bkmin = t[nbreak];
+					ibkmin = nbreak;
+				}
+			} else if (nbd[i] >= 2 && neggi > 0.) {
+				/*                   x(i) + d(i) is bounded; compute t(i). */
+				++nbreak;
+				iorder[nbreak] = i;
+				t[nbreak] = tu / neggi;
+				if (nbreak == 1 || t[nbreak] < bkmin) {
+					bkmin = t[nbreak];
+					ibkmin = nbreak;
+				}
+			} else {/*          x(i) + d(i) is not bounded. */
+				--nfree;
+				iorder[nfree] = i;
+				if (fabs(neggi) > 0.)
+					bnded = 0;
+			}
+		}
+		/* L50: */
+	} /* for(i = 1:n) */
+
+	/*     The indices of the nonzero components of d are now stored */
+	/*     in iorder(1),...,iorder(nbreak) and iorder(nfree),...,iorder(n). */
+	/*     The smallest of the nbreak breakpoints is in t(ibkmin)=bkmin. */
+	if (*theta != 1.) {
+		/*             complete the initialization of p for theta not= one. */
+		dscal(col, theta, &p[*col + 1], &c__1);
+	}
+	/*     Initialize GCP xcp = x. */
+	dcopy(&n, &x[1], &c__1, &xcp[1], &c__1);
+	if (nbreak == 0 && nfree == n + 1) {
+		/*            is a zero vector, return with the initial xcp as GCP. */
+		if (iprint > 100) {
+			cout << "Cauchy X =  ";
+			for(i = 1; i <= n; i++) cout << xcp[i] << " ";
+			cout << "\n";
+		}
+		return;
+	}
+	/*     Initialize c = W'(xcp - x) = 0. */
+	for (j = 1; j <= col2; ++j)
+		c[j] = 0.;
+
+	/*     Initialize derivative f2. */
+	f2 = -(*theta) * f1;
+	f2_org__ = f2;
+	if (*col > 0) {
+		bmv(m, &sy[sy_offset], &wt[wt_offset], col, &p[1], &v[1], info);
+		if (*info != 0) {
+			return;
+		}
+		f2 -= ddot(&col2, &v[1], &c__1, &p[1], &c__1);
+	}
+	dtm = -f1 / f2;
+	tsum = 0.;
+	*nint = 1;
+	if (iprint >= 99) cout << "There are " << nbreak << " breakpoints\n";
+
+	/*     If there are no breakpoints, locate the GCP and return. */
+	if (nbreak == 0) {
+		goto L888;
+	}
+	nleft = nbreak;
+	iter = 1;
+	tj = 0.;
+	/* ------------------- the beginning of the loop ------------------------- */
+	L777:
+	/*     Find the next smallest breakpoint; */
+	/*     compute dt = t(nleft) - t(nleft + 1). */
+	tj0 = tj;
+	if (iter == 1) {
+		/*       Since we already have the smallest breakpoint we need not do */
+		/*       heapsort yet. Often only one breakpoint is used and the */
+		/*       cost of heapsort is avoided. */
+		tj = bkmin;
+		ibp = iorder[ibkmin];
+	} else {
+		if (iter == 2) {
+			/* Replace the already used smallest breakpoint with the */
+			/* breakpoint numbered nbreak > nlast, before heapsort call. */
+			if (ibkmin != nbreak) {
+				t[ibkmin] = t[nbreak];
+				iorder[ibkmin] = iorder[nbreak];
+			}
+		}
+		/* Update heap structure of breakpoints */
+		/* (if iter=2, initialize heap). */
+		hpsolb(nleft, &t[1], &iorder[1], iter - 2);
+		tj = t[nleft];
+		ibp = iorder[nleft];
+	}
+	dt = tj - tj0;
+
+	if (dt != 0 && iprint >=  100) {
+		cout << "\nPiece    " << *nint << " f1, f2 at start point " << f1 << " " << f2 << "\n",
+				cout << "Distance to the next break point =  " << dt << "\n";
+		cout << "Distance to the stationary point =  " << dtm << "\n";
+	}
+
+	/*     If a minimizer is within this interval, */
+	/*     locate the GCP and return. */
+	if (dtm < dt) {
+		goto L888;
+	}
+	/*     Otherwise fix one variable and */
+	/*     reset the corresponding component of d to zero. */
+	tsum += dt;
+	--nleft;
+	++iter;
+	dibp = d[ibp];
+	d[ibp] = 0.;
+	if (dibp > 0.) {
+		zibp = u[ibp] - x[ibp];
+		xcp[ibp] = u[ibp];
+		iwhere[ibp] = 2;
+	} else {
+		zibp = l[ibp] - x[ibp];
+		xcp[ibp] = l[ibp];
+		iwhere[ibp] = 1;
+	}
+	if (iprint >= 100) cout << "Variable  " << ibp << "  is fixed.\n";
+	if (nleft == 0 && nbreak == n) {
+		/*                           all n variables are fixed, */
+		/*                          return with xcp as GCP. */
+		dtm = dt;
+		goto L999;
+	}
+	/*     Update the derivative information. */
+	++(*nint);
+	dibp2 = dibp * dibp;
+	/*     Update f1 and f2. */
+	/*      temporarily set f1 and f2 for col=0. */
+	f1 += dt * f2 + dibp2 - *theta * dibp * zibp;
+	f2 -= *theta * dibp2;
+	if (*col > 0) {
+		/*                update c = c + dt*p. */
+		daxpy(&col2, &dt, &p[1], &c__1, &c[1], &c__1);
+		/*         choose wbp, */
+		/*         the row of W corresponding to the breakpoint encountered. */
+		pointr = *head;
+		for (j = 1; j <= *col; ++j) {
+			wbp[j] = wy[ibp + pointr * wy_dim1];
+			wbp[*col + j] = *theta * ws[ibp + pointr * ws_dim1];
+			pointr = pointr % m + 1;
+		}
+		/*         compute (wbp)Mc, (wbp)Mp, and (wbp)M(wbp)'. */
+		bmv(m, &sy[sy_offset], &wt[wt_offset], col, &wbp[1], &v[1], info);
+		if (*info != 0) {
+			return;
+		}
+		wmc = ddot(&col2,  &c[1], &c__1, &v[1], &c__1);
+		wmp = ddot(&col2,  &p[1], &c__1, &v[1], &c__1);
+		wmw = ddot(&col2,&wbp[1], &c__1, &v[1], &c__1);
+		/*         update p = p - dibp*wbp. */
+		d__1 = -dibp;
+		daxpy(&col2, &d__1, &wbp[1], &c__1, &p[1], &c__1);
+		/*         complete updating f1 and f2 while col > 0. */
+		f1 += dibp * wmc;
+		f2 += (2. * dibp * wmp - dibp2 * wmw);
+	}
+	if(f2 < (d__1 = *epsmch * f2_org__)) f2 = d__1;
+	if (nleft > 0) {
+		dtm = -f1 / f2;
+		goto L777;
+		/*           to repeat the loop for unsearched intervals. */
+	} else if (bnded) {
+		f1 = 0.;
+		f2 = 0.;
+		dtm = 0.;
+	} else {
+		dtm = -f1 / f2;
+	}
+	/* ------------------- the end of the loop ------------------------------- */
+	L888:
+	if (iprint >= 99) {
+		cout << "\nGCP found in this segment\n";
+		cout << "Piece    " << *nint << " f1, f2 at start point " << f1 << " " << f2 << "\n";
+		cout << "Distance to the stationary point =  " << dtm << "\n";
+	}
+
+	if (dtm <= 0.) {
+		dtm = 0.;
+	}
+	tsum += dtm;
+	/*     Move free variables (i.e., the ones w/o breakpoints) and */
+	/*     the variables whose breakpoints haven't been reached. */
+	daxpy(&n, &tsum, &d[1], &c__1, &xcp[1], &c__1);
+	L999:
+	/*     Update c = c + dtm*p = W'(x^c - x) */
+	/*     which will be used in computing r = Z'(B(x^c - x) + g). */
+	if (*col > 0) {
+		daxpy(&col2, &dtm, &p[1], &c__1, &c[1], &c__1);
+	}
+	if (iprint >= 100) {
+		cout << "Cauchy X =  ";
+		for(i = 1; i <= n; i++) cout << xcp[i] << " ";
+		cout << "\n";
+	}
+
+	if (iprint >= 99)
+		cout << "\n---------------- exit CAUCHY----------------------\n\n";
+	return;
+} /* cauchy */
+/* ====================== The end of cauchy ============================== */
+
+void freev(int n, int *nfree, int *indx,
+		int *nenter, int *ileave, int *indx2, int *iwhere,
+		int *wrk, int *updatd, int *cnstnd, int iprint,
+		int *iter)
+{
+	/*    ************
+
+	 Subroutine freev
+
+	 This subroutine counts the entering and leaving variables when
+	 iter > 0, and finds the index set of free and active variables
+	 at the GCP.
+
+	 cnstnd is a int variable indicating whether bounds are present
+
+	 indx is an int array of dimension n
+	 for i=1,...,nfree, indx(i) are the indices of free variables
+	 for i=nfree+1,...,n, indx(i) are the indices of bound variables
+	 On entry after the first iteration, indx gives
+	 the free variables at the previous iteration.
+	 On exit it gives the free variables based on the determination
+	 in cauchy using the array iwhere.
+
+	 indx2 is an int array of dimension n
+	 On entry indx2 is unspecified.
+	 On exit with iter>0, indx2 indicates which variables
+	 have changed status since the previous iteration.
+	 For i= 1,...,nenter, indx2(i) have changed from bound to free.
+	 For i= ileave+1,...,n, indx2(i) have changed from free to bound.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int i__1;
+
+	/* Local variables */
+	int iact, i, k;
+
+	/* Parameter adjustments */
+	--iwhere;
+	--indx2;
+	--indx;
+
+	/* Function Body */
+	*nenter = 0;
+	*ileave = n + 1;
+	if (*iter > 0 && *cnstnd) {/* count the entering and leaving variables. */
+		i__1 = *nfree;
+		for (i = 1; i <= i__1; ++i) {
+			k = indx[i];
+			if (iwhere[k] > 0) {
+				--(*ileave);
+				indx2[*ileave] = k;
+				if (iprint >= 100)
+					cout << "Variable " << k << " leaves the set of free variables\n";
+			}
+			/* L20: */
+		}
+		for (i = *nfree + 1; i <= n; ++i) {
+			k = indx[i];
+			if (iwhere[k] <= 0) {
+				++(*nenter);
+				indx2[*nenter] = k;
+				if (iprint >= 100)
+					cout << "Variable " << k << " enters the set of free variables\n";
+			}
+			/* L22: */
+			if (iprint >= 100)
+				cout << n + 1 - *ileave << " variables leave; " << *nenter << " variables enter\n";
+		}
+	}
+	*wrk = *ileave < n + 1 || *nenter > 0 || *updatd;
+	/*     Find the index set of free and active variables at the GCP. */
+	*nfree = 0;
+	iact = n + 1;
+	for (i = 1; i <= n; ++i) {
+		if (iwhere[i] <= 0) {
+			++(*nfree);
+			indx[*nfree] = i;
+		} else {
+			--iact;
+			indx[iact] = i;
+		}
+	}
+	if (iprint >= 99)
+		cout << *nfree << "  variables are free at GCP on iteration " << *iter + 1 << endl;
+	return;
+} /* freev */
+/* ======================= The end of freev ============================== */
+
+void formk(int n, int *nsub, int *ind, int * nenter, int *ileave,
+		int *indx2, int *iupdat, int * updatd, double *wn,
+		double *wn1, int m, double *ws, double *wy, double *sy,
+		double *theta, int *col, int *head, int *info)
+{
+	/*     ************
+
+	 Subroutine formk
+
+	 This subroutine forms  the LEL^T factorization of the indefinite
+
+	 matrix     K = [-D -Y'ZZ'Y/theta       L_a'-R_z'  ]
+	 [L_a -R_z         theta*S'AA'S ]
+	 where E = [-I  0]
+	 [ 0  I]
+	 The matrix K can be shown to be equal to the matrix M^[-1]N
+	 occurring in section 5.1 of [1], as well as to the matrix
+	 Mbar^[-1] Nbar in section 5.3.
+
+	 n is an integer variable.
+	 On entry n is the dimension of the problem.
+	 On exit n is unchanged.
+
+	 nsub is an integer variable
+	 On entry nsub is the number of subspace variables in free set.
+	 On exit nsub is not changed.
+
+	 ind is an integer array of dimension nsub.
+	 On entry ind specifies the indices of subspace variables.
+	 On exit ind is unchanged.
+
+	 nenter is an integer variable.
+	 On entry nenter is the number of variables entering the
+	 free set.
+	 On exit nenter is unchanged.
+
+	 ileave is an integer variable.
+	 On entry indx2(ileave),...,indx2(n) are the variables leaving
+	 the free set.
+	 On exit ileave is unchanged.
+
+	 indx2 is an integer array of dimension n.
+	 On entry indx2(1),...,indx2(nenter) are the variables entering
+	 the free set, while indx2(ileave),...,indx2(n) are the
+	 variables leaving the free set.
+	 On exit indx2 is unchanged.
+	 p
+	 iupdat is an integer variable.
+	 On entry iupdat is the total number of BFGS updates made so far.
+	 On exit iupdat is unchanged.
+
+	 updatd is a logical variable.
+	 On entry 'updatd' is true if the L-BFGS matrix is updatd.
+	 On exit 'updatd' is unchanged.
+
+	 wn is a double precision array of dimension 2m x 2m.
+	 On entry wn is unspecified.
+	 On exit the upper triangle of wn stores the LEL^T factorization
+	 of the 2*col x 2*col indefinite matrix
+	 [-D -Y'ZZ'Y/theta       L_a'-R_z'  ]
+	 [L_a -R_z         theta*S'AA'S ]
+
+	 wn1 is a double precision array of dimension 2m x 2m.
+	 On entry wn1 stores the lower triangular part of
+	 [Y' ZZ'Y    L_a'+R_z']
+	 [L_a+R_z    S'AA'S     ]
+	 in the previous iteration.
+	 On exit wn1 stores the corresponding updated matrices.
+	 The purpose of wn1 is just to store these inner products
+	 so they can be easily updated and inserted into wn.
+
+	 m is an integer variable.
+	 On entry m is the maximum number of variable metric corrections
+	 used to define the limited memory matrix.
+	 On exit m is unchanged.
+
+	 ws, wy, sy, and wtyy are double precision arrays;
+	 theta is a double precision variable;
+	 col is an integer variable;
+	 head is an integer variable.
+	 On entry they store the information defining the
+	 limited memory BFGS matrix:
+	 ws(n,m) stores S, a set of s-vectors;
+	 wy(n,m) stores Y, a set of y-vectors;
+	 sy(m,m) stores S'Y;
+	 wtyy(m,m) stores the Cholesky factorization
+	 of (theta*S'S+LD^(-1)L')
+	 theta is the scaling factor specifying B_0 = theta I;
+	 col is the number of variable metric corrections stored;
+	 head is the location of the 1st s- (or y-) vector in S (or Y).
+	 On exit they are unchanged.
+
+	 info is an integer variable.
+	 On entry info is unspecified.
+	 On exit info =  0 for normal return;
+	 = -1 when the 1st Cholesky factorization failed;
+	 = -2 when the 2st Cholesky factorization failed.
+
+	 Subprograms called:
+
+	 Linpack ... dcopy, dpofa, dtrsl.
+
+
+	 References:
+	 [1] R. H. Byrd, P. Lu, J. Nocedal and C. Zhu, ``A limited
+	 memory algorithm for bound constrained optimization'',
+	 SIAM J. Scientific Computing 16 (1995), no. 5, pp. 1190--1208.
+
+	 [2] C. Zhu, R.H. Byrd, P. Lu, J. Nocedal, ``L-BFGS-B: a
+	 limited memory FORTRAN code for solving bound constrained
+	 optimization problems'', Tech. Report, NAM-11, EECS Department,
+	 Northwestern University, 1994.
+
+	 (Postscript files of these papers are available via anonymous
+	 ftp to ece.nwu.edu in the directory pub/lbfgs/lbfgs_bcm.)
+
+	 *  *     *
+
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int wn_dim1, wn_offset, wn1_dim1, wn1_offset, ws_dim1, ws_offset,
+	wy_dim1, wy_offset, sy_dim1, sy_offset, i__1, i__2;
+
+	/* Local variables */
+	int dend, pend;
+	int upcl;
+	double temp1, temp2, temp3, temp4;
+	int i, k;
+	int ipntr, jpntr, k1, m2, dbegin, is, js, iy, jy, pbegin, is1, js1,
+	col2;
+
+	/* Parameter adjustments */
+	--indx2;
+	--ind;
+	sy_dim1 = m;
+	sy_offset = 1 + sy_dim1 * 1;
+	sy -= sy_offset;
+	wy_dim1 = n;
+	wy_offset = 1 + wy_dim1 * 1;
+	wy -= wy_offset;
+	ws_dim1 = n;
+	ws_offset = 1 + ws_dim1 * 1;
+	ws -= ws_offset;
+	wn1_dim1 = 2 * m;
+	wn1_offset = 1 + wn1_dim1 * 1;
+	wn1 -= wn1_offset;
+	wn_dim1 = 2 * m;
+	wn_offset = 1 + wn_dim1 * 1;
+	wn -= wn_offset;
+
+	/* Function Body */
+
+	/*     Form the lower triangular part of */
+	/*         WN1 = [Y' ZZ'Y      L_a'+R_z'] */
+	/*               [L_a+R_z      S'AA'S   ] */
+	/*      where L_a is the strictly lower triangular part of S'AA'Y */
+	/*        R_z is the upper triangular part of S'ZZ'Y. */
+
+	if (*updatd) {
+		if (*iupdat > m) {/*        shift old part of WN1. */
+			i__1 = m - 1;
+			for (jy = 1; jy <= i__1; ++jy) {
+				js = m + jy;
+				i__2 = m - jy;
+				dcopy(&i__2, &wn1[jy + 1 + (jy + 1)* wn1_dim1], &c__1,
+						&wn1[jy + jy * wn1_dim1], &c__1);
+				dcopy(&i__2, &wn1[js + 1 + (js + 1)* wn1_dim1], &c__1,
+						&wn1[js + js * wn1_dim1], &c__1);
+				i__2 = m - 1;
+				dcopy(&i__2, &wn1[m + 2 + (jy + 1) * wn1_dim1], &c__1,
+						&wn1[m + 1 + jy * wn1_dim1], &c__1);
+				/* L10: */
+			}
+		}
+		/*        put new rows in blocks (1,1), (2,1) and (2,2). */
+		pbegin = 1;
+		pend = *nsub;
+		dbegin = *nsub + 1;
+		dend = n;
+		iy = *col;
+		is = m + *col;
+		ipntr = *head + *col - 1;
+		if (ipntr > m) {
+			ipntr -= m;
+		}
+		jpntr = *head;
+		i__1 = *col;
+		for (jy = 1; jy <= i__1; ++jy) {
+			js = m + jy;
+			temp1 = 0.;
+			temp2 = 0.;
+			temp3 = 0.;
+			/*           compute element jy of row 'col' of Y'ZZ'Y */
+			for (k = pbegin; k <= pend; ++k) {
+				k1 = ind[k];
+				temp1 += wy[k1 + ipntr * wy_dim1] * wy[k1 + jpntr * wy_dim1];
+			}
+			/*           compute elements jy of row 'col' of L_a and S'AA'S */
+			for (k = dbegin; k <= dend; ++k) {
+				k1 = ind[k];
+				temp2 += ws[k1 + ipntr * ws_dim1] * ws[k1 + jpntr * ws_dim1];
+				temp3 += ws[k1 + ipntr * ws_dim1] * wy[k1 + jpntr * wy_dim1];
+			}
+			wn1[iy + jy * wn1_dim1] = temp1;
+			wn1[is + js * wn1_dim1] = temp2;
+			wn1[is + jy * wn1_dim1] = temp3;
+			jpntr = jpntr % m + 1;
+			/* L20: */
+		}
+		/*        put new column in block (2,1). */
+		jy = *col;
+		jpntr = *head + *col - 1;
+		if (jpntr > m) {
+			jpntr -= m;
+		}
+		ipntr = *head;
+		i__1 = *col;
+		for (i = 1; i <= i__1; ++i) {
+			is = m + i;
+			temp3 = 0.;
+			/*           compute element i of column 'col' of R_z */
+			for (k = pbegin; k <= pend; ++k) {
+				k1 = ind[k];
+				temp3 += ws[k1 + ipntr * ws_dim1] * wy[k1 + jpntr * wy_dim1];
+			}
+			ipntr = ipntr % m + 1;
+			wn1[is + jy * wn1_dim1] = temp3;
+			/* L30: */
+		}
+		upcl = *col - 1;
+	} else {
+		upcl = *col;
+	}
+	/*     modify the old parts in blocks (1,1) and (2,2) due to changes */
+	/*     in the set of free variables. */
+	ipntr = *head;
+	for (iy = 1; iy <= upcl; ++iy) {
+		is = m + iy;
+		jpntr = *head;
+		for (jy = 1; jy <= iy; ++jy) {
+			js = m + jy;
+			temp1 = 0.;
+			temp2 = 0.;
+			temp3 = 0.;
+			temp4 = 0.;
+			for (k = 1; k <= *nenter; ++k) {
+				k1 = indx2[k];
+				temp1 += wy[k1 + ipntr * wy_dim1] * wy[k1 + jpntr * wy_dim1];
+				temp2 += ws[k1 + ipntr * ws_dim1] * ws[k1 + jpntr * ws_dim1];
+			}
+			for (k = *ileave; k <= n; ++k) {
+				k1 = indx2[k];
+				temp3 += wy[k1 + ipntr * wy_dim1] * wy[k1 + jpntr * wy_dim1];
+				temp4 += ws[k1 + ipntr * ws_dim1] * ws[k1 + jpntr * ws_dim1];
+			}
+			wn1[iy + jy * wn1_dim1] = wn1[iy + jy * wn1_dim1] + temp1 - temp3;
+			wn1[is + js * wn1_dim1] = wn1[is + js * wn1_dim1] - temp2 + temp4;
+			jpntr = jpntr % m + 1;
+			/* L40: */
+		}
+		ipntr = ipntr % m + 1;
+		/* L45: */
+	}
+	/*     modify the old parts in block (2,1). */
+	ipntr = *head;
+	for (is = m + 1; is <= m + upcl; ++is) {
+		jpntr = *head;
+		for (jy = 1; jy <= upcl; ++jy) {
+			temp1 = 0.;
+			temp3 = 0.;
+			for (k = 1; k <= *nenter; ++k) {
+				k1 = indx2[k];
+				temp1 += ws[k1 + ipntr * ws_dim1] * wy[k1 + jpntr * wy_dim1];
+			}
+			for (k = *ileave; k <= n; ++k) {
+				k1 = indx2[k];
+				temp3 += ws[k1 + ipntr * ws_dim1] * wy[k1 + jpntr * wy_dim1];
+			}
+			if (is <= jy + m) {
+				wn1[is + jy * wn1_dim1] +=  temp1 - temp3;
+			} else {
+				wn1[is + jy * wn1_dim1] += -temp1 + temp3;
+			}
+			jpntr = jpntr % m + 1;
+			/* L55: */
+		}
+		ipntr = ipntr % m + 1;
+		/* L60: */
+	}
+	/*     Form the upper triangle of WN = [D+Y' ZZ'Y/theta      -L_a'+R_z' ] */
+	/*                       [-L_a +R_z     S'AA'S*theta] */
+	m2 = m << 1;
+	i__1 = *col;
+	for (iy = 1; iy <= i__1; ++iy) {
+		is = *col + iy;
+		is1 = m + iy;
+		i__2 = iy;
+		for (jy = 1; jy <= i__2; ++jy) {
+			js = *col + jy;
+			js1 = m + jy;
+			wn[jy + iy * wn_dim1] = wn1[iy + jy * wn1_dim1] / *theta;
+			wn[js + is * wn_dim1] = wn1[is1 + js1 * wn1_dim1] * *theta;
+			/* L65: */
+		}
+		i__2 = iy - 1;
+		for (jy = 1; jy <= i__2; ++jy) {
+			wn[jy + is * wn_dim1] = -wn1[is1 + jy * wn1_dim1];
+		}
+		i__2 = *col;
+		for (jy = iy; jy <= i__2; ++jy) {
+			wn[jy + is * wn_dim1] = wn1[is1 + jy * wn1_dim1];
+		}
+		wn[iy + iy * wn_dim1] += sy[iy + iy * sy_dim1];
+		/* L70: */
+	}
+	/*     Form the upper triangle of */
+	/*        WN= [  LL'          L^-1(-L_a'+R_z')] */
+	/*        [(-L_a +R_z)L'^-1   S'AA'S*theta  ] */
+	/*      first Cholesky factor (1,1) block of wn to get LL' */
+	/*                with L' stored in the upper triangle of wn. */
+	dpofa(&wn[wn_offset], &m2, col, info);
+	if (*info != 0) {
+		*info = -1;
+		return;
+	}
+	/*      then form L^-1(-L_a'+R_z') in the (1,2) block. */
+	col2 = *col << 1;
+	for (js = *col + 1; js <= col2; ++js) {
+		dtrsl(&wn[wn_offset], &m2, col,
+				&wn[js * wn_dim1 + 1], &c__11, info);
+	}
+	/*     Form S'AA'S*theta + (L^-1(-L_a'+R_z'))'L^-1(-L_a'+R_z') in the */
+	/*      upper triangle of (2,2) block of wn. */
+	for (is = *col + 1; is <= col2; ++is) {
+		for (js = is; js <= col2; ++js) {
+			wn[is + js * wn_dim1] +=
+					ddot(col, &wn[is * wn_dim1 + 1], &c__1,
+							&wn[js * wn_dim1 + 1], &c__1);
+		}
+		/* L72: */
+	}
+	/*     Cholesky factorization of (2,2) block of wn. */
+	dpofa(&wn[*col + 1 + (*col + 1) * wn_dim1], &m2, col, info);
+	if (*info != 0) {
+		*info = -2;
+		return;
+	}
+	return;
+} /* formk */
+/* ======================= The end of formk ============================== */
+
+void cmprlb(int n, int m, double *x,
+		double *g, double *ws, double *wy, double *sy,
+		double *wt, double *z, double *r, double *wa,
+		int *indx, double *theta, int *col, int *head,
+		int *nfree, int *cnstnd, int *info)
+{
+	/*    ************
+
+	 Subroutine cmprlb
+
+	 This subroutine computes r=-Z'B(xcp-xk)-Z'g by using
+	 wa(2m+1)=W'(xcp-x) from subroutine cauchy.
+
+	 Subprograms called:
+
+	 L-BFGS-B Library ... bmv.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int ws_dim1, ws_offset, wy_dim1, wy_offset, sy_dim1, sy_offset,
+	wt_dim1, wt_offset, Col, n_f;
+
+	/* Local variables */
+	int i, j, k;
+	double a1, a2;
+	int pointr;
+
+	/* Parameter adjustments */
+	--indx;
+	--r;
+	--z;
+	--g;
+	--x;
+	--wa;
+	wt_dim1 = m;
+	wt_offset = 1 + wt_dim1 * 1;
+	wt -= wt_offset;
+	sy_dim1 = m;
+	sy_offset = 1 + sy_dim1 * 1;
+	sy -= sy_offset;
+	wy_dim1 = n;
+	wy_offset = 1 + wy_dim1 * 1;
+	wy -= wy_offset;
+	ws_dim1 = n;
+	ws_offset = 1 + ws_dim1 * 1;
+	ws -= ws_offset;
+
+	/* Function Body */
+	Col = *col;
+	if (! (*cnstnd) && Col > 0) {
+		for (i = 1; i <= n; ++i)
+			r[i] = -g[i];
+	}
+	else {
+		n_f = *nfree;
+		for (i = 1; i <= n_f; ++i) {
+			k = indx[i];
+			r[i] = -(*theta) * (z[k] - x[k]) - g[k];
+		}
+		bmv(m, &sy[sy_offset], &wt[wt_offset], col,
+				&wa[(m << 1) + 1], &wa[1], info);
+		if (*info != 0) {
+			*info = -8;
+			return;
+		}
+		pointr = *head;
+		for (j = 1; j <= Col; ++j) {
+			a1 = wa[j];
+			a2 = *theta * wa[Col + j];
+			for (i = 1; i <= n_f; ++i) {
+				k = indx[i];
+				r[i] += wy[k + pointr * wy_dim1] * a1 +
+						ws[k + pointr * ws_dim1] * a2;
+			}
+			pointr = pointr % m + 1;
+		}
+	}
+	return;
+} /* cmprlb */
+/* ======================= The end of cmprlb ============================= */
+
+
+void subsm(int n, int m, int *nsub, int *ind,
+		double *l, double *u, int *nbd, double *x,
+		double *d, double *ws, double *wy, double *theta,
+		int *col, int *head, int *iword, double *wv,
+		double *wn, int iprint, int *info)
+{
+	/*    ************
+
+	 Subroutine subsm
+
+	 Given xcp, l, u, r, an index set that specifies
+	 the active set at xcp, and an l-BFGS matrix B
+	 (in terms of WY, WS, SY, WT, head, col, and theta),
+	 this subroutine computes an approximate solution
+	 of the subspace problem
+
+	 (P)   min Q(x) = r'(x-xcp) + 1/2 (x-xcp)' B (x-xcp)
+
+	 subject to l <= x <= u
+	 x_i = xcp_i   for all i in A(xcp)
+
+	 along the subspace unconstrained Newton direction
+
+	 d = -(Z'BZ)^(-1) r.
+
+	 The formula for the Newton direction, given the L-BFGS matrix
+	 and the Sherman-Morrison formula, is
+
+	 d = (1/theta)r + (1/theta*2) Z'WK^(-1)W'Z r.
+
+	 where
+	 K = [-D -Y'ZZ'Y/theta        L_a'-R_z'  ]
+	 [L_a -R_z          theta*S'AA'S ]
+
+	 Note that this procedure for computing d differs
+	 from that described in [1]. One can show that the matrix K is
+	 equal to the matrix M^[-1]N in that paper.
+
+	 n is an integer variable.
+	 On entry n is the dimension of the problem.
+	 On exit n is unchanged.
+
+	 m is an integer variable.
+	 On entry m is the maximum number of variable metric corrections
+	 used to define the limited memory matrix.
+	 On exit m is unchanged.
+
+	 nsub is an integer variable.
+	 On entry nsub is the number of free variables.
+	 On exit nsub is unchanged.
+
+	 ind is an integer array of dimension nsub.
+	 On entry ind specifies the coordinate indices of free variables.
+	 On exit ind is unchanged.
+
+	 l is a double precision array of dimension n.
+	 On entry l is the lower bound of x.
+	 On exit l is unchanged.
+
+	 u is a double precision array of dimension n.
+	 On entry u is the upper bound of x.
+	 On exit u is unchanged.
+
+	 nbd is a integer array of dimension n.
+	 On entry nbd represents the type of bounds imposed on the
+	 variables, and must be specified as follows:
+	 nbd(i)=0 if x(i) is unbounded,
+	 1 if x(i) has only a lower bound,
+	 2 if x(i) has both lower and upper bounds, and
+	 3 if x(i) has only an upper bound.
+	 On exit nbd is unchanged.
+
+	 x is a double precision array of dimension n.
+	 On entry x specifies the Cauchy point xcp.
+	 On exit x(i) is the minimizer of Q over the subspace of
+	 free variables.
+
+	 d is a double precision array of dimension n.
+	 On entry d is the reduced gradient of Q at xcp.
+	 On exit d is the Newton direction of Q.
+
+	 ws and wy are double precision arrays;
+	 theta is a double precision variable;
+	 col is an integer variable;
+	 head is an integer variable.
+	 On entry they store the information defining the
+	 limited memory BFGS matrix:
+	 ws(n,m) stores S, a set of s-vectors;
+	 wy(n,m) stores Y, a set of y-vectors;
+	 theta is the scaling factor specifying B_0 = theta I;
+	 col is the number of variable metric corrections stored;
+	 head is the location of the 1st s- (or y-) vector in S (or Y).
+	 On exit they are unchanged.
+
+	 iword is an integer variable.
+	 On entry iword is unspecified.
+	 On exit iword specifies the status of the subspace solution.
+	 iword = 0 if the solution is in the box,
+	 1 if some bound is encountered.
+
+	 wv is a double precision working array of dimension 2m.
+
+	 wn is a double precision array of dimension 2m x 2m.
+	 On entry the upper triangle of wn stores the LEL^T factorization
+	 of the indefinite matrix
+
+	 K = [-D -Y'ZZ'Y/theta     L_a'-R_z'  ]
+	 [L_a -R_z           theta*S'AA'S ]
+	 where E = [-I  0]
+	 [ 0  I]
+	 On exit wn is unchanged.
+
+	 iprint is an INTEGER variable that must be set by the user.
+	 It controls the frequency and type of output generated:
+	 iprint<0    no output is generated;
+	 iprint=0    print only one line at the last iteration;
+	 0<iprint<99 print also f and |proj g| every iprint iterations;
+	 iprint=99   print details of every iteration except n-vectors;
+	 iprint=100  print also the changes of active set and final x;
+	 iprint>100  print details of every iteration including x and g;
+	 When iprint > 0, the file iterate.dat will be created to
+	 summarize the iteration.
+
+	 info is an integer variable.
+	 On entry info is unspecified.
+	 On exit info = 0       for normal return,
+	 = nonzero for abnormal return
+	 when the matrix K is ill-conditioned.
+
+	 Subprograms called:
+
+	 Linpack dtrsl.
+
+
+	 References:
+
+	 [1] R. H. Byrd, P. Lu, J. Nocedal and C. Zhu, ``A limited
+	 memory algorithm for bound constrained optimization'',
+	 SIAM J. Scientific Computing 16 (1995), no. 5, pp. 1190--1208.
+
+
+
+	 *  *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int ws_offset, wn_dim1, wn_offset;
+
+	/* Local variables */
+	double alpha, dk, temp1, temp2;
+	int i, j, k, m2, js, jy, pointr, ibd = 0, col2, ns;
+
+	/* Parameter adjustments */
+	--d;
+	--u;
+	--l;
+	--x;
+	--ind;
+	--nbd;
+	--wv;
+	wn_dim1 = 2 * m;
+	wn_offset = 1 + wn_dim1 * 1;
+	wn -= wn_offset;
+	/* ws[] and wy[] are both  [n x m ] :*/
+	ws_offset = 1 + n * 1;
+	ws -= ws_offset;
+	wy -= ws_offset;
+
+	ns = *nsub;
+	if (ns <= 0)
+		return;
+
+	/*     Compute wv = W'Zd. */
+	pointr = *head;
+	for (i = 1; i <= *col; ++i) {
+		temp1 = 0.;
+		temp2 = 0.;
+		for (j = 1; j <= ns; ++j) {
+			k = ind[j];
+			temp1 += wy[k + pointr * n] * d[j];
+			temp2 += ws[k + pointr * n] * d[j];
+		}
+		wv[i] = temp1;
+		wv[*col + i] = *theta * temp2;
+		pointr = pointr % m + 1;
+		/* L20: */
+	}
+	/*     Compute wv:=K^(-1)wv. */
+	m2 = m << 1;
+	col2 = *col << 1;
+	dtrsl(&wn[wn_offset], &m2, &col2, &wv[1], &c__11, info);
+	if (*info != 0) {
+		return;
+	}
+	for (i = 1; i <= *col; ++i)
+		wv[i] = -wv[i];
+
+	dtrsl(&wn[wn_offset], &m2, &col2, &wv[1], &c__1, info);
+	if (*info != 0) {
+		return;
+	}
+	/*     Compute d = (1/theta)d + (1/theta**2)Z'W wv. */
+	pointr = *head;
+	for (jy = 1; jy <= *col; ++jy) {
+		js = *col + jy;
+		for (i = 1; i <= ns; ++i) {
+			k = ind[i];
+			d[i] += (wy[k + pointr * n] * wv[jy] / *theta +
+					ws[k + pointr * n] * wv[js]);
+		}
+		pointr = pointr % m + 1;
+		/* L40: */
+	}
+
+	for (i = 1; i <= ns; ++i)
+		d[i] /= *theta;
+
+	/*     Backtrack to the feasible region. */
+	alpha = 1.;
+	temp1 = alpha;
+	for (i = 1; i <= ns; ++i) {
+		k = ind[i];
+		dk = d[i];
+		if (nbd[k] != 0) {
+			if (dk < 0. && nbd[k] <= 2) {
+				temp2 = l[k] - x[k];
+				if (temp2 >= 0.) {
+					temp1 = 0.;
+				} else if (dk * alpha < temp2) {
+					temp1 = temp2 / dk;
+				}
+			} else if (dk > 0. && nbd[k] >= 2) {
+				temp2 = u[k] - x[k];
+				if (temp2 <= 0.) {
+					temp1 = 0.;
+				} else if (dk * alpha > temp2) {
+					temp1 = temp2 / dk;
+				}
+			}
+			if (temp1 < alpha) {
+				alpha = temp1;
+				ibd = i;
+			}
+		}
+		/* L60: */
+	}
+	if (alpha < 1.) {
+		dk = d[ibd];
+		k = ind[ibd];
+		if (dk > 0.) {
+			x[k] = u[k];
+			d[ibd] = 0.;
+		} else if (dk < 0.) {
+			x[k] = l[k];
+			d[ibd] = 0.;
+		}
+	}
+	for (i = 1; i <= ns; ++i)
+		x[ind[i]] += alpha * d[i];
+
+	*iword = (alpha < 1.) ? 1 : 0;
+
+	return;
+} /* subsm */
+/* ====================== The end of subsm =============================== */
+
+void lnsrlb(int n, double *l, double *u,
+		int *nbd, double *x, double *f, double *fold,
+		double *gd, double *gdold, double *g, double *d,
+		double *r, double *t, double *z, double *stp,
+		double *dnorm, double *dtd, double *xstep,
+		double *stpmx, int *iter, int *ifun, int *iback, int *nfgv,
+		int *info, char *task, int *boxed, int *cnstnd,
+		char *csave, int *isave, double *dsave)
+{
+	/*     **********
+
+	 Subroutine lnsrlb
+
+	 This subroutine calls subroutine dcsrch from the Minpack2 library
+	 to perform the line search.  Subroutine dscrch is safeguarded so
+	 that all trial points lie within the feasible region.
+
+	 Subprograms called:
+
+	 Minpack2 Library ... dcsrch.
+
+	 Linpack ... dtrsl, ddot.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 **********
+	 */
+
+	/* For dcsrch(): */
+	const double stpmin = 0.;
+	const double ftol = .001;
+	const double gtol = .9;
+	const double xtol = .1;
+
+	/* System generated locals */
+	double d1;
+
+	/* Local variables */
+	int i;
+	double a1, a2;
+
+	/* Parameter adjustments */
+	--z;
+	--t;
+	--r;
+	--d;
+	--g;
+	--x;
+	--nbd;
+	--u;
+	--l;
+
+	/* Function Body */
+	if (strncmp(task, "FG_LN", 5) == 0) {
+		goto L556;
+	}
+	*dtd = ddot(&n, &d[1], &c__1, &d[1], &c__1);
+	*dnorm = sqrt(*dtd);
+	/*     Determine the maximum step length. */
+	*stpmx = 1e10;
+	if (*cnstnd) {
+		if (*iter == 0) {
+			*stpmx = 1.;
+		} else {
+			for (i = 1; i <= n; ++i) {
+				a1 = d[i];
+				if (nbd[i] != 0) {
+					if (a1 < 0. && nbd[i] <= 2) {
+						a2 = l[i] - x[i];
+						if (a2 >= 0.) {
+							*stpmx = 0.;
+						} else if (a1 * *stpmx < a2) {
+							*stpmx = a2 / a1;
+						}
+					} else if (a1 > 0. && nbd[i] >= 2) {
+						a2 = u[i] - x[i];
+						if (a2 <= 0.) {
+							*stpmx = 0.;
+						} else if (a1 * *stpmx > a2) {
+							*stpmx = a2 / a1;
+						}
+					}
+				}
+				/* L43: */
+			}
+		}
+	}
+	if (*iter == 0 && ! (*boxed)) {
+		d1 = 1. / *dnorm;
+		*stp = min(d1,*stpmx);
+	} else {
+		*stp = 1.;
+	}
+	dcopy(&n, &x[1], &c__1, &t[1], &c__1);
+	dcopy(&n, &g[1], &c__1, &r[1], &c__1);
+	*fold = *f;
+	*ifun = 0;
+	*iback = 0;
+	strcpy(csave, "START");
+	L556:
+	*gd = ddot(&n, &g[1], &c__1, &d[1], &c__1);
+	if (*ifun == 0) {
+		*gdold = *gd;
+		if (*gd >= 0.) {
+			/*                 the directional derivative >=0. */
+			/*                 Line search is impossible. */
+			*info = -4;
+			return;
+		}
+	}
+	dcsrch(f, gd, stp,
+			ftol, gtol, xtol,
+			stpmin, *stpmx,
+			csave, isave, dsave);
+	*xstep = *stp * *dnorm;
+	if (strncmp(csave, "CONV", 4) != 0 && strncmp(csave, "WARN", 4) != 0) {
+		strcpy(task, "FG_LNSRCH");
+		++(*ifun);
+		++(*nfgv);
+		*iback = *ifun - 1;
+		if (*stp == 1.) {
+			dcopy(&n, &z[1], &c__1, &x[1], &c__1);
+		} else {
+			for (i = 1; i <= n; ++i) {
+				x[i] = *stp * d[i] + t[i];
+			}
+		}
+	} else {
+		strcpy(task, "NEW_X");
+	}
+	return;
+} /* lnsrlb */
+/* ======================= The end of lnsrlb ============================= */
+
+void matupd(int n, int m, double *ws,
+		double *wy, double *sy, double *ss, double *d,
+		double *r, int *itail, int *iupdat, int *col,
+		int *head, double *theta, double *rr, double *dr,
+		double *stp, double *dtd)
+{
+	/*    ************
+
+	 Subroutine matupd
+
+	 This subroutine updates matrices WS and WY, and forms the
+	 middle matrix in B.
+
+	 Subprograms called:
+
+	 Linpack ... dcopy, ddot.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int ws_dim1, ws_offset, wy_dim1, wy_offset, sy_dim1, sy_offset,
+	ss_dim1, ss_offset, i__1, i__2;
+
+	/* Local variables */
+	int j;
+	int pointr;
+
+	/* Parameter adjustments */
+	--r;
+	--d;
+	ss_dim1 = m;
+	ss_offset = 1 + ss_dim1 * 1;
+	ss -= ss_offset;
+	sy_dim1 = m;
+	sy_offset = 1 + sy_dim1 * 1;
+	sy -= sy_offset;
+	wy_dim1 = n;
+	wy_offset = 1 + wy_dim1 * 1;
+	wy -= wy_offset;
+	ws_dim1 = n;
+	ws_offset = 1 + ws_dim1 * 1;
+	ws -= ws_offset;
+
+	/* Function Body */
+
+	/*     Set pointers for matrices WS and WY. */
+	if (*iupdat <= m) {
+		*col = *iupdat;
+		*itail = (*head + *iupdat - 2) % m + 1;
+	} else {
+		*itail = *itail % m + 1;
+		*head = *head % m + 1;
+	}
+	/*     Update matrices WS and WY. */
+	dcopy(&n, &d[1], &c__1, &ws[*itail * ws_dim1 + 1], &c__1);
+	dcopy(&n, &r[1], &c__1, &wy[*itail * wy_dim1 + 1], &c__1);
+	/*     Set theta=yy/ys. */
+	*theta = *rr / *dr;
+	/*     Form the middle matrix in B. */
+	/*      update the upper triangle of SS, */
+	/*                       and the lower triangle of SY: */
+	if (*iupdat > m) {
+		/*                move old information */
+		i__1 = *col - 1;
+		for (j = 1; j <= i__1; ++j) {
+			dcopy(&j, &ss[(j + 1) * ss_dim1 + 2], &c__1,
+					&ss[j * ss_dim1 + 1], &c__1);
+			i__2 = *col - j;
+			dcopy(&i__2, &sy[j + 1 + (j + 1) * sy_dim1], &c__1,
+					&sy[j + j * sy_dim1], &c__1);
+			/* L50: */
+		}
+	}
+	/*      add new information: the last row of SY */
+	/*                           and the last column of SS: */
+	pointr = *head;
+	i__1 = *col - 1;
+	for (j = 1; j <= i__1; ++j) {
+		sy[*col + j * sy_dim1] =
+				ddot(&n, &d[1], &c__1, &wy[pointr * wy_dim1 + 1], &c__1);
+		ss[j + *col * ss_dim1] =
+				ddot(&n, &ws[pointr * ws_dim1 + 1], &c__1, &d[1], &c__1);
+		pointr = pointr % m + 1;
+		/* L51: */
+	}
+	if (*stp == 1.) {
+		ss[*col + *col * ss_dim1] = *dtd;
+	} else {
+		ss[*col + *col * ss_dim1] = *stp * *stp * *dtd;
+	}
+	sy[*col + *col * sy_dim1] = *dr;
+	return;
+} /* matupd */
+/* ======================= The end of matupd ============================= */
+
+void formt(int m, double *wt, double *sy, double *ss,
+		int *col, double *theta, int *info)
+{
+	/*     ************
+
+	 Subroutine formt
+
+	 This subroutine forms the upper half of the pos. def. and symm.
+	 T = theta*SS + L*D^(-1)*L', stores T in the upper triangle
+	 of the array wt, and performs the Cholesky factorization of T
+	 to produce J*J', with J' stored in the upper triangle of wt.
+
+	 Subprograms called:
+
+	 Linpack ... dpofa.
+
+
+	 *  *     *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* System generated locals */
+	int wt_dim1, wt_offset, sy_dim1, sy_offset, ss_dim1, ss_offset, i__1;
+
+	/* Local variables */
+	double ddum;
+	int i, j, k;
+	int k1;
+
+	/* Parameter adjustments */
+	ss_dim1 = m;
+	ss_offset = 1 + ss_dim1 * 1;
+	ss -= ss_offset;
+	sy_dim1 = m;
+	sy_offset = 1 + sy_dim1 * 1;
+	sy -= sy_offset;
+	wt_dim1 = m;
+	wt_offset = 1 + wt_dim1 * 1;
+	wt -= wt_offset;
+
+	/* Function Body */
+
+	/*     Form the upper half of  T = theta*SS + L*D^(-1)*L', */
+	/*      store T in the upper triangle of the array wt. */
+	i__1 = *col;
+	for (j = 1; j <= i__1; ++j) {
+		wt[j * wt_dim1 + 1] = *theta * ss[j * ss_dim1 + 1];
+	}
+	for (i = 2; i <= i__1; ++i) {
+		for (j = i; j <= i__1; ++j) {
+			k1 = min(i,j) - 1;
+			ddum = 0.;
+			for (k = 1; k <= k1; ++k) {
+				ddum += sy[i + k * sy_dim1] * sy[j + k * sy_dim1] / sy[k +
+																	   k * sy_dim1];
+			}
+			wt[i + j * wt_dim1] = ddum + *theta * ss[i + j * ss_dim1];
+		}
+		/* L55: */
+	}
+	/*     Cholesky factorize T to J*J' with */
+	/*      J' stored in the upper triangle of wt. */
+	dpofa(&wt[wt_offset], &m, col, info);
+	if (*info != 0) {
+		*info = -3;
+	}
+	return;
+} /* formt */
+
+/* ======================= The end of formt ============================== */
+
+void bmv(int m, double *sy, double *wt,
+		int *col, double *v, double *p, int *info)
+{
+	/*     ************
+
+	 *     Subroutine bmv
+
+	 *     This subroutine computes the product of the 2m x 2m middle matrix
+	 *     in the compact L-BFGS formula of B and a 2m vector v;
+	 *     it returns the product in p.
+
+	 *     m is an integer variable.
+	 *     On entry m is the maximum number of variable metric corrections
+	 *       used to define the limited memory matrix.
+	 *     On exit m is unchanged.
+
+	 *     sy is a double precision array of dimension m x m.
+	 *     On entry sy specifies the matrix S'Y.
+	 *     On exit sy is unchanged.
+
+	 *     wt is a double precision array of dimension m x m.
+	 *     On entry wt specifies the upper triangular matrix J' which is
+	 *       the Cholesky factor of (thetaS'S+LD^(-1)L').
+	 *     On exit wt is unchanged.
+
+	 *     col is an integer variable.
+	 *     On entry col specifies the number of s-vectors (or y-vectors)
+	 *       stored in the compact L-BFGS formula.
+	 *     On exit col is unchanged.
+
+	 *     v is a double precision array of dimension 2col.
+	 *     On entry v specifies vector v.
+	 *     On exit v is unchanged.
+
+	 *     p is a double precision array of dimension 2col.
+	 *     On entry p is unspecified.
+	 *     On exit p is the product Mv.
+
+	 *     info is an integer variable.
+	 *     On entry info is unspecified.
+	 *     On exit info = 0    for normal return,
+	 *              = nonzero for abnormal return when the system
+	 *                  to be solved by dtrsl is singular.
+
+	 *     Subprograms called:
+
+	 *     Linpack ... dtrsl.
+
+
+	 *                 *    *  *
+
+	 *     NEOS, November 1994. (Latest revision June 1996.)
+	 *     Optimization Technology Center.
+	 *     Argonne National Laboratory and Northwestern University.
+	 *     Written by
+	 *              Ciyou Zhu
+	 *     in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 *     ************
+	 */
+
+	/* System generated locals */
+	int sy_dim1, sy_offset, wt_dim1, wt_offset, Col;
+
+
+	/* Local variables */
+	int i, k;
+	int i2;
+	double sum;
+
+	/* Parameter adjustments */
+	wt_dim1 = m;
+	wt_offset = 1 + wt_dim1 * 1;
+	wt -= wt_offset;
+	sy_dim1 = m;
+	sy_offset = 1 + sy_dim1 * 1;
+	sy -= sy_offset;
+	--p;
+	--v;
+
+	/* Function Body */
+	if (*col == 0) {
+		return;
+	}
+	/*    PART I: solve [     D^(1/2)      O ] [ p1 ] = [ v1 ]
+	 *              [ -L*D^(-1/2)   J ] [ p2 ]   [ v2 ].
+	 *    solve Jp2=v2+LD^(-1)v1.
+	 */
+	Col = *col;
+	p[*col + 1] = v[*col + 1];
+	for (i = 2; i <= Col; ++i) {
+		i2 = *col + i;
+		sum = 0.;
+		for (k = 1; k <= i - 1; ++k) {
+			sum += sy[i + k * sy_dim1] * v[k] / sy[k + k * sy_dim1];
+		}
+		p[i2] = v[i2] + sum;
+		/* L20: */
+	}
+	/*     Solve the triangular system */
+	dtrsl(&wt[wt_offset], &m, col, &p[*col + 1], &c__11, info);
+	if (*info != 0) {
+		return;
+	}
+	/*     solve D^(1/2)p1=v1. */
+	for (i = 1; i <= Col; ++i) {
+		p[i] = v[i] / sqrt(sy[i + i * sy_dim1]);
+	}
+
+	/*    PART II: solve [ -D^(1/2)   D^(-1/2)*L'     ] [ p1 ] = [ p1 ]
+	 *               [  0        J'         ] [ p2 ]   [ p2 ].
+	 *    solve J^Tp2=p2.
+	 */
+	dtrsl(&wt[wt_offset], &m, col, &p[*col + 1], &c__1, info);
+	if (*info != 0) {
+		return;
+	}
+	/*     compute p1=-D^(-1/2)(p1-D^(-1/2)L'p2) */
+	/*           =-D^(-1/2)p1 + D^(-1)L'p2. */
+	for (i = 1; i <= Col; ++i) {
+		p[i] = -p[i] / sqrt(sy[i + i * sy_dim1]);
+	}
+	for (i = 1; i <= Col; ++i) {
+		sum = 0.;
+		for (k = i + 1; k <= Col; ++k) {
+			sum += sy[k + i * sy_dim1] * p[*col + k] / sy[i + i * sy_dim1];
+		}
+		p[i] += sum;
+		/* L60: */
+	}
+	return;
+} /* bmv */
+/* ======================== The end of bmv =============================== */
+
+void hpsolb(int n, double *t, int *iorder, int iheap)
+{
+	/*    ************
+
+	 Subroutine hpsolb
+
+	 This subroutine sorts out the least element of t, and puts the
+	 remaining elements of t in a heap.
+
+	 n is an int variable.
+	 On entry n is the dimension of the arrays t and iorder.
+	 On exit n is unchanged.
+
+	 t is a double precision array of dimension n.
+	 On entry t stores the elements to be sorted,
+	 On exit t(n) stores the least elements of t, and t(1) to t(n-1)
+	 stores the remaining elements in the form of a heap.
+
+	 iorder is an int array of dimension n.
+	 On entry iorder(i) is the index of t(i).
+	 On exit iorder(i) is still the index of t(i), but iorder may be
+	 permuted in accordance with t.
+
+	 iheap is an int variable specifying the task.
+	 On entry iheap should be set as follows:
+	 iheap .eq. 0 if t(1) to t(n) is not in the form of a heap,
+	 iheap .ne. 0 if otherwise.
+	 On exit iheap is unchanged.
+
+
+	 References:
+	 Algorithm 232 of CACM (J. W. J. Williams): HEAPSORT.
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision June 1996.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+
+	/* Local variables */
+	double ddum;
+	int i, j, k, indxin, indxou;
+	double out;
+
+	/* Parameter adjustments */
+	--iorder;
+	--t;
+
+	/* Function Body */
+	if (iheap == 0) {
+		/*      Rearrange the elements t(1) to t(n) to form a heap. */
+		for (k = 2; k <= n; ++k) {
+			ddum = t[k];
+			indxin = iorder[k];
+			/*         Add ddum to the heap. */
+			i = k;
+			h_loop:
+			if (i > 1) {
+				j = i / 2;
+				if (ddum < t[j]) {
+					t[i] = t[j];
+					iorder[i] = iorder[j];
+					i = j;
+					goto h_loop;
+				}
+			}
+			t[i] = ddum;
+			iorder[i] = indxin;
+			/* L20: */
+		}
+	}
+	/*     Assign to 'out' the value of t(1), the least member of the heap, */
+	/*      and rearrange the remaining members to form a heap as */
+	/*      elements 1 to n-1 of t. */
+	if (n > 1) {
+		i = 1;
+		out = t[1];
+		indxou = iorder[1];
+		ddum = t[n];
+		indxin = iorder[n];
+		/*      Restore the heap */
+		Loop:
+		j = i + i;
+		if (j <= n - 1) {
+			if (t[j + 1] < t[j]) {
+				++j;
+			}
+			if (t[j] < ddum) {
+				t[i] = t[j];
+				iorder[i] = iorder[j];
+				i = j;
+				goto Loop;
+			}
+		}
+		t[i] = ddum;
+		iorder[i] = indxin;
+		/*     Put the least member in t(n). */
+		t[n] = out;
+		iorder[n] = indxou;
+	}
+	return;
+} /* hpsolb */
+/* ====================== The end of hpsolb ============================== */
+
+void dcsrch(double *f, double *g, double *stp,
+		/*Chgd: the next five are no longer pointers:*/
+		double ftol, double gtol, double xtol,
+		double stpmin, double stpmax,
+		char *task, int *isave, double *dsave)
+{
+	/*    **********
+
+	 Subroutine dcsrch
+
+	 This subroutine finds a step that satisfies a sufficient
+	 decrease condition and a curvature condition.
+
+	 Each call of the subroutine updates an interval with
+	 endpoints stx and sty. The interval is initially chosen
+	 so that it contains a minimizer of the modified function
+
+	 psi(stp) = f(stp) - f(0) - ftol*stp*f'(0).
+
+	 If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the
+	 interval is chosen so that it contains a minimizer of f.
+
+	 The algorithm is designed to find a step that satisfies
+	 the sufficient decrease condition
+
+	 f(stp) <= f(0) + ftol*stp*f'(0),
+
+	 and the curvature condition
+
+	 abs(f'(stp)) <= gtol*abs(f'(0)).
+
+	 If ftol is less than gtol and if, for example, the function
+	 is bounded below, then there is always a step which satisfies
+	 both conditions.
+
+	 If no step can be found that satisfies both conditions, then
+	 the algorithm stops with a warning. In this case stp only
+	 satisfies the sufficient decrease condition.
+
+	 A typical invocation of dcsrch has the following outline:
+
+	 task = 'START'
+	 10 continue
+	 call dcsrch( ... )
+	 if (task .eq. 'FG') then
+	 Evaluate the function and the gradient at stp
+	 goto 10
+	 end if
+
+	 NOTE: The user must no alter work arrays between calls.
+
+	 The subroutine statement is
+
+	 subroutine dcsrch(f,g,stp,ftol,gtol,xtol,stpmin,stpmax,
+	 task,isave,dsave)
+	 where
+
+	 f is a double precision variable.
+	 On initial entry f is the value of the function at 0.
+	 On subsequent entries f is the value of the
+	 function at stp.
+	 On exit f is the value of the function at stp.
+
+	 g is a double precision variable.
+	 On initial entry g is the derivative of the function at 0.
+	 On subsequent entries g is the derivative of the
+	 function at stp.
+	 On exit g is the derivative of the function at stp.
+
+	 stp is a double precision variable.
+	 On entry stp is the current estimate of a satisfactory
+	 step. On initial entry, a positive initial estimate
+	 must be provided.
+	 On exit stp is the current estimate of a satisfactory step
+	 if task = 'FG'. If task = 'CONV' then stp satisfies
+	 the sufficient decrease and curvature condition.
+
+	 ftol is a double precision variable.
+	 On entry ftol specifies a nonnegative tolerance for the
+	 sufficient decrease condition.
+	 On exit ftol is unchanged.
+
+	 gtol is a double precision variable.
+	 On entry gtol specifies a nonnegative tolerance for the
+	 curvature condition.
+	 On exit gtol is unchanged.
+
+	 xtol is a double precision variable.
+	 On entry xtol specifies a nonnegative relative tolerance
+	 for an acceptable step. The subroutine exits with a
+	 warning if the relative difference between sty and stx
+	 is less than xtol.
+	 On exit xtol is unchanged.
+
+	 stpmin is a double precision variable.
+	 On entry stpmin is a nonnegative lower bound for the step.
+	 On exit stpmin is unchanged.
+
+	 stpmax is a double precision variable.
+	 On entry stpmax is a nonnegative upper bound for the step.
+	 On exit stpmax is unchanged.
+
+	 task is a character variable of length at least 60.
+	 On initial entry task must be set to 'START'.
+	 On exit task indicates the required action:
+
+	 If task(1:2) = 'FG' then evaluate the function and
+	 derivative at stp and call dcsrch again.
+
+	 If task(1:4) = 'CONV' then the search is successful.
+
+	 If task(1:4) = 'WARN' then the subroutine is not able
+	 to satisfy the convergence conditions. The exit value of
+	 stp contains the best point found during the search.
+
+	 If task(1:5) = 'ERROR' then there is an error in the
+	 input arguments.
+
+	 On exit with convergence, a warning or an error, the
+	 variable task contains additional information.
+
+	 isave is an integer work array of dimension 2.
+
+	 dsave is a double precision work array of dimension 13.
+
+	 Subprograms called
+
+	 MINPACK-2 ... dcstep
+
+
+	 MINPACK-1 Project. June 1983.
+	 Argonne National Laboratory.
+	 Jorge J. More' and David J. Thuente.
+
+	 MINPACK-2 Project. October 1993.
+	 Argonne National Laboratory and University of Minnesota.
+	 Brett M. Averick, Richard G. Carter, and Jorge J. More'.
+
+	 **********
+	 */
+
+	/* Local variables */
+	int stage;
+	double finit, ginit, width, ftest, gtest, stmin, stmax, width1, fm,
+	gm, fx, fy, gx, gy;
+	int brackt;
+	double fxm, fym, gxm, gym, stx, sty;
+
+	/* Parameter adjustments */
+	--dsave;
+	--isave;
+
+	/* Function Body */
+
+	/*     Initialization block. */
+	if (strncmp(task, "START", 5) == 0) {
+		/*      Check the input arguments for errors. */
+		if (*stp < stpmin)    strcpy(task, "ERROR: STP .LT. STPMIN");
+		if (*stp > stpmax)    strcpy(task, "ERROR: STP .GT. STPMAX");
+		if (*g >= 0.)        strcpy(task, "ERROR: INITIAL G .GE. ZERO");
+		if (ftol < 0.)        strcpy(task, "ERROR: FTOL .LT. ZERO");
+		if (gtol < 0.)        strcpy(task, "ERROR: GTOL .LT. ZERO");
+		if (xtol < 0.)        strcpy(task, "ERROR: XTOL .LT. ZERO");
+		if (stpmin < 0.)    strcpy(task, "ERROR: STPMIN .LT. ZERO");
+		if (stpmax < stpmin)    strcpy(task, "ERROR: STPMAX .LT. STPMIN");
+
+		/*      Exit if there are errors on input. */
+		if (strncmp(task, "ERROR", 5) == 0) {
+			return;
+		}
+		/*      Initialize local variables. */
+		brackt = 0;
+		stage = 1;
+		finit = *f;
+		ginit = *g;
+		gtest = ftol * ginit;
+		width = stpmax - stpmin;
+		width1 = width / .5;
+		/*      The variables stx, fx, gx contain the values of the step, */
+		/*      function, and derivative at the best step. */
+		/*      The variables sty, fy, gy contain the value of the step, */
+		/*      function, and derivative at sty. */
+		/*      The variables stp, f, g contain the values of the step, */
+		/*      function, and derivative at stp. */
+		stx = 0.;    fx = finit;    gx = ginit;
+		sty = 0.;    fy = finit;    gy = ginit;
+		stmin = 0.;
+		stmax = *stp + *stp * 4.;
+		strcpy(task, "FG");
+		goto L1000;
+	} else {
+		/*      Restore local variables. */
+		if (isave[1] == 1) {
+			brackt = 1;
+		} else {
+			brackt = 0;
+		}
+		stage = isave[2];
+		ginit = dsave[1];
+		gtest = dsave[2];
+		gx = dsave[3];
+		gy = dsave[4];
+		finit = dsave[5];
+		fx = dsave[6];
+		fy = dsave[7];
+		stx = dsave[8];
+		sty = dsave[9];
+		stmin = dsave[10];
+		stmax = dsave[11];
+		width = dsave[12];
+		width1 = dsave[13];
+	}
+	/*      If psi(stp) <= 0 and f'(stp) >= 0 for some step, then the */
+	/*      algorithm enters the second stage. */
+	ftest = finit + *stp * gtest;
+	if (stage == 1 && *f <= ftest && *g >= 0.) {
+		stage = 2;
+	}
+	/*    Test for warnings. */
+	if (brackt && (*stp <= stmin || *stp >= stmax))
+		strcpy(task, "WARNING: ROUNDING ERRORS PREVENT PROGRESS");
+	if (brackt && stmax - stmin <= xtol * stmax)
+		strcpy(task, "WARNING: XTOL TEST SATISFIED");
+	if (*stp == stpmax && *f <= ftest && *g <= gtest)
+		strcpy(task, "WARNING: STP = STPMAX");
+	if (*stp == stpmin && (*f > ftest || *g >= gtest))
+		strcpy(task, "WARNING: STP = STPMIN");
+	/*    Test for convergence. */
+	if (*f <= ftest && fabs(*g) <= gtol * (-ginit))
+		strcpy(task, "CONVERGENCE");
+	/*    Test for termination. */
+	if (strncmp(task, "WARN", 4) == 0 || strncmp(task, "CONV", 4) == 0)
+		goto L1000;
+
+	/*     A modified function is used to predict the step during the */
+	/*     first stage if a lower function value has been obtained but */
+	/*     the decrease is not sufficient. */
+	if (stage == 1 && *f <= fx && *f > ftest) {
+		/*      Define the modified function and derivative values. */
+		fm = *f - *stp * gtest;
+		fxm = fx - stx * gtest;
+		fym = fy - sty * gtest;
+		gm = *g - gtest;
+		gxm = gx - gtest;
+		gym = gy - gtest;
+		/*      Call dcstep to update stx, sty, and to compute the new step. */
+		dcstep(&stx, &fxm, &gxm, &sty, &fym, &gym, stp, &fm, &gm, &brackt, &
+				stmin, &stmax);
+		/*      Reset the function and derivative values for f. */
+		fx = fxm + stx * gtest;
+		fy = fym + sty * gtest;
+		gx = gxm + gtest;
+		gy = gym + gtest;
+	} else {
+		/*     Call dcstep to update stx, sty, and to compute the new step. */
+		dcstep(&stx, &fx, &gx, &sty, &fy, &gy, stp, f, g, &brackt, &stmin, &
+				stmax);
+	}
+	/*     Decide if a bisection step is needed. */
+	if (brackt) {
+		if (fabs(sty - stx) >= width1 * .66) {
+			*stp = stx + (sty - stx) * .5;
+		}
+		width1 = width;
+		width = fabs(sty - stx);
+	}
+	/*     Set the minimum and maximum steps allowed for stp. */
+	if (brackt) {
+		stmin = min(stx,sty);
+		stmax = max(stx,sty);
+	} else {
+		stmin = *stp + (*stp - stx) * 1.1;
+		stmax = *stp + (*stp - stx) * 4.;
+	}
+	/*     Force the step to be within the bounds stpmax and stpmin. */
+	if(*stp < stpmin) *stp = stpmin;
+	if(*stp > stpmax) *stp = stpmax;
+
+	/*     If further progress is not possible, let stp be the best */
+	/*     point obtained during the search. */
+	if ((brackt && (*stp <= stmin || *stp >= stmax)) ||
+			(brackt && (stmax - stmin <= xtol * stmax))) {
+		*stp = stx;
+	}
+	/*     Obtain another function and derivative. */
+	strcpy(task, "FG");
+	L1000:
+	/*     Save local variables. */
+	if (brackt) {
+		isave[1] = 1;
+	} else {
+		isave[1] = 0;
+	}
+	isave[2] = stage;
+	dsave[1] = ginit;
+	dsave[2] = gtest;
+	dsave[3] = gx;
+	dsave[4] = gy;
+	dsave[5] = finit;
+	dsave[6] = fx;
+	dsave[7] = fy;
+	dsave[8] = stx;
+	dsave[9] = sty;
+	dsave[10] = stmin;
+	dsave[11] = stmax;
+	dsave[12] = width;
+	dsave[13] = width1;
+	return;
+} /* dcsrch */
+/* ====================== The end of dcsrch ============================== */
+
+
+void dcstep(double *stx, double *fx, double *dx,
+		double *sty, double *fy, double *dy, double *stp,
+		double *fp, double *dp, int *brackt, double *stpmin,
+		double *stpmax)
+{
+	/*    **********
+
+	 Subroutine dcstep
+
+	 This subroutine computes a safeguarded step for a search
+	 procedure and updates an interval that contains a step that
+	 satisfies a sufficient decrease and a curvature condition.
+
+	 The parameter stx contains the step with the least function
+	 value. If brackt is set to .true. then a minimizer has
+	 been bracketed in an interval with endpoints stx and sty.
+	 The parameter stp contains the current step.
+	 The subroutine assumes that if brackt is set to .true. then
+
+	 min(stx,sty) < stp < max(stx,sty),
+
+	 and that the derivative at stx is negative in the direction
+	 of the step.
+
+	 The subroutine statement is
+
+	 subroutine dcstep(stx,fx,dx,sty,fy,dy,stp,fp,dp,brackt,
+	 stpmin,stpmax)
+
+	 where
+
+	 stx is a double precision variable.
+	 On entry stx is the best step obtained so far and is an
+	 endpoint of the interval that contains the minimizer.
+	 On exit stx is the updated best step.
+
+	 fx is a double precision variable.
+	 On entry fx is the function at stx.
+	 On exit fx is the function at stx.
+
+	 dx is a double precision variable.
+	 On entry dx is the derivative of the function at
+	 stx. The derivative must be negative in the direction of
+	 the step, that is, dx and stp - stx must have opposite
+	 signs.
+	 On exit dx is the derivative of the function at stx.
+
+	 sty is a double precision variable.
+	 On entry sty is the second endpoint of the interval that
+	 contains the minimizer.
+	 On exit sty is the updated endpoint of the interval that
+	 contains the minimizer.
+
+	 fy is a double precision variable.
+	 On entry fy is the function at sty.
+	 On exit fy is the function at sty.
+
+	 dy is a double precision variable.
+	 On entry dy is the derivative of the function at sty.
+	 On exit dy is the derivative of the function at the exit sty.
+
+	 stp is a double precision variable.
+	 On entry stp is the current step. If brackt is set to .true.
+	 then on input stp must be between stx and sty.
+	 On exit stp is a new trial step.
+
+	 fp is a double precision variable.
+	 On entry fp is the function at stp
+	 On exit fp is unchanged.
+
+	 dp is a double precision variable.
+	 On entry dp is the the derivative of the function at stp.
+	 On exit dp is unchanged.
+
+	 brackt is an logical variable.
+	 On entry brackt specifies if a minimizer has been bracketed.
+	 Initially brackt must be set to .false.
+	 On exit brackt specifies if a minimizer has been bracketed.
+	 When a minimizer is bracketed brackt is set to .true.
+
+	 stpmin is a double precision variable.
+	 On entry stpmin is a lower bound for the step.
+	 On exit stpmin is unchanged.
+
+	 stpmax is a double precision variable.
+	 On entry stpmax is an upper bound for the step.
+	 On exit stpmax is unchanged.
+
+	 MINPACK-1 Project. June 1983
+	 Argonne National Laboratory.
+	 Jorge J. More' and David J. Thuente.
+
+	 MINPACK-2 Project. October 1993.
+	 Argonne National Laboratory and University of Minnesota.
+	 Brett M. Averick and Jorge J. More'.
+
+	 **********
+	 */
+
+	/* System generated locals */
+	double d__1, d__2;
+
+	/* Local variables */
+	double sgnd, stpc, stpf, stpq, p, q, gamm, r__, s, theta;
+
+	sgnd = *dp * (*dx / fabs(*dx));
+	/*     First case: A higher function value. The minimum is bracketed. */
+	/*     If the cubic step is closer to stx than the quadratic step, the */
+	/*     cubic step is taken, otherwise the average of the cubic and */
+	/*     quadratic steps is taken. */
+	if (*fp > *fx) {
+		theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp;
+		/* Computing MAX */
+		d__1 = fabs(theta), d__2 = fabs(*dx),
+				d__1 = max(d__1,d__2), d__2 = fabs(*dp);
+		s = max(d__1,d__2);
+		/* Computing 2nd power */
+		d__1 = theta / s;
+		gamm = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s));
+		if (*stp < *stx) {
+			gamm = -gamm;
+		}
+		p = gamm - *dx + theta;
+		q = gamm - *dx + gamm + *dp;
+		r__ = p / q;
+		stpc = *stx + r__ * (*stp - *stx);
+		stpq = *stx + *dx / ((*fx - *fp) / (*stp - *stx) + *dx) / 2. * (*stp
+				- *stx);
+		if (fabs(stpc - *stx) < fabs(stpq - *stx)) {
+			stpf = stpc;
+		} else {
+			stpf = stpc + (stpq - stpc) / 2.;
+		}
+		*brackt = 1;
+		/*     Second case: A lower function value and derivatives of opposite */
+		/*     sign. The minimum is bracketed. If the cubic step is farther from */
+		/*     stp than the secant step, the cubic step is taken, otherwise the */
+		/*     secant step is taken. */
+	} else if (sgnd < 0.) {
+		theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp;
+		/* Computing MAX */
+		d__1 = fabs(theta), d__2 = fabs(*dx),
+				d__1 = max(d__1,d__2), d__2 = fabs(*dp);
+		s = max(d__1,d__2);
+		/* Computing 2nd power */
+		d__1 = theta / s;
+		gamm = s * sqrt(d__1 * d__1 - *dx / s * (*dp / s));
+		if (*stp > *stx) {
+			gamm = -gamm;
+		}
+		p = gamm - *dp + theta;
+		q = gamm - *dp + gamm + *dx;
+		r__ = p / q;
+		stpc = *stp + r__ * (*stx - *stp);
+		stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp);
+		if (fabs(stpc - *stp) > fabs(stpq - *stp)) {
+			stpf = stpc;
+		} else {
+			stpf = stpq;
+		}
+		*brackt = 1;
+		/*     Third case: A lower function value, derivatives of the same sign, */
+		/*     and the magnitude of the derivative decreases. */
+	} else if (fabs(*dp) < fabs(*dx)) {
+		/*      The cubic step is computed only if the cubic tends to infinity */
+		/*      in the direction of the step or if the minimum of the cubic */
+		/*      is beyond stp. Otherwise the cubic step is defined to be the */
+		/*      secant step. */
+		theta = (*fx - *fp) * 3. / (*stp - *stx) + *dx + *dp;
+		/* Computing MAX */
+		d__1 = fabs(theta), d__2 = fabs(*dx),
+				d__1 = max(d__1,d__2), d__2 = fabs(*dp);
+		s = max(d__1,d__2);
+		/*      The case gamm = 0 only arises if the cubic does not tend */
+		/*      to infinity in the direction of the step. */
+		/* Computing MAX */
+		/* Computing 2nd power */
+		d__1 = theta / s;
+		d__1 = d__1 * d__1 - *dx / s * (*dp / s);
+		gamm = d__1 < 0 ? 0. : s * sqrt(d__1);
+		if (*stp > *stx) {
+			gamm = -gamm;
+		}
+		p = gamm - *dp + theta;
+		q = gamm + (*dx - *dp) + gamm;
+		r__ = p / q;
+		if (r__ < 0. && gamm != 0.) {
+			stpc = *stp + r__ * (*stx - *stp);
+		} else if (*stp > *stx) {
+			stpc = *stpmax;
+		} else {
+			stpc = *stpmin;
+		}
+		stpq = *stp + *dp / (*dp - *dx) * (*stx - *stp);
+		if (*brackt) {
+			/*         A minimizer has been bracketed. If the cubic step is */
+			/*         closer to stp than the secant step, the cubic step is */
+			/*         taken, otherwise the secant step is taken. */
+			if (fabs(stpc - *stp) < fabs(stpq - *stp)) {
+				stpf = stpc;
+			} else {
+				stpf = stpq;
+			}
+			d__1 = *stp + (*sty - *stp) * .66;
+			if (*stp > *stx) {
+				stpf = min(d__1,stpf);
+			} else {
+				stpf = max(d__1,stpf);
+			}
+		} else {
+			/*         A minimizer has not been bracketed. If the cubic step is */
+			/*         farther from stp than the secant step, the cubic step is */
+			/*         taken, otherwise the secant step is taken. */
+			if (fabs(stpc - *stp) > fabs(stpq - *stp)) {
+				stpf = stpc;
+			} else {
+				stpf = stpq;
+			}
+			stpf = min(*stpmax,stpf);
+			stpf = max(*stpmin,stpf);
+		}
+		/*     Fourth case: A lower function value, derivatives of the */
+		/*     same sign, and the magnitude of the derivative does not */
+		/*     decrease. If the minimum is not bracketed, the step is either */
+		/*     stpmin or stpmax, otherwise the cubic step is taken. */
+	} else {
+		if (*brackt) {
+			theta = (*fp - *fy) * 3. / (*sty - *stp) + *dy + *dp;
+			/* Computing MAX */
+			d__1 = fabs(theta), d__2 = fabs(*dy), d__1 = max(d__1,d__2), d__2 =
+					fabs(*dp);
+			s = max(d__1,d__2);
+			/* Computing 2nd power */
+			d__1 = theta / s;
+			gamm = s * sqrt(d__1 * d__1 - *dy / s * (*dp / s));
+			if (*stp > *sty) {
+				gamm = -gamm;
+			}
+			p = gamm - *dp + theta;
+			q = gamm - *dp + gamm + *dy;
+			r__ = p / q;
+			stpc = *stp + r__ * (*sty - *stp);
+			stpf = stpc;
+		} else if (*stp > *stx) {
+			stpf = *stpmax;
+		} else {
+			stpf = *stpmin;
+		}
+	}
+	/*     Update the interval which contains a minimizer. */
+	if (*fp > *fx) {
+		*sty = *stp;
+		*fy = *fp;
+		*dy = *dp;
+	} else {
+		if (sgnd < 0.) {
+			*sty = *stx;
+			*fy = *fx;
+			*dy = *dx;
+		}
+		*stx = *stp;
+		*fx = *fp;
+		*dx = *dp;
+	}
+	/*     Compute the new step. */
+	*stp = stpf;
+	return;
+} /* dcstep */
+/* ====================== The end of dcstep ============================== */
+
+
+void prn3lb(int n, double *x, double *f, char *task, int iprint,
+		int info, int iter, int nfgv, int nintol, int nskip,
+		int nact, double sbgnrm, int nint,
+		char *word, int iback, double stp, double xstep,
+		int k)
+{
+	if(strncmp(task, "CONV", 4) == 0) {
+		if (iprint >= 0) {
+			cout << endl;
+			cout << "iterations " << iter << endl;
+			cout << "function evaluations " << nfgv << endl;
+			cout << "segments explored during Cauchy searches " << nintol << endl;
+			cout << "BFGS updates skipped " << nskip << endl;
+			cout << "active bounds at final generalized Cauchy point " << nact << endl;
+			cout << "norm of the final projected gradient " << sbgnrm << endl;
+			cout << "inal function value " << *f << endl;
+			cout << endl;
+		}
+		if (iprint >= 100) pvector((char*)"X =", x, n);
+		if (iprint >= 1)
+			cout << "F = " << *f << endl;
+	}
+	if (iprint >= 0) {
+		switch(info) {
+		case -1:
+			cout << "Matrix in 1st Cholesky factorization in formk is not Pos. Def.";
+			break;
+		case -2:
+			cout << "Matrix in 2st Cholesky factorization in formk is not Pos. Def.";
+			break;
+		case -3:
+			cout << "Matrix in the Cholesky factorization in formt is not Pos. Def.";
+			break;
+		case -4:
+			cout << "Derivative >= 0, backtracking line search impossible.";
+			break;
+		case -5:
+			cout << "l(" << k << ") > u(" << k << ").  No feasible solution";
+			break;
+		case -6:
+			cout << "Input nbd(" << k << ") is invalid";
+			break;
+		case -7:
+			cout << "Warning:  more than 10 function and gradient evaluations" << endl;
+			cout << "   in the last line search" << endl;
+			break;
+		case -8:
+			cout << "The triangular system is singular." << endl;
+			break;
+		case -9:
+			cout << "Line search cannot locate an adequate point after 20 function" << endl;
+			cout << "and gradient evaluations" << endl;
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void prn1lb(int n, int m, double *l, double *u, double *x,
+		int iprint, double epsmch)
+{
+	if (iprint >=  0) {
+		cout << "N = " << n << ", M = " << m << " machine precision = " << epsmch << endl;
+		if (iprint >= 100){
+			pvector((char*)"L =", l, n);
+			pvector((char*)"X0 =",x, n);
+			pvector((char*)"U =", u, n);
+		}
+	}
+}
+
+void projgr(int n, double *l, double *u,
+		int *nbd, double *x, double *g, double *sbgnrm)
+{
+	/*    ************
+
+	 Subroutine projgr
+
+	 This subroutine computes the infinity norm of the projected gradient.
+
+
+	 *     *  *
+
+	 NEOS, November 1994. (Latest revision April 1997.)
+	 Optimization Technology Center.
+	 Argonne National Laboratory and Northwestern University.
+	 Written by
+	 Ciyou Zhu
+	 in collaboration with R.H. Byrd, P. Lu-Chen and J. Nocedal.
+
+	 ************
+	 */
+	int i;
+	double gi, d__1;
+
+	*sbgnrm = 0.;
+	for (i = 0; i < n; ++i) {
+		gi = g[i];
+		if (nbd[i] != 0) {
+			if (gi < 0.) {
+				if (nbd[i] >= 2) {
+					if(gi < (d__1 = x[i] - u[i]))
+						gi = d__1;
+				}
+			} else {
+				if (nbd[i] <= 2) {
+					if(gi > (d__1 = x[i] - l[i]))
+						gi = d__1;
+				}
+			}
+		}
+		if(*sbgnrm < (d__1 = fabs(gi))) *sbgnrm = d__1;
+	}
+	return;
+} /* projgr */
+
+void prn2lb(int n, double *x, double *f, double *g, int iprint,
+		int iter, int nfgv, int nact, double sbgnrm,
+		int nint, char *word, int iword, int iback,
+		double stp, double xstep)
+{
+	if (iprint >=  99) {
+		cout << "LINE SEARCH " << iback << " times; norm of step = " << xstep << "\n";
+		if (iprint > 100) {
+			pvector((char*)"X =", x, n);
+			pvector((char*)"G =", g, n);
+		}
+	} else if (iprint > 0 && iter%iprint == 0) {
+		cout << "At iterate " << iter << "  f = " << *f << "  |proj g|=  " << sbgnrm << "\n";
+	}
+}
+
+void pvector(char *title, double *x, int n)
+{
+	int i;
+	cout << title;
+	for (i = 0; i < n; i++) cout << x[i] << " ";
+	cout << endl;
+}
+
+
+int dcopy(int *n, double *dx, int *incx,
+		double *dy, int *incy)
+{
+
+
+	/* System generated locals */
+	// int i__1;
+
+	/* Local variables */
+	int i, m, ix, iy, mp1;
+
+
+	/*     copies a vector, x, to a vector, y.
+	 uses unrolled loops for increments equal to one.
+	 jack dongarra, linpack, 3/11/78.
+	 modified 12/3/93, array(1) declarations changed to array(*)
+
+
+
+	 Parameter adjustments
+	 Function Body */
+
+
+	if (*n <= 0) {
+		return 0;
+	}
+	if (*incx == 1 && *incy == 1) {
+		goto L20;
+	}
+
+	/*        code for unequal increments or equal increments
+	 not equal to 1 */
+
+	ix = 1;
+	iy = 1;
+	if (*incx < 0) {
+		ix = (-(*n) + 1) * *incx + 1;
+	}
+	if (*incy < 0) {
+		iy = (-(*n) + 1) * *incy + 1;
+	}
+	// i__1 = *n;
+	for (i = 1; i <= *n; ++i) {
+		DY(iy) = DX(ix);
+		ix += *incx;
+		iy += *incy;
+		/* L10: */
+	}
+	return 0;
+
+	/*        code for both increments equal to 1
+
+
+	 clean-up loop */
+
+	L20:
+	m = *n % 7;
+	if (m == 0) {
+		goto L40;
+	}
+	// i__1 = m;
+	for (i = 1; i <= m; ++i) {
+		DY(i) = DX(i);
+		/* L30: */
+	}
+	if (*n < 7) {
+		return 0;
+	}
+	L40:
+	mp1 = m + 1;
+	// i__1 = *n;
+	for (i = mp1; i <= *n; i += 7) {
+		DY(i) = DX(i);
+		DY(i + 1) = DX(i + 1);
+		DY(i + 2) = DX(i + 2);
+		DY(i + 3) = DX(i + 3);
+		DY(i + 4) = DX(i + 4);
+		DY(i + 5) = DX(i + 5);
+		DY(i + 6) = DX(i + 6);
+		/* L50: */
+	}
+	return 0;
+} /* dcopy */
+
+void timer(double * ttime)
+{
+	*ttime = 0.0;
+}
+
+int dscal(int *n, double *da, double *dx,
+		int *incx)
+{
+
+
+	/* System generated locals */
+	// int i__1, i__2;
+
+	/* Local variables */
+	int i, m, nincx, mp1;
+
+
+	/*     scales a vector by a constant.
+	 uses unrolled loops for increment equal to one.
+	 jack dongarra, linpack, 3/11/78.
+	 modified 3/93 to return if incx .le. 0.
+	 modified 12/3/93, array(1) declarations changed to array(*)
+
+
+
+	 Parameter adjustments
+	 Function Body */
+
+
+	if (*n <= 0 || *incx <= 0) {
+		return 0;
+	}
+	if (*incx == 1) {
+		goto L20;
+	}
+
+	/*        code for increment not equal to 1 */
+
+	nincx = *n * *incx;
+	// i__1 = nincx;
+	// i__2 = *incx;
+	for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
+		DX(i) = *da * DX(i);
+		/* L10: */
+	}
+	return 0;
+
+	/*        code for increment equal to 1
+
+
+	 clean-up loop */
+
+	L20:
+	m = *n % 5;
+	if (m == 0) {
+		goto L40;
+	}
+	// i__2 = m;
+	for (i = 1; i <= m; ++i) {
+		DX(i) = *da * DX(i);
+		/* L30: */
+	}
+	if (*n < 5) {
+		return 0;
+	}
+	L40:
+	mp1 = m + 1;
+	// i__2 = *n;
+	for (i = mp1; i <= *n; i += 5) {
+		DX(i) = *da * DX(i);
+		DX(i + 1) = *da * DX(i + 1);
+		DX(i + 2) = *da * DX(i + 2);
+		DX(i + 3) = *da * DX(i + 3);
+		DX(i + 4) = *da * DX(i + 4);
+		/* L50: */
+	}
+	return 0;
+} /* dscal */
+
+double ddot(int *n, double *dx, int *incx, double *dy,
+		int *incy)
+{
+
+
+	/* System generated locals */
+	// int i__1;
+	double ret_val;
+
+	/* Local variables */
+	int i, m;
+	double dtemp;
+	int ix, iy, mp1;
+
+
+	/*     forms the dot product of two vectors.
+	 uses unrolled loops for increments equal to one.
+	 jack dongarra, linpack, 3/11/78.
+	 modified 12/3/93, array(1) declarations changed to array(*)
+
+
+
+	 Parameter adjustments
+	 Function Body */
+
+
+	ret_val = 0.;
+	dtemp = 0.;
+	if (*n <= 0) {
+		return ret_val;
+	}
+	if (*incx == 1 && *incy == 1) {
+		goto L20;
+	}
+
+	/*        code for unequal increments or equal increments
+	 not equal to 1 */
+
+	ix = 1;
+	iy = 1;
+	if (*incx < 0) {
+		ix = (-(*n) + 1) * *incx + 1;
+	}
+	if (*incy < 0) {
+		iy = (-(*n) + 1) * *incy + 1;
+	}
+	// i__1 = *n;
+	for (i = 1; i <= *n; ++i) {
+		dtemp += DX(ix) * DY(iy);
+		ix += *incx;
+		iy += *incy;
+		/* L10: */
+	}
+	ret_val = dtemp;
+	return ret_val;
+
+	/*        code for both increments equal to 1
+
+
+	 clean-up loop */
+
+	L20:
+	m = *n % 5;
+	if (m == 0) {
+		goto L40;
+	}
+	// i__1 = m;
+	for (i = 1; i <= m; ++i) {
+		dtemp += DX(i) * DY(i);
+		/* L30: */
+	}
+	if (*n < 5) {
+		goto L60;
+	}
+	L40:
+	mp1 = m + 1;
+	// i__1 = *n;
+	for (i = mp1; i <= *n; i += 5) {
+		dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) *
+				DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4);
+		/* L50: */
+	}
+	L60:
+	ret_val = dtemp;
+	return ret_val;
+} /* ddot */
+
+int daxpy(int *n, double *da, double *dx,
+		int *incx, double *dy, int *incy)
+{
+
+
+	/* System generated locals */
+	// int i__1;
+
+	/* Local variables */
+	int i, m, ix, iy, mp1;
+
+
+	/*     constant times a vector plus a vector.
+	 uses unrolled loops for increments equal to one.
+	 jack dongarra, linpack, 3/11/78.
+	 modified 12/3/93, array(1) declarations changed to array(*)
+
+
+
+	 Parameter adjustments
+	 Function Body */    
+
+	if (*n <= 0) {
+		return 0;
+	}
+	if (*da == 0.) {
+		return 0;
+	}
+	if (*incx == 1 && *incy == 1) {
+		goto L20;
+	}
+
+	/*        code for unequal increments or equal increments
+	 not equal to 1 */
+
+	ix = 1;
+	iy = 1;
+	if (*incx < 0) {
+		ix = (-(*n) + 1) * *incx + 1;
+	}
+	if (*incy < 0) {
+		iy = (-(*n) + 1) * *incy + 1;
+	}
+	// i__1 = *n;
+	for (i = 1; i <= *n; ++i) {
+		DY(iy) += *da * DX(ix);
+		ix += *incx;
+		iy += *incy;
+		/* L10: */
+	}
+	return 0;
+
+	/*        code for both increments equal to 1
+
+
+	 clean-up loop */
+
+	L20:
+	m = *n % 4;
+	if (m == 0) {
+		goto L40;
+	}
+	// i__1 = m;
+	for (i = 1; i <= m; ++i) {
+		DY(i) += *da * DX(i);
+		/* L30: */
+	}
+	if (*n < 4) {
+		return 0;
+	}
+	L40:
+	mp1 = m + 1;
+	// i__1 = *n;
+	for (i = mp1; i <= *n; i += 4) {
+		DY(i) += *da * DX(i);
+		DY(i + 1) += *da * DX(i + 1);
+		DY(i + 2) += *da * DX(i + 2);
+		DY(i + 3) += *da * DX(i + 3);
+		/* L50: */
+	}
+	return 0;
+} /* daxpy */
+
+int dpofa(double *a, int *lda, int *n, int *info)
+{
+	/* System generated locals */
+	int a_dim1, a_offset, i__1, i__2, i__3;
+
+	/* Local variables */
+	int j, k;
+	double s, t;
+	int jm1;
+
+	/*<       integer lda,n,info >*/
+	/*<       double precision a(lda,1) >*/
+
+	/*     dpofa factors a double precision symmetric positive definite */
+	/*     matrix. */
+
+	/*     dpofa is usually called by dpoco, but it can be called */
+	/*     directly with a saving in time if  rcond  is not needed. */
+	/*     (time for dpoco) = (1 + 18/n)*(time for dpofa) . */
+
+	/*     on entry */
+
+	/*        a       double precision(lda, n) */
+	/*                the symmetric matrix to be factored.  only the */
+	/*                diagonal and upper triangle are used. */
+
+	/*        lda     integer */
+	/*                the leading dimension of the array  a . */
+
+	/*        n       integer */
+	/*                the order of the matrix  a . */
+
+	/*     on return */
+
+	/*        a       an upper triangular matrix  r  so that  a = trans(r)*r */
+	/*                where  trans(r)  is the transpose. */
+	/*                the strict lower triangle is unaltered. */
+	/*                if  info .ne. 0 , the factorization is not complete. */
+
+	/*        info    integer */
+	/*                = 0  for normal return. */
+	/*                = k  signals an error condition.  the leading minor */
+	/*                     of order  k  is not positive definite. */
+
+	/*     linpack.  this version dated 08/14/78 . */
+	/*     cleve moler, university of new mexico, argonne national lab. */
+
+	/*     subroutines and functions */
+
+	/*     blas ddot */
+	/*     fortran dsqrt */
+
+	/*     internal variables */
+
+	/*<       double precision ddot,t >*/
+	/*<       double precision s >*/
+	/*<       integer j,jm1,k >*/
+	/*     begin block with ...exits to 40 */
+
+
+	/*<          do 30 j = 1, n >*/
+	/* Parameter adjustments */
+	a_dim1 = *lda;
+	a_offset = 1 + a_dim1;
+	a -= a_offset;
+
+	/* Function Body */
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+		/*<             info = j >*/
+		*info = j;
+		/*<             s = 0.0d0 >*/
+		s = 0.;
+		/*<             jm1 = j - 1 >*/
+		jm1 = j - 1;
+		/*<             if (jm1 .lt. 1) go to 20 >*/
+		if (jm1 < 1) {
+			goto L20;
+		}
+		/*<             do 10 k = 1, jm1 >*/
+		i__2 = jm1;
+		for (k = 1; k <= i__2; ++k) {
+			/*<                t = a(k,j) - ddot(k-1,a(1,k),1,a(1,j),1) >*/
+			i__3 = k - 1;
+			t = a[k + j * a_dim1] - ddot(&i__3, &a[k * a_dim1 + 1], &c__1, &
+					a[j * a_dim1 + 1], &c__1);
+			/*<                t = t/a(k,k) >*/
+			t /= a[k + k * a_dim1];
+			/*<                a(k,j) = t >*/
+			a[k + j * a_dim1] = t;
+			/*<                s = s + t*t >*/
+			s += t * t;
+			/*<    10       continue >*/
+			/* L10: */
+		}
+		/*<    20       continue >*/
+		L20:
+		/*<             s = a(j,j) - s >*/
+		s = a[j + j * a_dim1] - s;
+		/*     ......exit */
+		/*<             if (s .le. 0.0d0) go to 40 >*/
+		if (s <= 0.) {
+			goto L40;
+		}
+		/*<             a(j,j) = dsqrt(s) >*/
+		a[j + j * a_dim1] = sqrt(s);
+		/*<    30    continue >*/
+		/* L30: */
+	}
+	/*<          info = 0 >*/
+	*info = 0;
+	/*<    40 continue >*/
+	L40:
+	/*<       return >*/
+	return 0;
+	/*<       end >*/
+} /* dpofa */
+
+int dtrsl(double *t, int *ldt, int *n,
+		double *b, int *job, int *info)
+{
+	/* System generated locals */
+	int t_dim1, t_offset, i__1, i__2;
+
+	/* Local variables */
+	int j, jj, case__;
+	double temp;
+
+
+	/*     dtrsl solves systems of the form */
+
+	/*                   t * x = b */
+	/*     or */
+	/*                   trans(t) * x = b */
+
+	/*     where t is a triangular matrix of order n. here trans(t) */
+	/*     denotes the transpose of the matrix t. */
+
+	/*     on entry */
+
+	/*         t         double precision(ldt,n) */
+	/*                   t contains the matrix of the system. the zero */
+	/*                   elements of the matrix are not referenced, and */
+	/*                   the corresponding elements of the array can be */
+	/*                   used to store other information. */
+
+	/*         ldt       integer */
+	/*                   ldt is the leading dimension of the array t. */
+
+	/*         n         integer */
+	/*                   n is the order of the system. */
+
+	/*         b         double precision(n). */
+	/*                   b contains the right hand side of the system. */
+
+	/*         job       integer */
+	/*                   job specifies what kind of system is to be solved. */
+	/*                   if job is */
+
+	/*                        00   solve t*x=b, t lower triangular, */
+	/*                        01   solve t*x=b, t upper triangular, */
+	/*                        10   solve trans(t)*x=b, t lower triangular, */
+	/*                        11   solve trans(t)*x=b, t upper triangular. */
+
+	/*     on return */
+
+	/*         b         b contains the solution, if info .eq. 0. */
+	/*                   otherwise b is unaltered. */
+
+	/*         info      integer */
+	/*                   info contains zero if the system is nonsingular. */
+	/*                   otherwise info contains the index of */
+	/*                   the first zero diagonal element of t. */
+
+	/*     linpack. this version dated 08/14/78 . */
+	/*     g. w. stewart, university of maryland, argonne national lab. */
+
+	/*     subroutines and functions */
+
+	/*     blas daxpy,ddot */
+	/*     fortran mod */
+
+	/*     internal variables */
+
+
+	/*     begin block permitting ...exits to 150 */
+
+	/*        check for zero diagonal elements. */
+
+	/* Parameter adjustments */
+	t_dim1 = *ldt;
+	t_offset = 1 + t_dim1;
+	t -= t_offset;
+	--b;
+
+	/* Function Body */
+	i__1 = *n;
+	for (*info = 1; *info <= i__1; ++(*info)) {
+		/*     ......exit */
+		if (t[*info + *info * t_dim1] == 0.) {
+			goto L150;
+		}
+		/* L10: */
+	}
+	*info = 0;
+
+	/*        determine the task and go to it. */
+
+	case__ = 1;
+	if (*job % 10 != 0) {
+		case__ = 2;
+	}
+	if (*job % 100 / 10 != 0) {
+		case__ += 2;
+	}
+	switch (case__) {
+	case 1:  goto L20;
+	case 2:  goto L50;
+	case 3:  goto L80;
+	case 4:  goto L110;
+	}
+
+	/*        solve t*x=b for t lower triangular */
+
+	L20:
+	b[1] /= t[t_dim1 + 1];
+	if (*n < 2) {
+		goto L40;
+	}
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+		temp = -b[j - 1];
+		i__2 = *n - j + 1;
+		daxpy(&i__2, &temp, &t[j + (j - 1) * t_dim1], &c__1, &b[j], &c__1);
+		b[j] /= t[j + j * t_dim1];
+		/* L30: */
+	}
+	L40:
+	goto L140;
+
+	/*        solve t*x=b for t upper triangular. */
+
+	L50:
+	b[*n] /= t[*n + *n * t_dim1];
+	if (*n < 2) {
+		goto L70;
+	}
+	i__1 = *n;
+	for (jj = 2; jj <= i__1; ++jj) {
+		j = *n - jj + 1;
+		temp = -b[j + 1];
+		daxpy(&j, &temp, &t[(j + 1) * t_dim1 + 1], &c__1, &b[1], &c__1);
+		b[j] /= t[j + j * t_dim1];
+		/* L60: */
+	}
+	L70:
+	goto L140;
+
+	/*        solve trans(t)*x=b for t lower triangular. */
+
+	L80:
+	b[*n] /= t[*n + *n * t_dim1];
+	if (*n < 2) {
+		goto L100;
+	}
+	i__1 = *n;
+	for (jj = 2; jj <= i__1; ++jj) {
+		j = *n - jj + 1;
+		i__2 = jj - 1;
+		b[j] -= ddot(&i__2, &t[j + 1 + j * t_dim1], &c__1, &b[j + 1], &c__1);
+		b[j] /= t[j + j * t_dim1];
+		/* L90: */
+	}
+	L100:
+	goto L140;
+
+	/*        solve trans(t)*x=b for t upper triangular. */
+
+	L110:
+	b[1] /= t[t_dim1 + 1];
+	if (*n < 2) {
+		goto L130;
+	}
+	i__1 = *n;
+	for (j = 2; j <= i__1; ++j) {
+		i__2 = j - 1;
+		b[j] -= ddot(&i__2, &t[j * t_dim1 + 1], &c__1, &b[1], &c__1);
+		b[j] /= t[j + j * t_dim1];
+		/* L120: */
+	}
+	L130:
+	L140:
+	L150:
+	return 0;
+} /* dtrsl */
diff --git a/lbfgsb/lbfgsb_new.h b/lbfgsb/lbfgsb_new.h
new file mode 100644
index 0000000..7d61a01
--- /dev/null
+++ b/lbfgsb/lbfgsb_new.h
@@ -0,0 +1,232 @@
+/*
+ *
+ * lbfgsb_new.h
+ * HAL_HAS
+ *
+ * CSIRO Open Source Software License Agreement (GPLv3)
+ * Copyright (c) 2014, Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.
+ * All rights reserved. CSIRO is willing to grant you a license to HAL-HAS on the terms of the GNU General Public
+ * License version 3 as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except
+ * where otherwise indicated for third party material.
+ * The following additional terms apply under clause 7 of that license:
+ * EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE
+ * IS PROVIDED "AS-IS". CSIRO MAKES NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY
+ * OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE
+ * OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.
+ * TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE LIABLE ON ANY LEGAL THEORY (INCLUDING,
+ * WITHOUT LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES
+ * OR OTHER LIABILITY HOWSOEVER INCURRED.  WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF
+ * LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS;
+ * OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS; OR ANY SPECIAL,
+ * INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS
+ * AGREEMENT, ACCESS OF THE SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED OF
+ * THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.
+ * APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS,
+ * OR IMPOSES OBLIGATIONS OR LIABILITY ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT
+ * SET OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES".  TO THE EXTENT THAT SUCH CONSUMER
+ * GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY
+ * OF CSIRO UNDER THE RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO’S OPTION) TO ONE OF FOLLOWING
+ * REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES:
+ * (a)               THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT
+ *                   SERVICES AGAIN;
+ * (b)               THE REPAIR OF THE SOFTWARE;
+ * (c)               THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE
+ *                   RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE REPAIRED.
+ * IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF THE SOFTWARE OR MATERIAL DISTRIBUTED
+ * WITH IT.  CSIRO MAY ENFORCE ANY RIGHTS ON BEHALF OF THE RELEVANT THIRD PARTY.
+ * Third Party Components
+ * The following third party components are distributed with the Software.  You agree to comply with the license
+ * terms for these components as part of accessing the Software.  Other third party software may also be identified
+ * in separate files distributed with the Software.
+ * ___________________________________________________________________
+ * 
+ * R : A Computer Language for Statistical Data Analysis version 3.0.1 (http://cran.r-project.org/src/base/R-3/R-3.0.1.tar.gz)
+ * Copyright (C) 2000-2004 The R Core Team
+ * This software is licensed under GNU GPL
+ * 
+ * JACOBI_EIGENVALUE.C (http://people.sc.fsu.edu/~jburkardt/c_src/jacobi_eigenvalue/jacobi_eigenvalue.c)
+ * Copyright (C) 2003-2013 John Burkardt
+ * This software is licensed under GNU LGPL (http://www.gnu.org/licenses/lgpl.html)
+ * ___________________________________________________________________
+ */
+
+
+#ifndef __RAL_RAS__lbfgsb_new__
+#define __RAL_RAS__lbfgsb_new__
+
+#include <cstring>
+#include <stdlib.h>
+#include <iostream>
+#include <math.h>
+#include <float.h>
+//#include "gradient.h"
+
+using namespace std;
+
+// Function to access the L-BFGS-B function
+// 1. int n : The number of the variables
+// 2. double* x : initial values of the variables
+// 3. double* l : lower bounds of the variables
+// 4. int maxit : max # of iterations
+// 5. void* ex  : the wrapped variables for objective function
+// After the function is invoked, the values of x will be updated
+void lbfgsb_R(int n, double* x, double* l, int maxit, void* ex);
+
+// Function to access the L-BFGS-B function
+// 1. int n : The number of the variables
+// 2. double* x : initial values of the variables
+// 3. double* l : lower bounds of the variables
+// 4. double* u : upper bounds of the variables
+// 4. int maxit : max # of iterations
+// 5. void* ex  : the wrapped variables for objective function
+// After the function is invoked, the values of x will be updated
+void lbfgsb_R2(int n, double* x, double* l, double* u, int maxit, void* ex);
+
+
+
+// ========================================================= //
+// FUNCTIONS converted from R v 3.0.1
+// ========================================================= //
+
+typedef double optimfn(int, double *, void *);
+typedef void optimgr(int, double *, double *, void *);
+
+void lbfgsb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *Fmin, optimfn fminfn, optimgr fmingr, int *fail,
+		void *ex, double factr, double pgtol,
+		int *fncount, int *grcount, int maxit, char *msg,
+		int trace, int nREPORT);
+
+
+void setulb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *f, double *g, double factr, double *pgtol,
+		double *wa, int * iwa, char *task, int iprint,
+		int *lsave, int *isave, double *dsave);
+
+void mainlb(int n, int m, double *x,
+		double *l, double *u, int *nbd, double *f, double *g,
+		double factr, double *pgtol, double *ws, double * wy,
+		double *sy, double *ss, double *wt, double *wn,
+		double *snd, double *z, double *r, double *d,
+		double *t, double *wa, int *indx, int *iwhere,
+		int *indx2, char *task, int iprint,
+		char *csave, int *lsave, int *isave, double *dsave);
+
+void errclb(int n, int m, double factr, double *l, double *u,
+		int *nbd, char *task, int *info, int *k);
+
+void prn3lb(int n, double *x, double *f, char *task, int iprint,
+		int info, int iter, int nfgv, int nintol, int nskip,
+		int nact, double sbgnrm, int nint,
+		char *word, int iback, double stp, double xstep,
+		int k);
+
+void prn1lb(int n, int m, double *l, double *u, double *x,
+		int iprint, double epsmch);
+
+void active(int n, double *l, double *u,
+		int *nbd, double *x, int *iwhere, int iprint,
+		int *prjctd, int *cnstnd, int *boxed);
+
+void projgr(int n, double *l, double *u,
+		int *nbd, double *x, double *g, double *sbgnrm);
+
+void timer(double * ttime);
+
+void cauchy(int n, double *x, double *l, double *u, int *nbd,
+		double *g, int *iorder, int * iwhere, double *t,
+		double *d, double *xcp, int m,
+		double *wy, double *ws, double *sy, double *wt,
+		double *theta, int *col, int *head, double *p,
+		double *c, double *wbp, double *v, int *nint,
+		int iprint, double *sbgnrm, int *info, double * epsmch);
+
+void freev(int n, int *nfree, int *indx,
+		int *nenter, int *ileave, int *indx2, int *iwhere,
+		int *wrk, int *updatd, int *cnstnd, int iprint,
+		int *iter);
+
+void formk(int n, int *nsub, int *ind, int * nenter, int *ileave,
+		int *indx2, int *iupdat, int * updatd, double *wn,
+		double *wn1, int m, double *ws, double *wy, double *sy,
+		double *theta, int *col, int *head, int *info);
+
+void cmprlb(int n, int m, double *x,
+		double *g, double *ws, double *wy, double *sy,
+		double *wt, double *z, double *r, double *wa,
+		int *indx, double *theta, int *col, int *head,
+		int *nfree, int *cnstnd, int *info);
+
+void subsm(int n, int m, int *nsub, int *ind,
+		double *l, double *u, int *nbd, double *x,
+		double *d, double *ws, double *wy, double *theta,
+		int *col, int *head, int *iword, double *wv,
+		double *wn, int iprint, int *info);
+
+void lnsrlb(int n, double *l, double *u,
+		int *nbd, double *x, double *f, double *fold,
+		double *gd, double *gdold, double *g, double *d,
+		double *r, double *t, double *z, double *stp,
+		double *dnorm, double *dtd, double *xstep,
+		double *stpmx, int *iter, int *ifun, int *iback, int *nfgv,
+		int *info, char *task, int *boxed, int *cnstnd,
+		char *csave, int *isave, double *dsave);
+
+void matupd(int n, int m, double *ws,
+		double *wy, double *sy, double *ss, double *d,
+		double *r, int *itail, int *iupdat, int *col,
+		int *head, double *theta, double *rr, double *dr,
+		double *stp, double *dtd);
+
+void prn2lb(int n, double *x, double *f, double *g, int iprint,
+		int iter, int nfgv, int nact, double sbgnrm,
+		int nint, char *word, int iword, int iback,
+		double stp, double xstep);
+
+void pvector(char *title, double *x, int n);
+
+void formt(int m, double *wt, double *sy, double *ss,
+		int *col, double *theta, int *info);
+
+void bmv(int m, double *sy, double *wt,
+		int *col, double *v, double *p, int *info);
+
+void hpsolb(int n, double *t, int *iorder, int iheap);
+
+void dcsrch(double *f, double *g, double *stp,
+		/*Chgd: the next five are no longer pointers:*/
+		double ftol, double gtol, double xtol,
+		double stpmin, double stpmax,
+		char *task, int *isave, double *dsave);
+
+void dcstep(double *stx, double *fx, double *dx,
+		double *sty, double *fy, double *dy, double *stp,
+		double *fp, double *dp, int *brackt, double *stpmin,
+		double *stpmax);
+
+// ========================================================= //
+// Other fortan functions
+// ========================================================= //
+
+#define DY(I) dy[(I)-1]
+#define DX(I) dx[(I)-1]
+
+int dcopy(int *n, double *dx, int *incx,
+		double *dy, int *incy);
+
+int dscal(int *n, double *da, double *dx,
+		int *incx);
+
+double ddot(int *n, double *dx, int *incx, double *dy,
+		int *incy);
+
+int daxpy(int *n, double *da, double *dx,
+		int *incx, double *dy, int *incy);
+
+int dpofa(double *a, int *lda, int *n, int *info);
+
+int dtrsl(double *t, int *ldt, int *n,
+		double *b, int *job, int *info);
+
+#endif /* defined(__RAL_RAS__lbfgsb_new__) */
diff --git a/likelihood.c b/likelihood.c
new file mode 100644
index 0000000..4de1e9b
--- /dev/null
+++ b/likelihood.c
@@ -0,0 +1,176 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define GLOBAL_VARIABLES_DEFINITION
+#include "axml.h"
+#include "globalVariables.h"
+
+void read_msa (tree * tr, char * filename);
+
+/* This is the info you need to copy the vector*/
+typedef struct
+{
+  int node_number;
+  int num_partitions;
+  size_t *partition_sizes;
+  double **lh_values;
+}likelihood_vector;
+
+void free_likelihood_vector(likelihood_vector *v)
+{
+  if(v == NULL)
+    return;
+  int i;
+  for(i=0; i < v->num_partitions; i++)
+    free(v->lh_values[i]);
+  free(v->lh_values);
+  free(v->partition_sizes);
+  free(v);
+}
+
+likelihood_vector *copy_likelihood_vectors (tree *tr, nodeptr p)
+{
+  assert(tr->useRecom == FALSE);
+  likelihood_vector *v = (likelihood_vector *) malloc(sizeof(likelihood_vector));  
+  v->node_number = p->number; 
+  v->num_partitions = tr->NumberOfModels; 
+  v->partition_sizes = (size_t *)malloc(tr->NumberOfModels * sizeof(size_t));
+  v->lh_values = (double **)malloc(tr->NumberOfModels * sizeof(double *));
+
+  /* Compute LH vector sizes for each partition */
+  size_t rateHet, states, width, vector_size;
+  rateHet = discreteRateCategories(tr->rateHetModel);
+  int model;
+  for(model = 0; model < tr->NumberOfModels; model++)
+  {
+    width  = (size_t)tr->partitionData[model].width;
+    states = (size_t)tr->partitionData[model].states;
+    vector_size = virtual_width( width ) * rateHet * states * sizeof(double);
+    v->lh_values[model] = (double *)malloc(sizeof(double) * vector_size);
+    assert (v->lh_values[model] != NULL);
+    v->partition_sizes[model] = vector_size;
+    double *lh_vector_src = tr->partitionData[model].xVector[p->number - tr->mxtips - 1];
+    assert (lh_vector_src != NULL);
+    vector_size = v->partition_sizes[model];
+    memcpy(v->lh_values[model], lh_vector_src, vector_size);
+  }
+  return v;
+}
+
+void restore_vector(tree *tr, nodeptr p, likelihood_vector *v)
+{
+  int model;
+  for(model = 0; model < tr->NumberOfModels; model++)
+  {
+    double *lh_vector_dest = tr->partitionData[model].xVector[p->number - tr->mxtips - 1];
+    memcpy(lh_vector_dest, v->lh_values[model], v->partition_sizes[model]);
+  }
+}
+
+boolean same_vector(tree *tr, nodeptr p, likelihood_vector *v)
+{
+  int i, model;
+  for(model=0; model<tr->NumberOfModels; model++)
+  {
+    double *lh_vector_tree = tr->partitionData[model].xVector[p->number - tr->mxtips - 1];
+    int len = (int)v->partition_sizes[model]/sizeof(double);
+    for(i=0; i<len; i++)
+    {
+      if(v->lh_values[model][i] != lh_vector_tree[i])
+      {
+        printf("Diff entry in partition %d, site %d of %f\n", model, i, fabs(v->lh_values[model][i] - lh_vector_tree[i])); 
+        return FALSE;
+      }
+    }
+  }
+  return TRUE;
+}
+
+int main(int argc, char * argv[])
+{
+
+  tree        * tr;
+
+  if (argc != 2)
+   {
+     fprintf (stderr, "syntax: %s [binary-alignment-file]\n", argv[0]);
+     return (1);
+   }
+  tr = (tree *)malloc(sizeof(tree));
+
+  /* read the binary input, setup tree, initialize model with alignment */
+  read_msa(tr,argv[1]);
+  tr->randomNumberSeed = 665;
+  makeRandomTree(tr);
+  printf("Number of taxa: %d\n", tr->mxtips);
+  printf("Number of partitions: %d\n", tr->NumberOfModels);
+
+
+  /* compute the LH of the full tree */
+  printf ("Virtual root: %d\n", tr->start->number);
+  evaluateGeneric(tr, tr->start, TRUE);
+  printf("Likelihood: %f\n", tr->likelihood);
+
+  /* 8 rounds of branch length optimization */
+  smoothTree(tr, 1);
+  evaluateGeneric(tr, tr->start, TRUE);
+  printf("Likelihood after branch length optimization: %.20f\n", tr->likelihood);
+
+
+
+  /* Now we show how to find a particular LH vector for a node */
+  int i;
+  int node_number = tr->mxtips + 1;
+  nodeptr p = tr->nodep[node_number];
+  printf("Pointing to  node %d\n", p->number);
+
+  /* Fix as VR */
+  newviewGeneric(tr, p, FALSE);
+  newviewGeneric(tr, p->back, FALSE);
+  evaluateGeneric(tr, p, FALSE);
+  printf("Likelihood : %.f\n", tr->likelihood);
+
+  printf("Make a copy of LH vector for node  %d\n", p->number);
+  likelihood_vector *vector = copy_likelihood_vectors(tr, p);
+  for(i=0; i<vector->num_partitions; i++)
+     printf("Partition %d requires %d bytes\n", i, (int)vector->partition_sizes[i]);
+
+  /* Check we have the same vector in both tree and copied one */
+  assert(same_vector(tr, p, vector));
+
+  /* Now force the p to get a new value (generally branch lengths are NOT updated like this) */
+  /* This is just an example to show usage (for fast NNI eval), manually updating vectors is not recommended! */
+  printf("bl : %.40f\n", p->next->z[0]);
+  p->next->z[0] = p->next->back->z[0] = zmin;
+  printf("bl : %.40f\n", p->next->z[0]);
+  newviewGeneric(tr, p, FALSE);
+  assert(!same_vector(tr, p, vector));
+  evaluateGeneric(tr, p, FALSE);
+  printf("Likelihood : %f\n", tr->likelihood);
+
+  restore_vector(tr, p, vector);
+  assert(same_vector(tr, p, vector));
+  evaluateGeneric(tr, p, FALSE);
+  printf("Likelihood after manually restoring the vector : %f\n", tr->likelihood);
+
+  free_likelihood_vector(vector);
+
+  /* Pick an inner branch */
+  printf("numBranches %d \n", tr->numBranches);
+  //tr->numBranches = 1;
+  p = tr->nodep[tr->mxtips + 1];
+  int partition_id = 0; /* single partition */
+  double bl = get_branch_length(tr, p, partition_id);
+  printf("z value: %f , bl value %f\n", p->z[partition_id], bl);
+  /* set the bl to 2.5 */
+  double new_bl = 2.5;
+  set_branch_length(tr, p, partition_id, new_bl);
+  printf("Changed BL to %f\n", new_bl);
+  printf("new z value: %f , new bl value %f\n", p->z[partition_id], get_branch_length(tr, p, partition_id));
+  /* set back to original */
+  printf("Changed to previous BL\n");
+  set_branch_length(tr, p, partition_id, bl);
+  printf("new z value: %f , new bl value %f\n", p->z[partition_id], get_branch_length(tr, p, partition_id));
+
+  return (0);
+}
diff --git a/lpwrapper.c b/lpwrapper.c
new file mode 100644
index 0000000..2b9ad4e
--- /dev/null
+++ b/lpwrapper.c
@@ -0,0 +1,104 @@
+
+/*#include "lp/lp_lib.h"*/
+#include "lpwrapper.h"
+
+/*
+void __WINAPI msgfunction(lprec *lp, void *userhandle, int msg)
+{
+	switch(msg) {
+	case MSG_LPFEASIBLE:
+		printf("Feasible solution found\n");
+		break;
+	case MSG_MILPFEASIBLE:
+		printf("Integer feasible solution found\n");
+		break;
+	case MSG_MILPBETTER:
+		printf("Better integer feasible solution found\n");
+		break;
+	case MSG_MILPEQUAL:
+		printf("Equal MILP solution found\n");
+		break;
+	}
+}
+*/
+
+void lp_solve_version_info(int *majorversion, int *minorversion, int *release, int *build) {
+	/*lp_solve_version(majorversion, minorversion, release, build);*/
+}
+
+int lp_solve(char *filename, int ntaxa, double *score, double *variables, int verbose_mode) {
+	return 5;
+/*	lprec *lp = NULL;
+	int ret;
+	int Ncol;
+	int index, j;
+	double *row = NULL;
+	char *name;
+	char name2[200];
+
+	lp = read_LP(filename, IMPORTANT + ((verbose_mode < 2) ? 0 : verbose_mode-1), "pd");
+	//lp = read_LP(filename, NORMAL, "pd");
+	//strcpy(name2, filename);
+	//strcat(name2,".cnv");
+	//write_lp(lp, name2);
+
+	if (lp == NULL) {
+		printf("Could not create an LP_SOLVE instance!\n");
+		return 1;
+	}
+
+	set_mip_gap(lp, TRUE, 0.0);
+	
+	ret = solve(lp);
+	
+
+    if(ret == OPTIMAL || ret == PRESOLVED) {
+		ret = 0;
+	} else {
+		ret = 5;
+		printf("LP_SOLVE ERROR: %s\n", get_statustext(lp, ret));	
+		exit (1);
+	}
+
+	if(ret == 0) {
+	// a solution is calculated, now lets get some results
+	
+	// objective value
+	*score = get_objective(lp);
+	// variable values 
+	Ncol = get_Ncolumns(lp);
+	row = (double*) malloc(Ncol * sizeof(*row)); 
+	get_variables(lp, row);
+
+	
+	for(j = 0; j < Ncol; j++) {
+		name = get_col_name(lp, j+1);
+		if (name[0] == 'x') { // this is for taxa set
+			index = -1;
+			index = atoi(name+1);
+			//printf(name);
+			if (index < 0 || index >= ntaxa) {
+				printf("Index x_%d is not in the range!\n", index);
+				ret = 6;
+				break;
+			}
+			if (row[j] > tolerance && (1.0 - row[j]) > tolerance) {
+				if (verbose_mode >= 3) printf("\n%s = %10.8f", name, row[j]);
+				ret = 7;
+				if (verbose_mode < 3) break;
+			}
+			variables[index] = row[j];
+		}
+		//printf("%s: %f\n", get_col_name(lp, j + 1), row[j]);
+	}
+	free(row);
+	
+	// we are done now 
+	}
+	if(lp != NULL) {
+		// clean up such that all used memory by lpsolve is freed
+		delete_lp(lp);
+	}
+
+	return ret;*/
+}
diff --git a/lpwrapper.h b/lpwrapper.h
new file mode 100644
index 0000000..14f6c22
--- /dev/null
+++ b/lpwrapper.h
@@ -0,0 +1,56 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef _LP_WRAPPER
+#define _LP_WRAPPER
+
+#define tolerance 0.000001
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+	interface to call LP_SOLVE
+	@param filename name of input lp file
+	@param ntaxa number of taxa
+	@param score (OUT) returned optimal score
+	@param variables (OUT) array of returned solution
+	@param verbose_mode verbose mode
+	@return 
+		0 if everything works file, 
+		5 if solution is not optimal, 
+		6 if some variable has wrong name, 
+		7 if returned solution is not binary. In this case, one should run the solver 
+		again with strict binary variable constraint.
+*/
+int lp_solve(char *filename, int ntaxa, double *score, double *variables, int verbose_mode);
+
+/*int lp_demo();*/
+
+void lp_solve_version_info(int *majorversion, int *minorversion, int *release, int *build);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/maalignment.cpp b/maalignment.cpp
new file mode 100644
index 0000000..34d5408
--- /dev/null
+++ b/maalignment.cpp
@@ -0,0 +1,253 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "maalignment.h"
+
+void MaAlignment::readLogLL(char *fileName)
+{
+	//First read the values from inFile to a DoubleVector
+	DoubleVector _logllVec;
+	int siteNum = -1;
+	string currentString;
+	cout << "\nReading file containing site's loglikelihood: " << fileName << "...." << endl;
+    ifstream inFile;
+	try{
+		inFile.exceptions (ios::failbit | ios::badbit);
+		inFile.open(fileName);
+		/**really start reading*/
+		//read number of sites
+		inFile >> currentString;
+		siteNum = convert_int(currentString.c_str());
+		//ignore "Site_Lh"		
+		inFile >> currentString;		
+		while (!inFile.eof())
+		{
+			//reading each line of the file
+			//remove the badbit
+			inFile.exceptions (ios::badbit);
+			if ( !(inFile >> currentString) ) break;
+			//set the failbit again
+			inFile.exceptions (ios::failbit | ios::badbit);
+			_logllVec.push_back(convert_double(currentString.c_str()));
+		}/**finish reading*/
+		inFile.clear();
+		inFile.exceptions (ios::failbit | ios::badbit);
+		inFile.close();
+	} catch(bad_alloc){
+			outError(ERR_NO_MEMORY);
+	} catch (const char *str){
+			outError(str);
+	} catch (string str){
+			outError(str);
+	} catch (ios::failure){
+			outError(ERR_READ_INPUT);
+	} catch (...){
+			outError(ERR_READ_ANY);
+	}
+	if (siteNum != _logllVec.size())
+		outError("Actual number of site's likelihoods is not consistent with the announced number in the first line.");
+	cout << "Finish reading, now assign the logLL to the pattern:" << endl;
+
+	logLL.resize(getNPattern(),0.0);
+	for (int i = 0; i < siteNum; i++)
+	{
+		int patIndex = getPatternID(i);
+		if ( logLL[patIndex] == 0 )
+			logLL[patIndex] = _logllVec[i];
+		else
+			if ( logLL[patIndex] != _logllVec[i] )
+				outError("Conflicting between the likelihoods reported for pattern", (*this)[i]);
+	}
+//	int npat = getNPattern();
+//	cout << "Number of patterns: " << npat << endl;
+//	for ( int j = 0; j < npat; j++ )
+//		cout << j << "\t" << at(j) << "\t" << logLL[j] << endl;
+	cout << "Finish assigning logLL to the patterns!" << endl;	 
+}
+
+IntVector MaAlignment::computeExpectedNorFre()
+{
+	IntVector expectedNorFre;
+	if ( logLL.empty()) 
+		outError("Error: log likelihood of patterns are not given!");
+
+	int patNum = getNPattern();
+	int alignLen = getNSite();		
+	//resize the expectedNorFre vector
+	expectedNorFre.resize(patNum,-1);
+
+	//Vector containing the likelihood of the pattern p_i
+	DoubleVector LL(patNum,-1.0);
+	double sumLL = 0; //sum of the likelihood of the patterns in the alignment
+
+	//Compute the likelihood from the logLL
+	for ( int i = 0; i < patNum; i++ )
+	{
+		LL[i] = exp(logLL[i]);
+		sumLL += LL[i];
+	}
+
+	//Vector containing l_i = p_i*ell/sum_i(p_i)
+	DoubleVector ell(patNum, -1.0);
+	//Compute l_i
+	for ( int i = 0; i < patNum; i++ )
+	{
+		ell[i] = (double)alignLen * LL[i] / sumLL;
+	}
+
+
+	//Vector containing r_i where r_0 = ell_0; r_{i+1} = ell_{i+1} + r_i - ordinaryRounding(r_i)
+	DoubleVector r(patNum, -1.0);
+	//Compute r_i and the expected normalized frequencies
+	r[0] = ell[0];
+	expectedNorFre[0] = (int)floor(ell[0]+0.5); //note that floor(_number+0.5) returns the ordinary rounding of _number
+	int sum = expectedNorFre[0];
+	for (int j = 1; j < patNum; j++ )
+	{
+		r[j] = ell[j] + r[j-1] - floor(r[j-1]+0.5);
+		expectedNorFre[j] = (int)floor(r[j]+0.5);
+		sum += expectedNorFre[j];
+	}
+	
+	//cout << "Number of patterns: " << patNum << ", sum of expected sites: " << sum << endl;
+	return expectedNorFre;
+}
+
+void MaAlignment::printPatObsExpFre(const char *fileName)
+{
+	IntVector expectedNorFre = computeExpectedNorFre();
+	printPatObsExpFre(fileName, expectedNorFre);
+}
+
+void MaAlignment::printPatObsExpFre(const char *fileName, const IntVector expectedNorFre)
+{	
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(fileName);
+		out << "Pattern\tLogLL\tObservedFre\tExpectedFre" << endl;
+
+		int patNum = getNPattern();
+		int seqNum = getNSeq();
+		int seqID;
+
+		for ( int i = 0; i < patNum; i++ )
+		{
+			for ( seqID = 0; seqID < seqNum; seqID++ ){
+				out << convertStateBackStr(at(i)[seqID]);
+			}
+			out << "\t" << logLL[i] << "\t" << (*this)[i].frequency << "\t" << expectedNorFre[i] << endl;
+		}
+		out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, fileName);
+	}
+}
+
+void MaAlignment::generateExpectedAlignment(MaAlignment *aln, double &prob)
+{
+	//cout << "In function: generating expected alignment!" << endl;
+	IntVector expectedNorFre = aln->computeExpectedNorFre();
+	
+	int nsite = aln->getNSite();
+	seq_names.insert(seq_names.begin(), aln->seq_names.begin(), aln->seq_names.end());
+	num_states = aln->num_states;
+	site_pattern.resize(nsite, -1);
+	clear();
+	pattern_index.clear();
+	VerboseMode save_mode = verbose_mode; 
+	verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+
+	int patID;
+	int site = 0;
+	int npat = aln->getNPattern();
+
+	double sumFac = 0;
+	double sumProb = 0;
+	double fac = logFac(nsite);
+
+	double sumFacMax = 0;
+	double sumProbMax = 0;
+
+	for (patID = 0; patID < npat; patID++) {
+		int patFre = expectedNorFre[patID];
+		for ( int patSite = 0; patSite < patFre; patSite++)
+		{			
+			Pattern pat = aln->at(patID);
+			addPattern(pat,site);
+			site++;	
+		}
+
+		//to compute the probability of the new alignment given the multinomial distribution
+		sumFac += logFac(patFre);
+		sumProb += (double)patFre*log((double)aln->at(patID).frequency/(double)nsite);
+
+		//for the unconstraint maximum log likelihood
+		sumFacMax += logFac(aln->at(patID).frequency);
+		sumProbMax += (double)aln->at(patID).frequency*log((double)aln->at(patID).frequency/(double)nsite);
+	}
+	prob = fac - sumFac + sumProb;
+
+	double probMax = fac - sumFacMax + sumProbMax;
+//	cout << "total number of sites: " << site << endl;
+	verbose_mode = save_mode;
+	countConstSite();
+	//cout << "Finish generating expected alignment!" << endl;
+	cout << "Logarithm of the probability of the new alignment given the multinomial distribution of the input alignment is: " << prob << endl;
+	cout << "Maximum unconstraint (log) likelihood of the input alignment: " << probMax << endl;
+// 	cout << "Maximum unconstraint likelihood: " << exp(probMax) << endl;
+}
+
+/*void MaAlignment::multinomialProb(Alignment objectAlign, double &prob)
+{
+	cout << "Computing the multinomial probability of an object alignment given a reference alignment ..." << endl;
+	//should we check for compatibility of sequence's names and sequence's order in THIS alignment and in the objectAlign??
+	//check alignment length
+	int nsite = getNSite();
+	assert(nsite == objectAlign.getNSite());
+	double sumFac = 0;
+	double sumProb = 0;
+	double fac = logFac(nsite);
+	int index;
+	for ( Alignment::iterator objectIt = objectAlign.begin(); objectIt != objectAlign.end() ; objectIt++)
+	{
+		PatternIntMap::iterator pat_it = pattern_index.find((*objectIt));
+		if ( pat_it == pattern_index.end() ) //not found ==> error
+			outError("Pattern in the object alignment is not found in the reference alignment!");
+		sumFac += logFac((*objectIt).frequency);
+		index = pat_it->second;
+		sumProb += (double)(*objectIt).frequency*log((double)at(index).frequency/(double)nsite);
+	}
+	prob = fac - sumFac + sumProb;
+}*/
+
+/*void MaAlignment::multinomialProb(AlignmentVector objectAligns, DoubleVector &probs)
+{
+	int num = objectAligns.size();
+	double curProb;
+	if (num > 0)
+	{
+		probs.resize(num,0);
+		for ( int i = 0; i < num; i++ )
+		{
+			(*this).multinomialProb(objectAligns[i], curProb);
+			probs[i] = curProb;
+		}
+	}
+}*/
diff --git a/maalignment.h b/maalignment.h
new file mode 100644
index 0000000..d70072c
--- /dev/null
+++ b/maalignment.h
@@ -0,0 +1,116 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MAALIGNMENT_H
+#define MAALIGNMENT_H
+
+#include "alignment.h"
+
+typedef vector< Alignment > AlignmentVector;
+
+/**
+Extended Alignment class to serve some analysis, created by MA
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class MaAlignment : public Alignment
+{
+public:
+    MaAlignment() : Alignment() {};
+
+    MaAlignment(char *filename,  char *sequence_type, InputType &intype) : Alignment(filename, sequence_type, intype){};
+	
+	MaAlignment(Alignment &align) : Alignment(align){};
+
+	/**
+		To generate a new alignment from a given alignment (with the Expected Normalized Frequency)
+		@param inputAlign the input alignment for which we can derive the expected normalized requency (CONSTANT)
+		@param prop (OUT) the probability of the new alignment given the observed frequency of patterns in the input alignment (inputAlign).
+		THEN THIS ALIGNMENT IS UPDATED!
+		prop is computed as follows:
+		- We have pattern 1 ... k in the inputAlign with observed freq. d_1 ... d_k (d_1+..+d_k = ell)
+		==> The observed (relative) frequencies are p_1 ... p_k, p_i = d_i/ell
+		- From some tree T we know the likelihood of each pattern given the tree and we derive the expected frequency (the expected alignment) d1(T) ... dk(T) where sum d_i(T) = ell
+		===> prop = [ell!/product(d_i(T)!)] * product(p_i^d_i(T)).
+		
+	*/
+	void generateExpectedAlignment(MaAlignment *inputAlign, double &prop);
+
+	/**
+		To generate a new alignment with the Expected Normalized Frequency
+	*/
+	//void generateExpectedAlignment(Alignment &returnAlign, const IntVector expectedNorFre);
+
+	/**
+		To print a list containing: patterns and the corresponding observed, expected frequencies
+		@param fileName a file to store the information		
+	*/
+	void printPatObsExpFre(const char *fileName);
+
+	/**
+		To print a list containing: patterns and the corresponding observed, expected frequencies
+		@param fileName a file to store the information
+		@param expectedNorFre a vector containing the expected frequencies
+	*/
+	void printPatObsExpFre(const char *fileName, const IntVector expectedNorFre);
+	/**		
+		To compute the Expected Normalized Frequencies of the patterns in the alignment.
+		The values in  this vector should be in the same order as the patterns in the pattern vector.
+		These values are computed based on
+			+ the length of the alignment (ell)
+			+ the logLL vector
+		How?
+			(do not need but to be clear: observed frequencies d1, d2, ..., dk)
+			logLL --> likelihood: p1, p2 ... pk
+			Because we have in total 4^n patterns but may observe k < 4^n patterns ==> p1 + p2 + ... + pk <= 1.
+			This also means (p1 + p2 + ... + pk)*ell <= ell
+			Now we want to derive expected frequencies ^d1, ^d2, ..., ^dk such that ^d1 + ^d2 + ... + ^dk = ell based on p1, p2, ..., pk.
+			We do the followings:
+			+ Compute li = ell*pi / sum_i(pi)  ==> sum_i (li) = ell
+			+ Because li is usually not an integer, we now have to round li as below:
+				* r1 = l1
+				* r_{i+1} = l_{i+1} + r_{i} - [r_i]
+				* Finally set: ^d_i = [r_i]
+				* where [.] denotes ordinary rounding
+	*/
+	IntVector computeExpectedNorFre();
+	/**
+		To read the log likelihood of the patterns from a file
+		@param filename file contains the site and log likelihood
+		@result: the vector logLL will be changed by this function
+	*/
+	void readLogLL(char *filename);
+
+	/**
+		Compute the multinomial probabilities for a vector of object alignments according to the parameters determined by THIS alignment
+		@param objectAligns vector containing the object alignments
+		@param probs (OUT) returned vector containing the probabilities (double)
+	*/
+	//void multinomialProb (AlignmentVector objectAligns, DoubleVector &probs);
+private:
+	/*
+		Log likelihood of the patterns.
+	 	The values in this vector should be in the same order as the patterns in the pattern vector. 
+		This must be made sure while reading the file containing these log likelihood values.
+	*/
+	DoubleVector logLL;	
+	//IntVector expectedNorFre;
+};
+
+#endif
diff --git a/matree.cpp b/matree.cpp
new file mode 100644
index 0000000..3d70578
--- /dev/null
+++ b/matree.cpp
@@ -0,0 +1,192 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "matree.h"
+
+void MaTree::printBrInfo(ostream& out) {
+    //to store internal branch lengths
+    DoubleVector inner;
+    //to store external branch lengths
+    DoubleVector outer;
+    //to store all branch lengths
+    DoubleVector all;
+    //convert the tree into split graph (vector of split*)
+    SplitGraph mySg;
+    convertSplits(mySg);
+    //get information about the branch length based on this SplitGraph
+    for ( SplitGraph::iterator it = mySg.begin(); it != mySg.end(); it++)
+    {
+        (*it)->report(cout);
+        //the split is an external branch
+        if ( (*it)->countTaxa() == 1 )
+            outer.push_back((*it)->getWeight());
+        else //the split is an internal branch
+            inner.push_back((*it)->getWeight());
+        //a branch
+        all.push_back((*it)->getWeight());
+    }
+    //sort the three vectors of branch lengths
+    sort(inner.begin(),inner.end());
+    sort(outer.begin(),outer.end());
+    sort(all.begin(),all.end());
+    //for the statistics
+    int noInner = inner.size();
+    int noOuter = outer.size();
+    int noBr = all.size();
+    double aveInner = 0;
+    double aveOuter = 0;
+    double treeLen = 0;
+
+    for ( int i = 0; i < noInner; i++ )
+        aveInner += inner[i];
+    for ( int i = 0; i < noOuter; i++ )
+        aveOuter += outer[i];
+    for ( int i = 0; i < noBr; i++ )
+        treeLen += all[i];
+    aveInner /= (double)noInner;
+    aveOuter /= (double)noOuter;
+    out << "minInter maxInter aveInter minExter maxExter aveExter minBr maxBr treeLen noBr" << endl;
+    out << inner[0] << " " << inner[noInner-1] << " " << aveInner << " " << outer[0] << " " << outer[noOuter-1] << " " << aveOuter << " " << all[0] << " " << all[noBr-1] << " " << treeLen << " " << noBr << endl;
+}
+
+void MaTree::comparedTo (MTreeSet &trees, DoubleMatrix &brLenMatrix, IntVector &RFs, DoubleVector &BSDs) {
+    //for consistency reason
+    NodeVector taxa;
+    getTaxa(taxa);
+    sort(taxa.begin(), taxa.end(), nodenamecmp);
+    int i;
+    NodeVector::iterator it;
+    for (it = taxa.begin(), i = 0; it != taxa.end(); it++, i++)
+        (*it)->id = i;
+
+    //convert the tree into SplitIntMap
+    SplitIntMap sim;
+    Split *sp = new Split(leafNum);
+    convertSplitIntMap(sim, sp, 0);
+    //output to test
+    /*	for ( SplitIntMap::iterator it = sim.begin(); it != sim.end(); it++ ){
+    		cout << (*it).second << "\t";
+    		(*it).first->report(cout);
+    	}*/
+
+    // get the taxa name
+    vector<string> taxname;
+    taxname.resize(leafNum);
+    getTaxaName(taxname);
+
+    int noTree = trees.size();
+    if (noTree == 0 ) return;
+    RFs.resize(noTree);
+    BSDs.resize(noTree);
+
+    //now check if it is consistent (rooting, same leaf set) with the input trees
+    MTree *tree = trees.front();
+//	if (tree->rooted != rooted)
+//		outError("Rooted and unrooted trees are mixed up");
+    if (tree->leafNum != leafNum)
+        outError("Tree has different number of taxa!");
+    vector<string> taxname1;
+    taxname1.resize(leafNum);
+    tree->getTaxaName(taxname1);
+
+    vector<string>::iterator strit;
+    for (strit = taxname1.begin(), i = 0; strit != taxname1.end(); strit++, i++) {
+        if ((*strit) != taxname[i])
+            outError("Tree has different taxa names!");
+    }
+
+    MTreeSet::iterator tit;
+    for ( tit = trees.begin(), i=0; tit != trees.end(); tit++, i++ )
+    {
+        DoubleVector brVec(nodeNum,-2);
+        SplitGraph *sg = new SplitGraph;
+        SplitIntMap *hs = new SplitIntMap;
+        (*tit)->convertSplits(taxname,*sg);
+        // make sure that taxon 0 is included
+        for (SplitGraph::iterator sit = sg->begin(); sit != sg->end(); sit++) {
+            if (!(*sit)->containTaxon(0)) (*sit)->invert();
+            //	(*sit)->report(cout);
+            hs->insertSplit((*sit), 1);
+        }
+
+        int rf = 0;
+        double bsd = 0;
+        //go through each split in this tree (not the compared tree)
+        for ( SplitIntMap::iterator tsit = sim.begin(); tsit != sim.end(); tsit++ )
+        {
+            Split* fSplit = hs->findSplit(tsit->first); // check whether the compared tree contains this split
+            if (fSplit) { //yes
+                brVec[tsit->second] = fSplit->getWeight(); //update brVec
+                bsd += (fSplit->getWeight() - tsit->first->getWeight()) *  (fSplit->getWeight() - tsit->first->getWeight());       //update bsd
+            }
+            else {
+                brVec[tsit->second] = -1;
+                rf++;
+                bsd += tsit->first->getWeight() * tsit->first->getWeight();
+            }
+        }
+        //go through each split in the compared tree
+        for ( SplitIntMap::iterator fsit = hs->begin(); fsit != hs->end(); fsit++ )
+        {
+            Split* fSplit = sim.findSplit(fsit->first);
+            if (!fSplit) {
+                rf++;
+                bsd += fsit->first->getWeight() * fsit->first->getWeight();
+            }
+        }
+        //insert the result
+        RFs[i] = rf;
+        BSDs[i] = bsd;
+        brLenMatrix.push_back(brVec);
+        delete sg;
+        delete hs;
+    }
+}
+
+//void MaTree::convertSplitIntMap(SplitIntMap &sim){}
+
+void MaTree::convertSplitIntMap(SplitIntMap &sim, Split *resp, const int taxonID, Node *node, Node *dad) {
+    if (!node) node = root;
+    assert(resp->getNTaxa() == leafNum);
+    assert (taxonID >= 0 && taxonID < leafNum);
+    bool has_child = false;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        //vector<int> taxa;
+        //getTaxaID((*it)->node, node, taxa);
+
+        Split *sp = new Split(leafNum, (*it)->length);
+        convertSplitIntMap(sim, sp, taxonID,(*it)->node, node);
+        *resp += *sp;
+        if (!sp->containTaxon(taxonID))
+            sp->invert();
+        //sg.push_back(sp);
+        if ( node == root)
+            sim.insertSplit(sp,node->id);
+        else
+            sim.insertSplit(sp,(*it)->node->id);
+        has_child = true;
+    }
+    if (!has_child)
+        resp->addTaxon(node->id);
+
+}
+
+
+
+
diff --git a/matree.h b/matree.h
new file mode 100644
index 0000000..73c12fd
--- /dev/null
+++ b/matree.h
@@ -0,0 +1,97 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MATREE_H
+#define MATREE_H
+
+//#include "tools.h"
+#include "mtree.h"
+#include "mtreeset.h"
+
+/**
+Minh Anh: extended tree to serve some statistics
+*/
+class MaTree : public MTree
+{
+public:
+/********************************************************
+	CONSTRUCTORs, INITIALIZATION AND DESTRUCTORs
+********************************************************/
+
+	/**
+		constructor, read tree from user file
+		@param userTreeFile the name of the user tree
+		@param is_rooted (IN/OUT) true if tree is rooted
+	*/
+	MaTree(const char *userTreeFile, bool &is_rooted) : MTree(userTreeFile, is_rooted) {};
+
+	/**
+		constructor, get from another tree
+		@param tree another MTree
+	*/
+	MaTree(MTree &tree) : MTree(tree) {};
+
+	/**
+		constructor
+	*/
+    MaTree() : MTree() {};
+
+/***************************************************************
+	OUTPUT INFORMATION ABOUT THE BRANCHES
+***************************************************************/
+	/**
+		Output information about branches on the tree into an output stream
+		The information contains:
+		- Number of external branches (number of leaves), minimum/maximum/sum of the external branches
+		- Number of internal branches, minimum/maximum/sum of the internal branches
+		- Number of branches, minimum/maximum/sum of all the branches
+		@param out the output stream
+	*/
+	void printBrInfo(ostream& out);
+	
+	/**
+		Compare this tree with each tree in a given set of trees
+		@param trees (IN) the trees to compare
+		@param brLenMatrix (OUT) a matrix of double, each row is a vector of double. The size of this vector is the number of branches in the tree, i.e. from 0 to 2n-3 if the tree is unrooted or from 0 to 2n-2 if the tree is rooted.
+		If branch i is contained in the other tree (atree), element i is the length of this branch 
+		on the other tree. If not, element i is set to -1.
+		@param RFs (OUT) the Robinson-Foulds distance between this tree and each of the given trees.
+		@param BSDs (OUT) the branch score distance between this tree and each of the given trees.
+	*/
+	void comparedTo(MTreeSet &trees, DoubleMatrix &brLenMatrix, IntVector &RFs, DoubleVector &BSDs);
+
+	/**
+		convert the tree into SplitIntMap, the integer number is the nodeID of the corresponding branch
+		@param sim (OUT) resulting splitIntMap
+		@param taxonID (IN) the ID of an external node (taxon) to be presented in all splits
+	*/
+//	void convertSplitIntMap(SplitIntMap &sim, const int taxonID);
+
+	/**
+		convert the tree into SplitIntMap, iterative procedure
+		@param sim (OUT) resulting splitIntMap
+		@param resp (internal) set of taxa below node
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param taxonID (IN) the ID of an external node (taxon) to be presented in all splits
+	*/
+	void convertSplitIntMap(SplitIntMap &sg, Split *resp, const int taxonID, Node *node = NULL, Node *dad = NULL);
+};
+
+#endif
diff --git a/mexttree.cpp b/mexttree.cpp
new file mode 100644
index 0000000..e4df396
--- /dev/null
+++ b/mexttree.cpp
@@ -0,0 +1,498 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "mexttree.h"
+#include "alignment.h"
+
+void MExtTree::generateRandomTree(TreeGenType tree_type, Params &params, bool binary) {
+	Alignment *alignment = NULL;
+	if (params.aln_file) {
+		// generate random tree with leaf sets taken from an alignment
+		alignment = new Alignment(params.aln_file, params.sequence_type, params.intype);
+		params.sub_size = alignment->getNSeq();
+	}
+	if (params.sub_size < 3) {
+		outError(ERR_FEW_TAXA);
+	}
+	switch (tree_type) {
+	case YULE_HARDING: 
+		generateYuleHarding(params, binary);
+		break;
+	case UNIFORM:
+		generateUniform(params.sub_size, binary);
+		break;
+	case CATERPILLAR:
+		generateCaterpillar(params.sub_size);
+		break;
+	case BALANCED:
+		generateBalanced(params.sub_size);
+		break;
+	case STAR_TREE:
+		generateStarTree(params);
+		break;
+	default:
+		break;
+	}
+	if (!alignment) return;
+	NodeVector taxa;
+	getTaxa(taxa);
+	assert(taxa.size() == params.sub_size);
+	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
+		(*it)->name = alignment->getSeqName((*it)->id);
+}
+
+void MExtTree::setZeroInternalBranches(int num_zero_len) {
+	NodeVector nodes, nodes2;
+	getAllInnerBranches(nodes, nodes2);
+	if (num_zero_len > nodes.size()) outError("The specified number of zero branches is too much");
+	for (int i = 0; i < num_zero_len;) {
+		int id = random_int(nodes.size());
+		if (!nodes[id]) continue;
+		i++;
+		nodes[id]->findNeighbor(nodes2[id])->length = 0.0;
+		nodes2[id]->findNeighbor(nodes[id])->length = 0.0;
+		nodes[id] = NULL;
+		nodes2[id] = NULL;
+	}
+}
+
+void MExtTree::collapseZeroBranches(Node *node, Node *dad) {
+	if (!node) node = root;
+	FOR_NEIGHBOR_DECLARE(node, dad, it) {
+		collapseZeroBranches((*it)->node, node);
+	}
+	NeighborVec nei_vec;
+	nei_vec.insert(nei_vec.begin(), node->neighbors.begin(), node->neighbors.end());
+	for (it = nei_vec.begin(); it != nei_vec.end(); it++) 
+	if ((*it)->node != dad) {
+		if ((*it)->length == 0.0) { // delete the child node
+			Node *child = (*it)->node;
+			bool first = true;
+			FOR_NEIGHBOR_IT(child, node, it2) {
+				if (first)
+					node->updateNeighbor(child, (*it2)->node, (*it2)->length);
+				else
+					node->addNeighbor((*it2)->node, (*it2)->length);
+				(*it2)->node->updateNeighbor(child, node);
+				first = false;
+			}
+			delete child;
+		}
+	}
+}
+
+void MExtTree::generateCaterpillar(int size) {
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+	root = newNode();
+	int i;
+	NodeVector myleaves;
+	NodeVector innodes;
+	Node *node;
+	double len;
+
+	innodes.push_back(root);
+	// create initial tree with 3 leaves
+	for (i = 0; i < 3; i++)
+	{
+		node = newNode();
+		len = random_double();
+		root->addNeighbor(node, len);
+		node->addNeighbor(root, len);
+		myleaves.push_back(node);
+	}
+
+	// additionally add a leaf
+	for (i = 3; i < size; i++)
+	{
+		int index;
+		index = i-1;
+
+		node = myleaves[index];
+		innodes.push_back(node);
+		// add the first leaf
+		Node *newleaf = newNode();
+		len = random_double();
+		node->addNeighbor(newleaf, len);
+		newleaf->addNeighbor(node, len);
+		myleaves[index] = newleaf;
+
+		// add the second leaf
+		newleaf = newNode();
+		len = random_double();
+		node->addNeighbor(newleaf, len);
+		newleaf->addNeighbor(node, len);
+		myleaves.push_back(newleaf);
+
+	}
+
+	root = myleaves[0];
+	// indexing the leaves
+	setLeavesName(myleaves);
+
+	leafNum = myleaves.size();
+	nodeNum = leafNum;
+	initializeTree();
+
+}
+
+
+void MExtTree::generateBalanced(int size) {
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+	root = newNode();
+	int i;
+	NodeVector myleaves;
+	Node *node;
+	double len;
+
+	myleaves.push_back(root);
+	// create initial tree with 2 leaves
+	node = newNode();
+	len = random_double();
+	root->addNeighbor(node, len);
+	node->addNeighbor(root, len);
+	myleaves.push_back(node);
+
+	while (myleaves.size() < size) {
+
+		int cur_size = myleaves.size();
+		// additionally add a leaf
+		for (i = 0; i < cur_size && myleaves.size() < size; i++)
+		{
+			int index = i;
+	
+			node = myleaves[index];
+			// add the first leaf
+			Node *newleaf = newNode();
+			len = random_double();
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves[index] = newleaf;
+	
+			// add the second leaf
+			newleaf = newNode();
+			len = random_double();
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves.push_back(newleaf);
+	
+		}
+	}
+
+	root = myleaves[0];
+	// indexing the leaves
+	setLeavesName(myleaves);
+
+	leafNum = myleaves.size();
+	nodeNum = leafNum;
+	initializeTree();
+
+}
+
+/**
+	generate a random tree following uniform model
+*/
+void MExtTree::generateUniform(int size, bool binary)
+{
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+	int i;
+
+	// list of left- and right-end of branches
+	NodeVector leftend, rightend, myleaves;
+	Node *node;
+	double len;
+
+	root = newNode(0, "0");
+	// create initial tree with 2 leaves
+	node = newNode(1, "1");
+	len = random_double();
+	root->addNeighbor(node, len);
+	node->addNeighbor(root, len);
+
+	leftend.push_back(root);
+	rightend.push_back(node);
+
+	myleaves.push_back(root);
+	myleaves.push_back(node);
+
+	// additionally add a leaf
+	for (i = 2; i < size; i++)
+	{
+		int index;
+		index = random_int(2*i-3);
+		//cout << "step " << i << " left = " << leftend[index]->id << " right = " << rightend[index]->id << endl;
+
+		// add an internal node
+		Node *newnode = newNode(size+i-2);
+		// reconnect the left end
+		node = leftend[index];
+		for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++) 
+			if ((*it)->node == rightend[index]) {
+				len = random_double();
+				(*it)->node = newnode;
+				(*it)->length = len;
+				newnode->addNeighbor(node, len);
+				//cout << "  left " << leftend[index]->id << " " << newnode->id << endl;
+				break;
+			}
+		// reconnect the right end
+		node = rightend[index];
+		for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++) 
+			if ((*it)->node == leftend[index]) {
+				len = random_double();
+				(*it)->node = newnode;
+				(*it)->length = len;
+				newnode->addNeighbor(node, len);
+				//cout << "  right " << rightend[index]->id  << " " << newnode->id  << endl;
+				break;
+			}
+
+		// add a new leaf
+		Node *newleaf = newNode(i, i);
+		len = random_double();
+		newnode->addNeighbor(newleaf, len);
+		newleaf->addNeighbor(newnode, len);
+
+		// update the leftend and rightend list
+		leftend.push_back(newnode);
+		rightend.push_back(rightend[index]);
+
+		leftend.push_back(newnode);
+		rightend.push_back(newleaf);
+
+		rightend[index] = newnode;
+
+		myleaves.push_back(newleaf);
+
+	}
+
+	// indexing the leaves
+	setLeavesName(myleaves);
+
+	leafNum = size;
+	nodeNum = leafNum;
+	initializeTree();
+
+}
+
+/**
+	generate a random tree following Yule Harding model
+*/
+void MExtTree::generateYuleHarding(Params &params, bool binary) {
+	int size = params.sub_size;
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+	root = newNode();
+	int i;
+	NodeVector myleaves;
+	NodeVector innodes;
+	Node *node;
+	double len;
+
+	innodes.push_back(root);
+	// create initial tree with 3 leaves
+	for (i = 0; i < 3; i++) {
+		node = newNode();
+		len = randomLen(params);
+		root->addNeighbor(node, len);
+		node->addNeighbor(root, len);
+		myleaves.push_back(node);
+	}
+
+	// additionally add a leaf
+	for (i = 3; i < size; i++)
+	{
+		int index;
+		if (binary) {
+			index = random_int(i);
+		} else {
+ 			index = random_int(i + innodes.size());
+		}
+		if (index < i) {
+			node = myleaves[index];
+			innodes.push_back(node);
+			// add the first leaf
+			Node *newleaf = newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves[index] = newleaf;
+	
+			// add the second leaf
+			newleaf = newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves.push_back(newleaf);
+		}
+		else {
+			node = innodes[index-i];
+			// add only 1 new leaf
+			Node *newleaf = newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves.push_back(newleaf);
+			
+		}
+
+	}
+
+	root = myleaves[0];
+	// indexing the leaves
+	setLeavesName(myleaves);
+
+	leafNum = myleaves.size();
+	nodeNum = leafNum;
+	initializeTree();
+
+
+}
+
+void MExtTree::generateStarTree(Params &params) {
+	generateYuleHarding(params);
+	NodeVector nodes, nodes2;
+	getAllInnerBranches(nodes, nodes2);
+	for (int i = 0; i < nodes.size(); i++) {
+		nodes[i]->findNeighbor(nodes2[i])->length = 0.0;
+		nodes2[i]->findNeighbor(nodes[i])->length = 0.0;
+	}
+
+}
+
+void MExtTree::generateRandomBranchLengths(Params &params, Node *node, Node *dad) {
+	if (!node) node = root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		double len = randomLen(params);
+		(*it)->length = len;
+		(*it)->node->findNeighbor(node)->length = len;
+		generateRandomBranchLengths(params, (*it)->node, node);
+	}
+}
+
+
+void MExtTree::setLeavesName(NodeVector &myleaves) {
+	for (int i = 0; i < myleaves.size(); i++)
+	{
+		myleaves[i]->id = i;
+		stringstream str;
+		str << 'T' << myleaves[i]->id;
+		myleaves[i]->name = str.str();
+	}
+}
+
+
+void MExtTree::reportDisagreedTrees(vector<string> &taxname, MTreeSet &trees, Split &mysplit) {
+	for (MTreeSet::iterator it = trees.begin(); it != trees.end(); it++) {
+		MTree *tree = (*it);
+		SplitGraph sg;
+		tree->convertSplits(taxname, sg);
+		if (!sg.containSplit(mysplit)) {
+			tree->printTree(cout, 0); // don't print branch lengths
+			cout << endl;
+		}
+	}
+}
+
+
+void MExtTree::createBootstrapSupport(vector<string> &taxname, MTreeSet &trees, SplitGraph &sg, SplitIntMap &hash_ss, 
+    char *tag, Node *node, Node *dad) {
+	if (!node) node = root;	
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!node->isLeaf() && !(*it)->node->isLeaf()) {
+			vector<int> taxa;
+			getTaxaID(taxa, (*it)->node, node);
+			Split mysplit(leafNum, 0.0, taxa);
+			if (mysplit.shouldInvert())
+				mysplit.invert();
+			//mysplit.report(cout);
+			//SplitIntMap::iterator ass_it = hash_ss.find(&mysplit);
+			Split *sp = hash_ss.findSplit(&mysplit);
+			// if found smt
+			if (sp != NULL) {
+				//Split *sp = ass_it->first;
+				/*char tmp[100];
+				if ((*it)->node->name.empty()) {
+					sprintf(tmp, "%d", round(sp->getWeight()));
+				} else
+					sprintf(tmp, "/%d", round(sp->getWeight()));*/
+				stringstream tmp;
+				if ((*it)->node->name.empty())
+				  tmp << sp->getWeight();
+				else
+				  tmp << "/" << sp->getWeight();
+                  
+                // assign tag
+                if (tag && (strcmp(tag, "ALL")==0 || (*it)->node->name == tag))
+                    tmp << sp->getName();                
+				(*it)->node->name.append(tmp.str());
+			} else {
+				if (!(*it)->node->name.empty()) (*it)->node->name.append("/");
+				(*it)->node->name.append("0");
+				if (verbose_mode >= VB_MED) {
+					cout << "split not found:" << endl;
+					mysplit.report(cout);
+				}
+			} 
+			/* new stuff: report trees that do not contain the split */
+			if (strncmp((*it)->node->name.c_str(), "INFO", 4) == 0) {
+				cout << "Reporting trees not containing the split " << (*it)->node->name << endl;
+				reportDisagreedTrees(taxname, trees, mysplit);
+			}
+		}
+		createBootstrapSupport(taxname, trees, sg, hash_ss, tag, (*it)->node, node);
+	}	
+}
+
+void MExtTree::createCluster(NodeVector &taxa, matrix(int) &clusters, Node *node, Node *dad) {
+	if (node == NULL) node = root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		// if both end-nodes are bifurcating
+		Node *child = (*it)->node;
+		if (!child->isLeaf()) child->name = "";
+		if (node->degree() == 3 && child->degree() == 3) { 
+			int count = 0;
+			FOR_NEIGHBOR_DECLARE(child, node, it2)
+				createCluster(count++, (*it2)->node, child);
+			if (!rooted) {
+				FOR_NEIGHBOR(node, child, it2) 
+					createCluster(count++, (*it2)->node, node);
+			} else createCluster(count++, node, child);
+
+
+			clusters.resize(clusters.size()+1);
+			for (NodeVector::iterator nit = taxa.begin(); nit != taxa.end(); nit++) {
+				clusters.back().push_back((int)((*nit)->height));
+			}
+			child->name = "";
+			child->name += clusters.size();
+		}
+		createCluster(taxa, clusters, child, node);
+	}
+}
+
+void MExtTree::createCluster(int clu_num, Node *node, Node *dad) {
+	if (node->isLeaf()) node->height = clu_num;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		createCluster(clu_num, (*it)->node, node);
+	}
+}
+
diff --git a/mexttree.h b/mexttree.h
new file mode 100644
index 0000000..c45caed
--- /dev/null
+++ b/mexttree.h
@@ -0,0 +1,161 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MEXTTREE_H
+#define MEXTTREE_H
+
+#include "mtree.h"
+#include "mtreeset.h"
+
+/**
+extended tree, for bootstrap, cluster, etc (do not related to PDA main topic)
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class MExtTree : public MTree
+{
+public:
+
+/********************************************************
+	CONSTRUCTORs, INITIALIZATION AND DESTRUCTORs
+********************************************************/
+
+	/**
+		constructor, read tree from user file
+		@param userTreeFile the name of the user tree
+		@param is_rooted (IN/OUT) true if tree is rooted
+	*/
+	MExtTree(const char *userTreeFile, bool &is_rooted) : MTree(userTreeFile, is_rooted) {};
+
+	/**
+		constructor, get from another tree
+		@param tree another MTree
+	*/
+	MExtTree(MTree &tree) : MTree(tree) {};
+
+	/**
+		constructor
+	*/
+    MExtTree() : MTree() {};
+
+
+/********************************************************
+	GENERATE RANDOM TREE PROCEDURES
+********************************************************/
+
+	/**
+		generate a random tree with given tree type
+		@param tree_type can be YULE_HARDING, UNIFORM, BALANCED, or CATERPILLAR, or STAR_TREE
+		@param params program parameters
+		@param binary TRUE if you want to generate a binary tree
+	*/
+	void generateRandomTree(TreeGenType tree_type, Params &params, bool binary = true);
+
+	/**
+		generate a random tree following Yule-Harding model
+		@param params program parameters
+		@param binary TRUE if you want to generate a binary tree
+	*/
+	void generateYuleHarding(Params &params, bool binary = true);
+
+	/**
+		generate a random tree following uniform model
+		@param size number of taxa
+		@param binary TRUE if you want to generate a binary tree
+	*/
+	void generateUniform(int size, bool binary = true);
+
+	/**
+		generate a caterpillar tree
+		@param size number of taxa
+	*/
+	void generateCaterpillar(int size);
+
+	/**
+		generate a balanced tree
+		@param size number of taxa
+	*/
+	void generateBalanced(int size);
+
+	/**
+		generate a star tree
+		@param params program parameters
+	*/
+	void generateStarTree(Params &params);
+
+	/**
+	 * generate random branch lengths on the given topology
+	 * 		@param params program parameters
+	 */
+	void generateRandomBranchLengths(Params &params, Node* node = NULL, Node *dad = NULL);
+
+	/**
+		set the leaf ID and names when generating random tree
+		@param myleaves vector of leaves
+	*/
+	void setLeavesName(NodeVector &myleaves);
+
+
+	void setZeroInternalBranches(int num_zero_len);
+
+	void collapseZeroBranches(Node *node = NULL, Node *dad = NULL);
+
+/********************************************************
+	BOOTSTRAP
+********************************************************/
+
+	/**
+		create support value for each internal node to the weight of split in the split graph
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param sg split graph
+		@param hash_ss hash split set
+		@param taxname vector of taxa names
+		@param trees set of trees
+	*/
+	void createBootstrapSupport(vector<string> &taxname, MTreeSet &trees, SplitGraph &sg, SplitIntMap &hash_ss, char *tag,
+		Node *node = NULL, Node *dad = NULL);
+
+	void reportDisagreedTrees(vector<string> &taxname, MTreeSet &trees, Split &mysplit);
+
+/********************************************************
+	CLUSTER for each branch, useful for likelihood mapping analysis
+********************************************************/
+
+	/**
+		create CLUSTER for each branch, useful for likelihood mapping analysis
+		@param taxa an order of taxa
+		@param clusters (OUT) list of all clusters
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+	*/
+	void createCluster(NodeVector &taxa, matrix(int) &clusters, Node *node = NULL, Node *dad = NULL);
+
+	/**
+		create CLUSTER for each branch, useful for likelihood mapping analysis
+		@param clu_num cluster number
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+	*/
+	void createCluster(int clu_num, Node *node, Node *dad);
+
+
+};
+
+#endif
diff --git a/model/CMakeLists.txt b/model/CMakeLists.txt
new file mode 100644
index 0000000..a05e424
--- /dev/null
+++ b/model/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_library(model
+modelgtr.cpp
+modelbin.cpp
+modeldna.cpp
+modelfactory.cpp
+modelnonrev.cpp
+modelprotein.cpp
+modelset.cpp
+modelsubst.cpp
+partitionmodel.cpp
+rategamma.cpp
+rategammainvar.cpp
+rateheterogeneity.cpp
+rateinvar.cpp
+ratemeyerdiscrete.cpp
+ratemeyerhaeseler.cpp
+ratekategory.cpp
+ratefree.cpp
+ratefreeinvar.cpp
+modelcodon.cpp
+modelmorphology.cpp
+modelmixture.cpp
+)
\ No newline at end of file
diff --git a/model/modelbin.cpp b/model/modelbin.cpp
new file mode 100644
index 0000000..d94dcca
--- /dev/null
+++ b/model/modelbin.cpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "modelbin.h"
+
+ModelBIN::ModelBIN(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates)
+: ModelGTR(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+void ModelBIN::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+	assert(num_states == 2); // make sure that you create model for Binary data
+	StateFreqType def_freq = FREQ_UNKNOWN;
+	name = model_name;
+	full_name = model_name;
+	if (name == "JC2") {
+		freq = FREQ_EQUAL;
+	} else if (name == "GTR2") {
+		freq = FREQ_ESTIMATE;
+	} else {
+		readParameters(model_name);
+	}
+	if (freq == FREQ_UNKNOWN || def_freq == FREQ_EQUAL) freq = def_freq;
+	ModelGTR::init(freq);
+}
+
diff --git a/model/modelbin.h b/model/modelbin.h
new file mode 100644
index 0000000..2b01af8
--- /dev/null
+++ b/model/modelbin.h
@@ -0,0 +1,55 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MODELBIN_H
+#define MODELBIN_H
+
+#include "modelgtr.h"
+
+/**
+Model for Binary data
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelBIN : public ModelGTR
+{
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+    ModelBIN(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates = true);
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() { return name; }
+
+};
+
+#endif
diff --git a/model/modelcodon.cpp b/model/modelcodon.cpp
new file mode 100644
index 0000000..0da735b
--- /dev/null
+++ b/model/modelcodon.cpp
@@ -0,0 +1,851 @@
+/*
+ * modelcodon.cpp
+ *
+ *  Created on: May 24, 2013
+ *      Author: minh
+ */
+
+#include "modelcodon.h"
+#include <string>
+
+
+/* Empirical codon model restricted (Kosiol et al. 2007), source: http://www.ebi.ac.uk/goldman/ECM/ */
+string model_ECMrest1 =
+"11.192024 \
+1.315610 0.010896 \
+5.427076 4.756288 24.748755 \
+1.658051 0.000000 0.000000 0.000000 \
+0.000000 1.913571 0.000000 0.000000 13.889102 \
+0.000000 0.000000 2.952332 0.000000 44.407955 13.681751 \
+0.000000 0.000000 0.000000 8.126914 17.057443 65.097021 12.991861 \
+6.610894 0.000000 0.000000 0.000000 2.206054 0.000000 0.000000 0.000000 \
+0.000000 5.177930 0.000000 0.000000 0.000000 5.615472 0.000000 0.000000 19.942818 \
+3.347364 0.000000 0.000000 0.000000 6.191481 0.000000 0.000000 0.000000 0.582084 0.000000 \
+0.000000 1.558523 0.000000 0.000000 0.000000 9.339206 0.000000 0.000000 0.000000 0.144278 44.777964 \
+0.000000 0.000000 0.000000 5.369644 0.000000 0.000000 0.000000 4.662001 0.000000 0.000000 0.677177 0.073268 \
+2.090751 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 2.266373 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.905484 \
+0.000000 0.000000 75.752638 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 56.803876 7.811205 \
+0.000000 0.000000 0.000000 20.877218 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.432339 22.078564 5.650116 \
+0.000000 0.000000 0.000000 0.000000 1.769355 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.263838 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 2.704601 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.389735 0.000000 0.000000 17.461627 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.312811 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.393680 0.000000 35.480963 12.053827 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.303480 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.477616 8.407091 28.557939 11.295213 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.444964 0.000000 0.000000 0.000000 0.000000 1.583116 0.000000 0.000000 0.000000 1.021682 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.087801 0.000000 0.000000 0.000000 0.000000 3.230751 0.000000 0.000000 0.000000 3.774544 0.000000 0.000000 28.086160 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.419058 0.000000 0.000000 0.000000 5.381868 0.000000 3.440380 1.918904 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.812540 0.000000 0.000000 0.000000 1.794388 1.086327 5.369463 14.959151 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.617091 0.000000 0.000000 0.779565 0.000000 0.000000 0.000000 0.334165 0.000000 0.000000 0.000000 3.019726 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.632945 0.000000 0.000000 2.250770 0.000000 0.000000 0.000000 1.699302 0.000000 0.000000 0.000000 7.016899 0.000000 0.000000 14.603857 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.023939 0.000000 0.000000 0.000000 1.693662 0.000000 0.000000 0.000000 6.415757 0.000000 99.459951 14.930266 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.026086 0.000000 0.000000 0.000000 1.462945 0.000000 0.000000 0.000000 3.144296 0.000000 0.000000 0.000000 19.920977 30.804750 79.483730 13.919752 \
+1.682029 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.301225 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 0.786043 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.381841 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.140728 \
+0.000000 0.000000 10.116588 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.134459 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 18.298900 4.623936 \
+0.000000 0.000000 0.000000 7.911096 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.570123 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.281784 1.303951 2.082128 \
+0.000000 0.000000 0.000000 0.000000 38.229100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.578976 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.801564 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 15.793595 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.434550 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.231468 0.000000 0.000000 6.035740 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.033932 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.925575 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.962350 0.000000 28.307876 6.967655 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 17.103904 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.238450 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.155285 19.578982 38.414969 12.678802 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.245405 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.004762 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.501054 0.000000 0.000000 0.000000 11.715476 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.228361 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.105602 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.292691 0.000000 0.000000 0.000000 2.134740 0.000000 0.000000 13.863648 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.404436 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.647620 0.000000 0.000000 0.000000 3.919360 0.000000 4.929483 0.366267 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.715692 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.975074 0.000000 0.000000 0.000000 5.869857 1.010212 0.982893 10.762877 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.719489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.834666 0.000000 0.000000 0.000000 0.578118 0.000000 0.000000 0.000000 39.399322 0.000000 0.000000 0.000000 16.623529 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.047654 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.033630 0.000000 0.000000 0.000000 0.437779 0.000000 0.000000 0.000000 21.337943 0.000000 0.000000 0.000000 7.784768 0.000000 0.000000 26.637668 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 92.372238 0.000000 0.000000 0.000000 1.903175 0.000000 0.000000 0.000000 0.754055 0.000000 0.000000 0.000000 8.423762 0.000000 1.792245 0.120900 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.825082 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 133.296291 0.000000 0.000000 0.000000 2.231662 0.000000 0.000000 0.000000 22.577271 0.000000 0.000000 0.000000 21.000358 3.324581 6.011970 36.292705 \
+2.261813 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.473623 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.096281 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 1.923392 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.914972 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.137337 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.669955 \
+0.000000 0.000000 2.362720 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.737489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25.294298 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 26.045078 3.531461 \
+0.000000 0.000000 0.000000 2.022101 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.164805 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.078444 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.901167 21.657664 11.898141 \
+0.000000 0.000000 0.000000 0.000000 5.540052 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.159185 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.107629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.682092 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 7.675838 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.120189 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.312255 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.308415 0.000000 0.000000 6.516319 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.880382 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.923972 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.064069 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.291148 0.000000 21.910225 5.090423 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 21.863158 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.034856 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25.461549 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.166554 5.512586 20.715347 9.529141 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.367553 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.383706 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.091654 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.352915 0.000000 0.000000 0.000000 0.693026 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.294702 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.006827 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.686074 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.208522 0.000000 0.000000 0.000000 1.866565 0.000000 0.000000 10.605899 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.485369 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.811398 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.277861 0.000000 0.000000 0.000000 2.774445 0.000000 2.710610 0.650088 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.686782 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.090641 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.476105 0.000000 0.000000 0.000000 9.441919 1.296294 3.7790 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.104727 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.041150 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.590780 0.000000 0.000000 0.000000 0.503385 0.000000 0.000000 0.000000 1.541379 0.000000 0.000000 0.000000 1.042624 0.000 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.552851 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.252470 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.285543 0.000000 0.000000 0.000000 0.542717 0.000000 0.000000 0.000000 2.303487 0.000000 0.000000 0.000000 1.5616 [...]
+string model_ECMrest = model_ECMrest1 + 
+"0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.091041 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.432410 0.000000 0.000000 0.000000 0.702411 0.000000 0.000000 0.000000 2.985093 0.000000 0.000000 0.000 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.810856 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.803738 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.388514 0.000000 0.000000 0.000000 0.302501 0.000000 0.000000 0.000000 6.644971 0.000000 0.0000 [...]
+\
+0.022103  0.021383  0.016387  0.015425  0.011880  0.011131  0.009750  0.008956  0.015965  0.015782  0.006025  0.007029  0.011880  0.014467  0.017386  0.007600  0.028839  0.010007  0.010100  0.010642  0.011843  0.011097  0.011703  0.016076  0.020211  0.008311  0.014148  0.004800  0.007837  0.025576  0.023441  0.013551  0.020102  0.013424  0.020201  0.015528  0.012142  0.023006  0.020171  0.030001  0.026344  0.010142  0.011679  0.010372  0.008195  0.019047  0.018938  0.010901  0.022747  0. [...]
+\
+TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TGT TGC TGG CTT CTC CTA CTG CCT CCC CCA \
+CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG AAT AAC AAA \
+AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG GGT GGC GGA \
+GGG";
+
+/* Empirical codon model unrestricted (Kosiol et al. 2007), source: http://www.ebi.ac.uk/goldman/ECM/ */
+string model_ECMunrest1 =
+"16.011531 \
+2.395822 0.151858 \
+1.204356 0.675537 18.541946 \
+0.773935 0.052602 0.249707 0.274990 \
+0.030074 0.656004 0.011609 0.158873 23.655090 \
+0.278090 0.056677 1.184813 0.611887 35.921779 15.982573 \
+0.034137 0.198277 0.010188 0.694091 11.510965 35.359077 17.424222 \
+4.317981 0.503397 0.798582 0.337279 0.688169 0.047115 0.341791 0.058136 \
+0.481042 4.483501 0.033529 0.177833 0.069588 0.524116 0.070809 0.213967 24.177765 \
+0.733587 0.076912 0.645571 0.395942 1.811753 0.343463 0.751980 0.143447 0.822999 0.054860 \
+0.045951 0.561620 0.040012 0.240632 0.138244 1.323765 0.121937 0.493179 0.068342 0.628438 56.838378 \
+0.786871 1.183337 0.271072 0.632947 0.069758 0.081312 0.195833 0.410046 1.140051 1.421996 0.264556 0.210115 \
+2.016257 0.207692 12.035723 11.161511 0.277929 0.000186 0.000289 0.000000 0.485469 0.000299 0.543240 0.000674 0.010122 \
+0.083684 2.306110 1.373823 5.651603 0.000085 0.342813 0.000096 0.000344 0.000116 0.622089 0.000466 0.674176 0.113701 15.874441 \
+1.036474 0.198558 27.219895 16.560966 0.000678 0.000186 0.496046 0.000115 0.016650 0.011978 0.020649 0.021578 0.017106 21.437257 8.808275 \
+0.073550 1.341144 1.045943 12.455337 0.000000 0.001022 0.000000 0.266943 0.004815 0.308859 0.002639 0.265948 0.504866 4.802017 15.484088 8.319767 \
+0.324368 0.000141 0.001358 0.003499 2.846677 0.196358 0.544474 0.078776 0.337879 0.000479 0.239715 0.000270 0.061833 0.822643 0.036254 0.181411 0.014388 \
+0.000140 0.285635 0.000000 0.000382 0.101204 2.487136 0.072352 0.432520 0.000116 0.310416 0.000000 0.215779 0.032564 0.026571 0.648769 0.040087 0.149771 23.496083 \
+0.025217 0.006558 0.261069 0.005535 0.487542 0.138742 3.121656 0.151589 0.032140 0.025873 0.002795 0.010250 0.070308 0.065669 0.016609 1.073790 0.040917 40.922701 15.426733 \
+0.004063 0.079161 0.000000 0.112999 0.021444 0.371063 0.064924 2.075226 0.004177 0.037372 0.000155 0.004585 0.215788 0.007978 0.118229 0.016442 0.495176 10.291826 33.453780 15.127582 \
+0.638696 0.001312 0.026551 0.040275 1.253945 0.002137 0.128111 0.073730 3.088481 0.340541 0.634065 0.001483 0.195073 0.664866 0.057328 0.438648 0.044742 0.775254 0.091276 0.286252 0.054021 \
+0.000467 0.761771 0.000123 0.002163 0.000593 1.144692 0.014470 0.114551 0.265766 3.193996 0.000155 0.483076 0.369273 0.058614 0.617694 0.059927 0.330036 0.061583 0.730306 0.089835 0.364129 38.685701 \
+0.126320 0.016628 0.576476 0.007508 0.508308 0.080383 2.066955 0.002179 0.486281 0.079236 0.163174 0.032232 0.055163 0.529045 0.071794 1.205738 0.033372 0.435109 0.074846 1.052040 0.063366 2.473439 0.751904 \
+0.009760 0.107218 0.000000 0.250748 0.049246 0.423382 0.002122 1.519211 0.092070 0.332396 0.057910 0.105597 0.247490 0.079119 0.422671 0.105449 0.703795 0.107434 0.529594 0.184327 0.715716 1.106179 2.503268 17.923045 \
+0.143832 0.000094 0.000741 0.003054 0.660622 0.001208 0.000579 0.001720 0.534375 0.001377 0.726908 0.077815 0.019696 0.663877 0.068758 0.134394 0.015019 0.500433 0.124232 0.063413 0.044676 2.460976 0.277265 1.164262 0.340811 \
+0.000000 0.200806 0.000000 0.000064 0.000000 0.685812 0.000000 0.032106 0.000116 0.604541 0.012886 0.516927 0.176476 0.016022 0.544828 0.005436 0.563956 0.002398 0.563799 0.001702 0.798346 0.170088 2.478358 0.148940 2.029914 27.244097 \
+0.030121 0.016020 0.136647 0.001527 0.006103 0.004089 0.557015 0.003211 0.043917 0.051686 0.232728 0.166150 0.146501 0.424607 0.112395 0.918198 0.041969 0.352807 0.154017 0.626603 0.091073 1.353860 0.526904 4.725840 0.617320 39.595443 12.677657 \
+0.000934 0.027355 0.000000 0.127696 0.000085 0.004832 0.000000 1.903571 0.003713 0.081931 0.023909 0.183143 1.135910 0.039428 0.640495 0.040902 0.794366 0.009880 0.897101 0.010300 1.164525 0.316372 2.208430 0.299978 4.718199 12.868484 35.563093 30.574631 \
+1.119411 0.059956 2.130663 1.292935 0.172403 0.000000 0.000386 0.000000 0.352731 0.000180 0.431456 0.000405 0.078312 3.330793 0.184010 1.328581 0.089308 0.292855 0.000096 0.002597 0.000246 0.193328 0.000000 0.078926 0.003859 0.076434 0.000000 0.000416 0.000000 \
+0.056038 1.006045 0.042112 0.478019 0.000000 0.115975 0.000096 0.000344 0.000116 0.255975 0.000311 0.309643 0.136849 0.390190 3.765697 0.203017 2.469249 0.000096 0.270274 0.000448 0.021723 0.000469 0.127899 0.010543 0.105885 0.000118 0.238839 0.001248 0.003064 13.609310 \
+1.075187 0.064968 5.159075 1.065537 0.000424 0.000000 0.403435 0.000000 0.573013 0.025454 0.069555 0.012138 0.170041 1.260239 0.136148 4.400610 0.048882 0.002014 0.000000 0.480521 0.000000 0.040109 0.000272 0.390087 0.000048 0.000000 0.000000 0.121855 0.000000 16.415611 5.784672 \
+0.679370 0.800602 1.418466 3.062807 0.093491 0.042282 0.246094 0.527005 0.294368 0.300354 0.298091 0.324613 0.321642 1.220020 1.434579 1.635281 2.236557 0.081631 0.008455 0.042006 0.193459 0.323588 0.163406 0.443617 0.834976 0.028736 0.029786 0.015596 0.408680 1.155098 1.428293 2.230691 \
+0.497293 0.000141 0.012473 0.015652 4.693944 0.487317 2.297807 0.199748 0.599932 0.000599 1.089585 0.001483 0.035939 0.831215 0.000060 0.004348 0.000070 1.050363 0.053805 0.345545 0.011476 0.898794 0.000272 0.374419 0.029088 0.344601 0.000000 0.001040 0.000000 1.266654 0.075878 0.351882 0.419831 \
+0.000093 0.371541 0.000062 0.002990 0.196983 3.829580 0.150107 2.395833 0.000116 0.393545 0.000776 0.806071 0.037822 0.000396 0.897490 0.000679 0.073657 0.044125 0.870967 0.027138 0.527094 0.001125 0.969387 0.066519 0.617464 0.001060 1.481721 0.002079 0.017774 0.034573 1.066285 0.016701 0.433759 13.991583 \
+0.079948 0.010539 0.871568 0.011134 2.018483 0.409721 5.709933 0.349846 0.208041 0.053363 0.415775 0.061227 0.060421 0.000857 0.000119 0.944560 0.000070 0.368538 0.026230 1.018005 0.015001 0.082373 0.021920 1.660885 0.001302 0.000589 0.000000 0.360575 0.000000 0.296014 0.048902 2.260424 0.853779 31.915858 8.373639 \
+0.008032 0.036489 0.000062 0.586818 0.361164 1.562591 0.516594 4.919174 0.042119 0.177757 0.053874 0.173298 0.362681 0.000264 0.000595 0.000408 0.733202 0.079712 0.435243 0.083475 0.962295 0.076938 0.103080 0.002854 1.766961 0.004593 0.014243 0.002911 2.985421 0.090674 0.311759 0.154441 1.376727 12.116657 28.470047 19.459275 \
+0.263567 0.000094 0.271628 0.077878 1.773102 0.000929 0.872084 0.040706 0.747870 0.042762 0.360038 0.000135 0.074859 0.259380 0.000000 0.019568 0.000140 0.787340 0.000192 0.096104 0.002705 2.691226 0.188587 1.759732 0.206851 0.682254 0.000068 0.009981 0.000981 0.239388 0.014351 0.256283 0.208924 2.057449 0.067554 1.243753 0.224397 \
+0.000093 0.143614 0.002840 0.060699 0.003390 1.118208 0.187826 0.725836 0.046586 0.487814 0.000932 0.325422 0.053908 0.000330 0.187880 0.014540 0.034916 0.000959 0.532092 0.074161 0.095418 0.460970 2.203539 0.377447 0.985145 0.003180 0.863191 0.016636 0.065212 0.025405 0.175491 0.020837 0.219170 0.296066 1.346385 0.259909 0.822133 17.634677 \
+0.148268 0.005996 0.612660 0.004963 0.340991 0.020909 1.496628 0.000459 0.467136 0.042942 0.099519 0.004316 0.051005 0.060922 0.002143 0.433620 0.000035 0.247195 0.012394 1.042457 0.000410 0.899262 0.143479 3.215817 0.285384 1.769879 0.296358 3.065510 0.011032 0.221536 0.023020 0.667397 0.275355 0.878163 0.089476 1.523251 0.199589 2.075154 0.413957 \
+0.013122 0.043609 0.000062 0.333780 0.156468 0.251650 0.004438 0.768607 0.072867 0.140864 0.011489 0.034794 0.105226 0.039362 0.022384 0.000679 0.133032 0.220816 0.303613 0.012002 0.561523 0.324525 0.571469 0.461383 2.285052 0.560831 2.721043 0.034519 3.832200 0.041440 0.116405 0.056267 0.497593 0.291009 0.623366 0.256174 1.144639 0.524647 1.038682 12.931524 \
+0.225554 0.000047 0.010929 0.009289 12.169045 0.083636 5.964323 0.681575 0.506470 0.000180 2.007768 0.181794 0.139046 0.322807 0.000060 0.009512 0.000105 1.035015 0.000288 0.021048 0.001066 1.293228 0.000453 1.177718 0.083261 0.772349 0.018146 0.530881 0.025006 0.359183 0.018727 0.269862 0.306375 4.943439 0.275865 2.397415 0.563566 4.971507 0.586685 1.293860 0.389004 \
+0.000093 0.166706 0.000432 0.005790 0.094762 10.892976 1.049877 9.818281 0.000116 0.346890 0.248099 1.372357 0.167138 0.000330 0.300513 0.002718 0.017300 0.001247 1.337629 0.005195 0.104681 0.005060 1.282522 0.232701 1.418383 0.387941 1.320875 0.354545 1.360141 0.031140 0.238533 0.021539 0.304581 0.622868 4.699375 0.441084 2.871848 0.643789 4.127466 0.334224 0.928876 28.579806 \
+0.140516 0.012600 0.423774 0.001718 0.047890 0.002044 1.094736 0.000115 0.424321 0.060909 0.144388 0.030883 0.145245 0.004747 0.000060 0.409432 0.000000 0.011511 0.000384 0.493508 0.000000 0.411115 0.025544 1.242140 0.000289 17.450524 1.113671 31.949764 2.418859 0.116039 0.012331 0.780008 0.305714 0.432918 0.021698 1.316696 0.087905 0.936840 0.273855 5.815294 1.197614 1.644621 0.403913 \
+0.083310 0.056771 0.000247 0.506841 0.063994 0.028715 0.009261 0.514392 0.200499 0.269510 0.122186 0.070533 0.496706 0.009560 0.000952 0.000951 0.040320 0.060432 0.033244 0.009225 0.273876 0.140943 0.169022 0.003786 1.148387 6.798629 4.087042 15.287419 18.531553 0.093542 0.062369 0.317466 0.905810 0.309656 0.140701 0.511968 0.765495 0.454347 0.600415 1.868194 7.316623 1.477696 1.286990 43.916187 \
+0.863970 0.065905 0.748196 0.529619 0.563995 0.000186 0.002219 0.000115 0.571505 0.000359 1.598824 0.004316 0.038763 1.897150 0.072866 0.555104 0.011580 0.708395 0.000192 0.009225 0.000246 0.338207 0.000091 0.150804 0.009600 0.336828 0.000000 0.003743 0.000000 6.546163 0.575921 2.577578 0.430124 2.898791 0.003020 0.056250 0.004868 0.318595 0.000295 0.201480 0.072138 0.501456 0.000219 0.032116 0.051709 \
+0.026338 0.946136 0.005990 0.140867 0.000085 0.396897 0.000096 0.001261 0.000116 0.582801 0.001087 1.273908 0.092044 0.105361 2.720153 0.043756 1.085170 0.000192 0.760090 0.000537 0.031642 0.000375 0.491034 0.006757 0.069320 0.000589 0.648523 0.001871 0.005026 0.458622 7.487816 0.154207 0.350473 0.001027 2.862216 0.002515 0.010298 0.000000 0.215492 0.001930 0.056145 0.000097 0.381287 0.000205 0.002062 10.956917 \
+0.565566 0.047403 2.299543 0.762425 0.001356 0.000279 0.813720 0.000115 0.236179 0.060430 0.754233 0.086986 0.058616 0.509595 0.031730 2.047159 0.020529 0.001151 0.000192 0.732111 0.000082 0.019586 0.002808 0.606945 0.000145 0.000353 0.000000 0.255147 0.000000 2.581654 0.313779 11.271062 0.569076 0.009008 0.001957 3.879355 0.002528 0.008844 0.002362 0.708204 0.000524 0.006305 0.000657 0.560642 0.002062 25.313949 5.637509 \
+0.068927 0.371072 0.024699 1.108802 0.000254 0.001394 0.000772 0.451899 0.009630 0.116309 0.126844 0.530278 0.277072 0.091844 0.477439 0.173122 2.262490 0.000384 0.001537 0.000717 0.602428 0.032237 0.044112 0.000466 0.475496 0.001413 0.003287 0.000832 0.701399 0.953353 3.487342 0.911193 1.399673 0.003635 0.089643 0.011433 3.092500 0.000628 0.013582 0.000193 0.311885 0.001358 0.004160 0.000103 0.314379 8.832621 18.744445 13.945647 \
+0.483563 0.000234 0.008892 0.035630 3.994417 0.771771 1.825878 0.250545 0.370309 0.000419 1.899865 0.011733 0.035860 0.879741 0.000417 0.004077 0.000772 1.272426 0.073021 0.346978 0.013526 0.420580 0.000091 0.272950 0.040907 0.324227 0.000000 0.001871 0.000000 0.536332 0.000253 0.001795 0.451134 2.359362 0.120121 0.324162 0.108501 0.643160 0.001329 0.216244 0.268662 2.323091 0.005328 0.013852 0.045374 2.600428 0.131929 0.662578 0.152215 \
+0.000093 0.512203 0.000000 0.001654 0.111968 3.452570 0.084218 1.978448 0.000058 0.380788 0.000466 1.514097 0.044178 0.000132 1.191514 0.000136 0.103766 0.022062 1.394892 0.013077 0.971148 0.000562 0.696016 0.018814 0.634927 0.000236 1.610248 0.000416 0.020471 0.000040 0.536362 0.000000 0.169264 0.038559 2.159328 0.019665 0.498504 0.000045 0.565968 0.005500 0.373948 0.000485 2.214735 0.000000 0.001768 0.046583 2.620833 0.028569 0.682579 9.612709 \
+0.109975 0.016535 1.041312 0.019406 1.931350 0.558500 4.380679 0.505677 0.176829 0.034737 0.806554 0.297371 0.031466 0.002374 0.000357 0.741135 0.000351 0.335924 0.069754 1.236457 0.087384 0.039265 0.004982 1.274118 0.002219 0.000589 0.000068 0.286547 0.000123 0.002262 0.000589 0.983926 0.517896 0.381796 0.077341 2.735831 0.318574 0.084620 0.041435 0.923644 0.004382 0.126382 0.063353 0.461729 0.004420 0.718719 0.092405 3.415722 0.415718 24.400553 6.746560 \
+0.005884 0.074851 0.000000 0.220908 0.103323 1.262618 0.150589 4.658653 0.027035 0.106187 0.028567 0.586111 0.446015 0.000066 0.000893 0.000000 1.524024 0.014101 0.417565 0.017824 1.950083 0.080124 0.190037 0.001165 1.544626 0.001531 0.083744 0.000624 3.409178 0.000081 0.004629 0.000078 0.837302 0.023862 0.728891 0.049848 2.866325 0.003771 0.068501 0.000482 0.759132 0.006402 0.200205 0.000000 0.187832 0.054049 0.968351 0.081861 2.211488 5.140068 19.373137 11.561124 \
+0.064397 0.000000 0.042112 0.038557 1.120532 0.003717 0.348448 0.117533 0.223763 0.015452 0.099985 0.000135 0.028249 0.129492 0.000000 0.012366 0.000491 0.661776 0.000769 0.147873 0.031560 0.746792 0.046739 0.706782 0.130873 0.162525 0.000000 0.007070 0.000368 0.066966 0.000042 0.001171 0.059065 0.928969 0.000559 0.092988 0.042595 3.529593 0.371685 0.604859 0.188097 1.702817 0.012481 0.030474 0.015763 0.153418 0.007112 0.078381 0.011491 0.396521 0.015140 0.189090 0.043198 \
+0.000000 0.055366 0.000062 0.006808 0.000254 1.023142 0.007428 0.670108 0.010037 0.184704 0.000000 0.071612 0.066384 0.000066 0.135255 0.001359 0.015686 0.000096 0.976175 0.003672 0.644235 0.100928 0.975727 0.121389 0.928319 0.000236 0.915505 0.009981 0.150527 0.000000 0.032447 0.000000 0.011379 0.000158 1.013424 0.003354 0.095207 0.167041 2.729647 0.053168 0.426684 0.000388 2.005334 0.000718 0.008986 0.004101 0.119062 0.006776 0.041280 0.018617 0.802516 0.027912 0.702594 14.214694 \
+0.084945 0.006464 0.287373 0.005472 0.330481 0.085680 1.265487 0.002179 0.257122 0.043721 0.028878 0.003641 0.009966 0.039560 0.002679 0.313495 0.000140 0.184749 0.105112 0.890822 0.005410 0.452442 0.106069 3.081614 0.536567 0.034978 0.025678 0.440217 0.000858 0.038612 0.009174 0.361403 0.033994 0.251423 0.109664 1.164866 0.003464 0.975582 0.193544 2.258321 0.308851 0.832592 0.308372 0.668173 0.004420 0.276499 0.042565 0.469281 0.055025 0.502355 0.140546 0.905488 0.227527 2.738552 0.892903 \
+0.010974 0.034428 0.000000 0.159955 0.042380 0.283432 0.001061 1.029128 0.042815 0.136432 0.014439 0.013216 0.137634 0.004220 0.010061 0.000136 0.176300 0.034437 0.294294 0.001791 0.990330 0.159217 0.566034 0.343314 3.036767 0.007891 0.528692 0.001040 2.171984 0.003312 0.031984 0.000078 0.262465 0.033581 0.360196 0.000838 1.447392 0.149578 0.372719 0.159248 1.563846 0.129098 0.822643 0.000410 1.195790 0.049842 0.245019 0.053017 0.362328 0.106257 0.938586 0.157605 1.251589 1.091224 3.1956 [...]
+0.164659 0.000141 0.000741 0.003881 0.976185 0.001951 0.011673 0.007109 0.130940 0.000120 0.420899 0.045044 0.039313 0.169777 0.000060 0.000272 0.000175 0.418802 0.000288 0.002508 0.001312 0.388156 0.000091 0.042812 0.003377 0.241197 0.004656 0.042005 0.011768 0.069995 0.000000 0.000156 0.027479 0.380374 0.000112 0.000534 0.000374 1.322234 0.005905 0.048730 0.021649 2.382451 0.326035 0.037657 0.047437 0.164143 0.016776 0.072521 0.024883 1.572808 0.086923 0.585071 0.083552 0.629243 0.0351 [...]
+0.000000 0.172889 0.000000 0.000191 0.000085 0.880032 0.000289 0.356038 0.000058 0.127388 0.007608 0.309374 0.105305 0.000000 0.240505 0.000000 0.047268 0.000096 0.636916 0.000090 0.395771 0.000843 0.566759 0.016193 0.336277 0.021435 0.676049 0.008942 0.703728 0.000283 0.055425 0.000000 0.018603 0.000000 0.518903 0.000000 0.006459 0.001122 1.110726 0.002863 0.176224 0.054025 2.392606 0.000821 0.012227 0.002050 0.201477 0.001557 0.051048 0.022214 1.797671 0.027973 1.398079 0.037461 1.2280 [...]
+string model_ECMunrest = model_ECMunrest1 + 
+"0.113991 0.018315 0.201112 0.001082 0.012121 0.001951 1.720919 0.001720 0.082323 0.029826 0.197641 0.061497 0.073682 0.000330 0.000060 0.165784 0.000070 0.003549 0.000384 0.556204 0.000164 0.097554 0.004982 0.551493 0.000289 0.015310 0.000753 0.247245 0.010419 0.000283 0.000084 0.194319 0.037724 0.002449 0.000112 0.466770 0.000187 0.909861 0.280400 0.713961 0.001760 1.179053 0.298738 0.938439 0.165587 0.080337 0.009773 0.324696 0.016839 0.658541 0.036022 1.693998 0.046588 0.375097 0.067 [...]
+0.018773 0.032039 0.000000 0.175861 0.002797 0.002974 0.003376 2.163175 0.007948 0.014314 0.105884 0.183952 0.381671 0.000066 0.000119 0.000000 0.185038 0.001918 0.001441 0.001254 0.703092 0.084060 0.053714 0.003029 0.634203 0.043222 0.097165 0.143481 0.590833 0.000081 0.000295 0.000078 0.410199 0.000553 0.000447 0.000610 0.716441 0.194964 0.293884 0.001158 0.744000 0.684968 1.149846 0.069567 1.558784 0.032177 0.064227 0.074536 0.276276 0.238907 0.496552 0.672077 1.526141 0.235747 0.4035 [...]
+\
+0.021414 0.021349 0.016195 0.015717 0.011798 0.010761 0.010366 0.008721 0.017237 0.016697 0.006441 0.007415 0.012744 0.015167 0.016798 0.007359 0.028497 0.010425 0.010408 0.011165 0.012199 0.010671 0.011040 0.017168 0.020730 0.008491 0.014604 0.004809 0.008158 0.024759 0.023762 0.012814 0.021180 0.012656 0.017882 0.013120 0.010682 0.022276 0.020321 0.031090 0.026699 0.010310 0.013701 0.009746 0.006788 0.019020 0.018419 0.010921 0.022626 0.018907 0.026817 0.016516 0.018288 0.028590 0.0252 [...]
+\
+TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TGT TGC TGG CTT CTC CTA CTG CCT CCC CCA \
+CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG AAT AAC AAA \
+AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG GGT GGC GGA \
+GGG";
+
+/* empirical codon model of Schneider, Cannarozzi, Gonnet 2005
+*/
+
+string model_ECM_Schneider05 = 
+  "15594\
+    787    609\
+    717    391  22864\
+    476      0     89    133\
+      0    444     39     51  15656\
+     77     11   2230    662  15154  11009\
+     99      0      0   2501  21330  19335  36566\
+   2355    360    127     41    400     18     28      0\
+    300   1911     76     49      0    260     23     70  25720\
+    378     30     81     96   1004      0    238     90   1025      0\
+     60    275     46     53     37    766     83    157      0    623  29318\
+    102     74     24    357     16     23     83    277    110     80    186    164\
+   1304      0   4065   4641    278      0      0      0    138     18    147     21      9\
+      0   1293   3595   2691      0    214      0      0      1    128      9    152      9  15530\
+    112    106  26307   6745      0      0    154    340     68      0     59      0     36  10903   8429\
+    186    136   4180  13314     20     34      0      0     39     14     17      9     51   7205   7138  16270\
+     97      0      7     72   2744      0      0      0     43      3    101     12      0    937     34      0      0\
+      0     82     18      3     65   2535      0      0      0     47      0     49      0      0    706      0      0  16327\
+      0     12     99     87      0      0   2829    131      7      0     11      0     14      0      0   1474    137  15604  10490\
+     26      3     11    109      0      0    244   5851      4     22     44     31     68      0      0      0    935  15588  21512  27394\
+    172     22     80     27    315     22     76      0   2184      0    475      6     63    676     55      0     43    756     38     56    173\
+     36    166      5     20     43    211     53    112    137   1624     21    276     20      0    390     96     40      0    617     97    246  30148\
+     35     18     93     63    143     90    403      0    122     56     70     60     38    168     38   1166     51    126     99   1405    296   1995   1176\
+     11     13     50     95     55     80    109    288     42     58     43     35     97     77     92     46    290     88    161    235   1234   1086   1057  15317\
+     70      7     49     98    326     11      1      0    362      0   1302      0     14    685      0      0      0    712      0      0      0   4390      0    646    232\
+      0     56     40      0      0    177      0    105      4    139      0    908     40      1    486      0     34      0    404      8    121      0   2689    217    355  42123\
+      8      0      0     25      0      0    163     53     68      0     51     53    130      0     45    966      0      0     23    393      0    133    261   3956      0  40836  19396\
+     10      8      2     45      0     50     31    385      0     17     50    109    576     21      0      7    340      0      0     57   1276    191    317      0   2408  25507  19595  39593\
+    437     29    201    399     72     30      0     18     64      1     90      1     10   1848    131    185    283     68     10     24      0     84      4      0     20     27      0     64      0\
+     23    356    111     27     15     35      6     15      7     42     10     62      7    133   1745      0    351      0     55      0     29      0     36     24      7     17     20      0     15  16920\
+    157     33   3243    833      0     54    209     88     29      1     14     16     14    134    176   3786    421     49     13     97     79     51      4     93     20     43      3      5      0  15113  13126\
+    126     89    536   1607     75     39    153    229     32     21     29     22     45    346    369    726   1104     34     27     55     92     61     20     96    116     29     23      0     50    781    495   2443\
+     82     23     67     79   2679    170    489    362     61     17    194     41      0    233      0      0     25    990     52      0     15    212      6    166     30    318     42      0      0   1552      0      0    262\
+      7     74      3     39    279   2315    113      0     21     33     10    153      9      0    186      0     44     76    817      0      0     15    171     37     61     14    118     71     33      0   1028      0    176  16316\
+     23     11    295     92    567     81   3129    578     12      0     75     22     19      4      0    288     46     64      0   1015     88     93     27    341     96     39      0      0     26      0      0   2434    714  16829  10306\
+     51      5     49    306      0    682    232   6944     52      9    134      0     44     55     28      0    181      0    170      0   1953      0     35      0    307      0    116    322    172    166    134      0   1616  20814  19942  34110\
+     41      0     48     36    289     32    139     78    316      0    191      0     11    110      0     22      0    115     34     55     46   1623     55    311    168    429      0     45     27    202      0     50     68   1425     16    237    182\
+      8     34      4     22     87    277     79     40      0    196      4    116      7      0     64     19      7     14     98     17     47      9   1196    148    142      0    264      0     58      0    145     21     42    122    984    165     67  17900\
+      4      6     59     28     54     24    133     53     16      8     20     15      9     30     17    100      0     26     18     91     39    169     94   1364    137    334    337    908    410     16      0    292     80    114     54    650      0    580    305\
+      6      0      4     39     26     24     52     70     11     11     13     16     24      7      3      0     36     14     14     28    105     84    110     33    727    353    569    299   1249     20     21     22    189     27     85     94    795    298    269  12057\
+     49      0      0     47    938     51    528    366    157      0    981    105     27     71      0     26      4    178      0     76     37    605      0    198     84   1145      0      0     44    345      0     91     78   2941      0    402    387   4830      0    245    146\
+     10     34     33      0    312    983    214    697     10     79    169    792     19     23     68      0     21     15    213     25     88     41    454    133    123      0    740     72     22      0    228     27     67    421   2248    343    618    178   3430    158    131  21641\
+     10      2     87     13     54     26     79     12      0     20     52      7     23     32      0      9     13      5     17      0     45    118     55    623     32   6203   2803  22415   6405      0      0    415     93     11     31    743      0    178    142   4144    390    638    339\
+     12      0     51     36     33     12      4     72     31      0     36     20    264      0     20      0     22     17      0     39      5     62     95     60    271   2885   5913   6064  18118     29     15    106    258     40     59     49    927    145    139    572   3644    544    444  25118\
+    446      0    175    236    218     61     96      0     80      0    108     60     20   1661     28      0     11    258      0      0     65     68      1     82     28     96     15      0      3   6623     53    389    259   1227     72     65    141    100     20     37      3    233     69     31      0\
+      6    460     73     35     15    199     49      0      7     69     28    105      4     18   1603    120     42      0    199     36     69     15     75     29     25     18     52      0     36     79   6084    158    190    148   1080      1      0     21     86     13     11     16    170      5     38  17102\
+     83     13   3202    251     70      2    396      0     18      2     52     24      6    131      5   2164      7      0     63    394      0     23     36    276     31      0      0     86      0    125      0  11512    626      0     20   1694     36     55     26    153     16     31     87    249      0  15228  10722\
+     60     42      0   1017     46     19     85    271      9     17     54     16     27      0     24     16    827     27     21     58    154     23     18     10     62     12     20      0     58    743    928   1567   1219    155     66    206    995     12     12     10     54     28     10      0    106  10663   9288  21679\
+     89     10     43     68   2454    180    301      0     54     11    203     32      0    183      8     14     31   1111     63     28      0    111      5     60     52    121      0      0     62    233     32     42    108   3490      0      0      0    252     38     43     31    628    101     47      0   2567      0     14     37\
+      1     78     42     24    250   2338    201    324     11     28     22    197     15     30    160      0     31     79    965      7    145      0    119     22     88     29    133     26      0      0    217      0    115      0   2888      0     36     25    224     13     45      0    608     24     51      0   2241      0     80  12825\
+     39      0    293     82    303    127   3185    250     19      1     87     27     17     29      0    300     46     91     19   1186    101     70     20    307     61     38      0     72     23     73      0    484    282      0      0   3718      0    149     28    216     48    268    106    108     88      0      0   3240    591  14816   8169\
+     19     59      0    480    450    354      0   9766      0     68     96    134     32     16     40      0    261      0    206      0   4882    109     45    182    246      0     94      0    379      0     77    315    392      0    193      0   7718     81    161     62    171    178    258     49    171      0      0      0   2849  13866  14245  22755\
+     14      0      0     16    138      0     52     20    122      0     41      0      3     21      0      5      0     44      7     28     37    378      0    132     54     66      8      0     13     36      0     19      8    149      2     83      0   1636      4     56     44    488     30     27     19    187      0     24      5    384      0     59     20\
+      0     13     16      0      1    115     35     76      0     76      9     24      1     10     20      0      3      8     72      6     40     22    311     76     72      1     57      0     11      0     20      0      9     54    124      0     77     33   1249     58     35     38    395     22     11      0    141     19      0      0    322     39    182  13482\
+      8      2     32     46     42     14    118     53     16      9     13      0      5     13      6     52      0     29     17     71     35    123     28   1246    112     69      0     24      0      0      5     95     22     38     36    196      0    187     74    626      0    114     82    213      0     62     24    367     16     64     41    636      0   1758   1114\
+      3      4     20     18     36     42     43     84     16      6      3     11     13      7     12      0     27     19     40     51     81     69     71     66    739      0     44      0    104     21      1     17     63     52     35     76    199    106     80      0    429    101     86      0    162      4     26     38    161     70     91    118    916   1216   1466  11181\
+     32      0     44      0    293     18     60     85     62      0    515      0      0     50      0      9      9    161     16     29      0    266      0      0     43    688      0    117      0     46      0      0     23    349     91     46      0    829     98     44     32   3432      6      0      0    557      0      0     21   1264      0    171      0   1058      0     28     30\
+      9     23      0     15     52    224     28    240      0     38     38    372      7     14     38      0      6     14    114     31    129     12    165     15     40      1    452      0     78     10     42     18     16     50    250      6     62     91    589     13     28      0   2627     25      0     15    383      0      0     72   1050      0    357      0    795      0     50  17792\
+     15      1     23      0     80     59    198      0      9      0     34      0      4     13     12     70      0     16     11     78     26     26     16    225      8      0      0    176      0     16      0     85     31      3     12    214     51    121     31    137     11    151    134   1071      0      0      0    629      0    133     44   1071      0     50     86    798      0  13656   9267\
+      2     11      9    122     10     49    162    532     14      3     42     39    214      0      1      0     39      9     55     36    251     29     31     53    148      0      0      0    515      0     22     14    116     77      0     84    405     67    171     13    144    398    328      0   1567      0      0     86    380     35    151    360   2466     88     32      0    749  11092  11188  21307\
+ 0.019065 0.019572 0.008521 0.014125 0.017115 0.015966 0.013340 0.004325 0.013196 0.015988 0.010982 0.011900 0.012462 0.015017 0.017340 0.008031 0.037328 0.017599 0.014989 0.017755 0.005879 0.011592 0.014372 0.013669 0.033679 0.005419 0.008712 0.006185 0.008488 0.017469 0.019956 0.009364 0.021871 0.014391 0.015964 0.016870 0.005890 0.018236 0.020614 0.028227 0.031883 0.013622 0.019058 0.013502 0.011845 0.013592 0.013760 0.008372 0.026506 0.019952 0.021260 0.017896 0.005964 0.025167 0.024 [...]
+TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TGT TGC TGG CTT CTC CTA CTG CCT CCC CCA \
+CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG AAT AAC AAA \
+AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG GGT GGC GGA \
+GGG";
+
+
+ModelCodon::ModelCodon(const char *model_name, string model_params, StateFreqType freq, string freq_params,
+		PhyloTree *tree, bool count_rates) : ModelGTR(tree, count_rates)
+{
+    half_matrix = false;
+    omega = kappa = kappa2 = 1.0;
+    fix_omega = fix_kappa = false;
+    fix_kappa2 = true;
+    codon_freq_style = CF_TARGET_CODON;
+    codon_kappa_style = CK_ONE_KAPPA;
+	ntfreq = new double[12];
+	empirical_rates = NULL;
+	int nrates = getNumRateEntries();
+    delete [] rates;
+    rates = new double[nrates];
+    empirical_rates = new double [nrates];
+
+    rate_attr = NULL;
+    computeRateAttributes();
+
+   	init(model_name, model_params, freq, freq_params);
+}
+
+ModelCodon::~ModelCodon() {
+	if (rate_attr) {
+		delete [] rate_attr;
+		rate_attr = NULL;
+	}
+	if (empirical_rates) {
+		delete [] empirical_rates;
+		empirical_rates = NULL;
+	}
+	if (ntfreq) {
+		delete [] ntfreq;
+		ntfreq = NULL;
+	}
+}
+
+StateFreqType ModelCodon::initCodon(const char *model_name, StateFreqType freq, bool reset_params) {
+	string name_upper = model_name;
+	for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
+		(*it) = toupper(*it);
+    
+	if (name_upper == "MG") {
+		return initMG94(true, freq, CK_ONE_KAPPA);
+	} else if (name_upper == "MGK") {
+		return initMG94(false, freq, CK_ONE_KAPPA);
+	} else if (name_upper == "MG1KTS" || name_upper == "MGKAP2") {
+        return initMG94(false, freq, CK_ONE_KAPPA_TS);
+	} else if (name_upper == "MG1KTV" || name_upper == "MGKAP3") {
+        return initMG94(false, freq, CK_ONE_KAPPA_TV);
+	} else if (name_upper == "MG2K" || name_upper == "MGKAP4") {
+        return initMG94(false, freq, CK_TWO_KAPPA);
+	} else if (name_upper == "GY") {
+        return initGY94(false, CK_ONE_KAPPA);
+	} else if (name_upper == "GY0K" || name_upper == "GYKAP1") {
+        return initGY94(true, CK_ONE_KAPPA);
+	} else if (name_upper == "GY1KTS" || name_upper == "GYKAP2") {
+        return initGY94(false, CK_ONE_KAPPA_TS);
+	} else if (name_upper == "GY1KTV" || name_upper == "GYKAP3") {
+        return initGY94(false, CK_ONE_KAPPA_TV);
+	} else if (name_upper == "GY2K" || name_upper == "GYKAP4") {
+        return initGY94(false, CK_TWO_KAPPA);
+	} else if (name_upper == "ECM" || name_upper == "KOSI07" || name_upper == "ECMK07") {
+		if (!phylo_tree->aln->isStandardGeneticCode())
+			outError("For ECMK07 a standard genetic code must be used");
+		readCodonModel(model_ECMunrest, reset_params);
+		return FREQ_USER_DEFINED;
+	} else if (name_upper == "ECMREST") {
+		if (!phylo_tree->aln->isStandardGeneticCode())
+			outError("For ECMREST a standard genetic code must be used");
+		readCodonModel(model_ECMrest, reset_params);
+		return FREQ_USER_DEFINED;
+	} else if (name_upper == "SCHN05" || name_upper == "ECMS05") {
+		if (!phylo_tree->aln->isStandardGeneticCode())
+			outError("For ECMS05 a standard genetic code must be used");
+		readCodonModel(model_ECM_Schneider05, reset_params);
+		return FREQ_USER_DEFINED;
+	} else {
+		//cout << "User-specified model "<< model_name << endl;
+		readParameters(model_name);
+			//name += " (user-defined)";
+		return FREQ_USER_DEFINED;
+	}
+
+	return FREQ_UNKNOWN;
+}
+
+void ModelCodon::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+    int i, j;
+	for (i = 0; i < 12; i++)
+		ntfreq[i] = 0.25;
+    // initialize empirical_rates
+    for (i = 0; i < num_states; i++) {
+        double *this_emp_rate = &empirical_rates[i*num_states];
+        int *this_rate_attr = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            memset(this_emp_rate, 0, num_states*sizeof(double));
+            continue;
+        }
+        for (j = 0; j < num_states; j++) {
+            int attr = this_rate_attr[j];
+            if (attr & (CA_STOP_CODON+CA_MULTI_NT)) { // stop codon or multiple nt substitutions
+                this_emp_rate[j] = 0.0;
+            } else {
+                this_emp_rate[j] = 1.0;
+            }
+        }
+    }    
+
+    ignore_state_freq = false;
+
+	StateFreqType def_freq = FREQ_UNKNOWN;
+	name = full_name = model_name;
+    size_t pos;
+	if ((pos=name.find('_')) == string::npos) {
+		def_freq = initCodon(model_name, freq, true);
+	} else {
+		def_freq = initCodon(name.substr(0, pos).c_str(), freq, false);
+		if (def_freq != FREQ_USER_DEFINED)
+			outError("Invalid model " + name + ": first component must be an empirical model"); // first model must be empirical
+		def_freq = initCodon(name.substr(pos+1).c_str(), freq, false);
+		if (def_freq == FREQ_USER_DEFINED) // second model must be parametric
+			outError("Invalid model " + name + ": second component must be a mechanistic model");
+		// adjust the constraint
+        if (codon_freq_style==CF_TARGET_CODON) 
+            def_freq = FREQ_USER_DEFINED;
+	}
+
+    num_params = (!fix_omega) + (!fix_kappa) + (!fix_kappa2);
+
+	if (freq_params != "") {
+		readStateFreq(freq_params);
+	}
+	if (model_params != "") {
+		readRates(model_params);
+	}
+
+//	if (freq == FREQ_UNKNOWN ||  def_freq == FREQ_EQUAL) freq = def_freq;
+    if (freq == FREQ_UNKNOWN) freq = def_freq;
+	if (freq == FREQ_CODON_1x4 || freq == FREQ_CODON_3x4 || freq == FREQ_CODON_3x4C) {
+		//ntfreq = new double[12];
+		phylo_tree->aln->computeCodonFreq(freq, state_freq, ntfreq);
+	}
+	ModelGTR::init(freq);
+}
+
+StateFreqType ModelCodon::initMG94(bool fix_kappa, StateFreqType freq, CodonKappaStyle kappa_style) {
+	/* Muse-Gaut 1994 model with 1 parameters: omega */
+
+    fix_omega = false;
+    this->fix_kappa = fix_kappa;
+    if (fix_kappa)
+        kappa = 1.0;
+    fix_kappa2 = true;
+    codon_freq_style = CF_TARGET_NT;
+    this->codon_kappa_style = kappa_style;
+    if (kappa_style == CK_TWO_KAPPA)
+        fix_kappa2 = false;
+    
+    if (freq == FREQ_UNKNOWN || freq == FREQ_USER_DEFINED)
+        freq = FREQ_CODON_3x4;
+        
+    switch (freq) {
+      case FREQ_CODON_1x4:
+      case FREQ_CODON_3x4:
+      case FREQ_CODON_3x4C:
+		phylo_tree->aln->computeCodonFreq(freq, state_freq, ntfreq);
+        break;
+      case FREQ_EMPIRICAL:
+      case FREQ_ESTIMATE:
+      case FREQ_USER_DEFINED:
+        outError("Invalid state frequency type for MG model, please use +F1X4 or +F3X4 or +F3X4C");
+        break;
+      default:
+        break;
+    }
+    
+    // ignote state_freq because ntfreq is already used
+    ignore_state_freq = true;
+    combineRateNTFreq();
+    
+    return FREQ_CODON_3x4;
+}
+
+StateFreqType ModelCodon::initGY94(bool fix_kappa, CodonKappaStyle kappa_style) {
+    fix_omega = false;
+    this->fix_kappa = fix_kappa;
+    if (fix_kappa)
+        kappa = 1.0;
+    fix_kappa2 = true;
+    this->codon_kappa_style = kappa_style;
+    if (kappa_style == CK_TWO_KAPPA)
+        fix_kappa2 = false;
+            
+    return FREQ_EMPIRICAL;
+}
+
+
+void ModelCodon::computeRateAttributes() {
+    int i, j, ts, tv;
+    int nrates = getNumRateEntries();
+    if (!rate_attr) {
+        rate_attr = new int[nrates];
+        memset(rate_attr, 0, sizeof(int)*nrates);
+    }
+    for (i = 0; i < num_states; i++) {
+        int *rate_attr_row = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            for (j = 0; j < num_states; j++)
+                rate_attr_row[j] = CA_STOP_CODON;
+            continue;
+        }
+        for (j = 0; j < num_states; j++)  {
+            if (j == i || phylo_tree->aln->isStopCodon(j)) {
+                rate_attr_row[j] = CA_STOP_CODON;
+                continue;
+            }
+            int nuc1, nuc2;
+            int attr = 0;
+            ts = tv = 0;
+            if (phylo_tree->aln->genetic_code[i] == phylo_tree->aln->genetic_code[j])
+                attr |= CA_SYNONYMOUS;
+            else
+                attr |= CA_NONSYNONYMOUS;
+                
+            if ((nuc1=i/16) != (nuc2=j/16)) {
+                if (abs(nuc1-nuc2)==2) { // transition 
+                    attr |= CA_TRANSITION_1NT;
+                    ts++;
+                } else { // transversion
+                    attr |= CA_TRANSVERSION_1NT;
+                    tv++;
+                }
+            }
+            if ((nuc1=(i%16)/4) != (nuc2=(j%16)/4)) {
+                if (abs(nuc1-nuc2)==2) { // transition
+                    attr |= CA_TRANSITION_2NT;
+                    ts++;
+                } else { // transversion
+                    attr |= CA_TRANSVERSION_2NT;
+                    tv++;
+                }
+            }
+            if ((nuc1=i%4) != (nuc2=j%4)) {
+                if (abs(nuc1-nuc2)==2) { // transition
+                    attr |= CA_TRANSITION_3NT;
+                    ts++;
+                } else { // transversion
+                    attr |= CA_TRANSVERSION_3NT;
+                    tv++;
+                }
+            }
+            if (ts+tv>1) 
+                attr |= CA_MULTI_NT;
+            else if (ts==1) 
+                attr |= CA_TRANSITION;
+            else if (tv==1)
+                attr |= CA_TRANSVERSION;
+                    
+            rate_attr_row[j] = attr;
+        }
+    }
+}
+
+void ModelCodon::combineRateNTFreq() {
+    int i, j;
+    for (i = 0; i < num_states; i++) {
+        if (phylo_tree->aln->isStopCodon(i))
+            continue;
+        double *this_rate = &empirical_rates[i*num_states];
+        for (j = 0; j < num_states; j++)  {
+            if (this_rate[j] == 0.0)
+                continue;
+            int nuc1, nuc2;
+                
+            if ((nuc1=i/16) != (nuc2=j/16)) {
+                this_rate[j] *= ntfreq[nuc2];
+            }
+            if ((nuc1=(i%16)/4) != (nuc2=(j%16)/4)) {
+                this_rate[j] *= ntfreq[nuc2+4];
+            }
+            if ((nuc1=i%4) != (nuc2=j%4)) {
+                this_rate[j] *= ntfreq[nuc2+8];
+            }
+        }
+    }
+    
+}
+
+
+void ModelCodon::readCodonModel(istream &in, bool reset_params) {
+	int nrates = getNumRateEntries();
+
+	int i, j;
+	int nscodons = phylo_tree->aln->getNumNonstopCodons();
+
+	double * q = new double[nscodons*nscodons];
+	double *f = new double[nscodons];
+	for (i = 1; i < nscodons; i++) {
+		for (j = 0; j < i; j++) {
+			in >> q[i*nscodons+j];
+			//q[j*num_states+i] = q[i*num_states+j];
+			if (verbose_mode >= VB_MAX) cout << " " << q[i*nscodons+j];
+		}
+		if (verbose_mode >= VB_MAX) cout << endl;
+	}
+	for (i = 0; i < nscodons; i++)
+		in >> f[i];
+	StrVector codons;
+	codons.resize(nscodons);
+	IntVector state_map;
+	state_map.resize(nscodons);
+	for (i = 0; i < nscodons; i++) {
+		in >> codons[i];
+		if (codons[i].length() != 3)
+			outError("Input model has wrong codon format ", codons[i]);
+		int nt1 = phylo_tree->aln->convertState(codons[i][0], SEQ_DNA);
+		int nt2 = phylo_tree->aln->convertState(codons[i][1], SEQ_DNA);
+		int nt3 = phylo_tree->aln->convertState(codons[i][2], SEQ_DNA);
+		if (nt1 > 3 || nt2 > 3 || nt3 > 3)
+			outError("Wrong codon triplet ", codons[i]);
+		state_map[i] = nt1*16+nt2*4+nt3;
+		if (phylo_tree->aln->isStopCodon(state_map[i]))
+			outError("Stop codon encountered");
+		if (verbose_mode >= VB_MAX)
+			cout << " " << codons[i] << " " << state_map[i];
+	}
+	if (verbose_mode >= VB_MAX) cout << endl;
+
+	//int row = 0, col = 1;
+	// since rates for codons is stored in lower-triangle, special treatment is needed
+    memset(empirical_rates, 0, nrates*sizeof(double));
+    memset(rates, 0, nrates*sizeof(double));
+	for (i = 1; i < nscodons; i++) {
+		for (j = 0; j < i; j++) {
+			int row = state_map[i], col = state_map[j];
+			if (row < col) {
+				int tmp = row;
+				row = col;
+				col = tmp;
+			}
+//			int id = col*(2*num_states-col-1)/2 + (row-col-1);
+            double qentry = q[i*nscodons+j];
+            int id = row*num_states+col;
+			assert(id < nrates && id >= 0);
+			empirical_rates[id] = rates[id] = qentry;
+            id = col*num_states+row;
+			assert(id < nrates && id >= 0);
+			empirical_rates[id] = rates[id] = qentry;
+		}
+	}
+	memset(state_freq, 0, num_states*sizeof(double));
+	for (i = 0; i < num_states; i++)
+		state_freq[i] = MIN_FREQUENCY;
+	for (i = 0; i < nscodons; i++)
+		state_freq[state_map[i]] = f[i]-(num_states-nscodons)*MIN_FREQUENCY/nscodons;
+
+    if (reset_params) {
+        fix_omega = fix_kappa = fix_kappa2 = true;
+        omega = kappa = kappa2 = 1.0;
+    }
+	delete [] f;
+	delete [] q;
+}
+
+void ModelCodon::readCodonModel(string &str, bool reset_params) {
+	try {
+		istringstream in(str);
+		readCodonModel(in, reset_params);
+	}
+	catch (const char *str) {
+		outError(str);
+	}
+}
+
+void ModelCodon::decomposeRateMatrix() {
+    computeCodonRateMatrix();
+    ModelGTR::decomposeRateMatrix();
+}
+
+void ModelCodon::computeCodonRateMatrix() {
+//    if (num_params == 0) 
+//        return; // do nothing for empirical codon model
+        
+    switch (codon_kappa_style) {
+    case CK_ONE_KAPPA:
+        computeCodonRateMatrix_1KAPPA();
+        break;
+    case CK_ONE_KAPPA_TS:
+        computeCodonRateMatrix_1KAPPATS();
+        break;
+    case CK_ONE_KAPPA_TV:
+        computeCodonRateMatrix_1KAPPATV();
+        break;
+    case CK_TWO_KAPPA:
+        computeCodonRateMatrix_2KAPPA();
+        break;
+    }
+}
+
+void ModelCodon::computeCodonRateMatrix_1KAPPA() {
+    int nrates = getNumRateEntries();
+    memcpy(rates, empirical_rates, nrates*sizeof(double));
+    if (omega == 1.0 && kappa == 1.0)
+        return; // do nothing
+
+    int i, j;
+    double omega_kappa = omega*kappa;
+    
+    for (i = 0; i < num_states; i++) {
+        double *this_rate = &rates[i*num_states];
+        int *this_rate_attr = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            continue;
+        }
+        for (j = 0; j < num_states; j++) {
+            if (this_rate[j] == 0.0) continue;
+            int attr = this_rate_attr[j];
+            if (attr & CA_SYNONYMOUS) { // synonymous
+                if (attr & CA_TRANSITION) // transition
+                    this_rate[j] *= kappa;
+            } else if (attr & CA_NONSYNONYMOUS) { // non-synomyous
+                if (attr & CA_TRANSITION) // transition
+                    this_rate[j] *= omega_kappa;                
+                else // transversion
+                    this_rate[j] *= omega;
+            }
+        }
+    }
+}
+
+void ModelCodon::computeCodonRateMatrix_1KAPPATS() {
+    int nrates = getNumRateEntries();
+    memcpy(rates, empirical_rates, nrates*sizeof(double));
+
+    int i, j;
+    double kappa_pow[] = {1.0, kappa, kappa*kappa, kappa*kappa*kappa};
+    double omega_kappa_pow[] = {omega, omega*kappa, omega*kappa*kappa, omega*kappa*kappa*kappa};
+
+    for (i = 0; i < num_states; i++) {
+        double *this_rate = &rates[i*num_states];
+        int *this_rate_attr = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            continue;
+        }
+        for (j = 0; j < num_states; j++) {
+            int attr = this_rate_attr[j];
+            if (this_rate[j] == 0.0) continue;
+            if (attr & CA_SYNONYMOUS) { // synonymous
+                int num = ((attr & CA_TRANSITION_1NT) != 0) + ((attr & CA_TRANSITION_2NT) != 0) + ((attr & CA_TRANSITION_3NT) != 0);
+                this_rate[j] *= kappa_pow[num];
+            } else if (attr & CA_NONSYNONYMOUS) { // non-synomyous
+                int num = ((attr & CA_TRANSITION_1NT) != 0) + ((attr & CA_TRANSITION_2NT) != 0) + ((attr & CA_TRANSITION_3NT) != 0);
+                this_rate[j] *= omega_kappa_pow[num];
+            }
+        }
+    }
+}
+
+void ModelCodon::computeCodonRateMatrix_1KAPPATV() {
+    int nrates = getNumRateEntries();
+    memcpy(rates, empirical_rates, nrates*sizeof(double));
+
+    int i, j;
+    double kappa_pow[] = {1.0, kappa, kappa*kappa, kappa*kappa*kappa};
+    double omega_kappa_pow[] = {omega, omega*kappa, omega*kappa*kappa, omega*kappa*kappa*kappa};
+
+    for (i = 0; i < num_states; i++) {
+        double *this_rate = &rates[i*num_states];
+        int *this_rate_attr = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            continue;
+        }
+        for (j = 0; j < num_states; j++) {
+            int attr = this_rate_attr[j];
+            if (this_rate[j] == 0.0) continue;
+            if (attr & CA_SYNONYMOUS) { // synonymous
+                int num = ((attr & CA_TRANSVERSION_1NT) != 0) + ((attr & CA_TRANSVERSION_2NT) != 0) + ((attr & CA_TRANSVERSION_3NT) != 0);
+                this_rate[j] *= kappa_pow[num];
+            } else if (attr & CA_NONSYNONYMOUS) { // non-synomyous
+                int num = ((attr & CA_TRANSVERSION_1NT) != 0) + ((attr & CA_TRANSVERSION_2NT) != 0) + ((attr & CA_TRANSVERSION_3NT) != 0);
+                this_rate[j] *= omega_kappa_pow[num];
+            }
+        }
+    }
+}
+
+void ModelCodon::computeCodonRateMatrix_2KAPPA() {
+    int nrates = getNumRateEntries();
+    memcpy(rates, empirical_rates, nrates*sizeof(double));
+
+    int i, j;
+    double kappa_pow[] = {1.0, kappa, kappa*kappa, kappa*kappa*kappa};
+    double omega_kappa_pow[] = {omega, omega*kappa, omega*kappa*kappa, omega*kappa*kappa*kappa};
+    double kappa2_pow[] = {1.0, kappa2, kappa2*kappa2, kappa2*kappa2*kappa2};
+
+    for (i = 0; i < num_states; i++) {
+        double *this_rate = &rates[i*num_states];
+        int *this_rate_attr = &rate_attr[i*num_states];
+        if (phylo_tree->aln->isStopCodon(i)) {
+            continue;
+        }
+        for (j = 0; j < num_states; j++) {
+            int attr = this_rate_attr[j];
+            if (this_rate[j] == 0.0) continue;
+            if (attr & CA_SYNONYMOUS) { // synonymous
+                int numts = ((attr & CA_TRANSITION_1NT) != 0) + ((attr & CA_TRANSITION_2NT) != 0) + ((attr & CA_TRANSITION_3NT) != 0);            
+                int numtv = ((attr & CA_TRANSVERSION_1NT) != 0) + ((attr & CA_TRANSVERSION_2NT) != 0) + ((attr & CA_TRANSVERSION_3NT) != 0);
+                this_rate[j] *= kappa_pow[numts]*kappa2_pow[numtv];
+            } else if (attr & CA_NONSYNONYMOUS) { // non-synomyous
+                int numts = ((attr & CA_TRANSITION_1NT) != 0) + ((attr & CA_TRANSITION_2NT) != 0) + ((attr & CA_TRANSITION_3NT) != 0);            
+                int numtv = ((attr & CA_TRANSVERSION_1NT) != 0) + ((attr & CA_TRANSVERSION_2NT) != 0) + ((attr & CA_TRANSVERSION_3NT) != 0);
+                this_rate[j] *= omega_kappa_pow[numts]*kappa2_pow[numtv];
+            }
+        }
+    }
+}
+
+double ModelCodon::computeEmpiricalOmega() {
+    double dn = 0.0, ds = 0.0;
+    int i, j;
+    if (ignore_state_freq) {
+        for (i = 0; i < num_states; i++) {
+            if (phylo_tree->aln->isStopCodon(i))
+                continue;
+            double *this_rate = &rates[i*num_states];
+            int *this_rate_attr = &rate_attr[i*num_states];
+            for (j = 0; j < num_states; j++)
+                if (this_rate_attr[j] & CA_NONSYNONYMOUS)
+                    dn += state_freq[i]*this_rate[j];
+                else
+                    ds += state_freq[i]*this_rate[j];
+        }
+    } else {
+        for (i = 0; i < num_states; i++) {
+            if (phylo_tree->aln->isStopCodon(i))
+                continue;
+            double *this_rate = &rates[i*num_states];
+            int *this_rate_attr = &rate_attr[i*num_states];
+            for (j = 0; j < num_states; j++)
+                if (this_rate_attr[j] & CA_NONSYNONYMOUS)
+                    dn += state_freq[i]*state_freq[j]*this_rate[j];
+                else
+                    ds += state_freq[i]*state_freq[j]*this_rate[j];
+        }
+    }
+    return (dn/ds)*(0.21/0.79);
+}
+    
+
+
+void ModelCodon::getVariables(double *variables) {
+	int i, j;
+    if (num_params > 0) {
+        j = 1;
+        if (!fix_omega)
+            omega = variables[j++];
+        if (!fix_kappa)
+            kappa = variables[j++];
+        if (!fix_kappa2)
+            kappa2 = variables[j++];
+        assert(j == num_params+1);
+    }
+	if (freq_type == FREQ_ESTIMATE) {
+//		int ndim = getNDim();
+//		memcpy(state_freq, variables+(ndim-num_states+2), (num_states-1)*sizeof(double));
+//		double sum = 0;
+//		for (i = 0; i < num_states-1; i++)
+//			sum += state_freq[i];
+//		state_freq[num_states-1] = 1.0 - sum;
+
+        // BUG FIX 2015.08.28
+        int nrate = getNDim();
+        if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+		double sum = 1.0;
+//		int i, j;
+		for (i = 1; i < num_states; i++)
+			sum += variables[nrate+i];
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				state_freq[i] = variables[nrate+j] / sum;
+				j++;
+			}
+		state_freq[highest_freq_state] = 1.0/sum;
+	}
+}
+
+void ModelCodon::setVariables(double *variables) {
+	int j;
+	if (num_params > 0) {
+        j = 1;
+        if (!fix_omega)
+            variables[j++] = omega;
+        if (!fix_kappa)
+            variables[j++] = kappa;
+        if (!fix_kappa2)
+            variables[j++] = kappa2;
+        
+		assert(j == num_params+1);
+	}
+	if (freq_type == FREQ_ESTIMATE) {
+//		int ndim = getNDim();
+//		memcpy(variables+(ndim-num_states+2), state_freq, (num_states-1)*sizeof(double));
+
+        // BUG FIX 2015.08.28
+        int nrate = getNDim();
+        if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+		int i, j;
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state];
+				j++;
+			}
+	}
+}
+
+void ModelCodon::writeInfo(ostream &out) {
+    if (name.find('_') == string::npos)
+        out << "Nonsynonymous/synonymous ratio (omega): " << omega << endl;
+    else
+        out << "Empirical nonsynonymous/synonymous ratio (omega_E): " << computeEmpiricalOmega() << endl;
+    out << "Transition/transversion ratio (kappa): " << kappa << endl;
+    if (codon_kappa_style == CK_TWO_KAPPA) 
+        out << "Transition/transversion ratio 2 (kappa2): " << kappa2 << endl;
+}
+
diff --git a/model/modelcodon.h b/model/modelcodon.h
new file mode 100644
index 0000000..c6f064e
--- /dev/null
+++ b/model/modelcodon.h
@@ -0,0 +1,179 @@
+/*
+ * modelcodon.h
+ *
+ *  Created on: May 24, 2013
+ *      Author: minh
+ */
+
+#ifndef MODELCODON_H_
+#define MODELCODON_H_
+
+#include "modelgtr.h"
+
+/** CF_TARGET_NT: frequency of target nucleotide is multiplied with the rate entry (Muse and Gaut 1994)
+    CF_TARGET_CODON: frequency of target codon is multiplied with the rate entry (Goldman Yang 1994)
+    */
+enum CodonFreqStyle {CF_TARGET_NT, CF_TARGET_CODON};
+
+enum CodonKappaStyle {CK_ONE_KAPPA, CK_ONE_KAPPA_TS, CK_ONE_KAPPA_TV, CK_TWO_KAPPA};
+
+const int CA_STOP_CODON   = 1; // stop codon substitution
+const int CA_MULTI_NT     = 2; // codon substitution involves > 1 NT
+const int CA_SYNONYMOUS   = 4; // synonymous codon substitution
+const int CA_NONSYNONYMOUS= 8; // synonymous codon substitution
+const int CA_TRANSVERSION = 16; // codon substitution involves 1 NT transversion
+const int CA_TRANSITION   = 32; // codon substitution involves 1 NT transition
+const int CA_TRANSVERSION_1NT = 64; // codon substitution involve the 1st NT which is also a transversion
+const int CA_TRANSVERSION_2NT = 128; // codon substitution involve the 2nd NT which is also a transversion
+const int CA_TRANSVERSION_3NT = 256; // codon substitution involve the 3rd NT which is also a transversion
+const int CA_TRANSITION_1NT   = 512; // codon substitution involve the 1st NT which is also a transversion
+const int CA_TRANSITION_2NT   = 1024; // codon substitution involve the 2nd NT which is also a transversion
+const int CA_TRANSITION_3NT   = 2048; // codon substitution involve the 3rd NT which is also a transversion
+
+/**
+ * Codon substitution models
+ */
+class ModelCodon: public ModelGTR {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., GY,YN
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+	ModelCodon(const char *model_name, string model_params, StateFreqType freq, string freq_params,
+    		PhyloTree *tree, bool count_rates = true);
+
+	/**
+	 * destructor
+	 */
+	virtual ~ModelCodon();
+
+	/**
+		@return the number of rate entries, equal to the number of non-diagonal elements of the rate matrix
+        since we store full matrix here
+	*/
+	virtual int getNumRateEntries() { return num_states*(num_states); }
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+	StateFreqType initCodon(const char *model_name, StateFreqType freq, bool reset_params);
+
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() { return name; }
+
+    /** main function to compute rate matrix */
+    void computeCodonRateMatrix();
+
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix();
+
+	/**
+	 * read codon model from a stream, modying rates and state_freq accordingly
+	 * @param in input stream containing lower triangular matrix of rates, frequencies and list of codons
+	 */
+	void readCodonModel(istream &in, bool reset_params);
+
+	/**
+	 * read codon model from a string, modying rates and state_freq accordingly
+	 * @param str input string containing lower triangular matrix of rates, frequencies and list of codons
+	 */
+	void readCodonModel(string &str, bool reset_params);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+    /** compute rate_attr for all codoni->codoni substitution */
+    void computeRateAttributes();
+    
+    /** combine rates with target nucleotide frequency (ntfreq) for MG-style model */
+    void combineRateNTFreq();
+
+    /** compute the corrected empirical omega (Kosiol et al 2007) */
+    double computeEmpiricalOmega();
+    
+
+	/** 3x4 matrix of nucleotide frequencies at 1st,2nd,3rd codon position */
+	double *ntfreq;
+
+    /** dn/ds rate ratio */
+    double omega;
+    
+    /** TRUE to fix omega, default: FALSE */
+    bool fix_omega; 
+
+    /** style for kappa */
+    CodonKappaStyle codon_kappa_style;
+
+    /** ts/tv rate ratio */
+    double kappa;
+    
+    /** TRUE to fix kappa, default: FALSE */
+    bool fix_kappa;
+
+    /** ts/tv rate ratio for 2-kappa model (Kosiol et al 2007) */
+    double kappa2;
+    
+    /** TRUE to fix kappa2, default: FALSE */
+    bool fix_kappa2;
+    
+    /** GY- or MG-style codon frequencies */
+    CodonFreqStyle codon_freq_style;
+    
+    /** rate atrributes */
+    int *rate_attr;
+    
+	/** empirical rates for empirical codon model or parametric+empirical codon model */
+	double *empirical_rates;
+    
+protected:
+
+    void computeCodonRateMatrix_1KAPPA();
+    void computeCodonRateMatrix_1KAPPATS();
+    void computeCodonRateMatrix_1KAPPATV();
+    void computeCodonRateMatrix_2KAPPA();
+
+	/** initialize Muse-Gaut 1994 model 
+        @param fix_kappa whether or not to fix kappa
+        @param freq input frequency
+        @return default frequency type
+    */
+	StateFreqType initMG94(bool fix_kappa, StateFreqType freq, CodonKappaStyle kappa_style);
+
+	/** initialize Goldman-Yang 1994 model (simplified version with 2 parameters omega and kappa 
+        @param fix_kappa whether or not to fix kappa
+        @param kappa_style: CK_ONE_KAPPA for traditional GY model, others follow Kosiol et al 2007
+        @return default frequency type
+    */
+	StateFreqType initGY94(bool fix_kappa, CodonKappaStyle kappa_style);
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+};
+
+#endif /* MODELCODON_H_ */
diff --git a/model/modelcodonempirical.cpp b/model/modelcodonempirical.cpp
new file mode 100644
index 0000000..9b80cf3
--- /dev/null
+++ b/model/modelcodonempirical.cpp
@@ -0,0 +1,270 @@
+/*
+ * modelcodonempirical.cpp
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#include "modelcodonempirical.h"
+
+/* Empirical codon model restricted (Kosiol et al. 2007), source: http://www.ebi.ac.uk/goldman/ECM/ */
+string model_ECMrest =
+"11.192024 \
+1.315610 0.010896 \
+5.427076 4.756288 24.748755 \
+1.658051 0.000000 0.000000 0.000000 \
+0.000000 1.913571 0.000000 0.000000 13.889102 \
+0.000000 0.000000 2.952332 0.000000 44.407955 13.681751 \
+0.000000 0.000000 0.000000 8.126914 17.057443 65.097021 12.991861 \
+6.610894 0.000000 0.000000 0.000000 2.206054 0.000000 0.000000 0.000000 \
+0.000000 5.177930 0.000000 0.000000 0.000000 5.615472 0.000000 0.000000 19.942818 \
+3.347364 0.000000 0.000000 0.000000 6.191481 0.000000 0.000000 0.000000 0.582084 0.000000 \
+0.000000 1.558523 0.000000 0.000000 0.000000 9.339206 0.000000 0.000000 0.000000 0.144278 44.777964 \
+0.000000 0.000000 0.000000 5.369644 0.000000 0.000000 0.000000 4.662001 0.000000 0.000000 0.677177 0.073268 \
+2.090751 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 2.266373 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.905484 \
+0.000000 0.000000 75.752638 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 56.803876 7.811205 \
+0.000000 0.000000 0.000000 20.877218 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.432339 22.078564 5.650116 \
+0.000000 0.000000 0.000000 0.000000 1.769355 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.263838 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 2.704601 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.389735 0.000000 0.000000 17.461627 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.312811 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.393680 0.000000 35.480963 12.053827 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.303480 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.477616 8.407091 28.557939 11.295213 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.444964 0.000000 0.000000 0.000000 0.000000 1.583116 0.000000 0.000000 0.000000 1.021682 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.087801 0.000000 0.000000 0.000000 0.000000 3.230751 0.000000 0.000000 0.000000 3.774544 0.000000 0.000000 28.086160 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.419058 0.000000 0.000000 0.000000 5.381868 0.000000 3.440380 1.918904 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.812540 0.000000 0.000000 0.000000 1.794388 1.086327 5.369463 14.959151 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.617091 0.000000 0.000000 0.779565 0.000000 0.000000 0.000000 0.334165 0.000000 0.000000 0.000000 3.019726 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.632945 0.000000 0.000000 2.250770 0.000000 0.000000 0.000000 1.699302 0.000000 0.000000 0.000000 7.016899 0.000000 0.000000 14.603857 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.023939 0.000000 0.000000 0.000000 1.693662 0.000000 0.000000 0.000000 6.415757 0.000000 99.459951 14.930266 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.026086 0.000000 0.000000 0.000000 1.462945 0.000000 0.000000 0.000000 3.144296 0.000000 0.000000 0.000000 19.920977 30.804750 79.483730 13.919752 \
+1.682029 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.301225 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 0.786043 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.381841 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.140728 \
+0.000000 0.000000 10.116588 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.134459 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 18.298900 4.623936 \
+0.000000 0.000000 0.000000 7.911096 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.570123 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.281784 1.303951 2.082128 \
+0.000000 0.000000 0.000000 0.000000 38.229100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.578976 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.801564 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 15.793595 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.434550 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.231468 0.000000 0.000000 6.035740 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.033932 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.925575 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.962350 0.000000 28.307876 6.967655 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 17.103904 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.238450 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.155285 19.578982 38.414969 12.678802 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.245405 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.004762 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.501054 0.000000 0.000000 0.000000 11.715476 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.228361 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.105602 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.292691 0.000000 0.000000 0.000000 2.134740 0.000000 0.000000 13.863648 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.404436 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.647620 0.000000 0.000000 0.000000 3.919360 0.000000 4.929483 0.366267 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.715692 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.975074 0.000000 0.000000 0.000000 5.869857 1.010212 0.982893 10.762877 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.719489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.834666 0.000000 0.000000 0.000000 0.578118 0.000000 0.000000 0.000000 39.399322 0.000000 0.000000 0.000000 16.623529 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.047654 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.033630 0.000000 0.000000 0.000000 0.437779 0.000000 0.000000 0.000000 21.337943 0.000000 0.000000 0.000000 7.784768 0.000000 0.000000 26.637668 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 92.372238 0.000000 0.000000 0.000000 1.903175 0.000000 0.000000 0.000000 0.754055 0.000000 0.000000 0.000000 8.423762 0.000000 1.792245 0.120900 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.825082 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 133.296291 0.000000 0.000000 0.000000 2.231662 0.000000 0.000000 0.000000 22.577271 0.000000 0.000000 0.000000 21.000358 3.324581 6.011970 36.292705 \
+2.261813 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.473623 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.096281 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \
+0.000000 1.923392 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.914972 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.137337 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.669955 \
+0.000000 0.000000 2.362720 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.737489 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25.294298 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 26.045078 3.531461 \
+0.000000 0.000000 0.000000 2.022101 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.164805 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.078444 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.901167 21.657664 11.898141 \
+0.000000 0.000000 0.000000 0.000000 5.540052 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.159185 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.107629 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.682092 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 7.675838 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.120189 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.312255 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.308415 0.000000 0.000000 6.516319 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.880382 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.923972 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.064069 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.291148 0.000000 21.910225 5.090423 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 21.863158 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.034856 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25.461549 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.166554 5.512586 20.715347 9.529141 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.367553 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.383706 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.091654 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.352915 0.000000 0.000000 0.000000 0.693026 0.000000 0.000000 0.000000 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.294702 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.006827 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.686074 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.208522 0.000000 0.000000 0.000000 1.866565 0.000000 0.000000 10.605899 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.485369 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.811398 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.277861 0.000000 0.000000 0.000000 2.774445 0.000000 2.710610 0.650088 \
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7.686782 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.090641 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.476105 0.000000 0.000000 0.000000 9.441919 1.296294 3.7790 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.104727 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.041150 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 10.590780 0.000000 0.000000 0.000000 0.503385 0.000000 0.000000 0.000000 1.541379 0.000000 0.000000 0.000000 1.042624 0.000 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.552851 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.252470 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.285543 0.000000 0.000000 0.000000 0.542717 0.000000 0.000000 0.000000 2.303487 0.000000 0.000000 0.000000 1.5616 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.091041 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.432410 0.000000 0.000000 0.000000 0.702411 0.000000 0.000000 0.000000 2.985093 0.000000 0.000000 0.0000 [...]
+0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.810856 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.803738 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.388514 0.000000 0.000000 0.000000 0.302501 0.000000 0.000000 0.000000 6.644971 0.000000 0.0000 [...]
+\
+0.022103  0.021383  0.016387  0.015425  0.011880  0.011131  0.009750  0.008956  0.015965  0.015782  0.006025  0.007029  0.011880  0.014467  0.017386  0.007600  0.028839  0.010007  0.010100  0.010642  0.011843  0.011097  0.011703  0.016076  0.020211  0.008311  0.014148  0.004800  0.007837  0.025576  0.023441  0.013551  0.020102  0.013424  0.020201  0.015528  0.012142  0.023006  0.020171  0.030001  0.026344  0.010142  0.011679  0.010372  0.008195  0.019047  0.018938  0.010901  0.022747  0. [...]
+\
+TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TGT TGC TGG CTT CTC CTA CTG CCT CCC CCA \
+CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG AAT AAC AAA \
+AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG GGT GGC GGA \
+GGG";
+
+/* Empirical codon model unrestricted (Kosiol et al. 2007), source: http://www.ebi.ac.uk/goldman/ECM/ */
+string model_ECMunrest =
+"16.011531 \
+2.395822 0.151858 \
+1.204356 0.675537 18.541946 \
+0.773935 0.052602 0.249707 0.274990 \
+0.030074 0.656004 0.011609 0.158873 23.655090 \
+0.278090 0.056677 1.184813 0.611887 35.921779 15.982573 \
+0.034137 0.198277 0.010188 0.694091 11.510965 35.359077 17.424222 \
+4.317981 0.503397 0.798582 0.337279 0.688169 0.047115 0.341791 0.058136 \
+0.481042 4.483501 0.033529 0.177833 0.069588 0.524116 0.070809 0.213967 24.177765 \
+0.733587 0.076912 0.645571 0.395942 1.811753 0.343463 0.751980 0.143447 0.822999 0.054860 \
+0.045951 0.561620 0.040012 0.240632 0.138244 1.323765 0.121937 0.493179 0.068342 0.628438 56.838378 \
+0.786871 1.183337 0.271072 0.632947 0.069758 0.081312 0.195833 0.410046 1.140051 1.421996 0.264556 0.210115 \
+2.016257 0.207692 12.035723 11.161511 0.277929 0.000186 0.000289 0.000000 0.485469 0.000299 0.543240 0.000674 0.010122 \
+0.083684 2.306110 1.373823 5.651603 0.000085 0.342813 0.000096 0.000344 0.000116 0.622089 0.000466 0.674176 0.113701 15.874441 \
+1.036474 0.198558 27.219895 16.560966 0.000678 0.000186 0.496046 0.000115 0.016650 0.011978 0.020649 0.021578 0.017106 21.437257 8.808275 \
+0.073550 1.341144 1.045943 12.455337 0.000000 0.001022 0.000000 0.266943 0.004815 0.308859 0.002639 0.265948 0.504866 4.802017 15.484088 8.319767 \
+0.324368 0.000141 0.001358 0.003499 2.846677 0.196358 0.544474 0.078776 0.337879 0.000479 0.239715 0.000270 0.061833 0.822643 0.036254 0.181411 0.014388 \
+0.000140 0.285635 0.000000 0.000382 0.101204 2.487136 0.072352 0.432520 0.000116 0.310416 0.000000 0.215779 0.032564 0.026571 0.648769 0.040087 0.149771 23.496083 \
+0.025217 0.006558 0.261069 0.005535 0.487542 0.138742 3.121656 0.151589 0.032140 0.025873 0.002795 0.010250 0.070308 0.065669 0.016609 1.073790 0.040917 40.922701 15.426733 \
+0.004063 0.079161 0.000000 0.112999 0.021444 0.371063 0.064924 2.075226 0.004177 0.037372 0.000155 0.004585 0.215788 0.007978 0.118229 0.016442 0.495176 10.291826 33.453780 15.127582 \
+0.638696 0.001312 0.026551 0.040275 1.253945 0.002137 0.128111 0.073730 3.088481 0.340541 0.634065 0.001483 0.195073 0.664866 0.057328 0.438648 0.044742 0.775254 0.091276 0.286252 0.054021 \
+0.000467 0.761771 0.000123 0.002163 0.000593 1.144692 0.014470 0.114551 0.265766 3.193996 0.000155 0.483076 0.369273 0.058614 0.617694 0.059927 0.330036 0.061583 0.730306 0.089835 0.364129 38.685701 \
+0.126320 0.016628 0.576476 0.007508 0.508308 0.080383 2.066955 0.002179 0.486281 0.079236 0.163174 0.032232 0.055163 0.529045 0.071794 1.205738 0.033372 0.435109 0.074846 1.052040 0.063366 2.473439 0.751904 \
+0.009760 0.107218 0.000000 0.250748 0.049246 0.423382 0.002122 1.519211 0.092070 0.332396 0.057910 0.105597 0.247490 0.079119 0.422671 0.105449 0.703795 0.107434 0.529594 0.184327 0.715716 1.106179 2.503268 17.923045 \
+0.143832 0.000094 0.000741 0.003054 0.660622 0.001208 0.000579 0.001720 0.534375 0.001377 0.726908 0.077815 0.019696 0.663877 0.068758 0.134394 0.015019 0.500433 0.124232 0.063413 0.044676 2.460976 0.277265 1.164262 0.340811 \
+0.000000 0.200806 0.000000 0.000064 0.000000 0.685812 0.000000 0.032106 0.000116 0.604541 0.012886 0.516927 0.176476 0.016022 0.544828 0.005436 0.563956 0.002398 0.563799 0.001702 0.798346 0.170088 2.478358 0.148940 2.029914 27.244097 \
+0.030121 0.016020 0.136647 0.001527 0.006103 0.004089 0.557015 0.003211 0.043917 0.051686 0.232728 0.166150 0.146501 0.424607 0.112395 0.918198 0.041969 0.352807 0.154017 0.626603 0.091073 1.353860 0.526904 4.725840 0.617320 39.595443 12.677657 \
+0.000934 0.027355 0.000000 0.127696 0.000085 0.004832 0.000000 1.903571 0.003713 0.081931 0.023909 0.183143 1.135910 0.039428 0.640495 0.040902 0.794366 0.009880 0.897101 0.010300 1.164525 0.316372 2.208430 0.299978 4.718199 12.868484 35.563093 30.574631 \
+1.119411 0.059956 2.130663 1.292935 0.172403 0.000000 0.000386 0.000000 0.352731 0.000180 0.431456 0.000405 0.078312 3.330793 0.184010 1.328581 0.089308 0.292855 0.000096 0.002597 0.000246 0.193328 0.000000 0.078926 0.003859 0.076434 0.000000 0.000416 0.000000 \
+0.056038 1.006045 0.042112 0.478019 0.000000 0.115975 0.000096 0.000344 0.000116 0.255975 0.000311 0.309643 0.136849 0.390190 3.765697 0.203017 2.469249 0.000096 0.270274 0.000448 0.021723 0.000469 0.127899 0.010543 0.105885 0.000118 0.238839 0.001248 0.003064 13.609310 \
+1.075187 0.064968 5.159075 1.065537 0.000424 0.000000 0.403435 0.000000 0.573013 0.025454 0.069555 0.012138 0.170041 1.260239 0.136148 4.400610 0.048882 0.002014 0.000000 0.480521 0.000000 0.040109 0.000272 0.390087 0.000048 0.000000 0.000000 0.121855 0.000000 16.415611 5.784672 \
+0.679370 0.800602 1.418466 3.062807 0.093491 0.042282 0.246094 0.527005 0.294368 0.300354 0.298091 0.324613 0.321642 1.220020 1.434579 1.635281 2.236557 0.081631 0.008455 0.042006 0.193459 0.323588 0.163406 0.443617 0.834976 0.028736 0.029786 0.015596 0.408680 1.155098 1.428293 2.230691 \
+0.497293 0.000141 0.012473 0.015652 4.693944 0.487317 2.297807 0.199748 0.599932 0.000599 1.089585 0.001483 0.035939 0.831215 0.000060 0.004348 0.000070 1.050363 0.053805 0.345545 0.011476 0.898794 0.000272 0.374419 0.029088 0.344601 0.000000 0.001040 0.000000 1.266654 0.075878 0.351882 0.419831 \
+0.000093 0.371541 0.000062 0.002990 0.196983 3.829580 0.150107 2.395833 0.000116 0.393545 0.000776 0.806071 0.037822 0.000396 0.897490 0.000679 0.073657 0.044125 0.870967 0.027138 0.527094 0.001125 0.969387 0.066519 0.617464 0.001060 1.481721 0.002079 0.017774 0.034573 1.066285 0.016701 0.433759 13.991583 \
+0.079948 0.010539 0.871568 0.011134 2.018483 0.409721 5.709933 0.349846 0.208041 0.053363 0.415775 0.061227 0.060421 0.000857 0.000119 0.944560 0.000070 0.368538 0.026230 1.018005 0.015001 0.082373 0.021920 1.660885 0.001302 0.000589 0.000000 0.360575 0.000000 0.296014 0.048902 2.260424 0.853779 31.915858 8.373639 \
+0.008032 0.036489 0.000062 0.586818 0.361164 1.562591 0.516594 4.919174 0.042119 0.177757 0.053874 0.173298 0.362681 0.000264 0.000595 0.000408 0.733202 0.079712 0.435243 0.083475 0.962295 0.076938 0.103080 0.002854 1.766961 0.004593 0.014243 0.002911 2.985421 0.090674 0.311759 0.154441 1.376727 12.116657 28.470047 19.459275 \
+0.263567 0.000094 0.271628 0.077878 1.773102 0.000929 0.872084 0.040706 0.747870 0.042762 0.360038 0.000135 0.074859 0.259380 0.000000 0.019568 0.000140 0.787340 0.000192 0.096104 0.002705 2.691226 0.188587 1.759732 0.206851 0.682254 0.000068 0.009981 0.000981 0.239388 0.014351 0.256283 0.208924 2.057449 0.067554 1.243753 0.224397 \
+0.000093 0.143614 0.002840 0.060699 0.003390 1.118208 0.187826 0.725836 0.046586 0.487814 0.000932 0.325422 0.053908 0.000330 0.187880 0.014540 0.034916 0.000959 0.532092 0.074161 0.095418 0.460970 2.203539 0.377447 0.985145 0.003180 0.863191 0.016636 0.065212 0.025405 0.175491 0.020837 0.219170 0.296066 1.346385 0.259909 0.822133 17.634677 \
+0.148268 0.005996 0.612660 0.004963 0.340991 0.020909 1.496628 0.000459 0.467136 0.042942 0.099519 0.004316 0.051005 0.060922 0.002143 0.433620 0.000035 0.247195 0.012394 1.042457 0.000410 0.899262 0.143479 3.215817 0.285384 1.769879 0.296358 3.065510 0.011032 0.221536 0.023020 0.667397 0.275355 0.878163 0.089476 1.523251 0.199589 2.075154 0.413957 \
+0.013122 0.043609 0.000062 0.333780 0.156468 0.251650 0.004438 0.768607 0.072867 0.140864 0.011489 0.034794 0.105226 0.039362 0.022384 0.000679 0.133032 0.220816 0.303613 0.012002 0.561523 0.324525 0.571469 0.461383 2.285052 0.560831 2.721043 0.034519 3.832200 0.041440 0.116405 0.056267 0.497593 0.291009 0.623366 0.256174 1.144639 0.524647 1.038682 12.931524 \
+0.225554 0.000047 0.010929 0.009289 12.169045 0.083636 5.964323 0.681575 0.506470 0.000180 2.007768 0.181794 0.139046 0.322807 0.000060 0.009512 0.000105 1.035015 0.000288 0.021048 0.001066 1.293228 0.000453 1.177718 0.083261 0.772349 0.018146 0.530881 0.025006 0.359183 0.018727 0.269862 0.306375 4.943439 0.275865 2.397415 0.563566 4.971507 0.586685 1.293860 0.389004 \
+0.000093 0.166706 0.000432 0.005790 0.094762 10.892976 1.049877 9.818281 0.000116 0.346890 0.248099 1.372357 0.167138 0.000330 0.300513 0.002718 0.017300 0.001247 1.337629 0.005195 0.104681 0.005060 1.282522 0.232701 1.418383 0.387941 1.320875 0.354545 1.360141 0.031140 0.238533 0.021539 0.304581 0.622868 4.699375 0.441084 2.871848 0.643789 4.127466 0.334224 0.928876 28.579806 \
+0.140516 0.012600 0.423774 0.001718 0.047890 0.002044 1.094736 0.000115 0.424321 0.060909 0.144388 0.030883 0.145245 0.004747 0.000060 0.409432 0.000000 0.011511 0.000384 0.493508 0.000000 0.411115 0.025544 1.242140 0.000289 17.450524 1.113671 31.949764 2.418859 0.116039 0.012331 0.780008 0.305714 0.432918 0.021698 1.316696 0.087905 0.936840 0.273855 5.815294 1.197614 1.644621 0.403913 \
+0.083310 0.056771 0.000247 0.506841 0.063994 0.028715 0.009261 0.514392 0.200499 0.269510 0.122186 0.070533 0.496706 0.009560 0.000952 0.000951 0.040320 0.060432 0.033244 0.009225 0.273876 0.140943 0.169022 0.003786 1.148387 6.798629 4.087042 15.287419 18.531553 0.093542 0.062369 0.317466 0.905810 0.309656 0.140701 0.511968 0.765495 0.454347 0.600415 1.868194 7.316623 1.477696 1.286990 43.916187 \
+0.863970 0.065905 0.748196 0.529619 0.563995 0.000186 0.002219 0.000115 0.571505 0.000359 1.598824 0.004316 0.038763 1.897150 0.072866 0.555104 0.011580 0.708395 0.000192 0.009225 0.000246 0.338207 0.000091 0.150804 0.009600 0.336828 0.000000 0.003743 0.000000 6.546163 0.575921 2.577578 0.430124 2.898791 0.003020 0.056250 0.004868 0.318595 0.000295 0.201480 0.072138 0.501456 0.000219 0.032116 0.051709 \
+0.026338 0.946136 0.005990 0.140867 0.000085 0.396897 0.000096 0.001261 0.000116 0.582801 0.001087 1.273908 0.092044 0.105361 2.720153 0.043756 1.085170 0.000192 0.760090 0.000537 0.031642 0.000375 0.491034 0.006757 0.069320 0.000589 0.648523 0.001871 0.005026 0.458622 7.487816 0.154207 0.350473 0.001027 2.862216 0.002515 0.010298 0.000000 0.215492 0.001930 0.056145 0.000097 0.381287 0.000205 0.002062 10.956917 \
+0.565566 0.047403 2.299543 0.762425 0.001356 0.000279 0.813720 0.000115 0.236179 0.060430 0.754233 0.086986 0.058616 0.509595 0.031730 2.047159 0.020529 0.001151 0.000192 0.732111 0.000082 0.019586 0.002808 0.606945 0.000145 0.000353 0.000000 0.255147 0.000000 2.581654 0.313779 11.271062 0.569076 0.009008 0.001957 3.879355 0.002528 0.008844 0.002362 0.708204 0.000524 0.006305 0.000657 0.560642 0.002062 25.313949 5.637509 \
+0.068927 0.371072 0.024699 1.108802 0.000254 0.001394 0.000772 0.451899 0.009630 0.116309 0.126844 0.530278 0.277072 0.091844 0.477439 0.173122 2.262490 0.000384 0.001537 0.000717 0.602428 0.032237 0.044112 0.000466 0.475496 0.001413 0.003287 0.000832 0.701399 0.953353 3.487342 0.911193 1.399673 0.003635 0.089643 0.011433 3.092500 0.000628 0.013582 0.000193 0.311885 0.001358 0.004160 0.000103 0.314379 8.832621 18.744445 13.945647 \
+0.483563 0.000234 0.008892 0.035630 3.994417 0.771771 1.825878 0.250545 0.370309 0.000419 1.899865 0.011733 0.035860 0.879741 0.000417 0.004077 0.000772 1.272426 0.073021 0.346978 0.013526 0.420580 0.000091 0.272950 0.040907 0.324227 0.000000 0.001871 0.000000 0.536332 0.000253 0.001795 0.451134 2.359362 0.120121 0.324162 0.108501 0.643160 0.001329 0.216244 0.268662 2.323091 0.005328 0.013852 0.045374 2.600428 0.131929 0.662578 0.152215 \
+0.000093 0.512203 0.000000 0.001654 0.111968 3.452570 0.084218 1.978448 0.000058 0.380788 0.000466 1.514097 0.044178 0.000132 1.191514 0.000136 0.103766 0.022062 1.394892 0.013077 0.971148 0.000562 0.696016 0.018814 0.634927 0.000236 1.610248 0.000416 0.020471 0.000040 0.536362 0.000000 0.169264 0.038559 2.159328 0.019665 0.498504 0.000045 0.565968 0.005500 0.373948 0.000485 2.214735 0.000000 0.001768 0.046583 2.620833 0.028569 0.682579 9.612709 \
+0.109975 0.016535 1.041312 0.019406 1.931350 0.558500 4.380679 0.505677 0.176829 0.034737 0.806554 0.297371 0.031466 0.002374 0.000357 0.741135 0.000351 0.335924 0.069754 1.236457 0.087384 0.039265 0.004982 1.274118 0.002219 0.000589 0.000068 0.286547 0.000123 0.002262 0.000589 0.983926 0.517896 0.381796 0.077341 2.735831 0.318574 0.084620 0.041435 0.923644 0.004382 0.126382 0.063353 0.461729 0.004420 0.718719 0.092405 3.415722 0.415718 24.400553 6.746560 \
+0.005884 0.074851 0.000000 0.220908 0.103323 1.262618 0.150589 4.658653 0.027035 0.106187 0.028567 0.586111 0.446015 0.000066 0.000893 0.000000 1.524024 0.014101 0.417565 0.017824 1.950083 0.080124 0.190037 0.001165 1.544626 0.001531 0.083744 0.000624 3.409178 0.000081 0.004629 0.000078 0.837302 0.023862 0.728891 0.049848 2.866325 0.003771 0.068501 0.000482 0.759132 0.006402 0.200205 0.000000 0.187832 0.054049 0.968351 0.081861 2.211488 5.140068 19.373137 11.561124 \
+0.064397 0.000000 0.042112 0.038557 1.120532 0.003717 0.348448 0.117533 0.223763 0.015452 0.099985 0.000135 0.028249 0.129492 0.000000 0.012366 0.000491 0.661776 0.000769 0.147873 0.031560 0.746792 0.046739 0.706782 0.130873 0.162525 0.000000 0.007070 0.000368 0.066966 0.000042 0.001171 0.059065 0.928969 0.000559 0.092988 0.042595 3.529593 0.371685 0.604859 0.188097 1.702817 0.012481 0.030474 0.015763 0.153418 0.007112 0.078381 0.011491 0.396521 0.015140 0.189090 0.043198 \
+0.000000 0.055366 0.000062 0.006808 0.000254 1.023142 0.007428 0.670108 0.010037 0.184704 0.000000 0.071612 0.066384 0.000066 0.135255 0.001359 0.015686 0.000096 0.976175 0.003672 0.644235 0.100928 0.975727 0.121389 0.928319 0.000236 0.915505 0.009981 0.150527 0.000000 0.032447 0.000000 0.011379 0.000158 1.013424 0.003354 0.095207 0.167041 2.729647 0.053168 0.426684 0.000388 2.005334 0.000718 0.008986 0.004101 0.119062 0.006776 0.041280 0.018617 0.802516 0.027912 0.702594 14.214694 \
+0.084945 0.006464 0.287373 0.005472 0.330481 0.085680 1.265487 0.002179 0.257122 0.043721 0.028878 0.003641 0.009966 0.039560 0.002679 0.313495 0.000140 0.184749 0.105112 0.890822 0.005410 0.452442 0.106069 3.081614 0.536567 0.034978 0.025678 0.440217 0.000858 0.038612 0.009174 0.361403 0.033994 0.251423 0.109664 1.164866 0.003464 0.975582 0.193544 2.258321 0.308851 0.832592 0.308372 0.668173 0.004420 0.276499 0.042565 0.469281 0.055025 0.502355 0.140546 0.905488 0.227527 2.738552 0.892903 \
+0.010974 0.034428 0.000000 0.159955 0.042380 0.283432 0.001061 1.029128 0.042815 0.136432 0.014439 0.013216 0.137634 0.004220 0.010061 0.000136 0.176300 0.034437 0.294294 0.001791 0.990330 0.159217 0.566034 0.343314 3.036767 0.007891 0.528692 0.001040 2.171984 0.003312 0.031984 0.000078 0.262465 0.033581 0.360196 0.000838 1.447392 0.149578 0.372719 0.159248 1.563846 0.129098 0.822643 0.000410 1.195790 0.049842 0.245019 0.053017 0.362328 0.106257 0.938586 0.157605 1.251589 1.091224 3.1956 [...]
+0.164659 0.000141 0.000741 0.003881 0.976185 0.001951 0.011673 0.007109 0.130940 0.000120 0.420899 0.045044 0.039313 0.169777 0.000060 0.000272 0.000175 0.418802 0.000288 0.002508 0.001312 0.388156 0.000091 0.042812 0.003377 0.241197 0.004656 0.042005 0.011768 0.069995 0.000000 0.000156 0.027479 0.380374 0.000112 0.000534 0.000374 1.322234 0.005905 0.048730 0.021649 2.382451 0.326035 0.037657 0.047437 0.164143 0.016776 0.072521 0.024883 1.572808 0.086923 0.585071 0.083552 0.629243 0.0351 [...]
+0.000000 0.172889 0.000000 0.000191 0.000085 0.880032 0.000289 0.356038 0.000058 0.127388 0.007608 0.309374 0.105305 0.000000 0.240505 0.000000 0.047268 0.000096 0.636916 0.000090 0.395771 0.000843 0.566759 0.016193 0.336277 0.021435 0.676049 0.008942 0.703728 0.000283 0.055425 0.000000 0.018603 0.000000 0.518903 0.000000 0.006459 0.001122 1.110726 0.002863 0.176224 0.054025 2.392606 0.000821 0.012227 0.002050 0.201477 0.001557 0.051048 0.022214 1.797671 0.027973 1.398079 0.037461 1.2280 [...]
+0.113991 0.018315 0.201112 0.001082 0.012121 0.001951 1.720919 0.001720 0.082323 0.029826 0.197641 0.061497 0.073682 0.000330 0.000060 0.165784 0.000070 0.003549 0.000384 0.556204 0.000164 0.097554 0.004982 0.551493 0.000289 0.015310 0.000753 0.247245 0.010419 0.000283 0.000084 0.194319 0.037724 0.002449 0.000112 0.466770 0.000187 0.909861 0.280400 0.713961 0.001760 1.179053 0.298738 0.938439 0.165587 0.080337 0.009773 0.324696 0.016839 0.658541 0.036022 1.693998 0.046588 0.375097 0.0674 [...]
+0.018773 0.032039 0.000000 0.175861 0.002797 0.002974 0.003376 2.163175 0.007948 0.014314 0.105884 0.183952 0.381671 0.000066 0.000119 0.000000 0.185038 0.001918 0.001441 0.001254 0.703092 0.084060 0.053714 0.003029 0.634203 0.043222 0.097165 0.143481 0.590833 0.000081 0.000295 0.000078 0.410199 0.000553 0.000447 0.000610 0.716441 0.194964 0.293884 0.001158 0.744000 0.684968 1.149846 0.069567 1.558784 0.032177 0.064227 0.074536 0.276276 0.238907 0.496552 0.672077 1.526141 0.235747 0.4035 [...]
+\
+0.021414 0.021349 0.016195 0.015717 0.011798 0.010761 0.010366 0.008721 0.017237 0.016697 0.006441 0.007415 0.012744 0.015167 0.016798 0.007359 0.028497 0.010425 0.010408 0.011165 0.012199 0.010671 0.011040 0.017168 0.020730 0.008491 0.014604 0.004809 0.008158 0.024759 0.023762 0.012814 0.021180 0.012656 0.017882 0.013120 0.010682 0.022276 0.020321 0.031090 0.026699 0.010310 0.013701 0.009746 0.006788 0.019020 0.018419 0.010921 0.022626 0.018907 0.026817 0.016516 0.018288 0.028590 0.0252 [...]
+\
+TTT TTC TTA TTG TCT TCC TCA TCG TAT TAC TGT TGC TGG CTT CTC CTA CTG CCT CCC CCA \
+CCG CAT CAC CAA CAG CGT CGC CGA CGG ATT ATC ATA ATG ACT ACC ACA ACG AAT AAC AAA \
+AAG AGT AGC AGA AGG GTT GTC GTA GTG GCT GCC GCA GCG GAT GAC GAA GAG GGT GGC GGA \
+GGG";
+
+
+ModelCodonEmpirical::ModelCodonEmpirical(const char *model_name, string model_params,
+		StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates) :
+		ModelCodon(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+ModelCodonEmpirical::~ModelCodonEmpirical() {
+}
+
+void ModelCodonEmpirical::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+	StateFreqType def_freq = FREQ_UNKNOWN;
+	name = full_name = model_name;
+	string name_upper = model_name;
+	for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
+		(*it) = toupper(*it);
+	if (name_upper == "ECM") {
+		def_freq = FREQ_USER_DEFINED;
+		if (!phylo_tree->aln->isStandardGeneticCode())
+			outError("For ECM a standard genetic code must be used");
+		try {
+			istringstream in(model_ECMunrest);
+			readCodonModel(in);
+		}
+		catch (const char *str) {
+			outError(str);
+		}
+	} else if (name_upper == "ECMREST") {
+		def_freq = FREQ_USER_DEFINED;
+		if (!phylo_tree->aln->isStandardGeneticCode())
+			outError("For ECM a standard genetic code must be used");
+		try {
+			istringstream in(model_ECMrest);
+			readCodonModel(in);
+		}
+		catch (const char *str) {
+			outError(str);
+		}
+	} else {
+		//cout << "User-specified model "<< model_name << endl;
+		readParameters(model_name);
+			//name += " (user-defined)";
+	}
+
+	if (freq_params != "") {
+		readStateFreq(freq_params);
+	}
+	if (model_params != "") {
+		readRates(model_params);
+	}
+
+	if (freq == FREQ_UNKNOWN ||  def_freq == FREQ_EQUAL) freq = def_freq;
+	ModelCodon::init(freq);
+}
+
+void ModelCodonEmpirical::readCodonModel(istream &in) {
+	int i, j;
+	double ** q = (double**)new double[num_states];
+	for (i = 0; i < num_states; i++)
+		q[i] = new double[num_states];
+	double *f = new double[num_states];
+	for (i = 1; i < num_states; i++) {
+		for (j = 0; j < i; j++) {
+			in >> q[i][j];
+			q[j][i] = q[i][j];
+			if (verbose_mode >= VB_MAX) cout << " " << q[i][j];
+		}
+		if (verbose_mode >= VB_MAX) cout << endl;
+	}
+	for (i = 0; i < num_states; i++)
+		in >> f[i];
+	StrVector codons;
+	codons.resize(num_states);
+	IntVector state_map;
+	state_map.resize(num_states);
+	for (i = 0; i < num_states; i++) {
+		in >> codons[i];
+		if (codons[i].length() != 3)
+			outError("Input model has wrong codon format ", codons[i]);
+		int nt1 = phylo_tree->aln->convertState(codons[i][0], SEQ_DNA);
+		int nt2 = phylo_tree->aln->convertState(codons[i][1], SEQ_DNA);
+		int nt3 = phylo_tree->aln->convertState(codons[i][2], SEQ_DNA);
+		if (nt1 > 3 || nt2 > 3 || nt3 > 3)
+			outError("Wrong codon triplet ", codons[i]);
+		state_map[i] = phylo_tree->aln->non_stop_codon[nt1*16+nt2*4+nt3];
+		if (verbose_mode >= VB_MAX)
+			cout << " " << codons[i] << " " << state_map[i];
+	}
+	if (verbose_mode >= VB_MAX) cout << endl;
+
+	//int nrates = getNumRateEntries();
+	//int row = 0, col = 1;
+	// since rates for codons is stored in lower-triangle, special treatment is needed
+	for (i = 1; i < num_states; i++) {
+		for (j = 0; j < i; j++) {
+			int row = state_map[i], col = state_map[j];
+			if (row < col) {
+				int tmp = row;
+				row = col;
+				col = tmp;
+			}
+			int id = col*(2*num_states-col-1)/2 + (row-col-1);
+			assert(id < getNumRateEntries() && id >= 0);
+			rates[id] = q[i][j];
+		}
+	}
+	for (i = 0; i < num_states; i++)
+		state_freq[i] = MIN_FREQUENCY;
+	for (i = 0; i < nscodons; i++)
+		state_freq[state_map[i]] = f[i]-(num_states-nscodons)*MIN_FREQUENCY/nscodons;
+
+	num_params = 0;
+
+	delete [] f;
+	for (i = num_states-1; i >= 0; i--)
+		delete [] q[i];
+	delete [] q;
+}
+
diff --git a/model/modelcodonempirical.h b/model/modelcodonempirical.h
new file mode 100644
index 0000000..14a8078
--- /dev/null
+++ b/model/modelcodonempirical.h
@@ -0,0 +1,48 @@
+/*
+ * modelcodonempirical.h
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#ifndef MODELCODONEMPIRICAL_H_
+#define MODELCODONEMPIRICAL_H_
+
+#include "modelcodon.h"
+
+/**
+ * empirical codon model (e.g., Kosiol et al. 2007)
+ */
+class ModelCodonEmpirical: virtual public ModelCodon {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., GY,YN
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+	ModelCodonEmpirical(const char *model_name, string model_params, StateFreqType freq, string freq_params,
+    		PhyloTree *tree, bool count_rates = true);
+
+	/**
+	 * destructor
+	 */
+	virtual ~ModelCodonEmpirical();
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+
+	/**
+	 * read codon model from a stream, modying rates and state_freq accordingly
+	 * @param in input stream containing lower triangular matrix of rates, frequencies and list of codons
+	 */
+	void readCodonModel(istream &in);
+
+};
+
+#endif /* MODELCODONEMPIRICAL_H_ */
diff --git a/model/modelcodonparametric.cpp b/model/modelcodonparametric.cpp
new file mode 100644
index 0000000..85db803
--- /dev/null
+++ b/model/modelcodonparametric.cpp
@@ -0,0 +1,113 @@
+/*
+ * modelcodonparametric.cpp
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#include "modelcodonparametric.h"
+
+ModelCodonParametric::ModelCodonParametric(const char *model_name, string model_params,
+		StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates) :
+		ModelCodon(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+
+ModelCodonParametric::~ModelCodonParametric() {
+}
+
+void ModelCodonParametric::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+	StateFreqType def_freq = FREQ_UNKNOWN;
+	name = full_name = model_name;
+	string name_upper = model_name;
+	for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
+		(*it) = toupper(*it);
+	if (name_upper == "JCC") {
+		name = "JCC";
+		def_freq = FREQ_EQUAL;
+		full_name = "JC-like codon model";
+	} else if (name_upper == "MG") {
+		initMG94();
+	} else if (name_upper == "GY") {
+		initGY94();
+	} else {
+		//cout << "User-specified model "<< model_name << endl;
+		readParameters(model_name);
+			//name += " (user-defined)";
+	}
+
+	if (freq_params != "") {
+		readStateFreq(freq_params);
+	}
+	if (model_params != "") {
+		readRates(model_params);
+	}
+
+	if (freq == FREQ_UNKNOWN ||  def_freq == FREQ_EQUAL) freq = def_freq;
+	ModelCodon::init(freq);
+}
+
+
+void ModelCodonParametric::initMG94() {
+	/* Muse-Gaut 1994 model with 1 parameters: omega */
+	int i,j;
+	IntVector group;
+	for (i = 0; i < num_states-1; i++) {
+		for (j = i+1; j < num_states; j++) {
+			if (isMultipleSubst(i, j))
+				group.push_back(0); // multiple substitution
+			else if (isSynonymous(i, j))
+				group.push_back(1); // synonymous substitution
+			else
+				group.push_back(2); // non-synonymous substitution
+		}
+	}
+	setRateGroup(group);
+	// set zero rate for multiple substitution and 1 for synonymous substitution
+	setRateGroupConstraint("x0=0,x1=1");
+}
+
+
+void ModelCodonParametric::initGY94() {
+	/* Yang-Nielsen 1998 model (also known as Goldman-Yang 1994) with 2 parameters: omega and kappa */
+	int i,j;
+	IntVector group;
+	for (i = 0; i < num_states-1; i++) {
+		for (j = i+1; j < num_states; j++) {
+			if (isMultipleSubst(i, j))
+				group.push_back(0); // multiple substitution
+			else if (isSynonymous(i, j)) {
+				if (isTransversion(i, j))
+					group.push_back(1); // synonymous transversion
+				else
+					group.push_back(2); // synonymous transition
+			} else {
+				if (isTransversion(i, j))
+					group.push_back(3); // non-synonymous transversion
+				else
+					group.push_back(4); // non-synonymous transition
+			}
+		}
+	}
+	setRateGroup(group);
+	// set zero rate for multiple substitution
+	// 1 for synonymous transversion
+	// and kappa*omega for non-synonymous transition
+	setRateGroupConstraint("x0=0,x1=1,x4=x2*x3");
+}
+
+void ModelCodonParametric::writeInfo(ostream &out) {
+	double *variables = new double[getNDim()+1];
+	setVariables(variables);
+	if (name == "MG") {
+		out << "Nonsynonymous/synonymous ratio (omega): " << variables[1] << endl;
+	} else if (name == "GY") {
+		out << "Transition/transversion ratio (kappa): " << variables[1] << endl;
+		out << "Nonsynonymous/synonymous ratio (omega): " << variables[2] << endl;
+	}
+	delete [] variables;
+}
+
diff --git a/model/modelcodonparametric.h b/model/modelcodonparametric.h
new file mode 100644
index 0000000..fe92ae7
--- /dev/null
+++ b/model/modelcodonparametric.h
@@ -0,0 +1,56 @@
+/*
+ * modelcodonparametric.h
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#ifndef MODELCODONPARAMETRIC_H_
+#define MODELCODONPARAMETRIC_H_
+
+#include "modelcodon.h"
+
+/**
+ * parametric codon model (e.g., Goldman-Yang, Muse-Gaut)
+ */
+class ModelCodonParametric: virtual public ModelCodon {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., GY,YN
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+	ModelCodonParametric(const char *model_name, string model_params, StateFreqType freq, string freq_params,
+    		PhyloTree *tree, bool count_rates = true);
+
+    /**
+     * destructor
+     */
+	virtual ~ModelCodonParametric();
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+protected:
+
+	/** initialize Muse-Gaut 1994 model */
+	void initMG94();
+
+	/** initialize Goldman-Yang 1994 model (simplified version with 2 parameters omega and kappa */
+	void initGY94();
+
+
+};
+
+#endif /* MODELCODONPARAMETRIC_H_ */
diff --git a/model/modelcodonsemiempirical.cpp b/model/modelcodonsemiempirical.cpp
new file mode 100644
index 0000000..be7c99e
--- /dev/null
+++ b/model/modelcodonsemiempirical.cpp
@@ -0,0 +1,31 @@
+/*
+ * modelcodonsemiempirical.cpp
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#include "modelcodonsemiempirical.h"
+
+ModelCodonSemiEmpirical::ModelCodonSemiEmpirical(const char *model_name, string model_params,
+		StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates) :
+		ModelCodon(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+
+ModelCodonSemiEmpirical::~ModelCodonSemiEmpirical() {
+}
+
+
+void ModelCodonSemiEmpirical::init(const char *model_name, string model_params, StateFreqType freq, string freq_params) {
+	name = full_name = model_name;
+	size_t pos = name.find('+');
+	assert(pos != string::npos);
+	if (name.substr(0,3) == "ECM") {
+		ModelCodonEmpirical::init(name.substr(0,pos), "", FREQ_USER_DEFINED, "");
+		ModelCodonParametric::init(name.substr(pos), model_params, freq, freq_params);
+	}
+
+}
diff --git a/model/modelcodonsemiempirical.h b/model/modelcodonsemiempirical.h
new file mode 100644
index 0000000..8cfed8d
--- /dev/null
+++ b/model/modelcodonsemiempirical.h
@@ -0,0 +1,40 @@
+/*
+ * modelcodonsemiempirical.h
+ *
+ *  Created on: May 29, 2013
+ *      Author: minh
+ */
+
+#ifndef MODELCODONSEMIEMPIRICAL_H_
+#define MODELCODONSEMIEMPIRICAL_H_
+
+#include "modelcodonempirical.h"
+#include "modelcodonparametric.h"
+
+class ModelCodonSemiEmpirical: public ModelCodonEmpirical, public ModelCodonParametric {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., GY,YN
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+	ModelCodonSemiEmpirical(const char *model_name, string model_params, StateFreqType freq, string freq_params,
+    		PhyloTree *tree, bool count_rates = true);
+
+
+	/**
+	 * destructor
+	 */
+	virtual ~ModelCodonSemiEmpirical();
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+};
+
+#endif /* MODELCODONSEMIEMPIRICAL_H_ */
diff --git a/model/modeldna.cpp b/model/modeldna.cpp
new file mode 100644
index 0000000..7053119
--- /dev/null
+++ b/model/modeldna.cpp
@@ -0,0 +1,409 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "modeldna.h"
+
+ModelDNA::ModelDNA(PhyloTree *tree, bool count_rates)
+: ModelGTR(tree, count_rates)
+{
+}
+
+ModelDNA::ModelDNA(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates)
+: ModelGTR(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+string getDNAModelInfo(string model_name, string &full_name, string &rate_type, StateFreqType &def_freq) {
+	string name_upper = model_name;
+	for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
+		(*it) = toupper(*it);
+	string name = model_name;
+	full_name = name;
+	rate_type = "";
+	def_freq = FREQ_UNKNOWN;
+	if (name_upper == "JC" || name_upper == "JC69") {
+		name = "JC";
+		rate_type = "000000";
+		def_freq = FREQ_EQUAL;
+		full_name = "JC (Juke and Cantor, 1969)";
+	} else if (name_upper == "F81") {
+		name = "F81";
+		rate_type = "000000";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "F81 (Felsenstein, 1981)";
+	} else if (name_upper == "K2P" || name_upper == "K80") {
+		name = "K2P";
+		rate_type = "010010";
+		def_freq = FREQ_EQUAL;
+		full_name = "K2P (Kimura, 1980)";
+	} else if (name_upper == "HKY" || name_upper == "HKY85") {
+		name = "HKY";
+		rate_type = "010010";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "HKY (Hasegawa, Kishino and Yano, 1985)";
+	} else if (name_upper == "K3P" || name_upper == "K81" || name_upper=="TPM1") {
+		name = "K3P";
+		rate_type = "012210";
+		def_freq = FREQ_EQUAL;
+		full_name = "K3P (Kimura, 1981)";
+	} else if (name_upper == "K81UF" || name_upper == "K81U" || name_upper == "K3PU" ||
+			name_upper == "K3PUF" || name_upper=="TPM1UF" || name_upper=="TPM1U") {
+		name = "K3Pu";
+		rate_type = "012210";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "K3P unequal frequencies (Kimura, 1981)";
+	} else if (name_upper == "TN" || name_upper == "TRN" || name_upper == "TN93") {
+		name = "TN";
+		rate_type = "010020";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TN (Tamura and Nei, 1993)";
+	} else if (name_upper == "TNEF" || name_upper == "TRNEF" || name_upper == "TNE" || name_upper == "TRNE") {
+		name = "TNe";
+		rate_type = "010020";
+		def_freq = FREQ_EQUAL;
+		full_name = "TN equal frequencies (Tamura and Nei, 1993)";
+	} else if (name_upper == "TPM2") {
+		name = "TPM2";
+		rate_type = "121020";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TPM2 ()";
+	} else if (name_upper == "TPM2U" || name_upper == "TPM2UF") {
+		name = "TPM2u";
+		rate_type = "121020";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TPM2 unequal frequencies ()";
+	} else if (name_upper == "TPM3") {
+		name = "TPM3";
+		rate_type = "120120";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TPM3 ()";
+	} else if (name_upper == "TPM3U" || name_upper == "TPM3UF") {
+		name = "TPM3u";
+		rate_type = "120120";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TPM3 unequal frequencies ()";
+	} else if (name_upper == "TIM" || name_upper == "TIM1") {
+		name = "TIM";
+		rate_type = "012230";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TIM ()";
+	} else if (name_upper == "TIMEF" || name_upper == "TIME" || name_upper == "TIM1EF" || name_upper == "TIM1E") {
+		name = "TIMe";
+		rate_type = "012230";
+		def_freq = FREQ_EQUAL;
+		full_name = "TIM equal frequencies";
+	} else if (name_upper == "TIM2") {
+		name = "TIM2";
+		rate_type = "121030";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TIM2 ()";
+	} else if (name_upper == "TIM2EF" || name_upper == "TIM2E") {
+		name = "TIM2e";
+		rate_type = "121030";
+		def_freq = FREQ_EQUAL;
+		full_name = "TIM2 equal frequencies";
+	} else if (name_upper == "TIM3") {
+		name = "TIM3";
+		rate_type = "120130";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TIM3 ()";
+	} else if (name_upper == "TIM3EF" || name_upper == "TIM3E") {
+		name = "TIM3e";
+		rate_type = "120130";
+		def_freq = FREQ_EQUAL;
+		full_name = "TIM3 equal frequencies";
+	} else if (name_upper == "TVM") {
+		name = "TVM";
+		rate_type = "412310";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "TVM";
+	} else if (name_upper == "TVMEF" || name_upper == "TVME") {
+		name = "TVMe";
+		rate_type = "412310";
+		def_freq = FREQ_EQUAL;
+		full_name = "TVM equal frequencies";
+	} else if (name_upper == "SYM") {
+		name = "SYM";
+		rate_type = "123450";
+		def_freq = FREQ_EQUAL;
+		full_name = "SYM (Zharkihk, 1994)";
+	} else if (name_upper == "GTR" || name_upper == "REV") {
+		name = "GTR";
+		rate_type = "123450";
+		def_freq = FREQ_ESTIMATE;
+		full_name = "GTR (Tavare, 1986)";
+	} else {
+		name = "";
+		rate_type = "";
+		full_name = "";
+	}
+	return name;
+}
+
+
+void ModelDNA::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+	assert(num_states == 4); // make sure that you create model for DNA
+	StateFreqType def_freq = FREQ_UNKNOWN;
+	string rate_type;
+	name = getDNAModelInfo((string)model_name, full_name, rate_type, def_freq);
+
+	if (name != "") {
+		setRateType(rate_type.c_str());
+	} else {
+		//cout << "User-specified model "<< model_name << endl;
+		if (setRateType(model_name))
+			def_freq = FREQ_ESTIMATE;
+		else {
+			readParameters(model_name);
+			//name += " (user-defined)";
+		}
+	}
+
+	if (freq_params != "") {
+		readStateFreq(freq_params);
+	}
+	if (model_params != "") {
+		readRates(model_params);
+	}
+
+	if (freq == FREQ_UNKNOWN ||  def_freq == FREQ_EQUAL) freq = def_freq;
+	ModelGTR::init(freq);
+}
+
+
+void ModelDNA::readRates(string str) throw(const char*) {
+	int nrates = *max_element(param_spec.begin(), param_spec.end());
+	int end_pos = 0;
+	int i, j;
+	for (j = 0; j < param_spec.length(); j++)
+		rates[j] = 1.0;
+	num_params = 0;
+	for (i = 0; i < nrates && end_pos < str.length(); i++) {
+		int new_end_pos;
+		double rate = 0;
+		if (str[end_pos] == '?') {
+			param_fixed[i+1] = false;
+			end_pos++;
+			rate = i + 0.4;
+			num_params++;
+		} else {
+			param_fixed[i+1] = true;
+			try {
+				rate = convert_double(str.substr(end_pos).c_str(), new_end_pos);
+			} catch (string str) {
+				outError(str);
+			}
+			end_pos += new_end_pos;
+		}
+		if (rate < 0.0)
+			outError("Negative rates found");
+		if (i == nrates-1 && end_pos < str.length())
+			outError("String too long ", str);
+		if (i < nrates-1 && end_pos >= str.length())
+			outError("Unexpected end of string ", str);
+		if (end_pos < str.length() && str[end_pos] != ',')
+			outError("Comma to separate rates not found in ", str);
+		end_pos++;
+		for (j = 0; j < param_spec.length(); j++)
+			if (param_spec[j] == i+1)
+				rates[j] = rate;
+	}
+}
+
+
+string ModelDNA::getNameParams() {
+	if (num_params == 0) return name;
+	ostringstream retname;
+	retname << name << '{';
+	int nrates = getNumRateEntries();
+	int k = 0;
+	for (int i = 0; i < nrates; i++) {
+		if (param_spec[i] > k) {
+			if (k>0) retname << ',';
+			retname << rates[i];
+			k++;
+		}
+	}
+	retname << '}';
+	return retname.str();
+}
+
+bool ModelDNA::setRateType(const char *rate_str) {
+	//char first_type = 127;
+	//char last_type = 0;
+	//char t = first_type;
+	int num_ch = strlen(rate_str);
+	int i;
+
+	if (num_ch != getNumRateEntries()) {
+		//outError("Model specification has wrong length!");
+		return false;
+	}
+	// only accept string of digits
+	for (i = 0; i < num_ch; i++)
+		if (!isdigit(rate_str[i])) return false;
+	/*
+	if (rate_str[num_ch-1] != '0') {
+		//outError("Model specification must end with '0'");
+		return false;
+	}
+	for (i = 0; i < num_ch; i++) {
+		if (rate_str[i] > last_type) last_type = rate_str[i];
+		if (rate_str[i] < first_type) first_type = rate_str[i];
+	}
+	if (first_type != rate_str[num_ch-1]) {
+		//outError("Model specification must contain digits!");
+		return false;
+	}
+
+	num_params = last_type - first_type;
+	param_spec = "";
+	for (i = 0; i < num_ch; i++) {
+		param_spec.push_back(rate_str[i]-first_type);
+	}*/
+
+	map<char,char> param_k;
+	num_params = 0;
+	param_spec = "";
+	// last entry get ID of 0 for easy management
+	param_k[rate_str[num_ch-1]] = 0;
+	for (i = 0; i < num_ch; i++) {
+		if (param_k.find(rate_str[i]) == param_k.end()) {
+			num_params++;
+			param_k[rate_str[i]] = (char)num_params;
+			param_spec.push_back(num_params);
+		} else {
+			param_spec.push_back(param_k[rate_str[i]]);
+		}
+	}
+
+	assert(param_spec.length() == num_ch);
+	double *avg_rates = new double[num_params+1];
+	int *num_rates = new int[num_params+1];
+	memset(avg_rates, 0, sizeof(double) * (num_params+1));
+	memset(num_rates, 0, sizeof(int) * (num_params+1));
+	for (i = 0; i < param_spec.size(); i++) {
+		avg_rates[(int)param_spec[i]] += rates[i];
+		num_rates[(int)param_spec[i]]++;
+	}
+	for (i = 0; i <= num_params; i++)
+		avg_rates[i] /= num_rates[i];
+	for (i = 0; i < param_spec.size(); i++) {
+		rates[i] = avg_rates[(int)param_spec[i]] / avg_rates[0];
+	}
+	if (verbose_mode >= VB_DEBUG) {
+		cout << "Initialized rates: ";
+		for (i = 0; i < param_spec.size(); i++) 
+			cout << rates[i] << " ";
+		cout << endl;
+	}
+	param_fixed.resize(num_params+1, false);
+	param_fixed[0] = true; // fix the last entry
+	delete [] num_rates;
+	delete [] avg_rates;
+	return true;
+}
+
+
+int ModelDNA::getNDim() {
+	assert(freq_type != FREQ_UNKNOWN);
+	int ndim = num_params; 
+	if (freq_type == FREQ_ESTIMATE) 
+		ndim += num_states-1;
+	return ndim;
+}
+
+void ModelDNA::writeParameters(ostream &out) {
+	int i;
+	if (freq_type == FREQ_ESTIMATE) {
+		for (i = 0; i < num_states; i++)
+			out << "\t" << state_freq[i];
+	}
+	if (num_params == 0) return;
+	if (num_params <= 1)
+		out << "\t" << rates[1];
+	else {
+		int nrateout = getNumRateEntries() - 1;
+		for (i = 0; i < nrateout; i++)
+			out << "\t" << rates[i];
+	}
+}
+
+
+void ModelDNA::getVariables(double *variables) {
+	int i;
+	if (num_params > 0) {
+		int num_all = param_spec.length();
+		if (verbose_mode >= VB_MAX) {
+			for (i = 1; i <= num_params; i++)
+				cout << "  estimated variables[" << i << "] = " << variables[i] << endl;
+		}
+		for (i = 0; i < num_all; i++)
+			if (!param_fixed[param_spec[i]]) {
+				rates[i] = variables[(int)param_spec[i]];
+			}
+	}
+	if (freq_type == FREQ_ESTIMATE) {
+//		int ndim = getNDim();
+//		memcpy(state_freq, variables+(ndim-num_states+2), (num_states-1)*sizeof(double));
+//		double sum = 0;
+//		for (i = 0; i < num_states-1; i++) 
+//			sum += state_freq[i];
+//		state_freq[num_states-1] = 1.0 - sum;
+
+        // BUG FIX 2015.08.28
+        int nrate = getNDim();
+        if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+		double sum = 1.0;
+		int i, j;
+		for (i = 1; i < num_states; i++)
+			sum += variables[nrate+i];
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				state_freq[i] = variables[nrate+j] / sum;
+				j++;
+			}
+		state_freq[highest_freq_state] = 1.0/sum;
+	}
+}
+
+void ModelDNA::setVariables(double *variables) {
+	if (num_params > 0) {
+		int num_all = param_spec.length();
+		for (int i = 0; i < num_all; i++)
+			if (!param_fixed[param_spec[i]])
+				variables[(int)param_spec[i]] = rates[i];
+	}
+	if (freq_type == FREQ_ESTIMATE) {
+//		int ndim = getNDim();
+//		memcpy(variables+(ndim-num_states+2), state_freq, (num_states-1)*sizeof(double));
+
+        // BUG FIX 2015.08.28
+        int nrate = getNDim();
+        if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+		int i, j;
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state];
+				j++;
+			}
+	}
+}
diff --git a/model/modeldna.h b/model/modeldna.h
new file mode 100644
index 0000000..3cf3664
--- /dev/null
+++ b/model/modeldna.h
@@ -0,0 +1,121 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MODELDNA_H
+#define MODELDNA_H
+
+#include "modelgtr.h"
+#include <string>
+
+/**
+ * return name and other information of an input model_name
+ * @param model_name DNA model name
+ * @param full_name (OUT) full model name with citation
+ * @param rate_type (OUT) a 6-digit string showing 6 rate constraints
+ * @param def_freq (OUT) base frequency type, either FREQ_UNNOWN, FREQ_EQUAL, or FREQ_ESIMATE
+ * @return unique model name or "" (empty string) if model_name is unrecognized
+ */
+string getDNAModelInfo(string model_name, string &full_name, string &rate_type, StateFreqType &def_freq);
+
+/**
+All DNA models are managed here
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelDNA : public ModelGTR
+{
+public:
+	/**
+		constructor
+		@param tree associated tree for the model
+	*/
+    ModelDNA(PhyloTree *tree, bool count_rates = true);
+
+	/**
+		constructor
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+    ModelDNA(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates = true);
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		Read the rate parameters from a comma-separated string
+		It will throw error messages if failed
+		@param in input stream
+	*/
+	virtual void readRates(string str) throw(const char*);
+
+	/**
+		set the substitution rate parameters by a specification
+		@param rate_spec a string of six letters describing how rates are related
+		@return TRUE if successful, FALSE otherwise
+	*/
+	bool setRateType(const char *rate_spec);
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters 
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters 
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+	/**
+		rate parameter specification, a string of 6 characters
+	*/
+	string param_spec;
+	
+	/** vector of boolean, TRUE if corresponding parameter is fixed and FALSE otherwise */
+	vector<bool> param_fixed;
+
+};
+
+#endif
diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp
new file mode 100644
index 0000000..1220d26
--- /dev/null
+++ b/model/modelfactory.cpp
@@ -0,0 +1,1025 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "rateinvar.h"
+#include "modelfactory.h"
+#include "rategamma.h"
+#include "rategammainvar.h"
+#include "modelgtr.h"
+#include "modelnonrev.h"
+#include "modeldna.h"
+#include "modelprotein.h"
+#include "modelbin.h"
+#include "modelcodon.h"
+#include "modelmorphology.h"
+#include "modelset.h"
+#include "modelmixture.h"
+#include "ratemeyerhaeseler.h"
+#include "ratemeyerdiscrete.h"
+#include "ratekategory.h"
+#include "ratefree.h"
+#include "ratefreeinvar.h"
+#include "ngs.h"
+#include <string>
+#include "timeutil.h"
+#include "myreader.h"
+
+ModelsBlock *readModelsDefinition(Params &params) {
+
+	ModelsBlock *models_block = new ModelsBlock;
+
+	if (true)
+	{
+		// loading internal model definitions
+		istringstream in(builtin_mixmodels_definition);
+		NxsReader nexus;
+		nexus.Add(models_block);
+	    MyToken token(in);
+	    nexus.Execute(token);
+//	    int num_model = 0, num_freq = 0;
+//	    for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++)
+//	    	if ((*it).flag & NM_FREQ) num_freq++; else num_model++;
+//	    cout << num_model << " models and " << num_freq << " frequency vectors loaded" << endl;
+	}
+
+	if (params.model_def_file) {
+		cout << "Reading model definition file " << params.model_def_file << " ... ";
+		MyReader nexus(params.model_def_file);
+		nexus.Add(models_block);
+	    MyToken token(nexus.inf);
+	    nexus.Execute(token);
+	    int num_model = 0, num_freq = 0;
+	    for (ModelsBlock::iterator it = models_block->begin(); it != models_block->end(); it++)
+	    	if ((*it).flag & NM_FREQ) num_freq++; else num_model++;
+	    cout << num_model << " models and " << num_freq << " frequency vectors loaded" << endl;
+	}
+	return models_block;
+}
+
+ModelFactory::ModelFactory() { 
+	model = NULL; 
+	site_rate = NULL;
+	store_trans_matrix = false;
+	is_storing = false;
+	joint_optimize = false;
+	fused_mix_rate = false;
+	unobserved_ptns = "";
+}
+
+size_t findCloseBracket(string &str, size_t start_pos) {
+	int counter = 0;
+	for (size_t pos = start_pos+1; pos < str.length(); pos++) {
+		if (str[pos] == '{') counter++;
+		if (str[pos] == '}') {
+			if (counter == 0) return pos; else counter--;
+		}
+	}
+	return string::npos;
+}
+
+ModelFactory::ModelFactory(Params &params, PhyloTree *tree, ModelsBlock *models_block) {
+	store_trans_matrix = params.store_trans_matrix;
+	is_storing = false;
+	joint_optimize = params.optimize_model_rate_joint;
+	fused_mix_rate = false;
+
+	string model_str = params.model_name;
+	string rate_str;
+
+	try {
+
+
+	if (model_str == "") {
+		if (tree->aln->seq_type == SEQ_DNA) model_str = "HKY";
+		else if (tree->aln->seq_type == SEQ_PROTEIN) model_str = "WAG";
+		else if (tree->aln->seq_type == SEQ_BINARY) model_str = "GTR2";
+		else if (tree->aln->seq_type == SEQ_CODON) model_str = "GY";
+		else if (tree->aln->seq_type == SEQ_MORPH) model_str = "MK";
+		else model_str = "JC";
+		outWarning("Default model may be under-fitting. Use option '-m TEST' to select best-fit model.");
+	}
+
+	/********* preprocessing model string ****************/
+	NxsModel *nxsmodel  = NULL;
+
+    string new_model_str = "";
+    size_t mix_pos;
+    for (mix_pos = 0; mix_pos < model_str.length(); mix_pos++) {
+        size_t next_mix_pos = model_str.find_first_of("+*", mix_pos);
+        string sub_model_str = model_str.substr(mix_pos, next_mix_pos-mix_pos);
+        nxsmodel = models_block->findMixModel(sub_model_str);
+        if (nxsmodel) sub_model_str = nxsmodel->description;
+        new_model_str += sub_model_str;
+        if (next_mix_pos != string::npos)
+            new_model_str += model_str[next_mix_pos];
+        else 
+            break;
+        mix_pos = next_mix_pos;
+    }
+    if (new_model_str != model_str)
+        cout << "Model " << model_str << " is alias for " << new_model_str << endl;
+    model_str = new_model_str;
+    
+//	nxsmodel = models_block->findModel(model_str);
+//	if (nxsmodel && nxsmodel->description.find_first_of("+*") != string::npos) {
+//		cout << "Model " << model_str << " is alias for " << nxsmodel->description << endl;
+//		model_str = nxsmodel->description;
+//	}
+
+	// decompose model string into model_str and rate_str string
+	size_t spec_pos = model_str.find_first_of("{+*");
+	if (spec_pos != string::npos) {
+		if (model_str[spec_pos] == '{') {
+			// scan for the corresponding '}'
+			size_t pos = findCloseBracket(model_str, spec_pos);
+			if (pos == string::npos)
+				outError("Model name has wrong bracket notation '{...}'");
+			rate_str = model_str.substr(pos+1);
+			model_str = model_str.substr(0, pos+1);
+		} else {
+			rate_str = model_str.substr(spec_pos);
+			model_str = model_str.substr(0, spec_pos);
+		}
+	}
+
+//	nxsmodel = models_block->findModel(model_str);
+//	if (nxsmodel && nxsmodel->description.find("MIX") != string::npos) {
+//		cout << "Model " << model_str << " is alias for " << nxsmodel->description << endl;
+//		model_str = nxsmodel->description;
+//	}
+
+	/******************** initialize state frequency ****************************/
+
+	StateFreqType freq_type = params.freq_type;
+
+	if (freq_type == FREQ_UNKNOWN) {
+		switch (tree->aln->seq_type) {
+		case SEQ_BINARY: freq_type = FREQ_ESTIMATE; break; // default for binary: optimized frequencies
+		case SEQ_PROTEIN: freq_type = FREQ_USER_DEFINED; break; // default for protein: frequencies of the empirical AA matrix
+		case SEQ_MORPH: freq_type = FREQ_EQUAL; break;
+		case SEQ_CODON: freq_type = FREQ_UNKNOWN; break;
+		default: freq_type = FREQ_EMPIRICAL; break; // default for DNA and others: counted frequencies from alignment
+		}
+	}
+
+    // first handle mixture frequency
+    string::size_type posfreq = rate_str.find("+FMIX");
+	string freq_params;
+    size_t close_bracket;
+
+    if (posfreq != string::npos) {
+		string freq_str;
+		size_t last_pos = rate_str.find_first_of("+*", posfreq+1);
+        
+		if (last_pos == string::npos) {
+			freq_str = rate_str.substr(posfreq);
+			rate_str = rate_str.substr(0, posfreq);
+		} else {
+			freq_str = rate_str.substr(posfreq, last_pos-posfreq);
+			rate_str = rate_str.substr(0, posfreq) + rate_str.substr(last_pos);
+		}
+        
+        if (freq_str[5] != OPEN_BRACKET)
+            outError("Mixture-frequency must start with +FMIX{");
+        close_bracket = freq_str.find(CLOSE_BRACKET);
+        if (close_bracket == string::npos)
+            outError("Close bracket not found in ", freq_str);
+        if (close_bracket != freq_str.length()-1)
+            outError("Wrong close bracket position ", freq_str);
+        freq_type = FREQ_MIXTURE;
+        freq_params = freq_str.substr(6, close_bracket-6);
+    }
+
+    // then normal frequency
+	posfreq = rate_str.find("+F");
+    bool optimize_mixmodel_weight = params.optimize_mixmodel_weight;
+
+	if (posfreq != string::npos) {
+		string freq_str;
+		size_t last_pos = rate_str.find_first_of("+*", posfreq+1);
+		if (last_pos == string::npos) {
+			freq_str = rate_str.substr(posfreq);
+			rate_str = rate_str.substr(0, posfreq);
+		} else {
+			freq_str = rate_str.substr(posfreq, last_pos-posfreq);
+			rate_str = rate_str.substr(0, posfreq) + rate_str.substr(last_pos);
+		}
+
+        if (freq_str.length() > 2 && freq_str[2] == OPEN_BRACKET) {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with user-defined frequency is not allowed");
+			close_bracket = freq_str.find(CLOSE_BRACKET);
+			if (close_bracket == string::npos)
+				outError("Close bracket not found in ", freq_str);
+			if (close_bracket != freq_str.length()-1)
+				outError("Wrong close bracket position ", freq_str);
+			freq_type = FREQ_USER_DEFINED;
+			freq_params = freq_str.substr(3, close_bracket-3);
+		} else if (freq_str == "+FC" || freq_str == "+Fc" || freq_str == "+F") {
+            if (freq_type == FREQ_MIXTURE) {
+                freq_params = "empirical," + freq_params;
+                optimize_mixmodel_weight = true;
+            } else
+                freq_type = FREQ_EMPIRICAL;
+		} else if (freq_str == "+FU" || freq_str == "+Fu") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with user-defined frequency is not allowed");
+            else
+                freq_type = FREQ_USER_DEFINED;
+		} else if (freq_str == "+FQ" || freq_str == "+Fq") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with equal frequency is not allowed");
+            else
+                freq_type = FREQ_EQUAL;
+		} else if (freq_str == "+FO" || freq_str == "+Fo") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with optimized frequency is not allowed");
+            else
+                freq_type = FREQ_ESTIMATE;
+		} else if (freq_str == "+F1x4" || freq_str == "+F1X4") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with " + freq_str + " is not allowed");
+            else
+                freq_type = FREQ_CODON_1x4;
+		} else if (freq_str == "+F3x4" || freq_str == "+F3X4") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with " + freq_str + " is not allowed");
+            else
+                freq_type = FREQ_CODON_3x4;
+		} else if (freq_str == "+F3x4C" || freq_str == "+F3x4c" || freq_str == "+F3X4C" || freq_str == "+F3X4c") {
+            if (freq_type == FREQ_MIXTURE)
+                outError("Mixture frequency with " + freq_str + " is not allowed");
+            else
+                freq_type = FREQ_CODON_3x4C;
+		} else outError("Unknown state frequency type ",freq_str);
+//		model_str = model_str.substr(0, posfreq);
+	}
+
+	/******************** initialize model ****************************/
+
+	if (!params.site_freq_file) {
+		if (model_str.substr(0, 3) == "MIX" || freq_type == FREQ_MIXTURE) {
+			string model_list;
+			if (model_str.substr(0, 3) == "MIX") {
+				if (model_str[3] != OPEN_BRACKET)
+					outError("Mixture model name must start with 'MIX{'");
+				if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1)
+					outError("Close bracket not found at the end of ", model_str);
+				model_list = model_str.substr(4, model_str.length()-5);
+				model_str = model_str.substr(0, 3);
+			}
+			model = new ModelMixture(params.model_name, model_str, model_list, models_block, freq_type, freq_params, tree, optimize_mixmodel_weight);
+		} else {
+//			string model_desc;
+//			NxsModel *nxsmodel = models_block->findModel(model_str);
+//			if (nxsmodel) model_desc = nxsmodel->description;
+			model = createModel(model_str, models_block, freq_type, freq_params, tree);
+		}
+
+//		fused_mix_rate &= model->isMixture() && site_rate->getNRate() > 1;
+	} else {
+		// site-specific model
+		if (model_str == "JC" || model_str == "POISSON")
+			outError("JC is not suitable for site-specific model");
+		model = new ModelSet(model_str.c_str(), tree);
+		ModelSet *models = (ModelSet*)model; // assign pointer for convenience
+		models->init(params.freq_type);
+		IntVector site_model;
+		vector<double*> freq_vec;
+		readSiteFreq(tree->aln, params.site_freq_file, site_model, freq_vec);
+		tree->aln->regroupSitePattern(freq_vec.size(), site_model);
+		//tree->aln->ungroupSitePattern();
+		tree->setAlignment(tree->aln);
+		int i;
+		models->pattern_model_map.resize(tree->aln->getNPattern(), -1);
+		for (i = 0; i < tree->aln->getNSite(); i++) {
+			models->pattern_model_map[tree->aln->getPatternID(i)] = site_model[i];
+			//cout << "site " << i << " ptn " << tree->aln->getPatternID(i) << " -> model " << site_model[i] << endl;
+		}
+		double *state_freq = new double[model->num_states];
+		double *rates = new double[model->getNumRateEntries()];
+		for (i = 0; i < freq_vec.size(); i++) {
+			ModelGTR *modeli;
+			if (i == 0) {
+				modeli = (ModelGTR*)createModel(model_str, models_block, params.freq_type, "", tree, true);
+				modeli->getStateFrequency(state_freq);
+				modeli->getRateMatrix(rates);
+			} else {
+				modeli = (ModelGTR*)createModel(model_str, models_block, FREQ_EQUAL, "", tree, false);
+				modeli->setStateFrequency(state_freq);
+				modeli->setRateMatrix(rates);
+			}
+			if (freq_vec[i])
+				modeli->setStateFrequency (freq_vec[i]);
+
+			modeli->init(FREQ_USER_DEFINED);
+			models->push_back(modeli);
+		}
+		delete [] rates;
+		delete [] state_freq;
+		cout << "Alignment is divided into " << models->size() << " partitions with " << tree->aln->getNPattern() << " patterns" << endl;
+		for (vector<double*>::reverse_iterator it = freq_vec.rbegin(); it != freq_vec.rend(); it++)
+			if (*it) delete [] (*it);
+	}
+
+//	if (model->isMixture())
+//		cout << "Mixture model with " << model->getNMixtures() << " components!" << endl;
+
+	/******************** initialize ascertainment bias correction model ****************************/
+
+	string::size_type posasc;
+
+	if ((posasc = rate_str.find("+ASC")) != string::npos) {
+		// ascertainment bias correction
+		unobserved_ptns = tree->aln->getUnobservedConstPatterns();
+		// rebuild the seq_states to contain states of unobserved constant patterns
+		tree->aln->buildSeqStates(true);
+//		if (unobserved_ptns.size() <= 0)
+//			outError("Invalid use of +ASC because all constant patterns are observed in the alignment");
+		if (unobserved_ptns.size() < tree->aln->getNumNonstopCodons())
+			outError("Invalid use of +ASC because constant patterns are observed in the alignment");
+		cout << "Ascertainment bias correction: " << unobserved_ptns.size() << " unobservable constant patterns"<< endl;
+		rate_str = rate_str.substr(0, posasc) + rate_str.substr(posasc+4);
+	}
+
+
+	/******************** initialize site rate heterogeneity ****************************/
+
+	string::size_type posI = rate_str.find("+I");
+	string::size_type posG = rate_str.find("+G");
+	string::size_type posG2 = rate_str.find("*G");
+    if (posG != string::npos && posG2 != string::npos) {
+        cout << "NOTE: both +G and *G were specified, continue with " 
+            << ((posG < posG2)? rate_str.substr(posG,2) : rate_str.substr(posG2,2)) << endl;
+    }
+    if (posG2 != string::npos && posG2 < posG) {
+        posG = posG2;
+        fused_mix_rate = true;
+    }
+//	if (posG == string::npos) {
+//		posG = rate_str.find("*G");
+//		if (posG != string::npos)
+//			fused_mix_rate = true;
+//	}
+	string::size_type posR = rate_str.find("+R"); // FreeRate model
+	string::size_type posR2 = rate_str.find("*R"); // FreeRate model
+    if (posR != string::npos && posR2 != string::npos) {
+        cout << "NOTE: both +R and *R were specified, continue with " 
+            << ((posR < posR2)? rate_str.substr(posR,2) : rate_str.substr(posR2,2)) << endl;
+    }
+    if (posR2 != string::npos && posR2 < posR) {
+        posR = posR2;
+        fused_mix_rate = true;
+    }
+    
+//	if (posR == string::npos) {
+//		posR = rate_str.find("*R");
+//		if (posR != string::npos)
+//			fused_mix_rate = true;
+//	}
+	if (posG != string::npos && posR != string::npos) {
+        if (posG == posG2 && posR != posR2) {
+            outWarning("Both Gamma and FreeRate models were specified, continue with Gamma model because *G has higher priority than +R");
+            posR = string::npos;
+        } else {
+            outWarning("Both Gamma and FreeRate models were specified, continue with FreeRate model");
+            posG = string::npos;
+        }
+    }
+	string::size_type posX;
+	/* create site-rate heterogeneity */
+	int num_rate_cats = params.num_rate_cats;
+	if (fused_mix_rate) num_rate_cats = model->getNMixtures();
+	double gamma_shape = params.gamma_shape;
+	double p_invar_sites = params.p_invar_sites;
+	string freerate_params = "";
+	if (posI != string::npos) {
+		// invariable site model
+		if (rate_str.length() > posI+2 && rate_str[posI+2] == OPEN_BRACKET) {
+			close_bracket = rate_str.find(CLOSE_BRACKET, posI);
+			if (close_bracket == string::npos)
+				outError("Close bracket not found in ", rate_str);
+			p_invar_sites = convert_double(rate_str.substr(posI+3, close_bracket-posI-3).c_str());
+			if (p_invar_sites < 0 || p_invar_sites >= 1)
+				outError("p_invar must be in [0,1)");
+		} else if (rate_str.length() > posI+2 && rate_str[posI+2] != '+')
+			outError("Wrong model name ", rate_str);
+	}
+	if (posG != string::npos) {
+		// Gamma rate model
+		int end_pos = 0;
+		if (rate_str.length() > posG+2 && isdigit(rate_str[posG+2])) {
+			num_rate_cats = convert_int(rate_str.substr(posG+2).c_str(), end_pos);
+			if (num_rate_cats < 1) outError("Wrong number of rate categories");
+		}
+		if (rate_str.length() > posG+2+end_pos && rate_str[posG+2+end_pos] == OPEN_BRACKET) {
+			close_bracket = rate_str.find(CLOSE_BRACKET, posG);
+			if (close_bracket == string::npos)
+				outError("Close bracket not found in ", rate_str);
+			gamma_shape = convert_double(rate_str.substr(posG+3+end_pos, close_bracket-posG-3-end_pos).c_str());
+//			if (gamma_shape < MIN_GAMMA_SHAPE || gamma_shape > MAX_GAMMA_SHAPE) {
+//				stringstream str;
+//				str << "Gamma shape parameter " << gamma_shape << "out of range ["
+//						<< MIN_GAMMA_SHAPE << ',' << MAX_GAMMA_SHAPE << "]" << endl;
+//				outError(str.str());
+//			}
+		} else if (rate_str.length() > posG+2+end_pos && rate_str[posG+2+end_pos] != '+')
+			outError("Wrong model name ", rate_str);
+	}
+	if (posR != string::npos) {
+		// FreeRate model
+		int end_pos = 0;
+		if (rate_str.length() > posR+2 && isdigit(rate_str[posR+2])) {
+			num_rate_cats = convert_int(rate_str.substr(posR+2).c_str(), end_pos);
+				if (num_rate_cats < 1) outError("Wrong number of rate categories");
+			}
+		if (rate_str.length() > posR+2+end_pos && rate_str[posR+2+end_pos] == OPEN_BRACKET) {
+			close_bracket = rate_str.find(CLOSE_BRACKET, posR);
+			if (close_bracket == string::npos)
+				outError("Close bracket not found in ", rate_str);
+			freerate_params = rate_str.substr(posR+3+end_pos, close_bracket-posR-3-end_pos).c_str();
+		} else if (rate_str.length() > posR+2+end_pos && rate_str[posR+2+end_pos] != '+')
+			outError("Wrong model name ", rate_str);
+	}
+	if (rate_str.find('+') != string::npos || rate_str.find('*') != string::npos) {
+		//string rate_str = model_str.substr(pos);
+		if (posI != string::npos && posG != string::npos) {
+			site_rate = new RateGammaInvar(num_rate_cats, gamma_shape, params.gamma_median,
+					p_invar_sites, params.optimize_model_rate_joint, tree);
+		} else if (posI != string::npos && posR != string::npos) {
+			site_rate = new RateFreeInvar(num_rate_cats, gamma_shape, freerate_params, p_invar_sites, !fused_mix_rate, params.optimize_alg, tree);
+		} else if (posI != string::npos) {
+			site_rate = new RateInvar(p_invar_sites, tree);
+		} else if (posG != string::npos) {
+			site_rate = new RateGamma(num_rate_cats, gamma_shape, params.gamma_median, tree);
+		} else if (posR != string::npos) {
+			site_rate = new RateFree(num_rate_cats, gamma_shape, freerate_params, !fused_mix_rate, params.optimize_alg, tree);
+		} else if ((posX = rate_str.find("+M")) != string::npos) {
+			tree->setLikelihoodKernel(LK_NORMAL);
+			params.rate_mh_type = true;
+			if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
+				num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
+				if (num_rate_cats < 0) outError("Wrong number of rate categories");
+			} else num_rate_cats = -1;
+			if (num_rate_cats >= 0)
+				site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type, 
+					params.rate_file, tree, params.rate_mh_type);
+			else
+				site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
+			site_rate->setTree(tree);
+		} else if ((posX = rate_str.find("+D")) != string::npos) {
+			tree->setLikelihoodKernel(LK_NORMAL);
+			params.rate_mh_type = false;
+			if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
+				num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
+				if (num_rate_cats < 0) outError("Wrong number of rate categories");
+			} else num_rate_cats = -1;
+			if (num_rate_cats >= 0)
+				site_rate = new RateMeyerDiscrete(num_rate_cats, params.mcat_type, 
+					params.rate_file, tree, params.rate_mh_type);
+			else
+				site_rate = new RateMeyerHaeseler(params.rate_file, tree, params.rate_mh_type);
+			site_rate->setTree(tree);
+		} else if ((posX = rate_str.find("+NGS")) != string::npos) {
+			tree->setLikelihoodKernel(LK_NORMAL);
+			if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
+				num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
+				if (num_rate_cats < 0) outError("Wrong number of rate categories");
+			} else num_rate_cats = -1;
+			site_rate = new NGSRateCat(tree, num_rate_cats);
+			site_rate->setTree(tree);
+		} else if ((posX = rate_str.find("+NGS")) != string::npos) {
+			tree->setLikelihoodKernel(LK_NORMAL);
+			if (rate_str.length() > posX+4 && isdigit(rate_str[posX+4])) {
+				num_rate_cats = convert_int(rate_str.substr(posX+4).c_str());
+				if (num_rate_cats < 0) outError("Wrong number of rate categories");
+			} else num_rate_cats = -1;
+			site_rate = new NGSRate(tree);
+			site_rate->setTree(tree);
+		} else if ((posX = rate_str.find("+K")) != string::npos) {
+			if (rate_str.length() > posX+2 && isdigit(rate_str[posX+2])) {
+				num_rate_cats = convert_int(rate_str.substr(posX+2).c_str());
+				if (num_rate_cats < 1) outError("Wrong number of rate categories");
+			}
+			site_rate = new RateKategory(num_rate_cats, tree);
+		} else
+			outError("Invalid rate heterogeneity type");
+//		if (model_str.find('+') != string::npos)
+//			model_str = model_str.substr(0, model_str.find('+'));
+//		else
+//			model_str = model_str.substr(0, model_str.find('*'));
+	} else {
+		site_rate = new RateHeterogeneity();
+		site_rate->setTree(tree);
+	} 	
+
+	if (fused_mix_rate) {
+		if (!model->isMixture())
+			outError("Model is not a mixture model");
+		if (model->getNMixtures() != site_rate->getNRate())
+			outError("Mixture model and site rate model do not have the same number of categories");
+		ModelMixture *mmodel = (ModelMixture*)model;
+		// reset mixture model
+		mmodel->fix_prop = true;
+		for (ModelMixture::iterator it = mmodel->begin(); it != mmodel->end(); it++) {
+			(*it)->total_num_subst = 1.0;
+			mmodel->prop[it-mmodel->begin()] = 1.0;
+		}
+		mmodel->decomposeRateMatrix();
+	}
+
+	tree->discardSaturatedSite(params.discard_saturated_site);
+
+	} catch (const char* str) {
+		outError(str);
+	}
+
+}
+
+int ModelFactory::getNParameters() {
+	int df = model->getNDim() + site_rate->getNDim() + site_rate->phylo_tree->branchNum;
+	if (model->freq_type == FREQ_EMPIRICAL) 
+        df += model->num_states-1;
+	else if (model->freq_type == FREQ_CODON_1x4) 
+        df += 3;
+	else if (model->freq_type == FREQ_CODON_3x4 || model->freq_type == FREQ_CODON_3x4C) 
+        df += 9;
+	return df;
+}
+void ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec)
+{
+	cout << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl;
+	site_model.resize(aln->getNSite(), -1);
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(site_freq_file);
+		double freq;
+		string site_spec;
+		int specified_sites = 0;
+		in.exceptions(ios::badbit);
+		for (int model_id = 0; !in.eof(); model_id++) {
+			// remove the failbit
+			in >> site_spec;
+			if (in.eof()) break;
+			IntVector site_id;
+			extractSiteID(aln, site_spec.c_str(), site_id);
+			specified_sites += site_id.size();
+			if (site_id.size() == 0) throw "No site ID specified";
+			for (IntVector::iterator it = site_id.begin(); it != site_id.end(); it++) {
+				if (site_model[*it] != -1) throw "Duplicated site ID";
+				site_model[*it] = model_id;
+			}
+			double *site_freq_entry = new double[aln->num_states];
+			double sum = 0;
+			for (int i = 0; i < aln->num_states; i++) {
+				in >> freq;
+				if (freq <= 0.0 || freq >= 1.0) throw "Invalid frequency entry";
+				site_freq_entry[i] = freq;
+				sum += freq;
+			}
+			if (fabs(sum-1.0) > 1e-4) throw "Frequencies do not sum up to 1";
+			aln->convfreq(site_freq_entry); // regularize frequencies (eg if some freq = 0)
+			freq_vec.push_back(site_freq_entry);
+		}
+		if (specified_sites < site_model.size()) {
+			// there are some unspecified sites
+			cout << site_model.size() - specified_sites << " unspecified sites will get default frequencies" << endl;
+			for (int i = 0; i < site_model.size(); i++)
+				if (site_model[i] == -1) 
+					site_model[i] = freq_vec.size();
+			freq_vec.push_back(NULL);
+		}
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch (const char* str) {
+		outError(str);
+	} catch (string str) {
+		outError(str);
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	}
+}
+
+double ModelFactory::initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst *model, double initAlpha,
+                                           double initPInvar, double *initRates, double *initStateFreqs)  {
+
+    RateGammaInvar* rateGammaInvar = dynamic_cast<RateGammaInvar*>(rate);
+    ModelGTR* modelGTR = dynamic_cast<ModelGTR*>(model);
+    modelGTR->setRateMatrix(initRates);
+    modelGTR->setStateFrequency(initStateFreqs);
+    rateGammaInvar->setGammaShape(initAlpha);
+    rateGammaInvar->setPInvar(initPInvar);
+    modelGTR->decomposeRateMatrix();
+    rateGammaInvar->computeRates();
+    site_rate->phylo_tree->clearAllPartialLH();
+    return site_rate->phylo_tree->computeLikelihood();
+}
+
+double ModelFactory::optimizeParametersOnly(double gradient_epsilon) {
+    double logl;
+    if (Params::getInstance().fai && dynamic_cast<RateGammaInvar*>(site_rate) != NULL
+        && dynamic_cast<ModelGTR*>(model) != NULL) {
+        cout << "Optimize substitutional and site rates with restart ..." << endl;
+        PhyloTree* tree = site_rate->phylo_tree;
+        double initAlpha = 0.1;
+        double maxInitAlpha = 1.0;
+        double alphaStep = 0.1;
+        double bestLogl = -DBL_MAX;
+        double bestAlpha = 0.0;
+        double bestPInvar = 0.0;
+        double initPInvar = site_rate->getPInvar();
+        int numRateEntries = model->getNumRateEntries();
+        double *initRates = new double[numRateEntries];
+        double *bestRates = new double[numRateEntries];
+        model->getRateMatrix(initRates);
+        int numStates = model->num_states;
+        double *initStateFreqs = new double[numStates];
+        model->getStateFrequency(initStateFreqs);
+        double *bestStateFreqs =  new double[numStates];
+        DoubleVector initBranchLengths;
+        DoubleVector bestBranchLengths;
+        tree->saveBranchLengths(initBranchLengths);
+
+        while (initAlpha <= maxInitAlpha) {
+            tree->restoreBranchLengths(initBranchLengths);
+            double initLogl = initGTRGammaIParameters(site_rate, model, initAlpha, initPInvar, initRates, initStateFreqs);
+            if (joint_optimize) {
+                logl = optimizeAllParameters(gradient_epsilon);
+            } else {
+                model->optimizeParameters(gradient_epsilon);
+                site_rate->optimizeParameters(gradient_epsilon);
+                logl = tree->optimizeAllBranches(1);
+            }
+            RateGammaInvar* rateGammaInvar = dynamic_cast<RateGammaInvar*>(site_rate);
+            ModelGTR* modelGTR = dynamic_cast<ModelGTR*>(model);
+            double curAlpha = rateGammaInvar->getGammaShape();
+            double curPInvar = rateGammaInvar->getPInvar();
+            if (logl > bestLogl) {
+                bestLogl = logl;
+                bestAlpha = curAlpha;
+                bestPInvar = curPInvar;
+                modelGTR->getRateMatrix(bestRates);
+                modelGTR->getStateFrequency(bestStateFreqs);
+                tree->saveBranchLengths(bestBranchLengths);
+            }
+            if (verbose_mode >= VB_MED) {
+                cout << "Init. alpha = " << initAlpha << " / Init. PInvar = " << initPInvar << " / Init. Logl = " <<
+                initLogl << " / Est. alpha = " << curAlpha
+                << "/ Est. pinv = " << curPInvar << " / Final Logl = " << logl << endl;
+            }
+            initAlpha = initAlpha + alphaStep;
+        }
+        cout << "Best alpha = " << bestAlpha << " / best p_invar = " << bestPInvar << endl;
+        tree->restoreBranchLengths(bestBranchLengths);
+        logl = initGTRGammaIParameters(site_rate, model, bestAlpha, bestPInvar, bestRates, bestStateFreqs);
+        delete [] initRates;
+        delete [] bestRates;
+        delete [] initStateFreqs;
+        delete [] bestStateFreqs;
+    } else {
+        /* Optimize substitutional and heterogeneity rates independetly */
+        if (!joint_optimize) {
+            double model_lh = model->optimizeParameters(gradient_epsilon);
+            double rate_lh = site_rate->optimizeParameters(gradient_epsilon);
+            if (rate_lh == 0.0)
+                logl = model_lh;
+            else
+                logl = rate_lh;
+        } else {
+            /* Optimize substitutional and heterogeneity rates jointly using BFGS */
+            logl = optimizeAllParameters(gradient_epsilon);
+        }
+    }
+    return logl;
+}
+
+double ModelFactory::optimizeAllParameters(double gradient_epsilon) {
+    int ndim = getNDim();
+
+    // return if nothing to be optimized
+    if (ndim == 0) return 0.0;
+
+    double *variables = new double[ndim+1];
+    double *upper_bound = new double[ndim+1];
+    double *lower_bound = new double[ndim+1];
+    bool *bound_check = new bool[ndim+1];
+    int i;
+    double score;
+
+    // setup the bounds for model
+    setVariables(variables);
+    int model_ndim = model->getNDim();
+    for (i = 1; i <= model_ndim; i++) {
+        //cout << variables[i] << endl;
+        lower_bound[i] = MIN_RATE;
+        upper_bound[i] = MAX_RATE;
+        bound_check[i] = false;
+    }
+
+    if (model->freq_type == FREQ_ESTIMATE) {
+        for (i = model_ndim- model->num_states+2; i <= model_ndim; i++)
+            upper_bound[i] = 1.0;
+    }
+
+    // setup the bounds for site_rate
+    site_rate->setBounds(lower_bound+model_ndim, upper_bound+model_ndim, bound_check+model_ndim);
+
+    score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
+
+    getVariables(variables);
+    //if (freq_type == FREQ_ESTIMATE) scaleStateFreq(true);
+    model->decomposeRateMatrix();
+    site_rate->phylo_tree->clearAllPartialLH();
+
+    delete [] bound_check;
+    delete [] lower_bound;
+    delete [] upper_bound;
+    delete [] variables;
+
+    return score;
+}
+
+double ModelFactory::optimizeParameters(bool fixed_len, bool write_info,
+                                        double logl_epsilon, double gradient_epsilon) {
+	assert(model);
+	assert(site_rate);
+
+    double defaultEpsilon = logl_epsilon;
+
+	double begin_time = getRealTime();
+	double cur_lh;
+	PhyloTree *tree = site_rate->getTree();
+	assert(tree);
+
+	stopStoringTransMatrix();
+	if (fixed_len || tree->params->num_param_iterations == 0)
+		cur_lh = tree->computeLikelihood();
+	else {
+        if (!Params::getInstance().testAlpha && !Params::getInstance().fai)
+		    cur_lh = tree->optimizeAllBranches(1);
+        else
+            cur_lh = tree->computeLikelihood();
+	}
+    tree->setCurScore(cur_lh);
+	if (verbose_mode >= VB_MED || write_info) 
+		cout << "1. Initial log-likelihood: " << cur_lh << endl;
+
+	// For UpperBounds -----------
+	//cout<<"MLCheck = "<<tree->mlCheck <<endl;
+	if(tree->mlCheck == 0){
+		tree->mlInitial = cur_lh;
+	}
+	// ---------------------------
+
+
+	int i;
+	//bool optimize_rate = true;
+//	double gradient_epsilon = min(logl_epsilon, 0.01); // epsilon for parameters starts at epsilon for logl
+	for (i = 2; i < tree->params->num_param_iterations; i++) {
+//        if (gradient_epsilon < 0.001)
+//            gradient_epsilon = 0.001;
+		/*
+		double model_lh = model->optimizeParameters(param_epsilon);
+		double rate_lh = 0.0;
+		if (optimize_rate) {
+			rate_lh = site_rate->optimizeParameters(param_epsilon);
+			if (rate_lh < model_lh+1e-6 && model_lh != 0.0) optimize_rate = false;
+		}
+		if (model_lh == 0.0 && rate_lh == 0.0) {
+			if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+			break;
+		}
+		double new_lh = (rate_lh != 0.0) ? rate_lh : model_lh;
+		*/
+        double new_lh;
+
+        if (Params::getInstance().fai && i > 2) {
+            Params::getInstance().fai = false;
+        }
+
+        new_lh = optimizeParametersOnly(gradient_epsilon);
+
+		if (new_lh == 0.0) {
+			if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+			break;
+		}
+		if (verbose_mode >= VB_MED) {
+			model->writeInfo(cout);
+			site_rate->writeInfo(cout);
+		}
+		if (!fixed_len)
+			new_lh = tree->optimizeAllBranches(min(i,3), logl_epsilon);  // loop only 3 times in total (previously in v0.9.6 5 times)
+		if (new_lh > cur_lh + logl_epsilon) {
+            if (Params::getInstance().testAlpha && i == 3) {
+                double newEpsilon = (new_lh - cur_lh) * 0.01;
+                if (newEpsilon > defaultEpsilon) {
+                    logl_epsilon = newEpsilon;
+                    cout << "Estimate model parameters with new epsilon = " << logl_epsilon << endl;
+                }
+            }
+//			if (gradient_epsilon > (new_lh - cur_lh) * logl_epsilon)
+//				gradient_epsilon = (new_lh - cur_lh) * logl_epsilon;
+			cur_lh = new_lh;
+			if (verbose_mode >= VB_MED || write_info)
+				cout << i << ". Current log-likelihood: " << cur_lh << endl;
+		} else {
+			site_rate->classifyRates(new_lh);
+			if (!fixed_len) cur_lh = tree->optimizeAllBranches(100, logl_epsilon);
+				break;
+		}
+	}
+
+	// normalize rates s.t. branch lengths are #subst per site
+    double mean_rate = site_rate->rescaleRates();
+    if (mean_rate != 1.0) {
+		tree->scaleLength(mean_rate);
+		tree->clearAllPartialLH();
+    }
+    
+	if (verbose_mode >= VB_MED || write_info)
+		cout << "Optimal log-likelihood: " << cur_lh << endl;
+
+	// For UpperBounds -----------
+	if(tree->mlCheck == 0)
+		tree->mlFirstOpt = cur_lh;
+	// ---------------------------
+
+	if (verbose_mode <= VB_MIN && write_info) {
+		model->writeInfo(cout);
+		site_rate->writeInfo(cout);
+	}
+	double elapsed_secs = getRealTime() - begin_time;
+	if (write_info)
+		cout << "Parameters optimization took " << i-1 << " rounds (" << elapsed_secs << " sec)" << endl << endl;
+	startStoringTransMatrix();
+
+	// For UpperBounds -----------
+	tree->mlCheck = 1;
+	// ---------------------------
+
+	return cur_lh;
+}
+
+/**
+ * @return TRUE if parameters are at the boundary that may cause numerical unstability
+ */
+bool ModelFactory::isUnstableParameters() {
+	if (model->isUnstableParameters()) return true;
+	return false;
+}
+
+void ModelFactory::startStoringTransMatrix() {
+	if (!store_trans_matrix) return;
+	is_storing = true;
+}
+
+void ModelFactory::stopStoringTransMatrix() {
+	if (!store_trans_matrix) return;
+	is_storing = false;
+	if (!empty()) {
+		for (iterator it = begin(); it != end(); it++)
+			delete it->second;
+		clear();
+	}
+}
+
+
+double ModelFactory::computeTrans(double time, int state1, int state2) {
+	return model->computeTrans(time, state1, state2);
+}
+
+double ModelFactory::computeTrans(double time, int state1, int state2, double &derv1, double &derv2) {
+	return model->computeTrans(time, state1, state2, derv1, derv2);
+}
+
+void ModelFactory::computeTransMatrix(double time, double *trans_matrix) {
+	if (!store_trans_matrix || !is_storing || model->isSiteSpecificModel()) {
+		model->computeTransMatrix(time, trans_matrix);
+		return;
+	}
+	int mat_size = model->num_states * model->num_states;
+	iterator ass_it = find(round(time * 1e6));
+	if (ass_it == end()) {
+		// allocate memory for 3 matricies
+		double *trans_entry = new double[mat_size * 3];
+		trans_entry[mat_size] = trans_entry[mat_size+1] = 0.0;
+		model->computeTransMatrix(time, trans_entry);
+		ass_it = insert(value_type(round(time * 1e6), trans_entry)).first;
+	} else {
+		//if (verbose_mode >= VB_MAX) 
+			//cout << "ModelFactory bingo" << endl;
+	} 
+	
+	memcpy(trans_matrix, ass_it->second, mat_size * sizeof(double));
+}
+
+void ModelFactory::computeTransMatrixFreq(double time, double *state_freq, double *trans_matrix) {
+	if (model->isSiteSpecificModel()) {
+		model->computeTransMatrixFreq(time, trans_matrix);
+		return;
+	}
+	int nstates = model->num_states;
+	computeTransMatrix(time, trans_matrix);
+	for (int state1 = 0; state1 < nstates; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * nstates);
+		for (int state2 = 0; state2 < nstates; state2++)
+			trans_mat_state[state2] *= state_freq[state1];
+	}
+}
+
+void ModelFactory::computeTransDerv(double time, double *trans_matrix, 
+	double *trans_derv1, double *trans_derv2) {
+	if (!store_trans_matrix || !is_storing || model->isSiteSpecificModel()) {
+		model->computeTransDerv(time, trans_matrix, trans_derv1, trans_derv2);
+		return;
+	}
+	int mat_size = model->num_states * model->num_states;
+	iterator ass_it = find(round(time * 1e6));
+	if (ass_it == end()) {
+		// allocate memory for 3 matricies
+		double *trans_entry = new double[mat_size * 3];
+		trans_entry[mat_size] = trans_entry[mat_size+1] = 0.0;
+		model->computeTransDerv(time, trans_entry, trans_entry+mat_size, trans_entry+(mat_size*2));
+		ass_it = insert(value_type(round(time * 1e6), trans_entry)).first;
+	} else if (ass_it->second[mat_size] == 0.0 && ass_it->second[mat_size+1] == 0.0) {
+		double *trans_entry = ass_it->second;
+		model->computeTransDerv(time, trans_entry, trans_entry+mat_size, trans_entry+(mat_size*2));
+	}
+	memcpy(trans_matrix, ass_it->second, mat_size * sizeof(double));
+	memcpy(trans_derv1, ass_it->second + mat_size, mat_size * sizeof(double));
+	memcpy(trans_derv2, ass_it->second + (mat_size*2), mat_size * sizeof(double));
+}
+
+void ModelFactory::computeTransDervFreq(double time, double rate_val, double *state_freq, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2) 
+{
+	if (model->isSiteSpecificModel()) {
+		model->computeTransDervFreq(time, rate_val, trans_matrix, trans_derv1, trans_derv2);
+		return;
+	}
+	int nstates = model->num_states;	
+	double rate_sqr = rate_val*rate_val;
+	computeTransDerv(time * rate_val, trans_matrix, trans_derv1, trans_derv2);
+	for (int state1 = 0; state1 < nstates; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * nstates);
+		double *trans_derv1_state = trans_derv1 + (state1 * nstates);
+		double *trans_derv2_state = trans_derv2 + (state1 * nstates);
+		for (int state2 = 0; state2 < nstates; state2++) {
+			trans_mat_state[state2] *= state_freq[state1];
+			trans_derv1_state[state2] *= (state_freq[state1] * rate_val);
+			trans_derv2_state[state2] *= (state_freq[state1] * rate_sqr);
+		}
+	}
+}
+
+ModelFactory::~ModelFactory()
+{
+	for (iterator it = begin(); it != end(); it++)
+		delete it->second;
+	clear();
+}
+
+/************* FOLLOWING SERVE FOR JOINT OPTIMIZATION OF MODEL AND RATE PARAMETERS *******/
+int ModelFactory::getNDim()
+{
+	return model->getNDim() + site_rate->getNDim();
+}
+
+double ModelFactory::targetFunk(double x[]) {
+	model->getVariables(x);
+	// need to compute rates again if p_inv or Gamma shape changes!
+	if (model->state_freq[model->num_states-1] < MIN_RATE) return 1.0e+12;
+	model->decomposeRateMatrix();
+	site_rate->phylo_tree->clearAllPartialLH();
+	return site_rate->targetFunk(x + model->getNDim());
+}
+
+void ModelFactory::setVariables(double *variables) {
+	model->setVariables(variables);
+	site_rate->setVariables(variables + model->getNDim());
+}
+
+void ModelFactory::getVariables(double *variables) {
+	model->getVariables(variables);
+	site_rate->getVariables(variables + model->getNDim());
+}
+
diff --git a/model/modelfactory.h b/model/modelfactory.h
new file mode 100644
index 0000000..47b8bb3
--- /dev/null
+++ b/model/modelfactory.h
@@ -0,0 +1,237 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MODELFACTORY_H
+#define MODELFACTORY_H
+
+#include "tools.h"
+#include "modelsubst.h"
+#include "rateheterogeneity.h"
+#include "modelsblock.h"
+
+
+ModelsBlock *readModelsDefinition(Params &params);
+
+/**
+Store the transition matrix corresponding to evolutionary time so that one must not compute again. 
+For efficiency purpose esp. for protein (20x20) or codon (61x61).
+The values of the map contain 3 matricies consecutively: transition matrix, 1st, and 2nd derivative
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelFactory : public unordered_map<int, double*>, public Optimization
+{
+public:
+
+	/**
+		constructor
+		create substitution model with possible rate heterogeneity. Create proper class objects
+		for two variables: model and site_rate. It takes the following field of params into account:
+			model_name, num_rate_cats, freq_type, store_trans_matrix
+		@param params program parameters
+		@param tree associated phylogenetic tree
+	*/
+	ModelFactory(Params &params, PhyloTree *tree, ModelsBlock *models_block);
+
+	/**
+		blank constructor
+	*/
+	
+	ModelFactory();
+
+	/**
+	 * read site specific state frequency vectors from a file to create corresponding model (Ingo's idea)
+	 * @param aln input alignment
+	 * @param site_freq_file file name
+	 * @param site_model (OUT) site to model ID map
+	 * @param freq_vec (OUT) vector of frequency vectors
+	 */
+	void readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector<double*> &freq_vec);
+
+	/**
+		get the name of the model
+	*/
+	//string getModelName();
+
+	void writeInfo(ostream &out);
+
+	/**
+		Start to store transition matrix for efficiency
+	*/
+	void startStoringTransMatrix();
+
+	/**
+		Stop storing transition matrix, e.g., when optimizing model parameters
+	*/
+	void stopStoringTransMatrix();
+
+	/**
+		Wrapper for computing the transition probability matrix from the model. It use ModelFactory
+		that stores matrix computed before for effiency purpose.
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+	*/
+	void computeTransMatrix(double time, double *trans_matrix);
+
+	/**
+	 * wrapper for computing transition matrix times state frequency vector
+	 * @param time time between two events
+	 * @param state_freq state frequency vector
+	 * @param trans_matrix (OUT) the transition matrix between all pairs of states.
+	 * 	Assume trans_matrix has size of num_states * num_states.
+	 */
+	void computeTransMatrixFreq(double time, double *state_freq, double *trans_matrix);
+
+	/**
+		Wrapper for computing the transition probability between two states.
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+	*/
+	double computeTrans(double time, int state1, int state2);
+
+	/**
+		Wrapper for computing the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2);
+
+	/**
+		Wrapper for computing the transition probability matrix and the derivative 1 and 2 from the model.
+		It use ModelFactory that stores matrix computed before for effiency purpose.
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	void computeTransDerv(double time, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	void computeTransDervFreq(double time, double rate_val, double *state_freq, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	/**
+		 destructor
+	*/
+    virtual ~ModelFactory();
+
+    /**
+     * @return #parameters of the model + # branches
+     */
+    virtual int getNParameters();
+
+	/**
+		optimize model parameters and tree branch lengths
+		@param fixed_len TRUE to fix branch lengths, default is false
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
+                                      double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+	/**
+	 * @return TRUE if parameters are at the boundary that may cause numerical unstability
+	 */
+	virtual bool isUnstableParameters();
+
+	/**
+		pointer to the model, will not be deleted when deleting ModelFactory object
+	*/
+	ModelSubst *model;
+
+
+	/**
+		pointer to the site-rate heterogeneity, will not be deleted when deleting ModelFactory object
+	*/
+	RateHeterogeneity *site_rate;
+
+	/* TRUE if a fused mixture and rate model, e.g. LG4M and LG4X */
+	bool fused_mix_rate;
+
+	/**
+		TRUE to store transition matrix into this hash table for computation efficiency
+	*/
+	bool store_trans_matrix;
+
+	/**
+		TRUE for storing process
+	*/
+	bool is_storing;
+
+	/**
+	 * encoded constant sites that are unobservable and added in the alignment
+	 * this involves likelihood function for ascertainment bias correction for morphological or SNP data (Lewis 2001)
+	 */
+	string unobserved_ptns;
+
+	/**
+	 * optimize model and site_rate parameters
+	 * @param gradient_epsilon
+	 */
+	double optimizeParametersOnly(double gradient_epsilon);
+
+	/************* FOLLOWING FUNCTIONS SERVE FOR JOINT OPTIMIZATION OF MODEL AND RATE PARAMETERS *******/
+
+	/**
+	 * TRUE to optimize all parameters simultaneously, default: FALSE
+	 */
+	bool joint_optimize;
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	double initGTRGammaIParameters(RateHeterogeneity *rate, ModelSubst *model, double initAlpha,
+								 double initPInvar, double *initRates, double *initStateFreqs);
+
+    double optimizeAllParameters(double gradient_epsilon);
+
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+};
+
+#endif
diff --git a/model/modelgtr.cpp b/model/modelgtr.cpp
new file mode 100644
index 0000000..7417f1b
--- /dev/null
+++ b/model/modelgtr.cpp
@@ -0,0 +1,789 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "modelgtr.h"
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+//const double MIN_FREQ_RATIO = MIN_FREQUENCY;
+//const double MAX_FREQ_RATIO = 1.0/MIN_FREQUENCY;
+
+ModelGTR::ModelGTR(PhyloTree *tree, bool count_rates)
+ : ModelSubst(tree->aln->num_states), EigenDecomposition()
+{
+    half_matrix = true;
+	int i;
+	int nrate = getNumRateEntries();
+	int ncoeff = num_states*num_states*num_states;
+	
+	highest_freq_state = num_states-1;
+	name = "GTR";
+	full_name = "GTR (Tavare, 1986)";
+	phylo_tree = tree;
+	
+	rates = new double[nrate];
+	memset(rates, 0, sizeof(double) * nrate);
+
+	freq_type = FREQ_UNKNOWN;
+	
+	eigenvalues = aligned_alloc<double>(num_states);
+
+	eigenvectors = aligned_alloc<double>(num_states*num_states);
+//	for (i = 0; i < num_states; i++)
+//		eigenvectors[i] = new double[num_states];
+
+	inv_eigenvectors = aligned_alloc<double>(num_states*num_states);
+//	for (i = 0; i < num_states; i++)
+//		inv_eigenvectors[i] = new double[num_states];
+		
+	eigen_coeff = aligned_alloc<double>(ncoeff);
+
+	if (count_rates) 
+		phylo_tree->aln->computeEmpiricalRate(rates);
+	else
+		for (i=0; i < nrate; i++) rates[i] = 1.0;
+	//eigen_coeff_derv1 = new double[ncoeff];
+	//eigen_coeff_derv2 = new double[ncoeff];
+	num_params = getNumRateEntries() - 1;
+}
+
+void ModelGTR::setTree(PhyloTree *tree) {
+	phylo_tree = tree;
+}
+
+string ModelGTR::getNameParams() {
+	ostringstream retname;
+	retname << "GTR";
+	if (num_states != 4) retname << num_states;
+	retname << '{';
+	int nrates = getNumRateEntries();
+	for (int i = 0; i < nrates; i++) {
+		if (i>0) retname << ',';
+		retname << rates[i];
+	}
+	retname << '}';
+	return retname.str();
+}
+
+void ModelGTR::init(StateFreqType type) {
+	//if (type == FREQ_UNKNOWN) return;
+	int i;
+	freq_type = type;
+	assert(freq_type != FREQ_UNKNOWN);
+	switch (freq_type) {
+	case FREQ_EQUAL:
+		if (phylo_tree->aln->seq_type == SEQ_CODON) {
+			int nscodon = phylo_tree->aln->getNumNonstopCodons();
+            double freq_codon = (1.0-(num_states-nscodon)*MIN_FREQUENCY)/(nscodon);
+			for (i = 0; i < num_states; i++)
+				if (phylo_tree->aln->isStopCodon(i))
+					state_freq[i] = MIN_FREQUENCY;
+				else
+					state_freq[i] = freq_codon;
+		} else {
+            double freq_state = 1.0/num_states;
+			for (i = 0; i < num_states; i++)
+				state_freq[i] = freq_state;
+		}
+		break;	
+	case FREQ_ESTIMATE:
+	case FREQ_EMPIRICAL:
+		if (phylo_tree->aln->seq_type == SEQ_CODON) {
+			double ntfreq[12];
+			phylo_tree->aln->computeCodonFreq(freq_type, state_freq, ntfreq);
+//			phylo_tree->aln->computeCodonFreq(state_freq);
+		} else
+			phylo_tree->aln->computeStateFreq(state_freq);
+		for (i = 0; i < num_states; i++)
+			if (state_freq[i] > state_freq[highest_freq_state])
+				highest_freq_state = i;
+		break;
+	case FREQ_USER_DEFINED:
+		if (state_freq[0] == 0.0) outError("State frequencies not specified");
+		break;
+	default: break;
+	}
+	decomposeRateMatrix();
+	if (verbose_mode >= VB_MAX)
+		writeInfo(cout);
+
+}
+
+void ModelGTR::writeInfo(ostream &out) {
+	if (num_states == 4) {
+		out << "Rate parameters:";
+		//out.precision(3);
+		//out << fixed;
+		out << "  A-C: " << rates[0];
+		out << "  A-G: " << rates[1];
+		out << "  A-T: " << rates[2];
+		out << "  C-G: " << rates[3];
+		out << "  C-T: " << rates[4];
+		out << "  G-T: " << rates[5];
+		out << endl;
+		//if (freq_type != FREQ_ESTIMATE) return;
+		out << "Base frequencies: ";
+		out << "  A: " << state_freq[0];
+		out << "  C: " << state_freq[1];
+		out << "  G: " << state_freq[2];
+		out << "  T: " << state_freq[3];
+		out << endl;
+	}
+//	if (verbose_mode >= VB_DEBUG) {
+//		int i, j;
+//		out.precision(6);
+//		out << "eigenvalues: " << endl;
+//		for (i = 0; i < num_states; i++) out << " " << eigenvalues[i];
+//		out << endl << "eigenvectors: " << endl;
+//		for (i = 0; i < num_states; i++)  {
+//			for (j = 0; j < num_states; j++)
+//				out << " " << eigenvectors[i*num_states+j];
+//			out << endl;
+//		}
+//		out << endl << "inv_eigenvectors: " << endl;
+//		for (i = 0; i < num_states; i++)  {
+//			for (j = 0; j < num_states; j++)
+//				out << " " << inv_eigenvectors[i*num_states+j];
+//			out << endl;
+//		}
+//	}
+	//out.unsetf(ios::fixed);
+}
+
+void ModelGTR::computeTransMatrix(double time, double *trans_matrix) {
+	/* compute P(t) */
+	double evol_time = time / total_num_subst;
+	double *exptime = new double[num_states];
+	int i, j, k;
+
+	for (i = 0; i < num_states; i++)
+		exptime[i] = exp(evol_time * eigenvalues[i]);
+
+	int row_offset;
+	for (i = 0, row_offset = 0; i < num_states; i++, row_offset+=num_states) {
+		double *trans_row = trans_matrix + row_offset;
+		for (j = i+1; j < num_states; j ++) { 
+			// compute upper triangle entries
+			double *trans_entry = trans_row + j;
+			double *coeff_entry = eigen_coeff + ((row_offset+j)*num_states);
+			*trans_entry = 0.0;
+			for (k = 0; k < num_states; k ++) {
+				*trans_entry += coeff_entry[k] * exptime[k];
+			}
+			if (*trans_entry < 0.0) {
+				*trans_entry = 0.0;
+			}
+			// update lower triangle entries
+			trans_matrix[j*num_states+i] = (state_freq[i]/state_freq[j]) * (*trans_entry);
+		}
+		trans_row[i] = 0.0; // initialize diagonal entry
+		// taking the sum of row
+		double sum = 0.0;
+		for (j = 0; j < num_states; j++)
+			sum += trans_row[j];
+		trans_row[i] = 1.0 - sum; // update diagonal entry
+	}
+	delete [] exptime;
+}
+
+void ModelGTR::computeTransMatrixFreq(double time, double* trans_matrix)
+{
+	computeTransMatrix(time, trans_matrix);
+	for (int state1 = 0; state1 < num_states; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * num_states);
+		for (int state2 = 0; state2 < num_states; state2++)
+			trans_mat_state[state2] *= state_freq[state1];
+	}
+}
+
+double ModelGTR::computeTrans(double time, int state1, int state2) {
+	double evol_time = time / total_num_subst;
+	int i;
+
+	double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
+	double trans_prob = 0.0;
+	for (i = 0; i < num_states; i++) {
+		trans_prob += coeff_entry[i] * exp(evol_time * eigenvalues[i]);
+	}
+	return trans_prob;
+}
+
+double ModelGTR::computeTrans(double time, int state1, int state2, double &derv1, double &derv2) {
+	double evol_time = time / total_num_subst;
+	int i;
+
+	double *coeff_entry = eigen_coeff + ((state1*num_states+state2)*num_states);
+	double trans_prob = 0.0;
+	derv1 = derv2 = 0.0;
+	for (i = 0; i < num_states; i++) {
+		double trans = coeff_entry[i] * exp(evol_time * eigenvalues[i]);
+		double trans2 = trans * eigenvalues[i];
+		trans_prob += trans;
+		derv1 += trans2;
+		derv2 += trans2 * eigenvalues[i];
+	}
+	return trans_prob;
+}
+
+
+void ModelGTR::computeTransDerv(double time, double *trans_matrix, 
+	double *trans_derv1, double *trans_derv2) 
+{
+	/* compute P(t) */
+
+	double evol_time = time / total_num_subst;
+	double *exptime = new double[num_states];
+	int i, j, k;
+
+	for (i = 0; i < num_states; i++)
+		exptime[i] = exp(evol_time * eigenvalues[i]);
+
+	for (i = 0; i < num_states; i ++) {
+		for (j = 0; j < num_states; j ++) {
+			int offset = (i*num_states+j);
+			double *trans_entry = trans_matrix + offset;
+			double *derv1_entry = trans_derv1 + offset;
+			double *derv2_entry = trans_derv2 + offset;
+
+			int coeff_offset = offset*num_states;
+			double *coeff_entry       = eigen_coeff + coeff_offset;
+			*trans_entry = 0.0;
+			*derv1_entry = 0.0;
+			*derv2_entry = 0.0;
+			for (k = 0; k < num_states; k ++) {
+				double trans = coeff_entry[k] * exptime[k];
+				double trans2 = trans * eigenvalues[k];
+				*trans_entry += trans;
+				*derv1_entry += trans2;
+				*derv2_entry += trans2 * eigenvalues[k];
+			}
+			if (*trans_entry < 0.0) {
+				*trans_entry = 0.0;
+			}
+		}
+	}
+	delete [] exptime;
+}
+
+void ModelGTR::computeTransDervFreq(double time, double rate_val, double* trans_matrix, double* trans_derv1, double* trans_derv2)
+{
+	int nstates = num_states;
+	double rate_sqr = rate_val*rate_val;
+	computeTransDerv(time * rate_val, trans_matrix, trans_derv1, trans_derv2);
+	for (int state1 = 0; state1 < nstates; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * nstates);
+		double *trans_derv1_state = trans_derv1 + (state1 * nstates);
+		double *trans_derv2_state = trans_derv2 + (state1 * nstates);
+		for (int state2 = 0; state2 < nstates; state2++) {
+			trans_mat_state[state2] *= state_freq[state1];
+			trans_derv1_state[state2] *= (state_freq[state1] * rate_val);
+			trans_derv2_state[state2] *= (state_freq[state1] * rate_sqr);
+		}
+	}
+}
+
+
+void ModelGTR::getRateMatrix(double *rate_mat) {
+	int nrate = getNumRateEntries();
+	memcpy(rate_mat, rates, nrate * sizeof(double));
+}
+
+void ModelGTR::setRateMatrix(double* rate_mat)
+{
+	int nrate = getNumRateEntries();
+	memcpy(rates, rate_mat, nrate * sizeof(double));
+}
+
+void ModelGTR::getStateFrequency(double *freq) {
+	assert(state_freq);
+	assert(freq_type != FREQ_UNKNOWN);
+	memcpy(freq, state_freq, sizeof(double) * num_states);
+}
+
+void ModelGTR::setStateFrequency(double* freq)
+{
+	assert(state_freq);
+	memcpy(state_freq, freq, sizeof(double) * num_states);
+}
+
+void ModelGTR::getQMatrix(double *q_mat) {
+	double **rate_matrix = (double**) new double[num_states];
+	int i, j, k = 0;
+
+	for (i = 0; i < num_states; i++)
+		rate_matrix[i] = new double[num_states];
+
+	for (i = 0, k = 0; i < num_states; i++) {
+		rate_matrix[i][i] = 0.0;
+		for (j = i+1; j < num_states; j++, k++) {
+			rate_matrix[i][j] = rates[k];
+			rate_matrix[j][i] = rates[k];
+		}
+	}
+
+	computeRateMatrix(rate_matrix, state_freq, num_states);
+	for (i = 0; i < num_states; i++)
+		memmove(q_mat + (i*num_states), rate_matrix[i], num_states * sizeof(double));
+
+	for (i = num_states-1; i >= 0; i--)
+		delete [] rate_matrix[i];
+	delete [] rate_matrix;
+
+}
+
+int ModelGTR::getNDim() { 
+	assert(freq_type != FREQ_UNKNOWN);
+	int ndim = num_params;
+	if (freq_type == FREQ_ESTIMATE) 
+		ndim += num_states-1;
+	return ndim;
+}
+
+
+void ModelGTR::scaleStateFreq(bool sum_one) {
+	int i;
+	if (sum_one) {
+		// make the frequencies sum to 1
+		double sum = 0.0;
+		for (i = 0; i < num_states; i++) sum += state_freq[i];
+		for (i = 0; i < num_states; i++) state_freq[i] /= sum;		
+	} else {
+		// make the last frequency equal to 0.1
+		if (state_freq[num_states-1] == 0.1) return;
+		assert(state_freq[num_states-1] > 1.1e-6);
+		for (i = 0; i < num_states; i++) 
+			state_freq[i] /= state_freq[num_states-1]*10.0;
+	}
+}
+
+void ModelGTR::setVariables(double *variables) {
+	int nrate = getNDim();
+	if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+	if (nrate > 0)
+		memcpy(variables+1, rates, nrate*sizeof(double));
+	if (freq_type == FREQ_ESTIMATE) {
+		int i, j;
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				variables[nrate+j] = state_freq[i] / state_freq[highest_freq_state];
+				j++;
+			}
+		//scaleStateFreq(false);
+//		memcpy(variables+nrate+1, state_freq, (num_states-1)*sizeof(double));
+		//scaleStateFreq(true);
+	}
+}
+
+void ModelGTR::getVariables(double *variables) {
+	int nrate = getNDim();
+	if (freq_type == FREQ_ESTIMATE) nrate -= (num_states-1);
+	if (nrate > 0)
+		memcpy(rates, variables+1, nrate * sizeof(double));
+
+	if (freq_type == FREQ_ESTIMATE) {
+//		memcpy(state_freq, variables+nrate+1, (num_states-1)*sizeof(double));
+		//state_freq[num_states-1] = 0.1;
+		//scaleStateFreq(true);
+
+//		double sum = 0.0;
+//		for (int i = 0; i < num_states-1; i++)
+//			sum += state_freq[i];
+//		state_freq[num_states-1] = 1.0 - sum;
+		double sum = 1.0;
+		int i, j;
+		for (i = 1; i < num_states; i++)
+			sum += variables[nrate+i];
+		for (i = 0, j = 1; i < num_states; i++)
+			if (i != highest_freq_state) {
+				state_freq[i] = variables[nrate+j] / sum;
+				j++;
+			}
+		state_freq[highest_freq_state] = 1.0/sum;
+	}
+}
+
+double ModelGTR::targetFunk(double x[]) {
+	getVariables(x);
+	if (state_freq[num_states-1] < 1e-4) return 1.0e+12;
+	decomposeRateMatrix();
+	assert(phylo_tree);
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+bool ModelGTR::isUnstableParameters() {
+	int nrates = getNumRateEntries();
+	int i;
+    // NOTE: zero rates are not consider unstable anymore
+	for (i = 0; i < nrates; i++)
+		if (/*rates[i] < MIN_RATE+TOL_RATE || */rates[i] > MAX_RATE-TOL_RATE)
+			return true;
+	for (i = 0; i < num_states; i++)
+		if (state_freq[i] < MIN_RATE+TOL_RATE)
+			return true;
+	return false;
+}
+
+void ModelGTR::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	int i, ndim = getNDim();
+
+	for (i = 1; i <= ndim; i++) {
+		//cout << variables[i] << endl;
+		lower_bound[i] = MIN_RATE;
+		upper_bound[i] = MAX_RATE;
+		bound_check[i] = false;
+	}
+
+	if (freq_type == FREQ_ESTIMATE) {
+		for (i = ndim-num_states+2; i <= ndim; i++) {
+            lower_bound[i] = MIN_FREQUENCY/state_freq[highest_freq_state];
+//			upper_bound[i] = state_freq[highest_freq_state]/MIN_FREQUENCY;
+//            lower_bound[i]  = MIN_FREQUENCY;
+            upper_bound[i] = 100.0;
+            bound_check[i] = false;
+        }
+	}
+}
+
+double ModelGTR::optimizeParameters(double gradient_epsilon) {
+	int ndim = getNDim();
+	
+	// return if nothing to be optimized
+	if (ndim == 0) return 0.0;
+
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing " << name << " model parameters..." << endl;
+
+	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
+
+	double *variables = new double[ndim+1];
+	double *upper_bound = new double[ndim+1];
+	double *lower_bound = new double[ndim+1];
+	bool *bound_check = new bool[ndim+1];
+	double score;
+
+    for (int i = 0; i < num_states; i++)
+        if (state_freq[i] > state_freq[highest_freq_state])
+            highest_freq_state = i;
+
+	// by BFGS algorithm
+	setVariables(variables);
+	setBounds(lower_bound, upper_bound, bound_check);
+	//packData(variables, lower_bound, upper_bound, bound_check);
+    if (phylo_tree->params->optimize_alg.find("BFGS-B") == string::npos)
+        score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_RATE));
+    else
+        score = -L_BFGS_B(ndim, variables+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_RATE));
+
+	getVariables(variables);
+	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(true);
+	decomposeRateMatrix();
+	phylo_tree->clearAllPartialLH();
+	
+	delete [] bound_check;
+	delete [] lower_bound;
+	delete [] upper_bound;
+	delete [] variables;
+
+	return score;
+}
+
+
+
+void ModelGTR::decomposeRateMatrix(){
+	int i, j, k = 0;
+
+	if (num_params == -1) {
+		// manual compute eigenvalues/vectors for F81-style model
+		eigenvalues[0] = 0.0;
+		double mu = 0.0;
+		for (i = 0; i < num_states; i++)
+			mu += state_freq[i]*state_freq[i];
+		mu = total_num_subst/(1.0 - mu);
+
+		// compute eigenvalues
+		for (i = 1; i < num_states; i++)
+			eigenvalues[i] = -mu;
+
+		double *f = new double[num_states];
+		for (i = 0; i < num_states; i++) f[i] = sqrt(state_freq[i]);
+		// compute eigenvectors
+		memset(eigenvectors, 0, num_states*num_states*sizeof(double));
+		memset(inv_eigenvectors, 0, num_states*num_states*sizeof(double));
+		eigenvectors[0] = 1.0;
+		for (i = 1; i < num_states; i++)
+			eigenvectors[i] = -1.0;
+//			eigenvectors[i] = f[i]/f[num_states-1];
+		for (i = 1; i < num_states; i++) {
+			eigenvectors[i*num_states] = 1.0;
+			eigenvectors[i*num_states+i] = state_freq[0]/state_freq[i];
+		}
+
+		for (i = 0; i < num_states; i++)
+			for (j = 0; j < num_states; j++)
+				inv_eigenvectors[i*num_states+j] = state_freq[j]*eigenvectors[j*num_states+i];
+		writeInfo(cout);
+		// sanity check
+		double *q = new double[num_states*num_states];
+		getQMatrix(q);
+		double zero;
+		for (j = 0; j < num_states; j++) {
+			for (i = 0, zero = 0.0; i < num_states; i++) {
+				for (k = 0; k < num_states; k++) zero += q[i*num_states+k] * eigenvectors[k*num_states+j];
+				zero -= eigenvalues[j] * eigenvectors[i*num_states+j];
+				if (fabs(zero) > 1.0e-5) {
+					cout << "\nERROR: Eigenvector doesn't satisfy eigenvalue equation! (gap=" << fabs(zero) << ")" << endl;
+					abort();
+				}
+			}
+		}
+		delete [] q;
+	} else {
+		double **rate_matrix = new double*[num_states];
+
+		for (i = 0; i < num_states; i++)
+			rate_matrix[i] = new double[num_states];
+
+        if (half_matrix) {
+            for (i = 0, k = 0; i < num_states; i++) {
+                rate_matrix[i][i] = 0.0;
+                for (j = i+1; j < num_states; j++, k++) {
+                    rate_matrix[i][j] = rates[k];
+                    rate_matrix[j][i] = rates[k];
+                }
+            }
+        } else {
+            // full matrix
+            for (i = 0; i < num_states; i++) {
+                memcpy(rate_matrix[i], &rates[i*num_states], num_states*sizeof(double));
+                rate_matrix[i][i] = 0.0;
+            }
+//            IntVector codonid;
+//            codonid.reserve(num_states);
+//            int baseid[] = {3,1,0,2};
+//            for (i=0; i<4; i++)
+//                for (j=0; j<4; j++)
+//                    for (k=0; k<4; k++)
+//                        codonid.push_back(baseid[i]*16+baseid[j]*4+baseid[k]);
+//            cout.precision(4);
+//            cout << "rate_matrix=" << endl;
+//            for (i = 0; i < num_states; i++) {
+//                for (j = 0; j < num_states; j++)
+//                    cout << " " << rate_matrix[codonid[i]][codonid[j]];
+//                cout << endl;
+//            }
+//            cout << "state_freq=";
+//            for (i = 0; i < num_states; i++)
+//                cout << " " << state_freq[codonid[i]];
+//            cout << endl;
+        }
+		/* eigensystem of 1 PAM rate matrix */
+		eigensystem_sym(rate_matrix, state_freq, eigenvalues, eigenvectors, inv_eigenvectors, num_states);
+		//eigensystem(rate_matrix, state_freq, eigenvalues, eigenvectors, inv_eigenvectors, num_states);
+		for (i = num_states-1; i >= 0; i--)
+			delete [] rate_matrix[i];
+		delete [] rate_matrix;
+	}
+	for (i = 0; i < num_states; i++)
+		for (j = 0; j < num_states; j++) {
+			int offset = (i*num_states+j)*num_states;
+			double sum = 0.0;
+			for (k = 0; k < num_states; k++) {
+				eigen_coeff[offset+k] = eigenvectors[i*num_states+k] * inv_eigenvectors[k*num_states+j];
+				sum += eigen_coeff[offset+k];
+				//eigen_coeff_derv1[offset+k] = eigen_coeff[offset+k] * eigenvalues[k];
+				//eigen_coeff_derv2[offset+k] = eigen_coeff_derv1[offset+k] * eigenvalues[k];
+			}
+			if (i == j) {
+				if (fabs(sum-1.0) > 1e-6) {
+					cout << "sum = " << sum << endl;
+					assert(0);
+				}
+			}
+			else assert(fabs(sum) < 1e-6);
+		}
+
+
+} 
+
+void ModelGTR::readRates(istream &in) throw(const char*, string) {
+	int nrates = getNumRateEntries();
+	string str;
+	in >> str;
+	if (str == "equalrate") {
+		for (int i = 0; i < nrates; i++)
+			rates[i] = 1.0;
+	} else {
+		try {
+			rates[0] = convert_double(str.c_str());
+		} catch (string &str) {
+			outError(str);
+		}
+		if (rates[0] < 0.0)
+			throw "Negative rates not allowed";
+		for (int i = 1; i < nrates; i++) {
+			if (!(in >> rates[i]))
+				throw "Rate entries could not be read";
+			if (rates[i] < 0.0)
+				throw "Negative rates not allowed";
+		}
+	}
+}
+
+void ModelGTR::readRates(string str) throw(const char*) {
+	int nrates = getNumRateEntries();
+	int end_pos = 0;
+	cout << __func__ << " " << str << endl;
+	if (str.find("equalrate") != string::npos) {
+		for (int i = 0; i < nrates; i++)
+			rates[i] = 1.0;
+	} else for (int i = 0; i < nrates; i++) {
+		int new_end_pos;
+		try {
+			rates[i] = convert_double(str.substr(end_pos).c_str(), new_end_pos);
+		} catch (string &str) {
+			outError(str);
+		}
+		end_pos += new_end_pos;
+		if (rates[i] <= 0.0)
+			outError("Non-positive rates found");
+		if (i == nrates-1 && end_pos < str.length())
+			outError("String too long ", str);
+		if (i < nrates-1 && end_pos >= str.length())
+			outError("Unexpected end of string ", str);
+		if (end_pos < str.length() && str[end_pos] != ',')
+			outError("Comma to separate rates not found in ", str);
+		end_pos++;
+	}
+	num_params = 0;
+
+}
+
+void ModelGTR::readStateFreq(istream &in) throw(const char*) {
+	int i;
+	for (i = 0; i < num_states; i++) {
+		if (!(in >> state_freq[i])) 
+			throw "State frequencies could not be read";
+		if (state_freq[i] < 0.0)
+			throw "Negative state frequencies found";
+	}
+	double sum = 0.0;
+	for (i = 0; i < num_states; i++) sum += state_freq[i];
+	if (fabs(sum-1.0) > 1e-2)
+		throw "State frequencies do not sum up to 1.0";
+}
+
+void ModelGTR::readStateFreq(string str) throw(const char*) {
+	int i;
+	int end_pos = 0;
+	for (i = 0; i < num_states; i++) {
+		int new_end_pos;
+		state_freq[i] = convert_double(str.substr(end_pos).c_str(), new_end_pos);
+		end_pos += new_end_pos;
+		//cout << i << " " << state_freq[i] << endl;
+		if (state_freq[i] < 0.0 || state_freq[i] > 1)
+			outError("State frequency must be in [0,1] in ", str);
+		if (i == num_states-1 && end_pos < str.length())
+			outError("Unexpected end of string ", str);
+		if (end_pos < str.length() && str[end_pos] != ',')
+			outError("Comma to separate state frequencies not found in ", str);
+		end_pos++;
+	}
+	double sum = 0.0;
+	for (i = 0; i < num_states; i++) sum += state_freq[i];
+	if (fabs(sum-1.0) > 1e-2)
+		outError("State frequencies do not sum up to 1.0 in ", str);
+}
+
+void ModelGTR::readParameters(const char *file_name) { 
+	try {
+		ifstream in(file_name);
+		if (in.fail())
+			outError("Invalid model name ", file_name);
+		cout << "Reading model parameters from file " << file_name << endl;
+		readRates(in);
+		readStateFreq(in);
+		in.close();
+	}
+	catch (const char *str) {
+		outError(str);
+	} 
+	num_params = 0;
+	writeInfo(cout);
+}
+
+
+ModelGTR::~ModelGTR() {
+	freeMem();
+}
+
+void ModelGTR::freeMem()
+{
+//	int i;
+	//delete eigen_coeff_derv2;
+	//delete eigen_coeff_derv1;
+	aligned_free(eigen_coeff);
+
+//	for (i = num_states-1; i>=0; i--)
+//		delete [] inv_eigenvectors[i];
+	aligned_free(inv_eigenvectors);
+//	for (i = num_states-1; i>=0; i--)
+//		delete [] eigenvectors[i];
+	aligned_free(eigenvectors);
+
+	aligned_free(eigenvalues);
+
+	if (rates) delete [] rates;
+}
+
+double *ModelGTR::getEigenCoeff() const
+{
+    return eigen_coeff;
+}
+
+double *ModelGTR::getEigenvalues() const
+{
+    return eigenvalues;
+}
+
+double *ModelGTR::getEigenvectors() const
+{
+    return eigenvectors;
+}
+
+double* ModelGTR::getInverseEigenvectors() const {
+	return inv_eigenvectors;
+}
+
+void ModelGTR::setEigenCoeff(double *eigenCoeff)
+{
+    eigen_coeff = eigenCoeff;
+}
+
+void ModelGTR::setEigenvalues(double *eigenvalues)
+{
+    this->eigenvalues = eigenvalues;
+}
+
+void ModelGTR::setEigenvectors(double *eigenvectors)
+{
+    this->eigenvectors = eigenvectors;
+}
+
diff --git a/model/modelgtr.h b/model/modelgtr.h
new file mode 100644
index 0000000..1be91ba
--- /dev/null
+++ b/model/modelgtr.h
@@ -0,0 +1,341 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef GTRMODEL_H
+#define GTRMODEL_H
+
+#define EIGEN
+#include "phylotree.h"
+#include "modelsubst.h"
+#include "optimization.h"
+#include "alignment.h"
+#include "eigendecomposition.h"
+
+const double MIN_RATE = 1e-4;
+const double TOL_RATE = 1e-4;
+const double MAX_RATE = 100;
+
+
+/**
+General Time Reversible (GTR) model of substitution.
+This works for all kind of data, not only DNA
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelGTR : public ModelSubst, public EigenDecomposition
+{
+	
+	friend class ModelSet;
+	friend class ModelMixture;
+	
+public:
+	/**
+		constructor
+		@param tree associated tree for the model
+	*/
+    ModelGTR(PhyloTree *tree, bool count_rates = true);
+	
+	/**
+		init the model and decompose the rate matrix. This function should always be called
+		after creating the class. Otherwise it will not work properly.
+		@param freq_type state frequency type, can be FREQ_USER_DEFINED, FREQ_EQUAL, FREQ_EMPIRICAL, or FREQ_ESTIMATE
+	*/
+	void init(StateFreqType freq_type);
+
+	/**
+		this function is served for ModelDNA and ModelProtein
+		@param model_name name of the model
+		@param freq_type state frequency type, can be FREQ_USER_DEFINED, FREQ_EQUAL, FREQ_EMPIRICAL, or FREQ_ESTIMATE
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params) {}
+
+	/**
+		destructor
+	*/
+    virtual ~ModelGTR();
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		set the associated tree
+		@param tree the associated tree
+	*/
+    void setTree(PhyloTree *tree);
+
+	/**
+		Read the upper-triangle rate matrix from an input stream. 
+		It will throw error messages if failed
+		@param in input stream
+	*/
+	virtual void readRates(istream &in) throw(const char*, string);
+
+	/**
+		Read the rate parameters from a comma-separated string
+		It will throw error messages if failed
+		@param in input stream
+	*/
+	virtual void readRates(string str) throw(const char*);
+
+	/**
+		Read state frequencies from an input stream. 
+		It will throw error messages if failed
+		@param in input stream
+	*/
+	virtual void readStateFreq(istream &in) throw(const char*);
+
+	/**
+		Read state frequencies from comma-separated string
+		It will throw error messages if failed
+		@param str input string
+	*/
+	virtual void readStateFreq(string str) throw(const char*);
+
+	/**
+		read model parameters from a file
+		@param file_name file containing upper-triangle rate matrix and state frequencies
+	*/
+	void readParameters(const char *file_name);
+
+
+	/**
+		compute the transition probability matrix.
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+	*/
+	virtual void computeTransMatrix(double time, double *trans_matrix);
+
+	
+	/**
+	 * wrapper for computing transition matrix times state frequency vector
+	 * @param time time between two events
+	 * @param trans_matrix (OUT) the transition matrix between all pairs of states.
+	 * 	Assume trans_matrix has size of num_states * num_states.
+	 */
+	virtual void computeTransMatrixFreq(double time, double *trans_matrix);
+
+	/**
+		compute the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int state1, int state2);
+
+	/**
+		compute the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2);
+
+	/**
+		Get the rate matrix.
+		@param rate_mat (OUT) upper-triagle rate matrix. Assume rate_mat has size of num_states*(num_states-1)/2
+	*/
+	virtual void getRateMatrix(double *rate_mat);
+
+	/**
+		Set the rate matrix.
+		@param rate_mat upper-triagle rate matrix. Assume rate_mat has size of num_states*(num_states-1)/2
+	*/
+	virtual void setRateMatrix(double *rate_mat);
+
+	/**
+		compute the state frequency vector
+		@param state_freq (OUT) state frequency vector. Assume state_freq has size of num_states
+	*/
+	virtual void getStateFrequency(double *state_freq);
+
+	/**
+		set the state frequency vector
+		@param state_freq (IN) state frequency vector. Assume state_freq has size of num_states
+	*/
+	virtual void setStateFrequency(double *state_freq);
+
+	/**
+	 * compute Q matrix 
+	 * @param q_mat (OUT) Q matrix, assuming of size num_states * num_states
+	 */
+	virtual void getQMatrix(double *q_mat);
+
+	/**
+		rescale the state frequencies
+		@param sum_one TRUE to make frequencies sum to 1, FALSE to make last entry equal to 1
+	*/
+	void scaleStateFreq(bool sum_one);
+
+	/**
+		get frequency type
+		@return frequency type
+	*/
+	virtual StateFreqType getFreqType() { return freq_type; }
+
+
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDerv(double time, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2 times state frequency vector
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDervFreq(double time, double rate_val, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	/**
+		@return the number of dimensions
+	*/
+	virtual int getNDim();
+	
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		optimize model parameters
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+	/**
+	 * @return TRUE if parameters are at the boundary that may cause numerical unstability
+	 */
+	virtual bool isUnstableParameters();
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out){}
+
+
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix();
+
+	double *getEigenCoeff() const;
+
+	virtual double *getEigenvalues() const;
+
+	virtual double *getEigenvectors() const;
+	virtual double *getInverseEigenvectors() const;
+
+	void setEigenCoeff(double *eigenCoeff);
+
+	void setEigenvalues(double *eigenvalues);
+
+	void setEigenvectors(double *eigenvectors);
+
+    /** default TRUE: store only upper half of the rate matrix */
+    bool half_matrix;
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters 
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters 
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+	virtual void freeMem();
+
+	/**
+		phylogenetic tree associated
+	*/
+	PhyloTree *phylo_tree;
+
+    /**
+		rates between pairs of states of the unit rate matrix Q.
+		In order A-C, A-G, A-T, C-G, C-T (rate G-T = 1 always)
+	*/
+	double *rates;
+
+	/**
+		the number of free rate parameters
+	*/
+	int num_params;
+
+	/**
+		eigenvalues of the rate matrix Q
+	*/
+	double *eigenvalues;
+
+	/**
+		eigenvectors of the rate matrix Q 
+	*/
+	double *eigenvectors;
+
+	/**
+		inverse eigenvectors of the rate matrix Q 
+	*/
+	double *inv_eigenvectors;
+
+	/**
+		coefficient cache, served for fast computation of the P(t) matrix
+	*/
+	double *eigen_coeff;
+
+	/** state with highest frequency, used when optimizing state frequencies +FO */
+	int highest_freq_state;
+
+};
+
+#endif
diff --git a/model/modelmixture.cpp b/model/modelmixture.cpp
new file mode 100644
index 0000000..d84cf9a
--- /dev/null
+++ b/model/modelmixture.cpp
@@ -0,0 +1,1481 @@
+/*
+ * modelmixture.cpp
+ *
+ *  Created on: Nov 29, 2014
+ *      Author: minh
+ */
+
+#include "modelgtr.h"
+#include "modelnonrev.h"
+#include "modeldna.h"
+#include "modelprotein.h"
+#include "modelbin.h"
+#include "modelcodon.h"
+#include "modelmorphology.h"
+#include "modelset.h"
+#include "modelmixture.h"
+#include "phylokernelmixture.h"
+
+const string builtin_mixmodels_definition =
+"#nexus\n\
+\n\
+begin models;\n\
+\n\
+[ ---------------------------------------------------------\n\
+    EX2 mixture model of Le, Lartillot & Gascuel (2008) \n\
+ --------------------------------------------------------- ]\n\
+\n\
+[ Exposed component ]\n\
+model ExpEX2 =\n\
+0.526738 \n\
+0.483150 0.505837 \n\
+0.658902 0.051052 3.902456 \n\
+2.051872 2.214326 0.961103 0.129989 \n\
+1.280002 2.039552 1.301786 0.399061 0.456521 \n\
+1.306565 0.137928 0.285806 3.100403 0.033946 2.514377 \n\
+1.370782 0.363365 1.820100 0.885317 0.886564 0.320746 0.303966 \n\
+0.540809 2.288922 4.949307 0.700890 2.172284 3.755421 0.270957 0.401311 \n\
+0.171986 0.237023 0.337226 0.018315 1.037046 0.212032 0.084442 0.012279 0.317239 \n\
+0.430511 0.670514 0.158937 0.021949 1.702066 1.261113 0.110508 0.052946 0.869247 8.675343 \n\
+0.697731 3.881079 1.677194 0.105450 0.146263 2.570254 0.730337 0.279865 0.598289 0.338782 0.313102 \n\
+1.043937 0.656943 0.539827 0.066925 1.846562 1.973592 0.188160 0.158136 0.519993 9.483497 14.176858 1.013268 \n\
+0.265209 0.097443 0.182522 0.026918 3.002586 0.080193 0.023999 0.084663 2.047163 2.193062 4.802817 0.044792 3.261401 \n\
+1.270693 0.166534 0.068692 0.228829 0.156216 0.362501 0.214847 0.148900 0.323141 0.071992 0.343919 0.195470 0.099252 0.087020 \n\
+4.826665 0.751947 4.412265 0.975564 5.294149 1.033459 0.382235 1.970857 0.993310 0.190509 0.389101 0.592156 0.557254 0.668834 1.223981 \n\
+2.131819 0.584329 2.133604 0.368887 2.067387 1.013613 0.511390 0.174527 0.580960 2.563630 0.522334 1.147459 2.960091 0.244420 0.413148 7.384701 \n\
+0.143081 0.475590 0.061094 0.042618 1.603125 0.210329 0.048276 0.186382 0.961546 0.208313 1.130724 0.052858 1.328785 5.210001 0.045945 0.316078 0.144393 \n\
+0.208643 0.196271 0.599369 0.121313 3.842632 0.158470 0.064648 0.039280 8.230282 0.517123 0.713426 0.084962 0.812142 23.228875 0.043249 0.405310 0.234217 4.903887 \n\
+2.544463 0.313443 0.172264 0.073705 4.207648 0.497398 0.484620 0.132496 0.329895 23.711178 3.466991 0.348362 4.136445 1.199764 0.368231 0.266531 3.184874 0.252132 0.459187 \n\
+\n\
+0.088367 0.078147 0.047163 0.087976 0.004517 0.058526 0.128039 0.056993 0.024856 0.025277 0.045202 0.094639 0.012338 0.016158 0.060124 0.055346 0.051290 0.006771 0.021554 0.036718;\n\
+\n\
+[ Buried component ]\n\
+model BurEX2 =\n\
+0.338649 \n\
+0.201335 0.981635 \n\
+0.283859 0.247537 6.505182 \n\
+2.640244 0.904730 1.353325 0.312005 \n\
+0.543136 4.570308 2.439639 0.682052 0.216787 \n\
+0.748479 0.917979 0.804756 10.030310 0.024055 8.670112 \n\
+2.700465 0.539246 0.810739 0.810727 0.701320 0.330139 0.636675 \n\
+0.237686 3.175221 6.308043 1.540002 0.469875 8.675492 0.750683 0.183743 \n\
+0.044209 0.099241 0.162644 0.020816 0.166986 0.082745 0.030581 0.005017 0.075820 \n\
+0.124047 0.314159 0.088243 0.017526 0.449241 0.641784 0.073392 0.017752 0.277023 2.383760 \n\
+0.433721 17.781822 2.851914 0.459939 0.117548 6.815411 3.482941 0.484653 1.247888 0.161658 0.219757 \n\
+0.497479 0.448773 0.380964 0.057176 0.815999 2.089412 0.291379 0.054491 0.307450 2.817174 4.759683 1.082403 \n\
+0.093991 0.055530 0.098936 0.026160 0.662517 0.091948 0.022760 0.034431 0.675645 0.521416 1.672365 0.077917 1.296869 \n\
+0.986621 0.356417 0.214521 0.246129 0.164228 0.654039 0.295079 0.179095 0.428213 0.037671 0.170780 0.347219 0.074086 0.057233 \n\
+5.925588 0.979993 4.725421 1.158990 5.111992 1.120931 0.737456 2.279470 0.886126 0.051057 0.089611 0.925355 0.275366 0.274582 1.151114 \n\
+1.958501 0.630713 2.007592 0.289641 2.284140 0.787821 0.539892 0.097432 0.467489 0.644041 0.202812 1.401676 1.340732 0.103118 0.601281 8.190534 \n\
+0.068357 0.784449 0.109073 0.085810 0.457880 0.297731 0.155877 0.157418 0.708743 0.054134 0.374568 0.115777 0.477495 2.362999 0.047127 0.209085 0.097054 \n\
+0.084768 0.312038 0.615093 0.202611 0.788164 0.293543 0.137306 0.035497 4.938330 0.101803 0.180086 0.280737 0.264540 8.142914 0.059308 0.264401 0.133054 2.905674 \n\
+1.387752 0.140091 0.112176 0.058637 1.575057 0.203946 0.239406 0.044011 0.085226 6.427279 1.035942 0.244336 1.033583 0.278010 0.213475 0.079878 1.592560 0.081135 0.108383 \n\
+\n\
+0.123119 0.019475 0.019852 0.018583 0.018711 0.017275 0.018723 0.050388 0.016402 0.119697 0.161399 0.012776 0.035838 0.057019 0.030913 0.043472 0.049935 0.012600 0.039929 0.133894;\n\
+\n\
+[ main definition of EX2 with fixed component rates ]\n\
+model EX2 =MIX{BurEX2:0.672020808818762,ExpEX2:1.6413466609931};\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    EX3 mixture model of Le, Lartillot & Gascuel (2008) \n\
+ --------------------------------------------------------- ]\n\
+\n\
+[ Buried component ]\n\
+model BurEX3 =\n\
+0.352598 \n\
+0.216996 1.087422 \n\
+0.292440 0.323465 7.797086 \n\
+2.610812 0.913640 1.460331 0.344397 \n\
+0.510610 5.128748 2.811070 0.773241 0.220223 \n\
+0.753729 1.090823 0.956820 12.012282 0.021022 10.123412 \n\
+2.838061 0.595013 0.884971 0.922298 0.707214 0.351856 0.713974 \n\
+0.239679 3.625577 7.108377 1.826237 0.481109 10.246488 0.839852 0.219310 \n\
+0.051496 0.102940 0.168735 0.024207 0.162795 0.087881 0.036973 0.004515 0.079975 \n\
+0.119849 0.316151 0.091984 0.018800 0.422679 0.648064 0.075035 0.016317 0.282195 2.225363 \n\
+0.443183 20.766910 3.194817 0.568138 0.132784 7.478955 4.176123 0.551523 1.415394 0.163276 0.207613 \n\
+0.460570 0.458210 0.398615 0.059146 0.765112 2.134261 0.313124 0.053192 0.340474 2.609469 4.476961 1.014674 \n\
+0.089411 0.056698 0.104720 0.027913 0.630095 0.094857 0.023275 0.034031 0.691151 0.491179 1.606618 0.077868 1.226530 \n\
+0.993370 0.419898 0.217106 0.273526 0.181230 0.729534 0.311152 0.192454 0.483200 0.040002 0.170402 0.376998 0.075002 0.057218 \n\
+6.108406 1.066008 5.182562 1.216396 5.236005 1.159086 0.763810 2.404073 0.924395 0.048875 0.084247 0.923997 0.260340 0.260617 1.208454 \n\
+1.992855 0.687262 2.181095 0.312299 2.276505 0.829879 0.551397 0.101409 0.480998 0.610331 0.198919 1.407257 1.292634 0.096955 0.648250 8.527249 \n\
+0.063159 0.855332 0.134012 0.099769 0.468450 0.329372 0.136731 0.169991 0.745868 0.056715 0.377293 0.137955 0.463394 2.343596 0.058650 0.211406 0.085948 \n\
+0.078057 0.341493 0.655744 0.241264 0.762740 0.302096 0.142491 0.040257 5.226086 0.092084 0.180292 0.311130 0.249838 8.141649 0.062812 0.267992 0.128044 3.047417 \n\
+1.339724 0.144916 0.125078 0.062854 1.481083 0.194081 0.225389 0.043663 0.090575 5.973306 0.993888 0.222252 0.964622 0.262045 0.207448 0.083450 1.544911 0.078358 0.105286 \n\
+\n\
+0.123992 0.016529 0.017595 0.015784 0.019325 0.015552 0.015939 0.049573 0.014540 0.126555 0.167605 0.011083 0.037438 0.058363 0.028849 0.042324 0.049207 0.011962 0.037833 0.139953;\n\
+\n\
+[ Intermediate component ]\n\
+model IntEX3 =\n\
+0.489239 \n\
+0.466919 0.536794 \n\
+0.601908 0.069474 4.603441 \n\
+2.430552 1.807414 0.997223 0.166431 \n\
+1.101971 2.081359 1.299123 0.508086 0.393348 \n\
+1.227777 0.215899 0.345545 3.579383 0.046861 3.113235 \n\
+1.873072 0.390054 1.528288 0.941969 0.867139 0.349219 0.406414 \n\
+0.519003 1.930915 5.003737 0.781887 1.630085 3.567804 0.324903 0.315383 \n\
+0.158722 0.180317 0.295816 0.013254 0.642786 0.179498 0.090830 0.013181 0.209208 \n\
+0.345026 0.503290 0.138767 0.024393 1.107569 1.027755 0.123806 0.048549 0.592981 5.439892 \n\
+0.610178 4.322929 1.524318 0.121994 0.181609 2.674484 0.792405 0.276766 0.591509 0.301836 0.294950 \n\
+0.949957 0.472702 0.502710 0.091008 1.283305 1.905885 0.242081 0.140301 0.378459 6.259505 9.391081 1.074513 \n\
+0.247271 0.069820 0.161809 0.028611 2.065479 0.077874 0.025753 0.065388 1.541097 1.306479 3.015722 0.048689 2.243101 \n\
+1.334722 0.170174 0.099375 0.211869 0.163190 0.349495 0.155436 0.186099 0.300496 0.065625 0.265961 0.162529 0.088677 0.083754 \n\
+5.316955 0.699036 4.526191 1.143652 5.249370 0.970695 0.438792 2.366185 0.939629 0.138819 0.275119 0.532771 0.521510 0.547761 1.187779 \n\
+1.963809 0.535034 2.034583 0.383040 2.012437 0.891145 0.531018 0.180104 0.467342 1.861944 0.395319 1.071879 2.340268 0.183984 0.400373 7.243848 \n\
+0.145693 0.378596 0.046601 0.048388 1.074147 0.174525 0.063777 0.168836 0.822524 0.110645 0.677913 0.062047 0.796395 3.502387 0.046950 0.290501 0.107097 \n\
+0.195764 0.149382 0.534652 0.105996 2.446201 0.150150 0.071967 0.031908 6.198893 0.299207 0.413150 0.090874 0.492692 15.039152 0.044765 0.328289 0.175204 3.125850 \n\
+2.227504 0.220361 0.150316 0.066496 3.112801 0.393451 0.444469 0.108811 0.224352 15.532696 2.152640 0.302279 2.658339 0.738053 0.322254 0.197018 2.507055 0.175763 0.276642 \n\
+\n\
+0.086346 0.080808 0.041727 0.064440 0.006654 0.052795 0.092110 0.048527 0.028831 0.040497 0.071679 0.079687 0.018007 0.025901 0.052632 0.052778 0.056138 0.010733 0.034744 0.054964;\n\
+\n\
+[ Highly exposed component ]\n\
+model HExEX3 =\n\
+0.557500 \n\
+0.467024 0.508965 \n\
+0.660464 0.044039 3.386724 \n\
+1.332582 3.667491 1.440486 0.185886 \n\
+1.402485 2.156104 1.297398 0.333117 0.789370 \n\
+1.259192 0.111162 0.245837 2.707953 0.058650 2.098300 \n\
+0.934526 0.393780 2.196372 0.868249 1.336358 0.322363 0.252359 \n\
+0.518929 3.157422 5.392488 0.748008 3.827563 4.517669 0.284167 0.634601 \n\
+0.279723 0.407537 0.535113 0.054030 3.345087 0.427624 0.148200 0.015686 0.658979 \n\
+0.715094 1.182387 0.270883 0.035162 3.520931 2.366650 0.172395 0.100089 1.779380 18.830270 \n\
+0.694526 3.728628 1.747648 0.083685 0.100399 2.477205 0.623294 0.280977 0.694965 0.569776 0.493141 \n\
+1.338414 1.261833 0.818216 0.054313 3.918703 2.383718 0.219943 0.228757 0.867786 19.605444 31.431195 1.089056 \n\
+0.295523 0.190129 0.263800 0.044853 5.266468 0.120909 0.042178 0.194665 3.494314 5.825792 11.527190 0.044361 6.237844 \n\
+1.085021 0.168461 0.041147 0.203765 0.185173 0.353420 0.218194 0.120292 0.375260 0.116875 0.705493 0.190747 0.139085 0.108823 \n\
+4.090024 0.852803 4.335615 0.829194 6.499129 1.095446 0.336922 1.733724 1.144100 0.413986 0.878828 0.631498 0.730416 1.167593 1.195720 \n\
+2.318400 0.650016 2.351068 0.385247 1.883085 1.167877 0.532167 0.187062 0.796107 4.825759 0.838744 1.268311 4.445757 0.381760 0.419944 7.677284 \n\
+0.134371 1.021826 0.151293 0.065183 3.716538 0.530580 0.077516 0.396559 1.324147 0.443432 3.290145 0.064651 4.411035 13.056874 0.056705 0.534908 0.408415 \n\
+0.212989 0.424870 1.115762 0.268883 8.874037 0.255572 0.125866 0.107717 14.436023 1.292209 1.491799 0.104026 2.063744 49.760746 0.057618 0.756357 0.396791 12.032322 \n\
+3.112666 0.544010 0.214411 0.125541 5.301703 0.868794 0.839508 0.215758 0.533676 46.074660 7.301056 0.557248 9.151909 2.634769 0.523205 0.564572 4.519860 0.456880 0.670812 \n\
+\n\
+0.094155 0.070537 0.052200 0.112406 0.002213 0.062733 0.165272 0.062302 0.019853 0.011154 0.019829 0.108860 0.006503 0.006873 0.070091 0.057931 0.046183 0.002449 0.008629 0.019827;\n\
+\n\
+[ main definition of EX3 with fixed component rates ]\n\
+model EX3 = MIX{BurEX3:0.427672756793791,IntEX3:0.837595938019774,HExEX3:1.51863631431518};\n\
+\n\
+[ ---------------------------------------------------------\n\
+    EHO mixture model of Le, Lartillot & Gascuel (2008)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+[ extended component ]\n\
+model ExtEHO = \n\
+0.221750 \n\
+0.256487 0.595368 \n\
+0.447755 0.112310 7.769815 \n\
+4.893140 0.929131 1.061884 0.164472 \n\
+0.542660 2.886791 1.927072 0.497273 0.133291 \n\
+0.549459 0.290798 0.518264 5.393249 0.003776 4.326528 \n\
+5.411319 0.302948 0.907713 0.961651 1.249183 0.173873 0.316780 \n\
+0.283752 2.760038 5.159285 0.978418 0.737799 5.086066 0.421812 0.209276 \n\
+0.026683 0.053027 0.166715 0.016491 0.151942 0.055934 0.026726 0.001780 0.098605 \n\
+0.226816 0.251641 0.062256 0.015837 0.763554 0.537705 0.042909 0.032938 0.321607 3.217159 \n\
+0.235513 6.017300 2.543177 0.223507 0.023575 3.432847 1.211039 0.160545 0.671045 0.082221 0.106179 \n\
+0.992834 0.351969 0.415447 0.041511 1.271632 1.700679 0.111984 0.117596 0.326393 3.329162 7.496635 0.519821 \n\
+0.191967 0.041219 0.090517 0.014810 1.004694 0.042779 0.011177 0.040989 0.641267 0.813011 2.233318 0.023173 1.863238 \n\
+1.876507 0.395175 0.362650 0.550534 0.174031 0.731229 0.412907 0.205341 0.381717 0.011597 0.315127 0.393303 0.135360 0.043846 \n\
+6.066032 1.083228 5.612711 1.035540 4.263932 1.429211 0.766802 2.266299 1.074108 0.047896 0.147065 0.683291 0.352118 0.382422 1.462674 \n\
+1.827471 0.645132 1.883173 0.287521 1.395928 1.013709 0.781080 0.055140 0.512000 0.588357 0.142327 1.256445 1.435179 0.079647 0.417388 6.092548 \n\
+0.101419 0.452274 0.065206 0.034173 0.592031 0.164037 0.049674 0.183473 0.741383 0.069289 0.429275 0.050856 0.545447 2.178510 0.022770 0.304839 0.111242 \n\
+0.091914 0.112094 0.451176 0.108762 1.183567 0.132194 0.042952 0.030418 4.373360 0.122828 0.186938 0.096667 0.344096 8.276255 0.053251 0.325231 0.135310 2.597897 \n\
+1.970427 0.119016 0.091863 0.041044 1.750822 0.222903 0.225961 0.053387 0.123318 6.815243 1.427658 0.124284 1.427074 0.341263 0.127045 0.076658 1.052442 0.073165 0.101733 \n\
+\n\
+0.062087 0.053435 0.023743 0.032063 0.013132 0.034151 0.061042 0.030664 0.022696 0.104732 0.099541 0.054991 0.022312 0.045996 0.025392 0.045673 0.072789 0.012691 0.043790 0.139079;\n\
+\n\
+[ Helix component ]\n\
+model HelEHO = \n\
+0.346476 \n\
+0.374362 0.664870 \n\
+0.557349 0.079157 3.710526 \n\
+3.192474 1.027228 0.891196 0.006722 \n\
+0.776545 1.902860 1.561002 0.517360 0.112028 \n\
+0.841893 0.158406 0.443065 3.792847 0.000006 2.320685 \n\
+4.037113 0.661209 1.866962 1.144918 1.465540 0.511489 0.573208 \n\
+0.394225 2.123760 5.845902 0.737868 1.084909 3.960964 0.270146 0.380762 \n\
+0.111350 0.099645 0.233216 0.005627 0.839533 0.089484 0.019520 0.021251 0.132153 \n\
+0.193017 0.307622 0.115495 0.009651 1.136538 0.584189 0.039838 0.048105 0.485901 4.915707 \n\
+0.481682 3.827872 1.926308 0.163314 0.021755 2.487895 0.768919 0.327002 0.534206 0.147053 0.136159 \n\
+0.610432 0.344033 0.452639 0.035659 1.624032 1.146169 0.103241 0.171164 0.364836 6.260678 7.738615 0.549401 \n\
+0.147278 0.035167 0.106276 0.018468 1.864906 0.047207 0.010268 0.086543 1.244539 0.927331 3.243633 0.016265 2.326533 \n\
+1.090575 0.181605 0.093658 0.386490 0.097655 0.462559 0.290152 0.568098 0.458437 0.043237 0.207460 0.198291 0.061027 0.067592 \n\
+6.243684 0.836138 5.633664 0.952131 6.398291 1.267404 0.430602 5.463144 1.088326 0.102127 0.193860 0.707365 0.438507 0.470620 1.534272 \n\
+2.847158 0.566364 2.984732 0.347047 3.711971 1.083181 0.495700 0.500029 0.642773 1.698955 0.402699 1.111399 2.483456 0.231119 0.685164 8.832473 \n\
+0.090983 0.369015 0.085583 0.046821 0.950521 0.183299 0.040785 0.391093 0.950288 0.075780 0.624335 0.041505 0.980672 3.915972 0.053806 0.299723 0.100663 \n\
+0.152848 0.170981 0.594708 0.106099 2.051641 0.121416 0.047614 0.064377 8.167042 0.195540 0.352598 0.069186 0.465779 15.178886 0.058255 0.405459 0.201603 4.035822 \n\
+2.140511 0.136453 0.145376 0.046174 4.011687 0.191618 0.192292 0.202844 0.174981 14.460840 2.175028 0.136317 2.393838 0.659302 0.418505 0.180248 3.585329 0.175143 0.281722 \n\
+\n\
+0.121953 0.076798 0.032215 0.066765 0.006842 0.061304 0.131841 0.026596 0.020392 0.047287 0.087919 0.084679 0.020970 0.024145 0.025871 0.042103 0.038715 0.008346 0.023841 0.051421;\n\
+\n\
+[ Other component ]\n\
+model OthEHO =\n\
+0.529263 \n\
+0.379476 0.612335 \n\
+0.516691 0.067732 4.012914 \n\
+3.774890 1.615176 0.888663 0.165810 \n\
+1.312262 2.913667 1.533683 0.442262 0.337571 \n\
+1.403437 0.154460 0.333334 3.815893 0.015567 3.743866 \n\
+1.272402 0.389317 1.243222 0.661976 0.554904 0.332656 0.319770 \n\
+0.558733 2.816641 4.803000 0.761339 1.223662 4.889028 0.323617 0.300981 \n\
+0.124057 0.155080 0.219635 0.019097 0.560959 0.100743 0.038076 0.005599 0.184752 \n\
+0.340362 0.580087 0.119838 0.015948 1.192857 1.156516 0.083154 0.031031 0.646292 7.873544 \n\
+0.706732 5.734632 1.847806 0.128114 0.050896 3.616626 1.131071 0.283950 0.643558 0.179831 0.224320 \n\
+1.056749 0.665355 0.399943 0.053900 1.893946 2.299714 0.168079 0.085094 0.556024 8.136055 14.213193 0.931689 \n\
+0.233961 0.079465 0.130295 0.016768 1.902244 0.077611 0.012655 0.048906 1.403178 1.581816 4.275863 0.036062 2.888633 \n\
+1.518830 0.252482 0.049484 0.171011 0.108909 0.501196 0.346600 0.058913 0.299924 0.073007 0.297573 0.249478 0.091619 0.068920 \n\
+5.595735 0.861017 3.749627 0.987083 4.952776 1.045071 0.463265 1.190738 0.897478 0.131753 0.265701 0.607097 0.399537 0.408758 0.993614 \n\
+2.157458 0.613623 1.733380 0.361861 2.145775 1.011592 0.523086 0.091023 0.450662 1.492403 0.408418 1.143233 2.378569 0.131777 0.381007 7.574340 \n\
+0.151895 0.544292 0.060182 0.043433 1.259614 0.228038 0.045082 0.134804 0.748147 0.134416 0.979277 0.038787 0.908253 4.850762 0.052415 0.249753 0.114232 \n\
+0.219509 0.243507 0.580103 0.130214 2.325021 0.196580 0.079660 0.037482 6.907609 0.299245 0.552917 0.067894 0.685250 19.404995 0.047839 0.323207 0.183044 4.704884 \n\
+3.049976 0.278740 0.134120 0.055382 4.149385 0.500946 0.435957 0.067170 0.214393 22.435652 2.883298 0.323886 3.369448 0.722571 0.315978 0.152899 2.423398 0.186495 0.303833 \n\
+\n\
+0.076458 0.052393 0.055429 0.088634 0.007473 0.040671 0.080952 0.100192 0.025439 0.031730 0.053100 0.070835 0.014039 0.023159 0.087111 0.063636 0.055346 0.007033 0.023779 0.042590;\n\
+\n\
+[ main definition of EHO with fixed component rates ]\n\
+model EHO = MIX{ExtEHO:0.720274356,HelEHO:0.976798797,OthEHO:0.783109376};\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    UL2 mixture model of Le, Lartillot & Gascuel (2008)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+model M1_UL2 =\n\
+0.267149 \n\
+0.211944 0.816250 \n\
+0.156648 0.336150 3.110967 \n\
+2.402535 1.001114 1.287205 0.467161 \n\
+0.301870 3.168646 1.844180 0.571540 0.394361 \n\
+0.503678 1.529332 0.788530 3.920399 0.234553 8.502278 \n\
+3.124853 0.171548 0.220006 0.250690 0.766651 0.174653 0.399019 \n\
+0.139279 1.597241 5.622886 2.146897 0.349557 8.097306 1.211287 0.044878 \n\
+0.037158 0.139068 0.189483 0.049336 0.147864 0.122799 0.153664 0.006928 0.085276 \n\
+0.108752 0.387538 0.092568 0.035815 0.399254 0.617370 0.225586 0.018972 0.202328 2.343778 \n\
+0.255267 15.176345 1.030178 0.196011 0.396427 3.731061 2.642525 0.142626 0.878376 0.319044 0.422741 \n\
+0.430988 0.522887 0.351960 0.102916 0.683070 2.247889 0.621957 0.070803 0.228871 2.780325 4.767336 1.450453 \n\
+0.088392 0.116382 0.114044 0.066251 0.668683 0.133418 0.075116 0.039034 0.780377 0.488538 1.586897 0.143427 1.211385 \n\
+1.303487 0.178064 0.192016 0.065259 0.315140 0.406966 0.144065 0.135536 0.273070 0.087171 0.298010 0.087701 0.165232 0.104423 \n\
+7.472990 0.579607 3.004054 0.854304 5.789930 0.930019 0.709540 2.018826 0.527351 0.051443 0.070322 0.432286 0.281917 0.286341 0.473611 \n\
+2.276542 0.392852 1.332166 0.193248 2.577504 0.541748 0.690939 0.052900 0.272814 0.634227 0.224553 0.795413 1.360016 0.120449 0.745729 6.088861 \n\
+0.048841 0.673695 0.076107 0.073261 0.377566 0.284556 0.284138 0.130136 0.649073 0.047797 0.324911 0.148403 0.390301 2.189403 0.122493 0.131225 0.080727 \n\
+0.073190 0.425791 0.503951 0.250485 0.577049 0.306036 0.198368 0.024991 3.987606 0.083215 0.127898 0.372637 0.179514 7.784255 0.089874 0.175724 0.117177 2.629196 \n\
+1.351002 0.175990 0.120675 0.105544 1.491339 0.203270 0.463186 0.055506 0.065132 6.411609 1.020423 0.337618 1.047308 0.272790 0.407545 0.079844 1.634833 0.077263 0.083195 \n\
+\n\
+0.122413 0.017757 0.020209 0.012086 0.018894 0.014525 0.009897 0.045663 0.020120 0.124002 0.168915 0.011684 0.037631 0.063612 0.023347 0.039268 0.046707 0.015603 0.050968 0.136701;\n\
+\n\
+\n\
+model M2_UL2 =\n\
+0.557363 \n\
+0.539068 0.465628 \n\
+0.696831 0.032997 3.879799 \n\
+1.480953 4.566841 1.777582 0.310752 \n\
+1.402193 1.920868 1.276554 0.327085 0.972350 \n\
+1.335667 0.096752 0.255510 2.685052 0.088385 2.281328 \n\
+1.056193 0.423348 2.171283 0.933450 1.398738 0.369406 0.334900 \n\
+0.729300 2.712485 5.461073 0.679965 5.202985 4.012284 0.282038 0.585359 \n\
+0.267035 0.493033 0.523699 0.023230 2.563394 0.459103 0.176281 0.010013 0.551901 \n\
+0.700687 0.932999 0.206875 0.025161 3.939537 1.918986 0.154733 0.085684 1.446302 8.189198 \n\
+0.736759 3.603558 1.676442 0.070721 0.292188 2.403019 0.611829 0.307607 0.675279 0.627044 0.410941 \n\
+1.505101 0.819561 0.736222 0.089302 4.462071 2.539203 0.250970 0.204790 0.654198 11.105816 15.171688 1.258549 \n\
+0.541573 0.185468 0.343735 0.042217 5.958046 0.156533 0.064557 0.188906 3.891682 3.152154 5.098336 0.088022 4.518197 \n\
+1.155460 0.142408 0.044854 0.175385 0.123605 0.316005 0.157783 0.157894 0.347393 0.047328 0.344717 0.153954 0.054635 0.108793 \n\
+3.823040 0.733964 4.846938 0.890611 7.416660 0.987912 0.343107 2.296896 1.193558 0.368432 0.667347 0.535051 0.754875 1.469714 1.242760 \n\
+1.897039 0.590040 2.371940 0.347041 1.619173 1.025240 0.479587 0.210934 0.728868 5.106169 0.726618 1.152768 3.985684 0.433442 0.358997 9.007029 \n\
+0.296375 0.833840 0.091310 0.080326 5.217767 0.363445 0.078944 0.378088 1.571919 0.351013 2.139511 0.098671 2.796573 6.102504 0.023698 0.665667 0.292919 \n\
+0.297444 0.206563 0.871576 0.173621 11.803422 0.181973 0.110832 0.073892 12.757344 1.161331 1.646025 0.101481 1.732368 29.335598 0.037045 0.706902 0.346859 5.666524 \n\
+2.765737 0.415803 0.194725 0.093474 5.264577 0.734884 0.683342 0.156374 0.517626 26.038986 3.741256 0.457775 5.253478 1.999427 0.297563 0.344932 4.012753 0.385172 0.870088 \n\
+\n\
+0.087622 0.083588 0.048847 0.098882 0.002815 0.062809 0.143166 0.055391 0.023310 0.015495 0.032465 0.102135 0.009511 0.008409 0.069323 0.057733 0.051876 0.003945 0.014462 0.028216;\n\
+\n\
+model UL2 = MIX{M1_UL2:0.581348617,M2_UL2:1.465482789};\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    UL3 mixture model of Le, Lartillot & Gascuel (2008)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+model Q1_UL3 =\n\
+0.514865 \n\
+0.774348 0.583403 \n\
+0.854291 0.046141 2.011233 \n\
+1.019817 5.652322 2.260587 0.057603 \n\
+1.095968 1.696154 1.296536 0.417322 0.967032 \n\
+1.054599 0.084924 0.368384 3.592374 0.063073 1.885301 \n\
+3.510012 0.797055 1.759631 1.421695 2.627911 0.743770 0.772359 \n\
+0.694799 2.596186 4.214186 0.654590 6.673533 3.664595 0.294967 0.608220 \n\
+0.344837 0.543739 0.965435 0.062495 2.500862 0.452448 0.155720 0.083334 0.905291 \n\
+0.593987 0.857922 0.351903 0.045358 3.290242 1.421539 0.109100 0.230693 1.595696 5.042430 \n\
+0.708843 2.012940 1.662582 0.106190 0.329149 2.268825 0.579185 0.365374 0.696286 0.701896 0.398546 \n\
+0.990080 0.754111 0.910436 0.143464 3.570847 1.708803 0.181804 0.706982 0.789517 8.138995 13.390024 1.137779 \n\
+0.085639 0.012721 0.098898 0.018361 2.148695 0.012425 0.009316 0.135782 0.921964 1.006572 2.479349 0.014715 1.418875 \n\
+0.655013 0.150052 0.120388 0.698261 0.254951 0.353826 0.250818 0.715043 0.329691 0.170251 0.827093 0.187804 0.178490 0.048299 \n\
+2.863328 0.657706 3.761619 0.619692 9.817007 0.810603 0.344050 6.758412 0.997214 0.414623 0.625678 0.555290 0.647617 0.392859 0.929152 \n\
+1.373936 0.392433 2.711122 0.237865 2.460302 0.701472 0.319136 0.607889 0.728133 3.705396 0.412346 0.953939 2.446017 0.054119 0.279699 9.934970 \n\
+0.247598 0.514750 0.144529 0.157484 5.383077 0.199950 0.045688 0.790171 1.116595 0.243053 1.738186 0.070214 3.427855 3.275850 0.007577 0.583988 0.205721 \n\
+0.090644 0.046952 0.326197 0.089450 7.475195 0.018555 0.020706 0.016617 3.728614 0.404819 0.617948 0.029889 0.956437 47.933104 0.050416 0.181180 0.070113 5.242459 \n\
+2.093798 0.323334 0.307076 0.101486 8.553531 0.473023 0.410909 0.459941 0.568017 13.906640 1.778101 0.426825 2.763369 0.570421 0.311278 0.389524 2.915452 0.252168 0.268516 \n\
+\n\
+0.104307 0.092553 0.043722 0.085643 0.003218 0.074342 0.163928 0.024243 0.022216 0.016012 0.038591 0.105577 0.011434 0.016126 0.018057 0.061232 0.061373 0.004086 0.020876 0.032465;\n\
+\n\
+model Q2_UL3 =\n\
+1.709484 \n\
+0.184309 0.860448 \n\
+0.660851 0.182073 4.471383 \n\
+4.554487 2.843438 1.801073 1.068728 \n\
+3.425703 6.092362 2.868388 0.790473 0.794773 \n\
+4.278840 0.359055 0.585031 4.176143 0.121031 6.860012 \n\
+0.625715 1.054231 1.222442 0.492366 1.418419 0.796035 0.643251 \n\
+1.089116 6.396197 8.965630 1.915247 2.033352 11.058341 0.768162 0.523196 \n\
+0.024545 0.023433 0.014686 0.002204 0.628823 0.008720 0.008363 0.002485 0.046726 \n\
+0.150945 0.140520 0.002514 0.000212 1.903535 0.384413 0.015127 0.010251 0.210723 5.066207 \n\
+1.751314 12.981698 3.641808 0.278298 0.036599 7.677610 2.744099 0.612733 1.686490 0.042380 0.023858 \n\
+0.475876 0.364580 0.063143 0.001486 3.890832 0.754732 0.041044 0.024222 0.236955 5.752463 12.019762 0.229898 \n\
+0.142125 0.051255 0.006503 0.000593 5.397699 0.064190 0.006871 0.015588 0.424840 1.005341 5.458275 0.021422 1.779060 \n\
+5.433246 1.051312 0.012611 0.027267 0.635181 1.765792 0.849429 0.023324 0.610884 0.000184 0.037705 0.604166 0.001415 0.003197 \n\
+6.267113 1.750009 5.986041 1.411952 5.482009 1.923966 0.595886 0.943724 1.786620 0.043381 0.066093 0.813893 0.053557 0.199095 1.723045 \n\
+6.389458 1.828974 2.044599 1.561907 2.083626 2.070125 1.210529 0.217976 1.192222 0.515450 0.199809 2.020941 1.238100 0.150760 1.727569 9.882473 \n\
+0.281689 1.180712 0.000006 0.017218 3.696424 0.146508 0.068518 0.222418 0.497727 0.199828 1.849405 0.001429 1.394852 2.473491 0.016401 0.288550 0.190290 \n\
+0.302638 0.475135 0.196905 0.067615 6.355457 0.576342 0.232832 0.059485 4.525509 0.571811 1.194578 0.006674 0.467694 8.107893 0.024556 0.394389 0.441794 4.067825 \n\
+11.333027 0.298555 0.053673 0.009846 5.743238 0.296166 0.413471 0.120393 0.105418 9.130937 2.674960 0.165290 4.417978 1.811161 0.492985 0.042803 3.284174 0.844277 2.327679 \n\
+\n\
+0.044015 0.021591 0.056258 0.102405 0.003260 0.018409 0.041364 0.168549 0.015843 0.064277 0.118096 0.036061 0.027358 0.036695 0.124914 0.047807 0.022696 0.005866 0.018644 0.025892;\n\
+\n\
+\n\
+model Q3_UL3 =\n\
+0.063622 \n\
+0.118948 0.528684 \n\
+0.065502 0.142677 12.092355 \n\
+2.010382 0.302352 1.127688 0.014546 \n\
+0.169022 2.026184 1.256016 0.417582 0.170493 \n\
+0.172876 0.453837 0.454428 1.882165 0.045799 4.705997 \n\
+5.254550 0.174422 0.364886 0.192790 0.891120 0.148450 0.195211 \n\
+0.090586 1.258840 5.523808 0.313487 0.211550 4.734918 0.466811 0.096529 \n\
+0.034911 0.065167 0.222440 0.023060 0.132230 0.122571 0.075521 0.003942 0.065261 \n\
+0.146425 0.219004 0.136129 0.028165 0.448432 0.795591 0.146014 0.016718 0.240024 2.387089 \n\
+0.041117 11.082325 0.783756 0.049843 0.039616 1.828161 0.649991 0.069199 0.271006 0.094864 0.140698 \n\
+0.599734 0.230551 0.595641 0.059404 0.699534 2.765355 0.391569 0.089210 0.206188 3.020110 2.806927 0.516762 \n\
+0.148889 0.145244 0.408811 0.089283 0.724422 0.260910 0.101509 0.101882 1.508086 0.693424 0.709933 0.061880 1.887041 \n\
+0.378131 0.225548 0.181924 0.038283 0.146379 0.511097 0.151769 0.166424 0.386101 0.186116 0.753595 0.182723 0.420131 0.341199 \n\
+8.340091 0.495564 3.010756 0.463573 5.601734 0.985082 0.415256 2.532014 0.720035 0.067860 0.121784 0.279698 0.626080 0.788724 0.890779 \n\
+1.932342 0.358779 1.069003 0.093122 2.028674 0.637861 0.597230 0.019551 0.211054 0.870341 0.381670 0.461083 2.131453 0.255120 0.604567 3.450887 \n\
+0.031419 0.378744 0.085588 0.008303 0.277628 0.277906 0.122038 0.055246 0.420382 0.021973 0.127345 0.027776 0.139098 3.077241 0.163299 0.177114 0.062650 \n\
+0.106283 0.366786 1.453278 0.404793 0.753053 0.475024 0.273992 0.055936 7.367304 0.094629 0.126746 0.148716 0.369726 5.251757 0.232549 0.676796 0.247828 2.910495 \n\
+1.104848 0.069287 0.110149 0.084224 1.280832 0.175361 0.342585 0.047507 0.038768 8.415916 1.577573 0.054532 1.537983 0.409619 0.309028 0.094413 1.483411 0.060147 0.076595 \n\
+\n\
+0.134055 0.044392 0.020730 0.020801 0.021008 0.024509 0.028587 0.029069 0.028138 0.105826 0.117458 0.049732 0.026120 0.038984 0.016306 0.038369 0.055334 0.017276 0.039905 0.143398;\n\
+\n\
+model UL3 = MIX{Q1_UL3:0.484340397,Q2_UL3:0.492780514,Q3_UL3:1.15597274};\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    EX_EHO mixture model of Le & Gascuel (2010)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+\n\
+model BUR_EXT =\n\
+0.228492 \n\
+0.165543 0.916344 \n\
+0.238509 0.258514 8.498064 \n\
+3.374029 1.037434 1.667702 0.332072 \n\
+0.344742 4.971495 2.471912 0.654950 0.130301 \n\
+0.417921 1.039226 0.875808 13.073209 0.040759 9.834742 \n\
+4.248714 0.411876 0.585570 0.748848 0.908311 0.221633 0.593504 \n\
+0.182762 3.872065 6.999812 1.719470 0.493863 8.695395 0.749303 0.137367 \n\
+0.011705 0.090751 0.149898 0.021996 0.077693 0.043664 0.013820 0.001527 0.073342 \n\
+0.133793 0.286232 0.065118 0.015540 0.456304 0.546974 0.052641 0.024196 0.226460 2.160734 \n\
+0.249141 17.756919 3.385483 0.343780 0.093875 6.677050 2.745017 0.295602 1.481997 0.100576 0.167406 \n\
+0.641194 0.342577 0.427146 0.059345 0.867233 2.306480 0.218260 0.058613 0.358032 2.187901 5.151337 0.750049 \n\
+0.118366 0.068606 0.102572 0.009357 0.633943 0.033356 0.012944 0.024474 0.497973 0.534407 1.581972 0.063281 1.329239 \n\
+1.561052 0.483968 0.385170 0.261437 0.310131 0.913924 0.355871 0.175520 0.512823 0.019789 0.295416 0.348527 0.104569 0.059641 \n\
+5.891807 1.320618 5.737159 1.074011 4.702782 1.389531 0.878480 2.178078 1.111068 0.033343 0.094349 1.035903 0.327901 0.292022 1.344678 \n\
+2.059884 0.976165 2.166428 0.369522 1.951862 0.815145 0.575774 0.060834 0.558388 0.422299 0.153549 1.793263 1.268126 0.085468 0.780914 9.031309 \n\
+0.081683 0.814216 0.057557 0.055146 0.450959 0.191881 0.109420 0.144367 0.651978 0.068649 0.345622 0.169527 0.387902 1.883741 0.023466 0.309129 0.111568 \n\
+0.052650 0.248907 0.570101 0.180267 0.701260 0.253975 0.061388 0.025465 4.206114 0.083799 0.147600 0.226848 0.254720 6.549427 0.027521 0.283138 0.141408 2.561108 \n\
+1.355342 0.137437 0.104597 0.051387 1.203830 0.218892 0.194527 0.031054 0.088935 4.577473 1.003647 0.153722 0.883283 0.242657 0.191295 0.068785 0.990922 0.056276 0.078264 \n\
+\n\
+0.087158 0.015906 0.012970 0.012566 0.020325 0.013301 0.013777 0.039603 0.014597 0.161107 0.147775 0.011033 0.031334 0.064281 0.013322 0.035417 0.048583 0.012672 0.045210 0.199064;\n\
+\n\
+\n\
+model BUR_HEL =\n\
+0.317211 \n\
+0.209784 1.120865 \n\
+0.315205 0.301050 7.439896 \n\
+2.214446 0.884449 1.356293 0.110768 \n\
+0.465495 4.319791 2.843187 1.082540 0.215988 \n\
+0.668735 0.901135 0.986572 11.245156 0.009874 7.561773 \n\
+3.614157 0.568883 0.972660 1.036117 0.894733 0.409083 0.780808 \n\
+0.249929 3.138701 7.344935 1.747672 0.379845 9.559763 0.842239 0.146008 \n\
+0.059633 0.103290 0.206475 0.017492 0.286194 0.123433 0.037593 0.010910 0.071273 \n\
+0.096230 0.285199 0.113728 0.015874 0.439724 0.547078 0.063675 0.021607 0.303531 2.097349 \n\
+0.380075 15.783354 2.780107 0.569108 0.093004 6.179905 3.209588 0.413960 1.002075 0.185911 0.185249 \n\
+0.371379 0.411553 0.398602 0.076761 0.727245 1.665645 0.249045 0.068128 0.256194 2.940308 3.649539 0.972247 \n\
+0.075616 0.043519 0.096446 0.041118 0.636688 0.102460 0.039991 0.041269 0.839126 0.376556 1.551814 0.064774 1.173962 \n\
+1.100574 0.385197 0.319458 0.353000 0.112549 0.805706 0.369483 0.482895 0.520098 0.058167 0.144341 0.361488 0.074069 0.057968 \n\
+6.832958 0.955160 5.296628 1.265211 6.144756 1.315182 0.902504 3.903795 0.862633 0.072343 0.080478 0.979654 0.330305 0.328917 1.924898 \n\
+2.223205 0.445571 2.461831 0.299635 2.943208 0.830637 0.621903 0.184055 0.468356 0.911139 0.208091 1.343261 1.515339 0.158763 0.915879 9.298787 \n\
+0.062541 0.806724 0.110928 0.132125 0.414525 0.388313 0.191952 0.271274 0.909529 0.025790 0.343842 0.099137 0.543577 2.467147 0.044938 0.215329 0.087955 \n\
+0.082948 0.329591 0.693402 0.286594 0.866329 0.259566 0.167425 0.049038 6.332054 0.093136 0.177755 0.275998 0.261754 8.344684 0.088981 0.335859 0.137177 3.125017 \n\
+1.390479 0.142986 0.175068 0.106294 1.687293 0.159520 0.297915 0.080925 0.085103 6.414688 0.953785 0.240157 1.097345 0.264988 0.373870 0.144230 2.572837 0.089110 0.115941 \n\
+\n\
+0.158060 0.021566 0.016487 0.014079 0.016937 0.020232 0.023096 0.032822 0.014618 0.114447 0.198900 0.014668 0.042840 0.053434 0.015640 0.037275 0.043095 0.012211 0.036330 0.113263;\n\
+\n\
+model BUR_OTH =\n\
+0.406682 \n\
+0.246649 0.848592 \n\
+0.364260 0.198690 4.535840 \n\
+3.292044 0.837291 1.295138 0.420726 \n\
+0.735862 4.205085 2.062501 0.427451 0.259335 \n\
+0.954795 0.673046 0.671062 8.395674 0.048284 8.922739 \n\
+1.958847 0.573207 0.632317 0.572264 0.486274 0.345345 0.650009 \n\
+0.312042 2.699661 4.969855 1.181781 0.551188 7.620453 0.701108 0.195346 \n\
+0.071000 0.127041 0.184028 0.030240 0.180591 0.065984 0.039235 0.005033 0.098525 \n\
+0.142298 0.338853 0.086876 0.026095 0.484427 0.867777 0.087780 0.017129 0.309774 3.477136 \n\
+0.624622 18.390649 2.748646 0.442886 0.238266 6.993941 3.906971 0.652336 1.365814 0.219252 0.288480 \n\
+0.610604 0.581287 0.382156 0.048508 0.963147 2.672887 0.384585 0.051334 0.386066 3.752286 6.858529 1.524446 \n\
+0.124670 0.047666 0.102656 0.031532 0.699124 0.129867 0.004923 0.039185 0.701690 0.643782 2.019473 0.104308 1.568249 \n\
+1.126387 0.321347 0.107738 0.137858 0.150346 0.601413 0.310374 0.073794 0.332910 0.056230 0.208204 0.368816 0.078902 0.062410 \n\
+5.908551 0.834735 3.611589 0.969189 4.765870 0.881934 0.528944 1.439305 0.746876 0.060111 0.114374 0.784754 0.235963 0.219009 0.710100 \n\
+1.856381 0.574277 1.573584 0.223054 2.038789 0.763848 0.461329 0.076195 0.396095 0.701247 0.249302 1.091322 1.282643 0.070553 0.419070 6.616977 \n\
+0.069294 0.654056 0.127255 0.078896 0.517561 0.188732 0.125541 0.104279 0.547504 0.066927 0.454998 0.056498 0.425274 2.668838 0.050943 0.151483 0.062698 \n\
+0.128158 0.354167 0.640140 0.182565 0.793990 0.368725 0.157796 0.037084 4.307140 0.140691 0.241076 0.323966 0.293629 9.711414 0.060323 0.207489 0.111492 2.857446 \n\
+1.982761 0.158227 0.115545 0.051117 2.065903 0.338262 0.258245 0.045770 0.089942 10.113118 1.382024 0.431385 1.456614 0.295718 0.273919 0.066465 1.668063 0.113899 0.144981 \n\
+\n\
+0.102123 0.021199 0.032404 0.032350 0.018985 0.017469 0.017625 0.089270 0.021090 0.083642 0.123866 0.012720 0.029789 0.055399 0.072705 0.061298 0.061705 0.013496 0.039682 0.093184;\n\
+\n\
+model EXP_EXT=\n\
+0.464716 \n\
+0.597009 0.420578 \n\
+1.010693 0.048553 5.944290 \n\
+3.915828 2.088244 0.878468 0.236108 \n\
+1.156023 1.882317 1.435926 0.338823 0.482742 \n\
+1.131098 0.127150 0.346338 3.317186 0.061060 2.724696 \n\
+4.638659 0.351041 1.379174 1.216518 1.396050 0.199361 0.353970 \n\
+0.657615 2.215990 4.150252 0.717363 1.853969 3.768864 0.347165 0.313421 \n\
+0.078558 0.127092 0.347281 0.032361 0.605448 0.171553 0.104678 0.010608 0.309418 \n\
+0.516672 0.510585 0.105529 0.039188 1.808273 1.017577 0.112010 0.044661 0.772131 5.693102 \n\
+0.519389 3.571104 1.844049 0.109305 0.103105 2.232749 0.653339 0.195325 0.547017 0.219311 0.253086 \n\
+1.658261 0.640712 0.558751 0.063591 1.694880 2.088441 0.194697 0.291701 0.321392 6.220456 12.392618 0.862547 \n\
+0.426071 0.064894 0.132019 0.034872 2.076573 0.085745 0.026972 0.099963 1.388250 1.765294 3.859637 0.032198 3.134107 \n\
+3.082729 0.250470 0.232578 0.376163 0.290522 0.502379 0.240501 0.302007 0.283950 0.013574 0.606936 0.248475 0.226716 0.058246 \n\
+7.012884 0.866957 5.008997 0.814153 4.758346 1.192080 0.595351 2.514269 0.993487 0.135167 0.349525 0.542021 0.512591 0.744682 1.258172 \n\
+2.037755 0.446367 1.618299 0.203392 1.177421 0.840646 0.583757 0.071515 0.466886 1.503883 0.260405 0.934230 2.245607 0.123552 0.258896 4.504833 \n\
+0.171334 0.385971 0.087717 0.019596 1.015512 0.127027 0.037725 0.217844 0.822780 0.095756 0.777332 0.039952 0.977419 3.217291 0.015240 0.301259 0.102153 \n\
+0.194998 0.091803 0.433021 0.086495 3.074882 0.111578 0.041481 0.048438 4.904785 0.336528 0.411742 0.087476 0.640594 14.126821 0.061656 0.338111 0.129249 2.902137 \n\
+2.811391 0.216605 0.127240 0.061503 2.320268 0.390874 0.450783 0.132513 0.234279 12.181354 2.539512 0.233848 3.363159 0.717467 0.138035 0.159602 1.615372 0.132268 0.186175 \n\
+\n\
+0.043140 0.090761 0.034408 0.052848 0.006370 0.053817 0.107749 0.024812 0.029498 0.049134 0.050167 0.098127 0.013722 0.025841 0.037395 0.056505 0.094326 0.012045 0.039238 0.080099;\n\
+\n\
+model EXP_HEL =\n\
+0.434227 \n\
+0.551823 0.569806 \n\
+0.698268 0.056291 3.064314 \n\
+2.026002 2.379205 1.077282 0.016649 \n\
+0.986617 1.606282 1.331570 0.426399 0.409724 \n\
+1.005936 0.120122 0.390888 2.999742 0.021217 1.881156 \n\
+3.221202 0.736168 2.269617 1.272893 1.771711 0.622430 0.656603 \n\
+0.515574 2.032567 5.484997 0.666491 2.985549 3.380526 0.265244 0.557878 \n\
+0.200810 0.241566 0.441585 0.009830 1.541200 0.198621 0.069562 0.043838 0.339616 \n\
+0.328669 0.583849 0.178015 0.022077 2.045404 1.046125 0.089148 0.104708 0.875298 8.628242 \n\
+0.598864 3.090263 1.682415 0.113637 0.207957 2.085253 0.582536 0.376534 0.554395 0.371883 0.290692 \n\
+0.799278 0.528354 0.704087 0.062290 2.303849 1.507620 0.173293 0.356580 0.492228 10.028453 12.162732 0.867109 \n\
+0.256227 0.083117 0.192262 0.030759 4.328951 0.078062 0.022890 0.181917 2.406824 2.014776 4.856941 0.041675 3.521229 \n\
+1.118844 0.147481 0.061969 0.323498 0.171678 0.387521 0.237715 0.641036 0.433529 0.069102 0.359935 0.164055 0.063832 0.126592 \n\
+5.069051 0.749554 5.245486 0.840686 7.114530 1.177802 0.382956 6.139836 1.086779 0.194824 0.424579 0.655759 0.682174 0.753148 1.355810 \n\
+2.949741 0.623328 3.248881 0.406219 3.345739 1.214278 0.538553 0.867954 0.747654 3.316346 0.754081 1.193593 3.516479 0.366653 0.622665 7.975653 \n\
+0.115446 0.394156 0.090971 0.055309 1.947845 0.185912 0.046886 0.451084 1.173014 0.277029 1.078778 0.054622 1.516237 5.813526 0.071865 0.359167 0.106921 \n\
+0.205680 0.197878 0.678775 0.118188 4.183184 0.139485 0.059999 0.051336 10.200670 0.507328 0.721921 0.086974 0.741023 24.191458 0.046460 0.489820 0.247367 4.904042 \n\
+2.494211 0.280293 0.235248 0.083648 5.509932 0.429196 0.409105 0.447130 0.351675 23.404006 3.840750 0.300727 4.126659 1.483049 0.675560 0.336101 4.426709 0.309940 0.588217 \n\
+\n\
+0.115826 0.094038 0.037357 0.085821 0.003363 0.073078 0.167709 0.025416 0.021634 0.024147 0.050238 0.106612 0.013318 0.013330 0.029895 0.044902 0.037901 0.006460 0.018548 0.030407;\n\
+\n\
+model EXP_OTH =\n\
+0.603175 \n\
+0.478745 0.562615 \n\
+0.608325 0.056553 3.755571 \n\
+2.371839 2.480665 0.889513 0.170707 \n\
+1.551117 2.685995 1.462350 0.424139 0.669728 \n\
+1.624084 0.129505 0.314826 3.404205 0.049823 3.375473 \n\
+0.987777 0.356744 1.294077 0.640234 0.583980 0.331879 0.304731 \n\
+0.667236 2.788429 4.719171 0.731257 1.872668 4.612209 0.316233 0.320454 \n\
+0.186911 0.269245 0.318538 0.028464 0.987958 0.242926 0.090427 0.007312 0.327205 \n\
+0.527992 0.844027 0.167295 0.021423 1.623589 1.636879 0.135662 0.044560 0.939347 10.338048 \n\
+0.842575 5.076266 1.736167 0.106076 0.132985 3.365869 0.969736 0.270931 0.669196 0.356829 0.352830 \n\
+1.296147 0.863599 0.469732 0.075018 1.832599 2.642602 0.217378 0.107935 0.624941 10.670411 17.593544 1.247987 \n\
+0.325034 0.135328 0.192352 0.021631 2.731423 0.103263 0.027708 0.060740 2.148472 2.344767 5.497995 0.057563 3.278627 \n\
+1.670091 0.235642 0.042844 0.164518 0.112539 0.479958 0.326780 0.057540 0.291899 0.110067 0.380466 0.240061 0.109541 0.083760 \n\
+5.098150 0.831455 3.661924 0.978777 4.500240 1.064732 0.455496 1.095629 0.915898 0.226713 0.405000 0.608323 0.525496 0.593321 1.035726 \n\
+2.174502 0.630453 1.791747 0.396219 1.681712 1.083797 0.556968 0.100584 0.457070 2.361119 0.543612 1.211816 2.987220 0.198957 0.368383 7.505908 \n\
+0.203719 0.615713 0.044203 0.046952 1.745090 0.303876 0.050920 0.155176 0.920001 0.165182 1.385828 0.055323 1.274920 5.896599 0.059081 0.303111 0.156402 \n\
+0.271220 0.253084 0.643377 0.142691 3.763228 0.209729 0.093004 0.035856 8.167503 0.490579 0.894778 0.077103 1.029700 26.210400 0.045876 0.373529 0.218567 5.726440 \n\
+3.470639 0.410713 0.180011 0.081584 4.323431 0.751254 0.686467 0.086874 0.318032 29.800262 3.856040 0.482930 4.862267 1.182403 0.390522 0.268937 2.836818 0.229423 0.453335 \n\
+\n\
+0.071716 0.058979 0.060316 0.101089 0.005039 0.044673 0.093349 0.105394 0.026228 0.020220 0.037831 0.081647 0.010677 0.015875 0.090566 0.065046 0.054453 0.005546 0.019924 0.031432;\n\
+\n\
+\n\
+model EX_EHO = MIX{BUR_EXT:0.761816796788931,BUR_HEL:0.744425646802117,BUR_OTH:0.532457759429489,EXP_EXT:1.5639387472863,EXP_HEL:2.06403411829438,EXP_OTH:1.43336795177594};\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    LG4M mixture model of Le, Dang & Gascuel (2012)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+model LG4M1 =\n\
+ 0.269343\n\
+ 0.254612 0.150988\n\
+ 0.236821 0.031863 0.659648\n\
+ 2.506547 0.938594 0.975736 0.175533\n\
+ 0.359080 0.348288 0.697708 0.086573 0.095967\n\
+ 0.304674 0.156000 0.377704 0.449140 0.064706 4.342595\n\
+ 1.692015 0.286638 0.565095 0.380358 0.617945 0.202058 0.264342\n\
+ 0.251974 0.921633 1.267609 0.309692 0.390429 2.344059 0.217750 0.104842\n\
+ 1.085220 0.325624 0.818658 0.037814 1.144150 0.534567 0.222793 0.062682 0.567431\n\
+ 0.676353 0.602366 0.217027 0.007533 1.595775 0.671143 0.158424 0.070463 0.764255 8.226528\n\
+ 0.179155 0.971338 1.343718 0.133744 0.122468 0.983857 0.994128 0.220916 0.410581 0.387487 0.181110\n\
+ 1.636817 0.515217 0.670461 0.071252 1.534848 5.288642 0.255628 0.094198 0.257229 25.667158 6.819689 1.591212\n\
+ 0.235498 0.123932 0.099793 0.030425 0.897279 0.112229 0.022529 0.047488 0.762914 1.344259 0.865691 0.038921 2.030833\n\
+ 1.265605 0.040163 0.173354 0.027579 0.259961 0.580374 0.088041 0.145595 0.143676 0.298859 1.020117 0.000714 0.190019 0.093964\n\
+ 5.368405 0.470952 5.267140 0.780505 4.986071 0.890554 0.377949 1.755515 0.786352 0.527246 0.667783 0.659948 0.731921 0.837669 1.355630\n\
+ 1.539394 0.326789 1.688169 0.283738 1.389282 0.329821 0.231770 0.117017 0.449977 3.531600 0.721586 0.497588 2.691697 0.152088 0.698040 16.321298\n\
+ 0.140944 0.375611 0.025163 0.002757 0.801456 0.257253 0.103678 0.132995 0.345834 0.377156 0.839647 0.176970 0.505682 1.670170 0.091298 0.210096 0.013165\n\
+ 0.199836 0.146857 0.806275 0.234246 1.436970 0.319669 0.010076 0.036859 3.503317 0.598632 0.738969 0.154436 0.579000 4.245524 0.074524 0.454195 0.232913 1.178490\n\
+ 9.435529 0.285934 0.395670 0.130890 6.097263 0.516259 0.503665 0.222960 0.149143 13.666175 2.988174 0.162725 5.973826 0.843416 0.597394 0.701149 4.680002 0.300085 0.416262\n\
+\n\
+0.082276 0.055172 0.043853 0.053484 0.018957 0.028152 0.046679 0.157817 0.033297 0.028284 0.054284 0.025275 0.023665 0.041874 0.063071 0.066501 0.065424 0.023837 0.038633 0.049465;\n\
+\n\
+model LG4M2 =\n\
+0.133720\n\
+ 0.337212 0.749052\n\
+ 0.110918 0.105087 4.773487\n\
+ 3.993460 0.188305 1.590332 0.304942\n\
+ 0.412075 2.585774 1.906884 0.438367 0.242076\n\
+ 0.435295 0.198278 0.296366 7.470333 0.008443 3.295515\n\
+ 7.837540 0.164607 0.431724 0.153850 1.799716 0.269744 0.242866\n\
+ 0.203872 2.130334 9.374479 1.080878 0.152458 12.299133 0.279589 0.089714\n\
+ 0.039718 0.024553 0.135254 0.014979 0.147498 0.033964 0.005585 0.007248 0.022746\n\
+ 0.075784 0.080091 0.084971 0.014128 0.308347 0.500836 0.022833 0.022999 0.161270 1.511682\n\
+ 0.177662 10.373708 1.036721 0.038303 0.043030 2.181033 0.321165 0.103050 0.459502 0.021215 0.078395\n\
+ 0.420784 0.192765 0.329545 0.008331 0.883142 1.403324 0.168673 0.160728 0.612573 1.520889 7.763266 0.307903\n\
+ 0.071268 0.019652 0.088753 0.013547 0.566609 0.071878 0.020050 0.041022 0.625361 0.382806 1.763059 0.044644 1.551911\n\
+ 0.959127 1.496585 0.377794 0.332010 0.318192 1.386970 0.915904 0.224255 2.611479 0.029351 0.068250 1.542356 0.047525 0.182715\n\
+ 11.721512 0.359408 2.399158 0.219464 9.104192 0.767563 0.235229 3.621219 0.971955 0.033780 0.043035 0.236929 0.319964 0.124977 0.840651\n\
+ 2.847068 0.218463 1.855386 0.109808 4.347048 0.765848 0.164569 0.312024 0.231569 0.356327 0.159597 0.403210 1.135162 0.106903 0.269190 9.816481\n\
+ 0.030203 0.387292 0.118878 0.067287 0.190240 0.122113 0.007023 0.137411 0.585141 0.020634 0.228824 0.000122 0.474862 3.135128 0.030313 0.093830 0.119152\n\
+ 0.067183 0.130101 0.348730 0.061798 0.301198 0.095382 0.095764 0.044628 2.107384 0.046105 0.100117 0.017073 0.192383 8.367641 0.000937 0.137416 0.044722 4.179782\n\
+ 0.679398 0.041567 0.092408 0.023701 1.271187 0.115566 0.055277 0.086988 0.060779 8.235167 0.609420 0.061764 0.581962 0.184187 0.080246 0.098033 1.438350 0.023439 0.039124\n\
+\n\
+0.120900 0.036460 0.026510 0.040410 0.015980 0.021132 0.025191 0.036369 0.015884 0.111029 0.162852 0.024820 0.028023 0.074058 0.012065 0.041963 0.039072 0.012666 0.040478 0.114137;\n\
+\n\
+model LG4M3 =\n\
+0.421017\n\
+ 0.316236 0.693340\n\
+ 0.285984 0.059926 6.158219\n\
+ 4.034031 1.357707 0.708088 0.063669\n\
+ 0.886972 2.791622 1.701830 0.484347 0.414286\n\
+ 0.760525 0.233051 0.378723 4.032667 0.081977 4.940411\n\
+ 0.754103 0.402894 2.227443 1.102689 0.416576 0.459376 0.508409\n\
+ 0.571422 2.319453 5.579973 0.885376 1.439275 4.101979 0.576745 0.428799\n\
+ 0.162152 0.085229 0.095692 0.006129 0.490937 0.104843 0.045514 0.004705 0.098934\n\
+ 0.308006 0.287051 0.056994 0.007102 0.958988 0.578990 0.067119 0.024403 0.342983 3.805528\n\
+ 0.390161 7.663209 1.663641 0.105129 0.135029 3.364474 0.652618 0.457702 0.823674 0.129858 0.145630\n\
+ 1.042298 0.364551 0.293222 0.037983 1.486520 1.681752 0.192414 0.070498 0.222626 4.529623 4.781730 0.665308\n\
+ 0.362476 0.073439 0.129245 0.020078 1.992483 0.114549 0.023272 0.064490 1.491794 1.113437 2.132006 0.041677 1.928654\n\
+ 1.755491 0.087050 0.099325 0.163817 0.242851 0.322939 0.062943 0.198698 0.192904 0.062948 0.180283 0.059655 0.129323 0.065778\n\
+ 3.975060 0.893398 5.496314 1.397313 3.575120 1.385297 0.576191 1.733288 1.021255 0.065131 0.129115 0.600308 0.387276 0.446001 1.298493\n\
+ 2.565079 0.534056 2.143993 0.411388 2.279084 0.893006 0.528209 0.135731 0.518741 0.972662 0.280700 0.890086 1.828755 0.189028 0.563778 7.788147\n\
+ 0.283631 0.497926 0.075454 0.043794 1.335322 0.308605 0.140137 0.150797 1.409726 0.119868 0.818331 0.080591 1.066017 3.754687 0.073415 0.435046 0.197272\n\
+ 0.242513 0.199157 0.472207 0.085937 2.039787 0.262751 0.084578 0.032247 7.762326 0.153966 0.299828 0.117255 0.438215 14.506235 0.089180 0.352766 0.215417 5.054245\n\
+ 2.795818 0.107130 0.060909 0.029724 2.986426 0.197267 0.196977 0.044327 0.116751 7.144311 1.848622 0.118020 1.999696 0.705747 0.272763 0.096935 1.820982 0.217007 0.172975\n\
+\n\
+0.072639 0.051691 0.038642 0.055580 0.009829 0.031374 0.048731 0.065283 0.023791 0.086640 0.120847 0.052177 0.026728 0.032589 0.039238 0.046748 0.053361 0.008024 0.037426 0.098662;\n\
+\n\
+model LG4M4 =\n\
+0.576160\n\
+ 0.567606 0.498643\n\
+ 0.824359 0.050698 3.301401\n\
+ 0.822724 4.529235 1.291808 0.101930\n\
+ 1.254238 2.169809 1.427980 0.449474 0.868679\n\
+ 1.218615 0.154502 0.411471 3.172277 0.050239 2.138661\n\
+ 1.803443 0.604673 2.125496 1.276384 1.598679 0.502653 0.479490\n\
+ 0.516862 2.874265 4.845769 0.719673 3.825677 4.040275 0.292773 0.596643\n\
+ 0.180898 0.444586 0.550969 0.023542 2.349573 0.370160 0.142187 0.016618 0.500788\n\
+ 0.452099 0.866322 0.201033 0.026731 2.813990 1.645178 0.135556 0.072152 1.168817 5.696116\n\
+ 0.664186 2.902886 2.101971 0.127988 0.200218 2.505933 0.759509 0.333569 0.623100 0.547454 0.363656\n\
+ 0.864415 0.835049 0.632649 0.079201 2.105931 1.633544 0.216462 0.252419 0.665406 7.994105 11.751178 1.096842\n\
+ 0.324478 0.208947 0.280339 0.041683 4.788477 0.107022 0.067711 0.171320 3.324779 2.965328 5.133843 0.084856 4.042591\n\
+ 1.073043 0.173826 0.041985 0.270336 0.121299 0.351384 0.228565 0.225318 0.376089 0.058027 0.390354 0.214230 0.058954 0.126299\n\
+ 3.837562 0.884342 4.571911 0.942751 6.592827 1.080063 0.465397 3.137614 1.119667 0.362516 0.602355 0.716940 0.506796 1.444484 1.432558\n\
+ 2.106026 0.750016 2.323325 0.335915 1.654673 1.194017 0.617231 0.318671 0.801030 4.455842 0.580191 1.384210 3.522468 0.473128 0.432718 5.716300\n\
+ 0.163720 0.818102 0.072322 0.068275 3.305436 0.373790 0.054323 0.476587 1.100360 0.392946 1.703323 0.085720 1.725516 5.436253 0.053108 0.498594 0.231832\n\
+ 0.241167 0.302440 1.055095 0.246940 9.741942 0.249895 0.129973 0.052363 11.542498 1.047449 1.319667 0.139770 1.330225 26.562270 0.046986 0.737653 0.313460 5.165098\n\
+ 1.824586 0.435795 0.179086 0.091739 3.609570 0.649507 0.656681 0.225234 0.473437 19.897252 3.001995 0.452926 3.929598 1.692159 0.370204 0.373501 3.329822 0.326593 0.860743\n\
+\n\
+0.104843 0.078835 0.043513 0.090498 0.002924 0.066163 0.151640 0.038843 0.022556 0.018383 0.038687 0.104462 0.010166 0.009089 0.066950 0.053667 0.049486 0.004409 0.012924 0.031963;\n\
+\n\
+model LG4M = MIX{LG4M1,LG4M2,LG4M3,LG4M4}*G4;\n\
+model LG4  = MIX{LG4M1,LG4M2,LG4M3,LG4M4}*G4;\n\
+\n\
+\n\
+[ ---------------------------------------------------------\n\
+    LG4X mixture model of Le, Dang & Gascuel (2012)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+model LG4X1 =\n\
+0.295719\n\
+0.067388 0.448317\n\
+0.253712 0.457483 2.358429\n\
+1.029289 0.576016 0.251987 0.189008\n\
+0.107964 1.741924 0.216561 0.599450 0.029955\n\
+0.514644 0.736017 0.503084 109.901504 0.084794 4.117654\n\
+10.868848 0.704334 0.435271 1.070052 1.862626 0.246260 1.202023\n\
+0.380498 5.658311 4.873453 5.229858 0.553477 6.508329 1.634845 0.404968\n\
+0.084223 0.123387 0.090748 0.052764 0.151733 0.054187 0.060194 0.048984 0.204296\n\
+0.086976 0.221777 0.033310 0.021407 0.230320 0.195703 0.069359 0.069963 0.504221 1.495537\n\
+0.188789 93.433377 0.746537 0.621146 0.096955 1.669092 2.448827 0.256662 1.991533 0.091940 0.122332\n\
+0.286389 0.382175 0.128905 0.081091 0.352526 0.810168 0.232297 0.228519 0.655465 1.994320 3.256485 0.457430\n\
+0.155567 0.235965 0.127321 0.205164 0.590018 0.066081 0.064822 0.241077 6.799829 0.754940 2.261319 0.163849 1.559944\n\
+1.671061 6.535048 0.904011 5.164456 0.386853 2.437439 3.537387 4.320442 11.291065 0.170343 0.848067 5.260446 0.426508 0.438856\n\
+2.132922 0.525521 0.939733 0.747330 1.559564 0.165666 0.435384 3.656545 0.961142 0.050315 0.064441 0.360946 0.132547 0.306683 4.586081\n\
+0.529591 0.303537 0.435450 0.308078 0.606648 0.106333 0.290413 0.290216 0.448965 0.372166 0.102493 0.389413 0.498634 0.109129 2.099355 3.634276\n\
+0.115551 0.641259 0.046646 0.260889 0.587531 0.093417 0.280695 0.307466 6.227274 0.206332 0.459041 0.033291 0.559069 18.392863 0.411347 0.101797 0.034710\n\
+0.102453 0.289466 0.262076 0.185083 0.592318 0.035149 0.105999 0.096556 20.304886 0.097050 0.133091 0.115301 0.264728 66.647302 0.476350 0.148995 0.063603 20.561407\n\
+0.916683 0.102065 0.043986 0.080708 0.885230 0.072549 0.206603 0.306067 0.205944 5.381403 0.561215 0.112593 0.693307 0.400021 0.584622 0.089177 0.755865 0.133790 0.154902\n\
+\n\
+0.147383 0.017579 0.058208 0.017707 0.026331 0.041582 0.017494 0.027859 0.011849 0.076971 0.147823 0.019535 0.037132 0.029940 0.008059 0.088179 0.089653 0.006477 0.032308 0.097931;\n\
+\n\
+model LG4X2 =\n\
+ 0.066142\n\
+ 0.590377 0.468325\n\
+ 0.069930 0.013688 2.851667\n\
+ 9.850951 0.302287 3.932151 0.146882\n\
+ 1.101363 1.353957 8.159169 0.249672 0.582670\n\
+ 0.150375 0.028386 0.219934 0.560142 0.005035 3.054085\n\
+ 0.568586 0.037750 0.421974 0.046719 0.275844 0.129551 0.037250\n\
+ 0.051668 0.262130 2.468752 0.106259 0.098208 4.210126 0.029788 0.013513\n\
+ 0.127170 0.016923 0.344765 0.003656 0.445038 0.165753 0.008541 0.002533 0.031779\n\
+ 0.292429 0.064289 0.210724 0.004200 1.217010 1.088704 0.014768 0.005848 0.064558 7.278994\n\
+ 0.071458 0.855973 1.172204 0.014189 0.033969 1.889645 0.125869 0.031390 0.065585 0.029917 0.042762\n\
+ 1.218562 0.079621 0.763553 0.009876 1.988516 3.344809 0.056702 0.021612 0.079927 7.918203 14.799537 0.259400\n\
+ 0.075144 0.011169 0.082464 0.002656 0.681161 0.111063 0.004186 0.004854 0.095591 0.450964 1.506485 0.009457 1.375871\n\
+ 7.169085 0.161937 0.726566 0.040244 0.825960 2.067758 0.110993 0.129497 0.196886 0.169797 0.637893 0.090576 0.457399 0.143327\n\
+ 30.139501 0.276530 11.149790 0.267322 18.762977 3.547017 0.201148 0.976631 0.408834 0.104288 0.123793 0.292108 0.598048 0.328689 3.478333\n\
+ 13.461692 0.161053 4.782635 0.053740 11.949233 2.466507 0.139705 0.053397 0.126088 1.578530 0.641351 0.297913 4.418398 0.125011 2.984862 13.974326\n\
+ 0.021372 0.081472 0.058046 0.006597 0.286794 0.188236 0.009201 0.019475 0.037226 0.015909 0.154810 0.017172 0.239749 0.562720 0.061299 0.154326 0.060703\n\
+ 0.045779 0.036742 0.498072 0.027639 0.534219 0.203493 0.012095 0.004964 0.452302 0.094365 0.140750 0.021976 0.168432 1.414883 0.077470 0.224675 0.123480 0.447011\n\
+ 4.270235 0.030342 0.258487 0.012745 4.336817 0.281953 0.043812 0.015539 0.016212 16.179952 3.416059 0.032578 2.950318 0.227807 1.050562 0.112000 5.294490 0.033381 0.045528\n\
+\n\
+0.063139 0.066357 0.011586 0.066571 0.010800 0.009276 0.053984 0.146986 0.034214 0.088822 0.098196 0.032390 0.021263 0.072697 0.016761 0.020711 0.020797 0.025463 0.045615 0.094372;\n\
+\n\
+model LG4X3 =\n\
+ 0.733336\n\
+ 0.558955 0.597671\n\
+ 0.503360 0.058964 5.581680\n\
+ 4.149599 2.863355 1.279881 0.225860\n\
+ 1.415369 2.872594 1.335650 0.434096 1.043232\n\
+ 1.367574 0.258365 0.397108 2.292917 0.209978 4.534772\n\
+ 1.263002 0.366868 1.840061 1.024707 0.823594 0.377181 0.496780\n\
+ 0.994098 2.578946 5.739035 0.821921 3.039380 4.877840 0.532488 0.398817\n\
+ 0.517204 0.358350 0.284730 0.027824 1.463390 0.370939 0.232460 0.008940 0.349195\n\
+ 0.775054 0.672023 0.109781 0.021443 1.983693 1.298542 0.169219 0.043707 0.838324 5.102837\n\
+ 0.763094 5.349861 1.612642 0.088850 0.397640 3.509873 0.755219 0.436013 0.888693 0.561690 0.401070\n\
+ 1.890137 0.691594 0.466979 0.060820 2.831098 2.646440 0.379926 0.087640 0.488389 7.010411 8.929538 1.357738\n\
+ 0.540460 0.063347 0.141582 0.018288 4.102068 0.087872 0.020447 0.064863 1.385133 3.054968 5.525874 0.043394 3.135353\n\
+ 0.200122 0.032875 0.019509 0.042687 0.059723 0.072299 0.023282 0.036426 0.050226 0.039318 0.067505 0.023126 0.012695 0.015631\n\
+ 4.972745 0.821562 4.670980 1.199607 5.901348 1.139018 0.503875 1.673207 0.962470 0.204155 0.273372 0.567639 0.570771 0.458799 0.233109\n\
+ 1.825593 0.580847 1.967383 0.420710 2.034980 0.864479 0.577513 0.124068 0.502294 2.653232 0.437116 1.048288 2.319555 0.151684 0.077004 8.113282\n\
+ 0.450842 0.661866 0.088064 0.037642 2.600668 0.390688 0.109318 0.218118 1.065585 0.564368 1.927515 0.120994 1.856122 4.154750 0.011074 0.377578 0.222293\n\
+ 0.526135 0.265730 0.581928 0.141233 5.413080 0.322761 0.153776 0.039217 8.351808 0.854294 0.940458 0.180650 0.975427 11.429924 0.026268 0.429221 0.273138 4.731579\n\
+ 3.839269 0.395134 0.145401 0.090101 4.193725 0.625409 0.696533 0.104335 0.377304 15.559906 2.508169 0.449074 3.404087 1.457957 0.052132 0.260296 2.903836 0.564762 0.681215\n\
+\n\
+ 0.062457 0.066826 0.049332 0.065270 0.006513 0.041231 0.058965 0.080852 0.028024 0.037024 0.075925 0.064131 0.019620 0.028710 0.104579 0.056388 0.062027 0.008241 0.033124 0.050760;\n\
+\n\
+model LG4X4 =\n\
+ 0.658412\n\
+ 0.566269 0.540749\n\
+ 0.854111 0.058015 3.060574\n\
+ 0.884454 5.851132 1.279257 0.160296\n\
+ 1.309554 2.294145 1.438430 0.482619 0.992259\n\
+ 1.272639 0.182966 0.431464 2.992763 0.086318 2.130054\n\
+ 1.874713 0.684164 2.075952 1.296206 2.149634 0.571406 0.507160\n\
+ 0.552007 3.192521 4.840271 0.841829 5.103188 4.137385 0.351381 0.679853\n\
+ 0.227683 0.528161 0.644656 0.031467 3.775817 0.437589 0.189152 0.025780 0.665865\n\
+ 0.581512 1.128882 0.266076 0.048542 3.954021 2.071689 0.217780 0.082005 1.266791 8.904999\n\
+ 0.695190 3.010922 2.084975 0.132774 0.190734 2.498630 0.767361 0.326441 0.680174 0.652629 0.440178\n\
+ 0.967985 1.012866 0.720060 0.133055 1.776095 1.763546 0.278392 0.343977 0.717301 10.091413 14.013035 1.082703\n\
+ 0.344015 0.227296 0.291854 0.056045 4.495841 0.116381 0.092075 0.195877 4.001286 2.671718 5.069337 0.091278 4.643214\n\
+ 0.978992 0.156635 0.028961 0.209188 0.264277 0.296578 0.177263 0.217424 0.362942 0.086367 0.539010 0.172734 0.121821 0.161015\n\
+ 3.427163 0.878405 4.071574 0.925172 7.063879 1.033710 0.451893 3.057583 1.189259 0.359932 0.742569 0.693405 0.584083 1.531223 1.287474\n\
+ 2.333253 0.802754 2.258357 0.360522 2.221150 1.283423 0.653836 0.377558 0.964545 4.797423 0.780580 1.422571 4.216178 0.599244 0.444362 5.231362\n\
+ 0.154701 0.830884 0.073037 0.094591 3.017954 0.312579 0.074620 0.401252 1.350568 0.336801 1.331875 0.068958 1.677263 5.832025 0.076328 0.548763 0.208791\n\
+ 0.221089 0.431617 1.238426 0.313945 8.558815 0.305772 0.181992 0.072258 12.869737 1.021885 1.531589 0.163829 1.575754 33.873091 0.079916 0.831890 0.307846 5.910440\n\
+ 2.088785 0.456530 0.199728 0.118104 4.310199 0.681277 0.752277 0.241015 0.531100 23.029406 4.414850 0.481711 5.046403 1.914768 0.466823 0.382271 3.717971 0.282540 0.964421\n\
+\n\
+0.106471 0.074171 0.044513 0.096390 0.002148 0.066733 0.158908 0.037625 0.020691 0.014608 0.028797 0.105352 0.007864 0.007477 0.083595 0.055726 0.047711 0.003975 0.010088 0.027159;\n\
+\n\
+model LG4X = MIX{LG4X1,LG4X2,LG4X3,LG4X4}*R4;\n\
+\n\
+[ ---------------------------------------------------------\n\
+    +cF class frequency mixture model of Wang et al. (2008)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+frequency Fclass1 = 0.02549352 0.01296012 0.005545202 0.006005566 0.01002193 0.01112289 0.008811948 0.001796161 0.004312188 0.2108274 0.2730413 0.01335451 0.07862202 0.03859909 0.005058205 0.008209453 0.03210019 0.002668138 0.01379098 0.2376598;\n\
+frequency Fclass2 = 0.09596966 0.008786096 0.02805857 0.01880183 0.005026264 0.006454635 0.01582725 0.7215719 0.003379354 0.002257725 0.003013483 0.01343441 0.001511657 0.002107865 0.006751404 0.04798539 0.01141559 0.000523736 0.002188483 0.004934972;\n\
+frequency Fclass3 = 0.01726065 0.005467988 0.01092937 0.3627871 0.001046402 0.01984758 0.5149206 0.004145081 0.002563289 0.002955213 0.005286931 0.01558693 0.002693098 0.002075771 0.003006167 0.01263069 0.01082144 0.000253451 0.001144787 0.004573568;\n\
+frequency Fclass4 = 0.1263139 0.09564027 0.07050061 0.03316681 0.02095119 0.05473468 0.02790523 0.009007538 0.03441334 0.005855319 0.008061884 0.1078084 0.009019514 0.05018693 0.07948 0.09447839 0.09258897 0.01390669 0.05367769 0.01230413;\n\
+frequency CF4 = FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4};\n\
+model JTTCF4G = JTT+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;\n\
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C10 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+\n\
+frequency C10pi1 = 0.4082573125 0.0081783015 0.0096285438 0.0069870889 0.0349388179 0.0075279735 0.0097846653 0.1221613215 0.0039151830 0.0125784287 0.0158338663 0.0059670150 0.0081313216 0.0061604332 0.0394155867 0.1682450664 0.0658132542 0.0018751587 0.0041579747 0.0604426865;\n\
+frequency C10pi2 = 0.1027763487 0.0418664491 0.0213272051 0.0155943616 0.0149663448 0.0440685478 0.0419667447 0.0138805792 0.0158864807 0.1066076641 0.1131944125 0.0436343681 0.0437800327 0.0180729309 0.0223250701 0.0529608087 0.1081741005 0.0045147205 0.0137373857 0.1606654446;\n\
+frequency C10pi3 = 0.0351766018 0.0019678632 0.0016591476 0.0006768741 0.0078706538 0.0016559557 0.0019686768 0.0022420602 0.0012878339 0.3515819591 0.1278183107 0.0018856550 0.0242631753 0.0126221329 0.0029771559 0.0049998099 0.0255378034 0.0011907778 0.0037539283 0.3888636245;\n\
+frequency C10pi4 = 0.0408513927 0.0269887074 0.2185648186 0.2333814790 0.0037602852 0.0380451418 0.0901238869 0.1158332065 0.0373197176 0.0025523644 0.0052164616 0.0485017266 0.0022571778 0.0025108218 0.0108333610 0.0804527209 0.0302879995 0.0010815260 0.0069890931 0.0044481118;\n\
+frequency C10pi5 = 0.0185492661 0.0062362395 0.0024895723 0.0009775062 0.0070416514 0.0083539447 0.0024891617 0.0028952913 0.0040103982 0.1632422345 0.4443079409 0.0043570878 0.1202815687 0.0733329781 0.0048827648 0.0051642443 0.0131806647 0.0068759784 0.0144734420 0.0968580644;\n\
+frequency C10pi6 = 0.1106750119 0.0352190043 0.0405186210 0.1636437899 0.0014834855 0.0877962201 0.2638456592 0.0325228293 0.0163803600 0.0068334902 0.0140679579 0.0677158208 0.0048988133 0.0023256777 0.0298982139 0.0562887953 0.0426922497 0.0010338979 0.0040522304 0.0181078719;\n\
+frequency C10pi7 = 0.0522657662 0.0668294648 0.0714836849 0.0297745257 0.0143324928 0.0736540298 0.0388386669 0.0228101108 0.1551638111 0.0187406149 0.0653779932 0.0439469345 0.0207189121 0.0624033021 0.0145475497 0.0549017631 0.0370140058 0.0193756900 0.1110694548 0.0267512268;\n\
+frequency C10pi8 = 0.0116587342 0.0050990142 0.0064011054 0.0021742457 0.0105340743 0.0040203734 0.0024251112 0.0034709143 0.0366787049 0.0187185330 0.0676489746 0.0026694717 0.0143534813 0.3650985596 0.0031159927 0.0094848536 0.0073713920 0.0509564551 0.3574858593 0.0206341497;\n\
+frequency C10pi9 = 0.0627195947 0.2038782162 0.0428629162 0.0236193294 0.0052662886 0.1098111767 0.0686284994 0.0256174957 0.0332612124 0.0128968249 0.0305627740 0.2270839355 0.0124036991 0.0039181841 0.0140440613 0.0483152469 0.0463378087 0.0025143473 0.0065521118 0.0197062770;\n\
+frequency C10pi10 = 0.1145518598 0.0324008908 0.0750614981 0.0416192189 0.0098549497 0.0339624663 0.0364907910 0.0503817581 0.0165233329 0.0092949460 0.0139153707 0.0423026886 0.0082240805 0.0046605982 0.0379221548 0.2610647896 0.1845829279 0.0017548981 0.0058538316 0.0195769483;\n\
+model C10 = POISSON+G+FMIX{C10pi1:1:0.1191344178,C10pi2:1:0.0874372456,C10pi3:1:0.1037105070,C10pi4:1:0.0922584809,C10pi5:1:0.1070492801,C10pi6:1:0.1329945166,C10pi7:1:0.0538028458,C10pi8:1:0.0691986212,C10pi9:1:0.1319937434,C10pi10:1:0.1024203429};\n\
+model C10Opt = POISSON+G+FMIX{C10pi1,C10pi2,C10pi3,C10pi4,C10pi5,C10pi6,C10pi7,C10pi8,C10pi9,C10pi10};\n\
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C20 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+frequency C20pi1 = 0.0862412505 0.0171943793 0.0791293376 0.0329908619 0.0130504558 0.0169046938 0.0184526503 0.0366905299 0.0108013340 0.0097907148 0.0112826424 0.0220195221 0.0087821483 0.0044155335 0.0189273201 0.3178152357 0.2711700523 0.0015317305 0.0048342853 0.0179753220 ;\n\
+frequency C20pi2 = 0.2035582865 0.0050980810 0.0077052407 0.0031656079 0.0348667285 0.0064044073 0.0070859400 0.0195235515 0.0024392035 0.1152573291 0.0789777393 0.0042380850 0.0309187017 0.0112429356 0.0164189221 0.0496777139 0.1118946615 0.0017762569 0.0048448213 0.2849057867 ;\n\
+frequency C20pi3 = 0.0211547413 0.0014946177 0.0012755030 0.0005492865 0.0048188557 0.0012328812 0.0014539632 0.0011430874 0.0011346394 0.3928460626 0.1250644210 0.0013579946 0.0209788805 0.0128251737 0.0020247248 0.0026240726 0.0171914121 0.0011591071 0.0036027969 0.3860677787 ;\n\
+frequency C20pi4 = 0.0376903543 0.2885196153 0.0365411474 0.0109469400 0.0064073829 0.0893564381 0.0358365464 0.0191106776 0.0329513951 0.0101711878 0.0237495504 0.2897626974 0.0096528870 0.0036349802 0.0105337370 0.0356313768 0.0355926500 0.0027925238 0.0066557222 0.0144621902 ;\n\
+frequency C20pi5 = 0.0084597802 0.0053589922 0.0072525884 0.0024487852 0.0084909000 0.0042781483 0.0025055486 0.0024277107 0.0433214027 0.0097713028 0.0380507037 0.0026741007 0.0080724771 0.3420463838 0.0021418673 0.0080418935 0.0055322116 0.0494840193 0.4375001561 0.0121410277 ;\n\
+frequency C20pi6 = 0.1759898886 0.0290429175 0.0332845569 0.1301263816 0.0017558693 0.0707183953 0.2182166681 0.0409535143 0.0130708195 0.0085622087 0.0159530702 0.0542946169 0.0054045759 0.0025276980 0.0371020404 0.0793480500 0.0540083424 0.0010592104 0.0036259116 0.0249552645 ;\n\
+frequency C20pi7 = 0.1634397322 0.0195541184 0.0438701833 0.0374272612 0.0088659891 0.0137554758 0.0220611924 0.5296717726 0.0090006141 0.0017569353 0.0061156267 0.0167117975 0.0029390787 0.0030641349 0.0126457766 0.0829342776 0.0142835614 0.0028640685 0.0032398299 0.0057985736 ;\n\
+frequency C20pi8 = 0.0917468761 0.0265853306 0.0290699087 0.0133818895 0.0284015012 0.0255084506 0.0196875685 0.0249898794 0.0449766405 0.0583555688 0.1155009222 0.0164915955 0.0395994595 0.0998479096 0.0209916159 0.0736482742 0.0661518462 0.0246463919 0.0972327226 0.0831856483 ;\n\
+frequency C20pi9 = 0.0646700714 0.0988015996 0.0228907308 0.0168733856 0.0077117603 0.0996414875 0.0544977962 0.0148893975 0.0313851988 0.0505983315 0.1844282999 0.0907931290 0.0774839960 0.0219148172 0.0105004469 0.0321196170 0.0411766062 0.0084303030 0.0206106035 0.0505824221 ;\n\
+frequency C20pi10 = 0.0135993865 0.0043408375 0.0018469375 0.0007951703 0.0100090240 0.0046420778 0.0018011758 0.0026794645 0.0072401918 0.0814026713 0.3661422246 0.0025158135 0.0734965132 0.2640965246 0.0038994134 0.0043668760 0.0075248451 0.0261564898 0.0660970801 0.0573472826 ;\n\
+frequency C20pi11 = 0.1478036236 0.0842845089 0.0726630217 0.0534743238 0.0048825808 0.0757166156 0.0727246460 0.0907725939 0.0262288856 0.0035781075 0.0126777221 0.1051660098 0.0059621792 0.0029903868 0.0156558198 0.1459903343 0.0634877444 0.0015928454 0.0050760739 0.0092719768 ;\n\
+frequency C20pi12 = 0.0186377412 0.0042055165 0.0019865236 0.0008329696 0.0054968852 0.0065890091 0.0020248504 0.0021713483 0.0023665991 0.2020809776 0.4370381920 0.0029120653 0.1241860384 0.0385383157 0.0040672279 0.0046177381 0.0149904396 0.0026871667 0.0056324117 0.1189379840 ;\n\
+frequency C20pi13 = 0.0477624336 0.0505742667 0.0209574273 0.0141349161 0.0075791708 0.0429296799 0.0462688073 0.0052327914 0.0165351815 0.1741496627 0.1121253570 0.0577575020 0.0330288046 0.0130691347 0.0124374733 0.0264988925 0.0951754678 0.0031660482 0.0112465746 0.2093704079 ;\n\
+frequency C20pi14 = 0.4164189845 0.0056100821 0.0091701381 0.0045131748 0.0406937949 0.0061320495 0.0063229801 0.0946185184 0.0031057404 0.0076443223 0.0099885414 0.0038941773 0.0069323155 0.0048438356 0.0187840756 0.2360774301 0.0746274607 0.0012172579 0.0034825786 0.0459225422 ;\n\
+frequency C20pi15 = 0.0402295888 0.0735203003 0.1036647193 0.0365523994 0.0124782975 0.0826558132 0.0372197283 0.0233618081 0.2108307125 0.0093478727 0.0360561493 0.0482410586 0.0100289536 0.0459094917 0.0098503973 0.0533383445 0.0310209005 0.0140076639 0.1064377821 0.0152480184 ;\n\
+frequency C20pi16 = 0.0323453034 0.0236282995 0.2520448083 0.2431495959 0.0035976296 0.0330831153 0.0710274499 0.1016074562 0.0366225082 0.0031410809 0.0051980542 0.0470129351 0.0024028744 0.0024429276 0.0094837826 0.0848355278 0.0359083275 0.0008730928 0.0067247672 0.0048704638 ;\n\
+frequency C20pi17 = 0.1476256642 0.0334506604 0.0211972524 0.0403051550 0.0032327194 0.0371554480 0.0576893391 0.0330850942 0.0146392559 0.0108267008 0.0256200793 0.0451350877 0.0058651400 0.0047177179 0.3473710507 0.0892065279 0.0485899446 0.0016358749 0.0044177191 0.0282335685 ;\n\
+frequency C20pi18 = 0.1031448143 0.0717747663 0.0435172139 0.0386401502 0.0061762467 0.0786603123 0.0923369140 0.0202338419 0.0246761899 0.0376904275 0.0376283678 0.0921698920 0.0161883318 0.0067666433 0.0128302120 0.0951450188 0.1378566702 0.0022144738 0.0083041573 0.0740453560 ;\n\
+frequency C20pi19 = 0.0837542823 0.0899383244 0.0518811417 0.0804870571 0.0020735078 0.1456497470 0.1947759184 0.0229030361 0.0268458796 0.0074079756 0.0190249576 0.1459287407 0.0067395241 0.0023063393 0.0085616014 0.0455739585 0.0451080843 0.0010771349 0.0049325333 0.0150302559 ;\n\
+frequency C20pi20 = 0.0578735570 0.0138313604 0.0491421636 0.2946738942 0.0011130839 0.0598250358 0.3402102668 0.0293911435 0.0139817004 0.0030525663 0.0062611922 0.0363365043 0.0027295976 0.0017034884 0.0156106390 0.0358044639 0.0249941878 0.0008664342 0.0038312977 0.0087674229 ;\n\
+\n\
+[ C20 with fixed weights ]\n\
+model C20 = POISSON+G+FMIX{C20pi1:1:0.0559910600,C20pi2:1:0.0514824870,C20pi3:1:0.0812922124,C20pi4:1:0.0721976867,C20pi5:1:0.0556718858,C20pi6:1:0.0331003080,C20pi7:1:0.0589501763,C20pi8:1:0.0263756889,C20pi9:1:0.0307584220,C20pi10:1:0.0376701125,C20pi11:1:0.0303058290,C20pi12:1:0.0808775576,C20pi13:1:0.0263349134,C20pi14:1:0.0579101455,C20pi15:1:0.0371248064,C20pi16:1:0.0586867766,C20pi17:1:0.0561479138,C20pi18:1:0.0349810886,C20pi19:1:0.0544937394,C20pi20:1:0.0596471901};\n\
+[ C20 to weights to be optimized ]\n\
+model C20Opt = POISSON+G+FMIX{C20pi1,C20pi2,C20pi3,C20pi4,C20pi5,C20pi6,C20pi7,C20pi8,C20pi9,C20pi10,C20pi11,C20pi12,C20pi13,C20pi14,C20pi15,C20pi16,C20pi17,C20pi18,C20pi19,C20pi20};\n\
+\n\
+model C20Test = POISSON+G+FMIX{C20pi1:1:0.089485,C20pi2:1:0.021281,C20pi3:1:0.119676,C20pi4:1:0.080933,C20pi5:1:0.064054,C20pi6:1:0.021848,C20pi7:1:0.063392,C20pi8:1:0.003629,C20pi9:1:0.007174,C20pi10:1:0.006256,C20pi11:1:0.023424,C20pi12:1:0.086825,C20pi13:1:0.038495,C20pi14:1:0.090028,C20pi15:1:0.020025,C20pi16:1:0.043484,C20pi17:1:0.076864,C20pi18:1:0.031347,C20pi19:1:0.047749,C20pi20:1:0.064031};\n\
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C30 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+frequency C30pi1 = 0.1100453954 0.0171294861 0.0640338464 0.1595411459 0.0019047235 0.0310187088 0.1098958823 0.0684301540 0.0137950707 0.0026283074 0.0073396531 0.0358553674 0.0024706414 0.0016629473 0.1669356820 0.1381790473 0.0568342547 0.0004661120 0.0035970152 0.0082365591;\n\
+frequency C30pi2 = 0.0874125465 0.0806320385 0.0382152368 0.0326119879 0.0049826376 0.0798168854 0.0951700809 0.0144042708 0.0210626652 0.0399884450 0.0301585074 0.1147200015 0.0126488911 0.0048996596 0.0137397028 0.0873769666 0.1558616621 0.0015122843 0.0053974463 0.0793880836;\n\
+frequency C30pi3 = 0.0225477414 0.0014900535 0.0013034594 0.0005959279 0.0050018158 0.0011436556 0.0015030529 0.0011570953 0.0009374322 0.3944689167 0.0889573138 0.0013600872 0.0189102669 0.0089216031 0.0018312028 0.0028336408 0.0189813395 0.0006693746 0.0023303726 0.4250556480;\n\
+frequency C30pi4 = 0.0602158209 0.0136833299 0.0414987935 0.2900084105 0.0009525462 0.0621611083 0.3610869026 0.0281925621 0.0130500799 0.0030516237 0.0060401889 0.0352704692 0.0027460635 0.0014625624 0.0127175499 0.0318109377 0.0225279521 0.0007948027 0.0034024563 0.0093258397;\n\
+frequency C30pi5 = 0.0101223637 0.0028344920 0.0012928910 0.0006379191 0.0085989355 0.0035028551 0.0011249625 0.0024085229 0.0047753376 0.0701153131 0.4135913903 0.0016748492 0.0744862631 0.2785384406 0.0040466582 0.0037087155 0.0052379329 0.0200222636 0.0523938808 0.0408860135;\n\
+frequency C30pi6 = 0.1335831781 0.0284789590 0.0213891629 0.1125775537 0.0010514541 0.0565844323 0.2099572968 0.0207551870 0.0121330488 0.0073526522 0.0133278240 0.0771772013 0.0030571689 0.0016793592 0.1890195131 0.0484054108 0.0373318180 0.0009266995 0.0026946425 0.0225174379;\n\
+frequency C30pi7 = 0.0408277374 0.0124491768 0.0080464869 0.0030634898 0.0153918410 0.0102922098 0.0066010880 0.0058113137 0.0245211764 0.1487514547 0.1637802160 0.0075923232 0.0385527359 0.1575049888 0.0058352224 0.0151578617 0.0332220362 0.0264937109 0.1213342989 0.1547706314;\n\
+frequency C30pi8 = 0.2469059247 0.0106278945 0.0168929681 0.0027418266 0.1039406309 0.0103988197 0.0054944756 0.0373263209 0.0085752319 0.0292403793 0.0535091180 0.0056123053 0.0302246485 0.0251775640 0.0078098946 0.1642352274 0.1239889705 0.0053155877 0.0163953993 0.0955868125;\n\
+frequency C30pi9 = 0.0549428629 0.1305426495 0.0202957532 0.0092915274 0.0099280995 0.0906036344 0.0417085054 0.0105563869 0.0363512470 0.0569584863 0.1681833183 0.1152521806 0.0592328363 0.0243860149 0.0083055411 0.0283778833 0.0412594019 0.0096355359 0.0249780472 0.0592100878;\n\
+frequency C30pi10 = 0.0462773303 0.0362984274 0.0412365193 0.0182504174 0.0172727117 0.0348990852 0.0224266258 0.0160971397 0.1357852215 0.0164966886 0.0598936127 0.0239396241 0.0164507129 0.1336320854 0.0117413009 0.0454156401 0.0304387749 0.0330338410 0.2350163763 0.0253978649;\n\
+frequency C30pi11 = 0.0474379955 0.0410179935 0.0222453982 0.0112116958 0.0082332447 0.0374051414 0.0388100853 0.0055998598 0.0149156570 0.1832173840 0.1100691114 0.0467850545 0.0356443791 0.0116643783 0.0100244663 0.0317171100 0.1114352326 0.0026685586 0.0099660086 0.2199312452;\n\
+frequency C30pi12 = 0.0213607696 0.0069976154 0.0039878996 0.0012941246 0.0061024858 0.0139566033 0.0036297282 0.0030017014 0.0038425894 0.1309465785 0.4566988203 0.0054567760 0.1947837355 0.0371808169 0.0040747282 0.0076991487 0.0198018718 0.0034086391 0.0064545692 0.0693207986;\n\
+frequency C30pi13 = 0.0919632044 0.0160004872 0.0764682386 0.0306717360 0.0117031014 0.0160060006 0.0171907654 0.0370684649 0.0100792697 0.0093123713 0.0097240970 0.0205385908 0.0075767282 0.0041589440 0.0179686194 0.3254471625 0.2744377258 0.0013887442 0.0044739725 0.0178217761;\n\
+frequency C30pi14 = 0.4649246103 0.0043013249 0.0075304815 0.0050731691 0.0233328752 0.0043571322 0.0057994247 0.1495242047 0.0023298425 0.0043361190 0.0055995530 0.0028525398 0.0039313170 0.0025588185 0.0186467246 0.2150194771 0.0477030158 0.0009038096 0.0020087184 0.0292668421;\n\
+frequency C30pi15 = 0.2051329382 0.0439661329 0.0339418395 0.1070980865 0.0020915940 0.0822742346 0.1989733497 0.0487574293 0.0127143076 0.0058124693 0.0133471767 0.0667787412 0.0043783406 0.0018235059 0.0110997761 0.0873961609 0.0519781961 0.0007361603 0.0023821404 0.0193174204;\n\
+frequency C30pi16 = 0.0263689890 0.0133613622 0.2727158135 0.3117715371 0.0039462429 0.0218978778 0.0694354212 0.0799842408 0.0309615130 0.0027521242 0.0038579661 0.0288630708 0.0018363656 0.0023351927 0.0062457560 0.0798729385 0.0324143174 0.0007229656 0.0063857732 0.0042705326;\n\
+frequency C30pi17 = 0.1526502637 0.0332784464 0.0168229991 0.0237392180 0.0040215287 0.0341733672 0.0377949108 0.0306214335 0.0141929803 0.0123317972 0.0290062362 0.0375543022 0.0064473224 0.0058584416 0.3864504800 0.0880336410 0.0489543188 0.0018252558 0.0048877798 0.0313552773;\n\
+frequency C30pi18 = 0.0080247558 0.0017408595 0.0006327403 0.0003385965 0.0023412143 0.0015507896 0.0007818945 0.0005403825 0.0010026402 0.3177056649 0.3737894172 0.0012598254 0.0488212345 0.0311968471 0.0020687549 0.0012095129 0.0065696791 0.0016309208 0.0043343553 0.1944599147;\n\
+frequency C30pi19 = 0.0599950319 0.1000540567 0.1334918892 0.0889730776 0.0016884984 0.0864856169 0.0962700957 0.0588796388 0.0327277145 0.0021467269 0.0070876372 0.1825860579 0.0033979446 0.0011800742 0.0141408084 0.0779002375 0.0448817374 0.0006249028 0.0032641120 0.0042241415;\n\
+frequency C30pi20 = 0.0393520657 0.0838170642 0.1425481600 0.0431197671 0.0099071945 0.1019786610 0.0394639510 0.0282866471 0.2095718357 0.0076101442 0.0258339558 0.0596434088 0.0084586675 0.0188680789 0.0096840517 0.0624998643 0.0347087967 0.0054645779 0.0564145251 0.0127685828;\n\
+frequency C30pi21 = 0.0072715487 0.0140998918 0.0019756795 0.0027603830 0.0067852535 0.0043339290 0.0025069369 0.0080834718 0.0113217919 0.0056609640 0.0394199644 0.0017735096 0.0079866080 0.1271475634 0.0041098092 0.0052244365 0.0043022271 0.6273570153 0.1084563767 0.0094226397;\n\
+frequency C30pi22 = 0.0907070068 0.0290062335 0.0860677696 0.0745872716 0.0063699858 0.0259377035 0.0386802115 0.4750046194 0.0168090013 0.0014721054 0.0055149849 0.0343855535 0.0024692074 0.0028859215 0.0112150781 0.0731110371 0.0153705714 0.0022914775 0.0041860660 0.0039281943;\n\
+frequency C30pi23 = 0.0055291882 0.0024626303 0.0046086594 0.0011413426 0.0072105915 0.0022692184 0.0009683043 0.0016070950 0.0325831191 0.0082918400 0.0353677882 0.0013849437 0.0074486804 0.3744093753 0.0013374573 0.0057402692 0.0037279636 0.0330334445 0.4609978298 0.0098802591;\n\
+frequency C30pi24 = 0.2443263138 0.0045386562 0.0062422652 0.0031590902 0.0273880205 0.0053593950 0.0076715636 0.0196089609 0.0020189401 0.1017435067 0.0468424225 0.0045492259 0.0201286022 0.0060619450 0.0185219126 0.0497753825 0.1170795523 0.0009577255 0.0035333687 0.3104931504;\n\
+frequency C30pi25 = 0.0863111274 0.0984811895 0.0313963115 0.0600902926 0.0024419845 0.1672351286 0.2036096150 0.0175221435 0.0245245046 0.0105994220 0.0271209781 0.1485789590 0.0095824358 0.0029393105 0.0068276769 0.0347800318 0.0408210979 0.0014001253 0.0055105388 0.0202271268;\n\
+frequency C30pi26 = 0.0643926114 0.0369048739 0.1031213278 0.1628208462 0.0023165895 0.0752534859 0.1762701353 0.0297139006 0.0303503732 0.0088163033 0.0148016812 0.0727140107 0.0056748403 0.0043066715 0.0099270322 0.0926433867 0.0833129915 0.0011237109 0.0093801464 0.0161550816;\n\
+frequency C30pi27 = 0.1736682858 0.0943628709 0.0520404980 0.0285984935 0.0083596568 0.0722446698 0.0483894060 0.0781901497 0.0266134684 0.0068641911 0.0219499324 0.0964011794 0.0112303313 0.0058273974 0.0169661076 0.1547802460 0.0751701930 0.0028774511 0.0082130397 0.0172524320;\n\
+frequency C30pi28 = 0.0347856579 0.3075984538 0.0314157384 0.0092355245 0.0062754891 0.0861073155 0.0323568406 0.0170288127 0.0306438905 0.0091932292 0.0224428556 0.3020845818 0.0093720833 0.0034303536 0.0104447169 0.0326882932 0.0328713449 0.0025244855 0.0064171317 0.0130832013;\n\
+frequency C30pi29 = 0.1087737102 0.0051781020 0.0032679768 0.0015823203 0.0247877480 0.0057932006 0.0041769888 0.0134703172 0.0024765788 0.1643462917 0.2337152707 0.0027000391 0.0539213396 0.0316523420 0.0154886946 0.0188187787 0.0474912345 0.0037656478 0.0073106362 0.2512827825;\n\
+frequency C30pi30 = 0.1101008748 0.0324324597 0.0435098681 0.0579268520 0.0072699765 0.0615196630 0.0828181488 0.0314463068 0.0308557019 0.0530865813 0.1096787834 0.0293860426 0.0458728977 0.0269153699 0.0296430687 0.0715887866 0.0685882454 0.0062324120 0.0257237601 0.0754042006;\n\
+model C30 = POISSON+G+FMIX{C30pi1:1:0.0095783264,C30pi2:1:0.0248476365,C30pi3:1:0.0636309366,C30pi4:1:0.0537939225,C30pi5:1:0.0295885587,C30pi6:1:0.0117587936,C30pi7:1:0.0132013428,C30pi8:1:0.0236868805,C30pi9:1:0.0261687659,C30pi10:1:0.0239821974,C30pi11:1:0.0257100906,C30pi12:1:0.0465072425,C30pi13:1:0.0546794546,C30pi14:1:0.0536085131,C30pi15:1:0.0270622670,C30pi16:1:0.0403913593,C30pi17:1:0.0474212700,C30pi18:1:0.0458816478,C30pi19:1:0.0214036510,C30pi20:1:0.0290385981,C30pi21:1:0.01 [...]
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C40 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+frequency C40pi1 = 0.0660259814 0.0231861755 0.1599815873 0.1054473175 0.0056586745 0.0273928499 0.0440360794 0.0711238664 0.0168194755 0.0039088727 0.0055316013 0.0366689617 0.0037412416 0.0013104807 0.0176359169 0.2497687201 0.1507079582 0.0006723214 0.0038290224 0.0065528958;\n\
+frequency C40pi2 = 0.0232377444 0.0122683027 0.2759650991 0.3532087982 0.0037987468 0.0197339134 0.0739378219 0.0576668030 0.0315866952 0.0031092806 0.0038711609 0.0259363304 0.0017355634 0.0024032103 0.0063116881 0.0657067704 0.0270483653 0.0007602894 0.0069602476 0.0047531689;\n\
+frequency C40pi3 = 0.0166486809 0.0012594763 0.0012622242 0.0005651446 0.0036665719 0.0010669784 0.0013356251 0.0008894749 0.0008231853 0.4129367561 0.0884689295 0.0011904105 0.0186054583 0.0082775676 0.0014029981 0.0021339439 0.0162167380 0.0006082049 0.0019553200 0.4206863114;\n\
+frequency C40pi4 = 0.2394741986 0.0072901253 0.0120536943 0.0044741726 0.0283811727 0.0086558850 0.0105529632 0.0135109628 0.0038929844 0.0765957115 0.0358494908 0.0071093014 0.0199496319 0.0055991131 0.0114265585 0.0847798773 0.1797284519 0.0009838000 0.0042240671 0.2454678377;\n\
+frequency C40pi5 = 0.1194613086 0.0233255669 0.0294552140 0.0134272792 0.0150526644 0.0301537796 0.0192173037 0.0337675998 0.0214746045 0.0579001821 0.1446308373 0.0147261337 0.0561242940 0.0550467421 0.0631355418 0.0925266727 0.0831230185 0.0131636136 0.0331118002 0.0811758434;\n\
+frequency C40pi6 = 0.0567043710 0.0117359330 0.0364734454 0.2955500969 0.0008924801 0.0609516515 0.3795154126 0.0230469606 0.0118360971 0.0031182036 0.0060137466 0.0314205689 0.0028584065 0.0012972333 0.0124745819 0.0300334889 0.0227051137 0.0007738758 0.0031343761 0.0094639563;\n\
+frequency C40pi7 = 0.0179027412 0.0040967133 0.0035697688 0.0008870412 0.0160760340 0.0045395474 0.0023182113 0.0039829808 0.0127292680 0.0404650518 0.1676143477 0.0027994718 0.0424172255 0.3344862590 0.0020115128 0.0075841581 0.0068227293 0.0518381385 0.2452542553 0.0326045442;\n\
+frequency C40pi8 = 0.2712170094 0.0056480837 0.0141045260 0.0021017036 0.2003830179 0.0048264059 0.0023229984 0.0502501222 0.0053727960 0.0150684657 0.0330003443 0.0020646283 0.0154811217 0.0202990358 0.0045351023 0.1764198412 0.0839578061 0.0046265242 0.0141271048 0.0741933626;\n\
+frequency C40pi9 = 0.0894736584 0.1040026384 0.0190192153 0.0272183085 0.0045538316 0.1168091917 0.1275076663 0.0115685734 0.0215746293 0.0469424171 0.0512035100 0.1382047308 0.0147656854 0.0056590176 0.0095546504 0.0383953611 0.0836652641 0.0017079427 0.0062181292 0.0819555787;\n\
+frequency C40pi10 = 0.0495441385 0.0375345822 0.0315863530 0.0143641284 0.0182505609 0.0316504100 0.0215379122 0.0140199913 0.1108543799 0.0247065801 0.0700287927 0.0258142032 0.0188271760 0.1418048822 0.0112101202 0.0456094427 0.0361427973 0.0371985427 0.2223972375 0.0369177689;\n\
+frequency C40pi11 = 0.1704314254 0.0415784004 0.0271109259 0.1098556600 0.0009747331 0.0917299929 0.2536458944 0.0249846466 0.0101389736 0.0058749399 0.0116526350 0.0903324267 0.0036512738 0.0013321301 0.0293613681 0.0561765645 0.0479045729 0.0006696817 0.0022637316 0.0203300232;\n\
+frequency C40pi12 = 0.0162725399 0.0054826071 0.0021876158 0.0010182101 0.0050614097 0.0104414465 0.0025141347 0.0021935389 0.0029914328 0.1328173512 0.4904441779 0.0040120394 0.1929931280 0.0376245580 0.0034333187 0.0040122105 0.0127074428 0.0032107554 0.0058100621 0.0647720205;\n\
+frequency C40pi13 = 0.0823765743 0.0734226431 0.0598389731 0.0311745159 0.0065694304 0.0686451074 0.0675530778 0.0178961594 0.0251143622 0.0291161743 0.0287904106 0.0982301674 0.0168022878 0.0064717899 0.0114044922 0.1302995288 0.1820374273 0.0022724618 0.0079573279 0.0540270885;\n\
+frequency C40pi14 = 0.3594965940 0.0072407229 0.0033421456 0.0031484357 0.0251417178 0.0049014279 0.0064962700 0.1194682267 0.0022970448 0.0458766662 0.0468053893 0.0050168849 0.0215568816 0.0092020461 0.0443915884 0.0465270945 0.0477755293 0.0024540215 0.0046450361 0.1942162766;\n\
+frequency C40pi15 = 0.2015583874 0.0430161610 0.0425386444 0.0954149893 0.0032365302 0.0772010857 0.1534908791 0.0667291678 0.0155218808 0.0067740832 0.0165114429 0.0547322644 0.0060162992 0.0025643300 0.0091970560 0.1185981804 0.0625472744 0.0009565508 0.0031150007 0.0202797924;\n\
+frequency C40pi16 = 0.1042731047 0.0147062345 0.0621645800 0.2424069523 0.0022450116 0.0356498946 0.1774821588 0.1697819523 0.0132648834 0.0018929517 0.0042542620 0.0220651981 0.0016441234 0.0012570256 0.0317041583 0.0778636230 0.0288515782 0.0006930898 0.0017741945 0.0060250231;\n\
+frequency C40pi17 = 0.0781183281 0.0111498472 0.0159270309 0.0041541669 0.0194448667 0.0240151620 0.0116633921 0.0111524105 0.0063589385 0.1354530457 0.2457574952 0.0093729846 0.1087781166 0.0262793949 0.0055294038 0.0408518858 0.0860514305 0.0031547586 0.0085108496 0.1482764918;\n\
+frequency C40pi18 = 0.0856592432 0.0101233167 0.0441923073 0.0135061568 0.0136072878 0.0092590642 0.0078602552 0.0245400880 0.0055379075 0.0100591561 0.0103343559 0.0127318506 0.0080675803 0.0047153035 0.0175273997 0.3406479487 0.3573294650 0.0014243098 0.0035099810 0.0193670227;\n\
+frequency C40pi19 = 0.0674594695 0.1161734658 0.1163107783 0.0662588409 0.0021634231 0.0939360452 0.0865501280 0.0368556575 0.0381149118 0.0033238825 0.0093839985 0.1899736999 0.0039487389 0.0018212730 0.0151207830 0.0842204423 0.0565953680 0.0007187305 0.0046189437 0.0064514195;\n\
+frequency C40pi20 = 0.0572262322 0.0494723554 0.1083882793 0.1793932771 0.0015301521 0.0903668522 0.1992261265 0.0316472274 0.0291392067 0.0045804559 0.0100739563 0.1015624916 0.0040204606 0.0013701849 0.0063674130 0.0621142922 0.0496102162 0.0006669285 0.0046497641 0.0085941279;\n\
+frequency C40pi21 = 0.0036020163 0.0102712927 0.0013455508 0.0020871647 0.0045484804 0.0032718114 0.0017857730 0.0056391633 0.0064968790 0.0029292916 0.0232635081 0.0010419846 0.0044592278 0.0855714596 0.0024991984 0.0030671803 0.0025900250 0.7617821954 0.0678809532 0.0058668443;\n\
+frequency C40pi22 = 0.2032018418 0.0083895722 0.0143743754 0.0135011707 0.0098131618 0.0044514580 0.0083818173 0.6184886075 0.0027747899 0.0011828492 0.0039826789 0.0044598895 0.0020631785 0.0019619615 0.0085870399 0.0739919851 0.0108922273 0.0018606145 0.0015638674 0.0060769136;\n\
+frequency C40pi23 = 0.0050898779 0.0028740788 0.0057092962 0.0016126151 0.0061776450 0.0024693148 0.0012040415 0.0016334183 0.0393460780 0.0059088776 0.0249343597 0.0013713662 0.0049795162 0.3563126947 0.0014136424 0.0059527667 0.0036536770 0.0357987380 0.4853645852 0.0081934106;\n\
+frequency C40pi24 = 0.0403335679 0.0540186397 0.0216052457 0.0098218598 0.0081549541 0.0383639077 0.0375406578 0.0047934404 0.0176735565 0.1893424159 0.1051859862 0.0607377395 0.0305599836 0.0119140782 0.0077550551 0.0257110173 0.1009913165 0.0028780020 0.0115276935 0.2210908828;\n\
+frequency C40pi25 = 0.0790086293 0.1065441152 0.0309384274 0.0546012394 0.0024947877 0.1843375981 0.1997882784 0.0192655847 0.0270700474 0.0075667489 0.0254542392 0.1553108816 0.0098024439 0.0023773444 0.0056640684 0.0332370813 0.0359574739 0.0011682801 0.0048820809 0.0145306498;\n\
+frequency C40pi26 = 0.0722240672 0.0489728405 0.0678929607 0.1194883992 0.0064755348 0.0708969573 0.1345886574 0.0287815397 0.0699011334 0.0173588702 0.0519870084 0.0490341790 0.0154411043 0.0348233029 0.0145597486 0.0589579876 0.0425972780 0.0087913770 0.0554386705 0.0317883834;\n\
+frequency C40pi27 = 0.1085842431 0.0206450023 0.0441956285 0.1529666596 0.0012502570 0.0405398136 0.1664851192 0.0336098469 0.0134902179 0.0038821795 0.0089861440 0.0576227094 0.0024339036 0.0014553522 0.1990095021 0.0846749753 0.0454715217 0.0005902831 0.0027650162 0.0113416246;\n\
+frequency C40pi28 = 0.0309526387 0.3195887318 0.0301336637 0.0082352132 0.0065593963 0.0832608108 0.0291974083 0.0154206187 0.0310385092 0.0098251607 0.0237900204 0.3062634996 0.0097071728 0.0036891639 0.0095029109 0.0295285439 0.0303052301 0.0028125285 0.0068850639 0.0133037148;\n\
+frequency C40pi29 = 0.0098953741 0.0019604525 0.0007307935 0.0003748228 0.0028276741 0.0017337004 0.0009182100 0.0006997068 0.0010419482 0.3115040359 0.3750387796 0.0013960508 0.0474451070 0.0298607430 0.0025296256 0.0014628019 0.0075738968 0.0016799771 0.0040259930 0.1973003069;\n\
+frequency C40pi30 = 0.1163213921 0.0273321006 0.0250163656 0.0731917718 0.0034792282 0.0586677248 0.1380880502 0.0193193469 0.0160240740 0.0712243431 0.0771473538 0.0355120487 0.0242841072 0.0094117688 0.0508926833 0.0475560280 0.0726552233 0.0026892716 0.0076166020 0.1235705162;\n\
+frequency C40pi31 = 0.1285218235 0.0373073487 0.1179844215 0.0402749992 0.0172928883 0.0439706110 0.0250692272 0.1127033137 0.0606981059 0.0109350265 0.0258415767 0.0288749652 0.0167592956 0.0199118302 0.0180674983 0.1741489481 0.0648967655 0.0063574951 0.0321771650 0.0182066946;\n\
+frequency C40pi32 = 0.0372286941 0.0094528028 0.0053377315 0.0023703173 0.0144940088 0.0079097138 0.0048585146 0.0046433943 0.0186795102 0.1820459527 0.1780099317 0.0058198481 0.0371334296 0.1463772419 0.0048538601 0.0103570678 0.0284161577 0.0211293603 0.0958905187 0.1849919442;\n\
+frequency C40pi33 = 0.0535643726 0.1159797757 0.0239172676 0.0113537364 0.0096256227 0.0928585070 0.0391699080 0.0120279334 0.0384887950 0.0522748270 0.1892392595 0.0996037748 0.0712219098 0.0264213736 0.0083720574 0.0299114019 0.0389484845 0.0104232046 0.0265030050 0.0500947835;\n\
+frequency C40pi34 = 0.1332424803 0.0033147683 0.0022704992 0.0012739239 0.0246514263 0.0030843469 0.0040461524 0.0089139209 0.0015864680 0.1971284995 0.1251288442 0.0023713225 0.0286947200 0.0156995251 0.0118845743 0.0171461828 0.0563298009 0.0017341820 0.0048778410 0.3566205216;\n\
+frequency C40pi35 = 0.1498658185 0.0326607222 0.0176452820 0.0280354786 0.0035437399 0.0348151308 0.0435380704 0.0311112643 0.0140625707 0.0101953314 0.0251433928 0.0393124980 0.0051548319 0.0047533945 0.3923800449 0.0874496981 0.0473306717 0.0015215239 0.0043208299 0.0271597054;\n\
+frequency C40pi36 = 0.4214366359 0.0061425967 0.0121590498 0.0073305074 0.0187609694 0.0072748556 0.0086837775 0.0902333103 0.0030262044 0.0039362777 0.0047193320 0.0051508681 0.0038306586 0.0027156136 0.0208940236 0.2901188793 0.0651922314 0.0008108235 0.0023622848 0.0252211004;\n\
+frequency C40pi37 = 0.1770713890 0.1332782050 0.0311656783 0.0226500225 0.0078348946 0.0752471493 0.0509767242 0.0897389513 0.0220667143 0.0059519850 0.0205369728 0.1257689326 0.0092982479 0.0040514178 0.0264087912 0.1169591448 0.0565566955 0.0029947127 0.0049346701 0.0165087010;\n\
+frequency C40pi38 = 0.0293984032 0.0370901720 0.1483622633 0.1099709900 0.0031729093 0.0388688450 0.0464270335 0.4222420155 0.0272494642 0.0007997326 0.0037634298 0.0622314461 0.0016657052 0.0015039626 0.0056481827 0.0472252404 0.0086568982 0.0009176022 0.0027693124 0.0020363920;\n\
+frequency C40pi39 = 0.0265779317 0.0791104753 0.1318603134 0.0280314140 0.0101369144 0.0989710810 0.0269057233 0.0173376629 0.2815133703 0.0064646977 0.0268210053 0.0474749135 0.0072375268 0.0276960902 0.0083014995 0.0426276702 0.0259042511 0.0078528946 0.0891598394 0.0100147256;\n\
+frequency C40pi40 = 0.0096096503 0.0027136180 0.0013104432 0.0006331856 0.0077301682 0.0033899420 0.0010471898 0.0020227436 0.0039001415 0.0733098005 0.4451691588 0.0014931484 0.0732575295 0.2630171690 0.0042768091 0.0036117358 0.0057928403 0.0181275729 0.0370698053 0.0425173480;\n\
+model C40 = POISSON+G+FMIX{C40pi1:1:0.0223853788,C40pi2:1:0.0338891820,C40pi3:1:0.0577169375,C40pi4:1:0.0252416233,C40pi5:1:0.0108607921,C40pi6:1:0.0462373793,C40pi7:1:0.0102293175,C40pi8:1:0.0147523625,C40pi9:1:0.0143161352,C40pi10:1:0.0182302541,C40pi11:1:0.0204025079,C40pi12:1:0.0425505156,C40pi13:1:0.0248627269,C40pi14:1:0.0105892988,C40pi15:1:0.0188238725,C40pi16:1:0.0086663445,C40pi17:1:0.0148496147,C40pi18:1:0.0343037402,C40pi19:1:0.0225335203,C40pi20:1:0.0174068578,C40pi21:1:0.01 [...]
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C50 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+frequency C50pi1 = 0.1357566757 0.0328511938 0.0937692919 0.0757182069 0.0041887049 0.0448010470 0.0572805366 0.1210866186 0.0167465028 0.0049719235 0.0113823284 0.0458096069 0.0064563157 0.0029292810 0.0228705187 0.2060115780 0.1011347978 0.0012443033 0.0056104605 0.0093801079;\n\
+frequency C50pi2 = 0.0530862751 0.1905936010 0.0595772279 0.0320970468 0.0026608079 0.1152605895 0.0840617877 0.0196495178 0.0274729775 0.0064919200 0.0158709120 0.2635539775 0.0078171228 0.0017231166 0.0121639300 0.0449347664 0.0472425608 0.0008407188 0.0037608716 0.0111402722;\n\
+frequency C50pi3 = 0.0083279799 0.0007172026 0.0006359642 0.0003134388 0.0020547407 0.0007351595 0.0005373710 0.0005576905 0.0004858721 0.4370910601 0.1208722220 0.0006394909 0.0195499664 0.0090175268 0.0007265254 0.0007876194 0.0057076665 0.0006453449 0.0016797264 0.3889174318;\n\
+frequency C50pi4 = 0.2072868350 0.0166858699 0.0129177658 0.0020625574 0.0849982226 0.0151757635 0.0065903656 0.0472047575 0.0130289256 0.0345690755 0.1042722764 0.0075861385 0.0498042308 0.0572909747 0.0064928361 0.1183618036 0.0780339514 0.0128352368 0.0323576924 0.0924447209;\n\
+frequency C50pi5 = 0.0364181183 0.0076427099 0.0052725527 0.0020389950 0.0171009943 0.0064088232 0.0042399368 0.0053824238 0.0198596156 0.1361523026 0.1651892915 0.0045481616 0.0387479055 0.2025922657 0.0055053348 0.0121111950 0.0254621828 0.0327580458 0.1368025306 0.1357666147;\n\
+frequency C50pi6 = 0.0535489196 0.0099543365 0.0269073208 0.3076150732 0.0007101021 0.0574988641 0.4066173371 0.0204537673 0.0096286483 0.0025879708 0.0049721459 0.0280989086 0.0025143457 0.0010618006 0.0124317994 0.0247246015 0.0191107367 0.0006385967 0.0024132214 0.0085115039;\n\
+frequency C50pi7 = 0.0074733729 0.0025226602 0.0033967505 0.0005574007 0.0081158286 0.0037658904 0.0013610444 0.0022017759 0.0115142679 0.0195730439 0.1268878488 0.0018497296 0.0269141680 0.3821985941 0.0019970421 0.0057127939 0.0039692337 0.0553575998 0.3184099394 0.0162210153;\n\
+frequency C50pi8 = 0.2615592974 0.0027098854 0.0124908261 0.0020153852 0.2740228527 0.0017043893 0.0007667803 0.0463498030 0.0019474361 0.0082858275 0.0147048711 0.0010787235 0.0063051368 0.0062080862 0.0039442437 0.1940042648 0.0963699489 0.0016185483 0.0048431386 0.0590705550;\n\
+frequency C50pi9 = 0.1190557043 0.0956320251 0.0215995297 0.0378323341 0.0041536088 0.1151348174 0.1337084452 0.0179375220 0.0216767047 0.0336228770 0.0557402194 0.1132452331 0.0178407325 0.0063405927 0.0147606946 0.0478666925 0.0712091035 0.0022867238 0.0075728630 0.0627835766;\n\
+frequency C50pi10 = 0.0505010344 0.0281381134 0.0341872191 0.0178157543 0.0183140005 0.0271729546 0.0212018661 0.0176052654 0.1190104107 0.0161645217 0.0561232531 0.0203908848 0.0146521042 0.1553484132 0.0135251600 0.0478959652 0.0292963208 0.0376058633 0.2477283800 0.0273225153;\n\
+frequency C50pi11 = 0.1239446910 0.0355525870 0.0409769096 0.1479953346 0.0011563976 0.0908869312 0.2700270273 0.0283589709 0.0126760201 0.0064825033 0.0122101302 0.0787433823 0.0042467440 0.0016540857 0.0205717500 0.0552940245 0.0474239965 0.0008596621 0.0027823209 0.0181565313;\n\
+frequency C50pi12 = 0.0160542063 0.0027359185 0.0014708079 0.0007004900 0.0034820152 0.0061470051 0.0016359686 0.0022137927 0.0013207229 0.1640035117 0.4616043506 0.0021342205 0.2174099502 0.0143751693 0.0013694259 0.0037614383 0.0172651408 0.0011454338 0.0019438536 0.0792265779;\n\
+frequency C50pi13 = 0.1548192401 0.0131324559 0.0280584102 0.0095301620 0.0166267416 0.0175228950 0.0170969133 0.0179616718 0.0078385586 0.0865181208 0.0523369910 0.0132802182 0.0326348210 0.0083511229 0.0145594414 0.1096327081 0.2218108602 0.0015829972 0.0062173360 0.1704883347;\n\
+frequency C50pi14 = 0.2950313592 0.0027580697 0.0021616268 0.0015364190 0.0375439186 0.0028808733 0.0042976283 0.0261726702 0.0008294969 0.0834938143 0.0553606311 0.0022642314 0.0181259911 0.0074433078 0.0126794048 0.0382913338 0.0783205173 0.0010015148 0.0034016419 0.3264055498;\n\
+frequency C50pi15 = 0.1683177099 0.0820396152 0.0526048706 0.0822517150 0.0023029997 0.0969341246 0.1488943001 0.0535291188 0.0179803231 0.0032503636 0.0114941086 0.1156402642 0.0039439899 0.0015002945 0.0066854154 0.0924511658 0.0480769504 0.0006152103 0.0025022919 0.0089851683;\n\
+frequency C50pi16 = 0.0334088176 0.0134485791 0.1590918150 0.3657542471 0.0025127086 0.0327665151 0.1820739351 0.0740807194 0.0202010901 0.0016650025 0.0036700956 0.0295517886 0.0017087810 0.0011422805 0.0073155123 0.0426788071 0.0211162106 0.0005931485 0.0034724580 0.0037474882;\n\
+frequency C50pi17 = 0.0777586977 0.0174438357 0.0053423343 0.0043431532 0.0062523949 0.0220851281 0.0161769285 0.0053903202 0.0080675581 0.1052945216 0.1617365895 0.0148319919 0.0288253912 0.0168985297 0.2565426868 0.0202089662 0.0542929694 0.0060146095 0.0078109966 0.1646823969;\n\
+frequency C50pi18 = 0.0727013979 0.0048977192 0.0026095383 0.0011420120 0.0198747408 0.0066949336 0.0030401434 0.0079074845 0.0026492900 0.1685788878 0.3185489163 0.0026024909 0.0735597038 0.0490419983 0.0051699104 0.0128630830 0.0305356924 0.0050857840 0.0095279173 0.2029683559;\n\
+frequency C50pi19 = 0.0658153836 0.0833432992 0.0224582275 0.0107735824 0.0092974677 0.0745951987 0.0299754097 0.0146336557 0.0148026634 0.0671888719 0.2198675990 0.0868172087 0.1084156835 0.0155812696 0.0071132147 0.0381451947 0.0562948237 0.0056421684 0.0102813038 0.0589577740;\n\
+frequency C50pi20 = 0.0525278351 0.0364897390 0.0903013988 0.1854660991 0.0037795400 0.0776857292 0.1789287290 0.0232011648 0.0687702011 0.0135825419 0.0337350646 0.0458143770 0.0108457797 0.0191020037 0.0088729983 0.0495289201 0.0389358438 0.0046292762 0.0354195947 0.0223831639;\n\
+frequency C50pi21 = 0.0026515970 0.0080885204 0.0010572021 0.0016052142 0.0036540307 0.0022979498 0.0014681767 0.0046230912 0.0043887616 0.0020669456 0.0172444871 0.0006593575 0.0034691503 0.0658351447 0.0019185467 0.0022498420 0.0021278866 0.8183345006 0.0515918357 0.0046677595;\n\
+frequency C50pi22 = 0.0548133174 0.0692044159 0.0211265710 0.0207779125 0.0072646572 0.0567865657 0.0738456579 0.0051797705 0.0168408457 0.1386104888 0.0713795154 0.0896393340 0.0201205491 0.0082150393 0.0104049016 0.0282344422 0.0995597110 0.0019722093 0.0074054035 0.1986186919;\n\
+frequency C50pi23 = 0.0047955268 0.0028033787 0.0050506238 0.0014080516 0.0061671241 0.0019350126 0.0009861551 0.0014396818 0.0389623239 0.0048950388 0.0151748150 0.0012306644 0.0032520404 0.3601993060 0.0011266316 0.0054509935 0.0034763921 0.0362899931 0.4980200998 0.0073361467;\n\
+frequency C50pi24 = 0.0365462996 0.0280070630 0.0183606115 0.0070525803 0.0093251684 0.0300239431 0.0221812842 0.0047778642 0.0178840316 0.2025947306 0.1973012130 0.0250209750 0.0557862640 0.0258067541 0.0042772210 0.0209374223 0.0731398943 0.0049738166 0.0200601168 0.1959427463;\n\
+frequency C50pi25 = 0.0684197684 0.0111619750 0.0544764241 0.0224313301 0.0106958312 0.0091799953 0.0097436799 0.0255871619 0.0055558006 0.0059416697 0.0076746853 0.0144198991 0.0056892166 0.0037356845 0.0172554137 0.3527301149 0.3586913194 0.0012501907 0.0028636710 0.0124961682;\n\
+frequency C50pi26 = 0.0495330775 0.1060064564 0.1511923969 0.0483471288 0.0080946362 0.0886108407 0.0449556763 0.0331436148 0.1447288287 0.0061850770 0.0190407203 0.0948075276 0.0063418871 0.0126162987 0.0100869563 0.0799801169 0.0445418973 0.0044765096 0.0363930724 0.0109172804;\n\
+frequency C50pi27 = 0.0702411901 0.0642050323 0.0779553908 0.0510328304 0.0042438849 0.0723300485 0.0883747710 0.0177347101 0.0233800891 0.0198779320 0.0183537117 0.1051267065 0.0107865869 0.0037987118 0.0112811107 0.1345081583 0.1805543234 0.0014252764 0.0055089381 0.0392805971;\n\
+frequency C50pi28 = 0.1207399152 0.1741788075 0.0385528120 0.0162689581 0.0118494185 0.0760068404 0.0337935391 0.0653431008 0.0342783806 0.0085426053 0.0256788075 0.1434443984 0.0112347894 0.0061270793 0.0294493558 0.1091415488 0.0634181251 0.0046156419 0.0085374279 0.0187984481;\n\
+frequency C50pi29 = 0.0064521696 0.0021817337 0.0005939658 0.0003904032 0.0021538307 0.0019099968 0.0008007758 0.0005208471 0.0011374294 0.2850758996 0.4278536740 0.0013920239 0.0561988528 0.0449501501 0.0026289702 0.0011053664 0.0055157148 0.0022753671 0.0059612583 0.1509015707;\n\
+frequency C50pi30 = 0.0969092741 0.0359723370 0.0633194168 0.0411020773 0.0145578946 0.0466661704 0.0469223767 0.0374614202 0.0537149580 0.0394603009 0.0856256544 0.0283577862 0.0346435320 0.0507298072 0.0167177549 0.0990945318 0.0806503833 0.0128373826 0.0598972198 0.0553597218;\n\
+frequency C50pi31 = 0.0840212010 0.0214242172 0.2240668646 0.0354684798 0.0265031681 0.0235675678 0.0076026464 0.1173325117 0.0516019781 0.0048917455 0.0067211727 0.0173653354 0.0079342101 0.0087501486 0.0093276105 0.2637097946 0.0630157977 0.0022314593 0.0170994247 0.0073646661;\n\
+frequency C50pi32 = 0.0055061507 0.0012508737 0.0004824961 0.0004530173 0.0054435931 0.0011315076 0.0004150379 0.0012285001 0.0019884532 0.0617431901 0.4342418135 0.0008161868 0.0554628445 0.3289659386 0.0025814794 0.0021197505 0.0029510440 0.0172981374 0.0412097497 0.0347102358;\n\
+frequency C50pi33 = 0.0442014612 0.1295816316 0.0258622052 0.0148900471 0.0076165815 0.1301765579 0.0636708052 0.0105339122 0.0662542863 0.0423977240 0.1434197528 0.1040381429 0.0403363621 0.0260540342 0.0089335090 0.0242573966 0.0317938092 0.0077831996 0.0309973779 0.0472012033;\n\
+frequency C50pi34 = 0.0571984155 0.0034929878 0.0031324721 0.0012472712 0.0113230439 0.0025279922 0.0040737817 0.0030647398 0.0020494153 0.3131200932 0.0901750144 0.0034699557 0.0242565205 0.0112345295 0.0048197020 0.0095675953 0.0529842025 0.0010645104 0.0041851135 0.3970126433;\n\
+frequency C50pi35 = 0.1141963934 0.0102229903 0.0178644126 0.0172307307 0.0056978908 0.0039055039 0.0085974326 0.7425714921 0.0026414175 0.0005602022 0.0019872568 0.0055400059 0.0004739977 0.0010663175 0.0054302447 0.0508318204 0.0055408544 0.0018890811 0.0012409205 0.0025110348;\n\
+frequency C50pi36 = 0.3531758625 0.0043402857 0.0031812423 0.0030024877 0.0165711581 0.0029126214 0.0042077690 0.4520896100 0.0021366362 0.0063692579 0.0120143269 0.0022586970 0.0080260130 0.0043865828 0.0111462027 0.0658344033 0.0182952730 0.0010872878 0.0023330172 0.0266312657;\n\
+frequency C50pi37 = 0.0310798708 0.0234519814 0.1273669012 0.1197925100 0.0031216960 0.0295858842 0.0470763446 0.4883046368 0.0193412101 0.0008855622 0.0032808220 0.0408430573 0.0014984226 0.0016298596 0.0063229464 0.0423452622 0.0082797260 0.0007718998 0.0024996877 0.0025217188;\n\
+frequency C50pi38 = 0.0370340667 0.0689410214 0.1704407181 0.1041817082 0.0018108784 0.0715495095 0.0659866718 0.2159298358 0.0443591808 0.0008668888 0.0064679416 0.1275300877 0.0027248464 0.0014178323 0.0060253154 0.0534574556 0.0147073432 0.0007999410 0.0037708147 0.0019979426;\n\
+frequency C50pi39 = 0.0160398536 0.0526622999 0.1051167149 0.0187352256 0.0085330116 0.0922616498 0.0154450839 0.0076235155 0.3848449137 0.0057129406 0.0277195224 0.0219347380 0.0071078308 0.0376358992 0.0072201969 0.0209969653 0.0142198783 0.0096946226 0.1384243143 0.0080708232;\n\
+frequency C50pi40 = 0.0165549167 0.0085856833 0.0049441851 0.0016567380 0.0086529073 0.0184087838 0.0033759867 0.0033844413 0.0084695063 0.0483923758 0.4963073963 0.0056997331 0.1949377866 0.0999527140 0.0060271256 0.0084289585 0.0122619536 0.0114013282 0.0192314834 0.0233259964;\n\
+frequency C50pi41 = 0.0227379959 0.0137060298 0.3162561805 0.2932103363 0.0037073869 0.0169119273 0.0380984220 0.0550224760 0.0319886436 0.0039219190 0.0041582288 0.0312539900 0.0019467591 0.0022276545 0.0059660826 0.0998736999 0.0462336456 0.0007310446 0.0069012376 0.0051463400;\n\
+frequency C50pi42 = 0.2406936002 0.0197081082 0.0462578641 0.0206379264 0.0186726798 0.0189843646 0.0129785315 0.1749109142 0.0118714342 0.0049349532 0.0126237761 0.0127876711 0.0095642661 0.0083606873 0.0326283314 0.2101300187 0.1130042042 0.0041951500 0.0069210515 0.0201344675;\n\
+frequency C50pi43 = 0.0214325714 0.3730744306 0.0220674626 0.0037495290 0.0069038342 0.0670391950 0.0159298773 0.0126211348 0.0284477629 0.0102051798 0.0242954287 0.3272456489 0.0093147452 0.0036403029 0.0070138928 0.0216860624 0.0232259733 0.0030422478 0.0065368590 0.0125278613;\n\
+frequency C50pi44 = 0.1567707052 0.0258059606 0.0161658338 0.0223946414 0.0074382689 0.0274455582 0.0410010574 0.0360501033 0.0159972680 0.0640941463 0.0944756654 0.0192586366 0.0312789234 0.0227728534 0.1653169011 0.0640177954 0.0549103568 0.0050980224 0.0138248643 0.1158824381;\n\
+frequency C50pi45 = 0.4345912387 0.0061142999 0.0097660767 0.0060102195 0.0197377879 0.0069062805 0.0082800652 0.0829075516 0.0029125126 0.0047747098 0.0054182241 0.0049974525 0.0039676868 0.0029052002 0.0193588692 0.2795854727 0.0677816788 0.0008196092 0.0025196339 0.0306454302;\n\
+frequency C50pi46 = 0.0296734965 0.1443250343 0.0128668160 0.0059561454 0.0129805897 0.0492311054 0.0262726056 0.0069437743 0.0676183913 0.0452364160 0.1374511139 0.0907089722 0.0308070846 0.0816441785 0.0060701025 0.0197130339 0.0299715868 0.0461468661 0.1119414237 0.0444412635;\n\
+frequency C50pi47 = 0.1089911217 0.0159187676 0.0643054232 0.2086425054 0.0016540963 0.0375565797 0.1791004993 0.0610564917 0.0144660242 0.0038322948 0.0067778708 0.0372270242 0.0022817918 0.0012634818 0.0851792013 0.1065821239 0.0524401536 0.0005901255 0.0027836060 0.0093508169;\n\
+frequency C50pi48 = 0.1429463629 0.0304191716 0.0191145368 0.0351867799 0.0031493079 0.0341248336 0.0508492526 0.0305914291 0.0134276644 0.0070227247 0.0197257013 0.0421442438 0.0038904796 0.0040697467 0.4052202085 0.0874406009 0.0445304918 0.0012842531 0.0039485525 0.0209136585;\n\
+frequency C50pi49 = 0.0580116857 0.0903213669 0.0369245281 0.0613603988 0.0022829951 0.2073851382 0.2225853236 0.0159476910 0.0311816018 0.0068543753 0.0217092509 0.1504781849 0.0084841006 0.0020581132 0.0046206107 0.0276754451 0.0321477211 0.0011651089 0.0051889637 0.0136173964;\n\
+frequency C50pi50 = 0.2153540940 0.0359173007 0.0219927944 0.0735128474 0.0037017294 0.0566408566 0.1350375818 0.0662986417 0.0157121780 0.0138456188 0.0266922211 0.0474338339 0.0088042600 0.0035035311 0.0739583083 0.0921989198 0.0575687235 0.0019306896 0.0044520833 0.0454437865;\n\
+model C50 = POISSON+G+FMIX{C50pi1:1:0.0164297003,C50pi2:1:0.0273175755,C50pi3:1:0.0460247610,C50pi4:1:0.0084864734,C50pi5:1:0.0125389252,C50pi6:1:0.0343549036,C50pi7:1:0.0130241102,C50pi8:1:0.0094755681,C50pi9:1:0.0190040551,C50pi10:1:0.0151902354,C50pi11:1:0.0320534760,C50pi12:1:0.0210059850,C50pi13:1:0.0237408547,C50pi14:1:0.0239841203,C50pi15:1:0.0213748021,C50pi16:1:0.0210717705,C50pi17:1:0.0050241805,C50pi18:1:0.0166262276,C50pi19:1:0.0143945956,C50pi20:1:0.0104391130,C50pi21:1:0.01 [...]
+\n\
+[ ---------------------------------------------------------\n\
+    CAT-C60 profile mixture model of Le, Gascuel & Lartillot (2008)\n\
+ --------------------------------------------------------- ]\n\
+frequency C60pi1 = 0.1534363248 0.0444389067 0.0796726990 0.0546757288 0.0047306596 0.0514333025 0.0529324359 0.1103775749 0.0174480218 0.0050343887 0.0130294160 0.0603928711 0.0075550589 0.0035554315 0.0249523704 0.2029625968 0.0957668473 0.0014444483 0.0059800307 0.0101808864;\n\
+frequency C60pi2 = 0.0281984692 0.3031055487 0.0312954609 0.0091549350 0.0019503463 0.0939884393 0.0388530140 0.0084028325 0.0155384715 0.0107872879 0.0217786594 0.3476042929 0.0109904917 0.0015919288 0.0071539896 0.0197479052 0.0328352333 0.0009209994 0.0025714024 0.0135302919;\n\
+frequency C60pi3 = 0.0083680740 0.0007319768 0.0006123446 0.0002228366 0.0020433870 0.0009498685 0.0004731544 0.0004825748 0.0005189995 0.3768453098 0.2608334606 0.0006296168 0.0315700586 0.0123984358 0.0009595916 0.0009746383 0.0049990761 0.0008657759 0.0017132332 0.2938075872;\n\
+frequency C60pi4 = 0.2227229348 0.0064846074 0.0061206496 0.0007997588 0.1640285908 0.0051051888 0.0027280806 0.0202702520 0.0037183875 0.0455406072 0.0883350071 0.0022832871 0.0348094559 0.0228667054 0.0035471579 0.0850040072 0.1012848285 0.0048424833 0.0096500033 0.1698580069;\n\
+frequency C60pi5 = 0.0412139519 0.0067627055 0.0051067690 0.0017434391 0.0204715649 0.0057538477 0.0037263409 0.0069107492 0.0180293946 0.1154281623 0.1693562458 0.0042900270 0.0414066566 0.2239001858 0.0058416410 0.0149106129 0.0239548406 0.0332237129 0.1379349474 0.1200342049;\n\
+frequency C60pi6 = 0.0480550249 0.0308438053 0.0940628721 0.2084606133 0.0037801787 0.0747676701 0.1855184661 0.0191402239 0.0872162350 0.0094685435 0.0277340828 0.0375741243 0.0088308358 0.0196000958 0.0081267777 0.0439680761 0.0324588883 0.0034665720 0.0387499964 0.0181769181;\n\
+frequency C60pi7 = 0.0062848745 0.0026246919 0.0030342510 0.0005324147 0.0073027627 0.0034409089 0.0009741492 0.0019578159 0.0102225186 0.0180592309 0.1179064681 0.0016205916 0.0234721825 0.3974552519 0.0020165583 0.0056903327 0.0037091821 0.0598639097 0.3185565304 0.0152753744;\n\
+frequency C60pi8 = 0.1815005560 0.0026845411 0.0148484537 0.0025145485 0.4205633920 0.0014097001 0.0007088144 0.0461854175 0.0014374605 0.0041745536 0.0098310464 0.0006474254 0.0041611385 0.0068976432 0.0038767247 0.1864537050 0.0687189855 0.0027083549 0.0061033012 0.0345742379;\n\
+frequency C60pi9 = 0.0600740822 0.0367642654 0.0134869242 0.0170572285 0.0070719770 0.0142469806 0.0127486975 0.0343564471 0.0305859029 0.0204571345 0.0994551128 0.0212367087 0.0318165939 0.1140907926 0.0297628218 0.0505792699 0.0339368402 0.2312808862 0.1192491702 0.0217421638;\n\
+frequency C60pi10 = 0.0708394513 0.0474098489 0.0416822304 0.0324482918 0.0131641265 0.0494874703 0.0508264389 0.0183309196 0.0567272697 0.0650369079 0.1282255556 0.0343618389 0.0390362930 0.0594359563 0.0135608209 0.0551343199 0.0642260358 0.0137118382 0.0673934289 0.0789609573;\n\
+frequency C60pi11 = 0.0617689371 0.0076332888 0.0303081645 0.3430234188 0.0007199837 0.0307856241 0.3792509407 0.0284658686 0.0079592120 0.0016999627 0.0039945339 0.0216076877 0.0019734329 0.0009814186 0.0174791407 0.0337831940 0.0203426591 0.0006130268 0.0017102752 0.0058992300;\n\
+frequency C60pi12 = 0.0421559537 0.1042068314 0.0286980872 0.0164385240 0.0044450330 0.1393690851 0.0531949072 0.0134711207 0.0177764997 0.0267727728 0.1967237776 0.1323735242 0.1182827521 0.0086728324 0.0051837880 0.0255852718 0.0333292020 0.0045852327 0.0070281498 0.0217066546;\n\
+frequency C60pi13 = 0.2814809927 0.0100367066 0.0172867775 0.0064385734 0.0258337508 0.0133101925 0.0115046410 0.0270054934 0.0054629657 0.0188216093 0.0190993462 0.0098712843 0.0158719589 0.0050481705 0.0129510033 0.1886808600 0.2427104979 0.0012274627 0.0036052922 0.0837524211;\n\
+frequency C60pi14 = 0.2769188320 0.0017226995 0.0021315271 0.0011672545 0.0318292645 0.0018216251 0.0024752467 0.0199646887 0.0005170863 0.0983109006 0.0489264326 0.0016232163 0.0173414948 0.0070843906 0.0070179705 0.0336348952 0.0814141404 0.0007118144 0.0032942319 0.3620922883;\n\
+frequency C60pi15 = 0.1577797792 0.1112140270 0.0570403237 0.0648290471 0.0053318076 0.1065373681 0.0913586945 0.0906209718 0.0533809635 0.0029171632 0.0156225571 0.0782148712 0.0045758969 0.0025047816 0.0067077844 0.0929310045 0.0393122597 0.0028575821 0.0077590269 0.0085040899;\n\
+frequency C60pi16 = 0.0593735135 0.0354740772 0.1151175314 0.2189482708 0.0015332173 0.0688752402 0.1819422913 0.0813707101 0.0220478285 0.0020993577 0.0056191259 0.0750172075 0.0021871739 0.0010838321 0.0109737422 0.0726449461 0.0380238271 0.0007346460 0.0026664883 0.0042669729;\n\
+frequency C60pi17 = 0.0978066326 0.0265576438 0.0101843505 0.0120781428 0.0064138404 0.0307876446 0.0291282947 0.0128912798 0.0128036716 0.0723904209 0.1279438950 0.0245630658 0.0303267312 0.0198963719 0.2723524069 0.0350549441 0.0484557340 0.0046842467 0.0104773833 0.1152032995;\n\
+frequency C60pi18 = 0.0124023388 0.0030680354 0.0009239105 0.0006037316 0.0041885695 0.0032957441 0.0012524000 0.0011306791 0.0013542104 0.2344167852 0.4550557697 0.0016718177 0.0667307666 0.0610615367 0.0037076169 0.0019420934 0.0067612939 0.0038937184 0.0074911765 0.1290478057;\n\
+frequency C60pi19 = 0.0794230623 0.1294739355 0.0662792725 0.0587236242 0.0019919499 0.1143880588 0.1246900644 0.0325432311 0.0238605372 0.0036277150 0.0097987961 0.2147597316 0.0041846209 0.0012869951 0.0142410239 0.0615807386 0.0477333594 0.0006525371 0.0029420233 0.0078187231;\n\
+frequency C60pi20 = 0.0248148778 0.0083552910 0.1888915388 0.4278832998 0.0027839717 0.0210777725 0.1432386297 0.0643968435 0.0185736870 0.0022506941 0.0034558626 0.0179274104 0.0015714503 0.0014680353 0.0073768035 0.0377003132 0.0187767966 0.0005891859 0.0042602708 0.0046072655;\n\
+frequency C60pi21 = 0.0017003427 0.0060674330 0.0004222900 0.0010711490 0.0029059420 0.0016424179 0.0011731741 0.0035579609 0.0027630465 0.0012291190 0.0127420810 0.0004273804 0.0025671348 0.0513377024 0.0013536738 0.0011871674 0.0014033068 0.8640436936 0.0390912582 0.0033137266;\n\
+frequency C60pi22 = 0.0468360682 0.0639796924 0.0205603686 0.0185615516 0.0059954138 0.0557030821 0.0705436036 0.0045435329 0.0152062773 0.1550613356 0.0824253382 0.0866248354 0.0245854443 0.0080177192 0.0081485616 0.0237025617 0.0962054496 0.0018368673 0.0067131723 0.2047491243;\n\
+frequency C60pi23 = 0.0258764792 0.0201097124 0.0298384107 0.0107037437 0.0142503909 0.0158529432 0.0105649532 0.0073064999 0.1411078834 0.0114777629 0.0407992414 0.0119179202 0.0098798997 0.1876429961 0.0051228805 0.0275699644 0.0170764901 0.0405124999 0.3536390834 0.0187502449;\n\
+frequency C60pi24 = 0.0296285022 0.0046400334 0.0034944393 0.0008851024 0.0090046468 0.0055481111 0.0033046518 0.0027969482 0.0050701500 0.2583397750 0.2668085481 0.0046690936 0.0770825277 0.0408798247 0.0026918193 0.0068538089 0.0322265673 0.0035506055 0.0153353414 0.2271895033;\n\
+frequency C60pi25 = 0.0555725806 0.0098447861 0.0409064430 0.0140389597 0.0097418602 0.0068727710 0.0069443190 0.0157956555 0.0041631258 0.0069826497 0.0075271247 0.0139224817 0.0058762687 0.0034496730 0.0119733364 0.3482466393 0.4213655981 0.0010061491 0.0026576772 0.0131119012;\n\
+frequency C60pi26 = 0.0682671212 0.0615207091 0.0530661192 0.0360278709 0.0141433148 0.0612274332 0.0497415394 0.0268696520 0.1127674983 0.0132646615 0.0544493838 0.0482609047 0.0170033964 0.0803375967 0.0191949850 0.0671839752 0.0443995774 0.0199957919 0.1255070748 0.0267713947;\n\
+frequency C60pi27 = 0.0792618808 0.0638377192 0.0635289371 0.0436646174 0.0049503302 0.0666365188 0.0829639117 0.0183428565 0.0233169239 0.0249427251 0.0221483402 0.0932577596 0.0120893380 0.0049131149 0.0126360122 0.1334848656 0.1916745928 0.0018040086 0.0062353115 0.0503102360;\n\
+frequency C60pi28 = 0.0731759112 0.2105335985 0.0324200854 0.0110007149 0.0123458504 0.0858951989 0.0349942684 0.0224509173 0.0386903280 0.0246226304 0.0508307349 0.1783344831 0.0185740720 0.0093148787 0.0148722772 0.0603181436 0.0649574934 0.0051046395 0.0130597421 0.0385040321;\n\
+frequency C60pi29 = 0.0878402710 0.0110331750 0.0060801213 0.0032803903 0.0171147088 0.0109831614 0.0101465790 0.0087090941 0.0054902234 0.1987761871 0.1756460821 0.0082096925 0.0417232903 0.0191954435 0.0111283542 0.0209862621 0.0697718709 0.0031744014 0.0081905473 0.2825201446;\n\
+frequency C60pi30 = 0.0990215820 0.0349351987 0.0211149501 0.0118797946 0.0108995677 0.0557710676 0.0278999992 0.0240250097 0.0123445071 0.0776564721 0.2354511299 0.0322817789 0.1207665429 0.0214442058 0.0075655541 0.0524170141 0.0649785115 0.0047075806 0.0077328724 0.0771066610;\n\
+frequency C60pi31 = 0.0601641168 0.0161995226 0.2783522747 0.0337188808 0.0315066987 0.0210645987 0.0059839451 0.0543080710 0.0531523512 0.0070650825 0.0070698142 0.0139598368 0.0088298653 0.0069525877 0.0075834331 0.2829802556 0.0860317092 0.0014966551 0.0134849454 0.0100953553;\n\
+frequency C60pi32 = 0.0049781737 0.0018412331 0.0007012207 0.0005315368 0.0052978737 0.0024089907 0.0007630546 0.0015051317 0.0041575221 0.0443828633 0.4417417476 0.0011615060 0.0602807417 0.3351117140 0.0027847686 0.0025795769 0.0030288544 0.0171302592 0.0458455751 0.0237676560;\n\
+frequency C60pi33 = 0.0251996593 0.1114468110 0.0142031925 0.0041012288 0.0097099500 0.0620070749 0.0262571641 0.0038067269 0.0431938935 0.0974043253 0.2447197423 0.0824312856 0.0539323021 0.0429091639 0.0052658505 0.0096093107 0.0251183002 0.0146571900 0.0456965140 0.0783303143;\n\
+frequency C60pi34 = 0.0230361648 0.0014748749 0.0013534390 0.0006264439 0.0048580122 0.0009870046 0.0015762583 0.0011565336 0.0008899238 0.3952895890 0.0576537208 0.0014663528 0.0140986541 0.0072127040 0.0020177885 0.0028770237 0.0205580852 0.0005477695 0.0019539080 0.4603657493;\n\
+frequency C60pi35 = 0.1408776963 0.0297808449 0.0171297613 0.0285076933 0.0032213718 0.0320632225 0.0423838922 0.0299558472 0.0131321477 0.0066914481 0.0195120028 0.0383781635 0.0036276863 0.0041231064 0.4383466229 0.0851400095 0.0422765692 0.0013236871 0.0037087638 0.0198194632;\n\
+frequency C60pi36 = 0.4442491220 0.0050216551 0.0102305117 0.0057193038 0.0235405374 0.0055997640 0.0064889886 0.0822687710 0.0025505743 0.0033615104 0.0040990063 0.0038097073 0.0028683069 0.0024413211 0.0162890960 0.2999969708 0.0559664935 0.0007735426 0.0020639824 0.0226608347;\n\
+frequency C60pi37 = 0.0898717958 0.0070958305 0.0130067619 0.0129166888 0.0044131479 0.0023806547 0.0058957027 0.8087563021 0.0016517855 0.0004339282 0.0015564455 0.0033939025 0.0004253422 0.0008073572 0.0034128140 0.0362876891 0.0032887534 0.0015223902 0.0008537454 0.0020289624;\n\
+frequency C60pi38 = 0.0550840246 0.0472254260 0.1877829604 0.1273796123 0.0035824944 0.0527969268 0.0655884730 0.0637607521 0.0404883483 0.0075574152 0.0136304510 0.0867682792 0.0081684229 0.0040375032 0.0110681809 0.1263380956 0.0752544318 0.0013563681 0.0118590434 0.0102727908;\n\
+frequency C60pi39 = 0.0117681394 0.0442558806 0.0844144627 0.0144712108 0.0070388254 0.1038342049 0.0110901161 0.0049626578 0.4337194047 0.0061337038 0.0298794939 0.0137928558 0.0076237551 0.0338266335 0.0081346096 0.0140571089 0.0108276801 0.0080683065 0.1437251732 0.0083757773;\n\
+frequency C60pi40 = 0.0159285638 0.0048098656 0.0032692643 0.0010966937 0.0080519916 0.0134552459 0.0021324215 0.0025086365 0.0049192147 0.0501543893 0.5307634291 0.0035599431 0.2160085187 0.0743650717 0.0045247350 0.0066922196 0.0119092283 0.0070928134 0.0106565111 0.0281012433;\n\
+frequency C60pi41 = 0.0195973253 0.0105142992 0.3289103336 0.3099848991 0.0034539049 0.0116196758 0.0250777800 0.0627528956 0.0295961112 0.0032650434 0.0028246884 0.0240963907 0.0008425062 0.0019706550 0.0049062781 0.1064984500 0.0438053705 0.0006333959 0.0056197958 0.0040302013;\n\
+frequency C60pi42 = 0.0833804360 0.0125871438 0.0969824220 0.0686820704 0.0081981143 0.0121520930 0.0227415415 0.0982291876 0.0073954898 0.0017471177 0.0039653113 0.0129342146 0.0019557975 0.0024132583 0.0355924232 0.3115606483 0.2113368612 0.0016329034 0.0017991083 0.0047138579;\n\
+frequency C60pi43 = 0.0181409133 0.4129662563 0.0233205154 0.0033333547 0.0085143598 0.0526694251 0.0096531879 0.0224552642 0.0375238929 0.0035090482 0.0149146621 0.3208065790 0.0046098856 0.0035426859 0.0087197469 0.0262309419 0.0131791136 0.0034766995 0.0079588201 0.0044746474;\n\
+frequency C60pi44 = 0.2494227404 0.0185481724 0.0164119567 0.0169234299 0.0122862654 0.0228501981 0.0370491083 0.0347467705 0.0087069587 0.0595718359 0.0451065029 0.0177064733 0.0204556127 0.0077360919 0.0686403544 0.0889295672 0.0986017356 0.0028603862 0.0061938477 0.1672519917;\n\
+frequency C60pi45 = 0.1419737638 0.0373945961 0.0576296888 0.0537452477 0.0068856658 0.0286239972 0.0407540287 0.3988107872 0.0152895617 0.0016627616 0.0092348297 0.0314273807 0.0055425500 0.0040286132 0.0180328866 0.1123731997 0.0242478202 0.0025909098 0.0049054208 0.0048462908;\n\
+frequency C60pi46 = 0.0178903305 0.1958843646 0.0155853897 0.0031054277 0.0290304227 0.1051819261 0.0040503389 0.0100480293 0.1252696215 0.0016708003 0.0722356645 0.0233340169 0.0116142354 0.0238913260 0.0009938415 0.0181675536 0.0186260222 0.2260554691 0.0859787232 0.0113864962;\n\
+frequency C60pi47 = 0.1454758367 0.0420979067 0.0400419720 0.1294249748 0.0014186329 0.0906469055 0.2471353458 0.0319650773 0.0130426183 0.0058525371 0.0123593139 0.0818154090 0.0044178939 0.0017552077 0.0151135525 0.0656688174 0.0511289472 0.0007731441 0.0029258438 0.0169400635;\n\
+frequency C60pi48 = 0.0169799462 0.0242346701 0.1318047919 0.1043655101 0.0022087215 0.0269349684 0.0376379591 0.5404470183 0.0181137053 0.0007459679 0.0021146994 0.0508617611 0.0009473769 0.0006780593 0.0038754401 0.0297030159 0.0045836180 0.0006031889 0.0015704090 0.0015891728;\n\
+frequency C60pi49 = 0.0402646249 0.1152022601 0.0323829165 0.0293968352 0.0039388655 0.2497008043 0.1603524245 0.0129260411 0.0617967839 0.0098491259 0.0354918823 0.1448804422 0.0124818865 0.0041153375 0.0043374229 0.0243246958 0.0305645368 0.0026676598 0.0097227847 0.0156026694;\n\
+frequency C60pi50 = 0.2256914610 0.0523417493 0.0244308734 0.0637125217 0.0043390149 0.0578159236 0.1154830640 0.0867335173 0.0131066949 0.0085086217 0.0193314218 0.0660468804 0.0064877206 0.0027440054 0.0611149102 0.1070877179 0.0507677144 0.0013695913 0.0028982948 0.0299883012;\n\
+frequency C60pi51 = 0.0033164209 0.0015310773 0.0030830171 0.0008266472 0.0051890730 0.0011024889 0.0005134130 0.0010432830 0.0278451262 0.0041895268 0.0111212494 0.0007149922 0.0023621780 0.3801761447 0.0008365077 0.0035876698 0.0023608948 0.0333346985 0.5107889643 0.0060766272;\n\
+frequency C60pi52 = 0.1995014012 0.0236078675 0.0392254543 0.0094955104 0.0584590451 0.0254265363 0.0125535371 0.0939787338 0.0341857201 0.0140209879 0.0449387571 0.0118723304 0.0246990633 0.0634433944 0.0145385320 0.1663920640 0.0533159207 0.0129802666 0.0606346163 0.0367302614;\n\
+frequency C60pi53 = 0.0319448994 0.1011667268 0.2084709220 0.0378074649 0.0066040348 0.0766372935 0.0279488190 0.0365541130 0.2088643258 0.0047542347 0.0156545731 0.0868664783 0.0043253317 0.0108915768 0.0060899575 0.0577656939 0.0302051160 0.0026001883 0.0387897304 0.0060585202;\n\
+frequency C60pi54 = 0.0776799515 0.0142518583 0.0403216692 0.0080651725 0.0140092962 0.0179995517 0.0112622427 0.0136868237 0.0133729897 0.1239635380 0.0724670993 0.0129144967 0.0420745442 0.0173584908 0.0117084432 0.0922723571 0.2316899445 0.0028153633 0.0141726542 0.1679135132;\n\
+frequency C60pi55 = 0.1183662657 0.0805192606 0.0259524932 0.0495595439 0.0035624835 0.1204924917 0.1537589210 0.0194993426 0.0229373171 0.0302661211 0.0571250629 0.0982304112 0.0171727472 0.0068665705 0.0175153030 0.0486588400 0.0635796210 0.0023008307 0.0083027431 0.0553336300;\n\
+frequency C60pi56 = 0.0528559899 0.0193569043 0.0264743774 0.2092761515 0.0008625883 0.1212409715 0.4024189781 0.0155838458 0.0124148798 0.0054864832 0.0090256472 0.0497017031 0.0042357114 0.0012650715 0.0063185636 0.0197262901 0.0235463735 0.0008381610 0.0033948741 0.0159764347;\n\
+frequency C60pi57 = 0.0344366215 0.0426221820 0.1636716191 0.1139007491 0.0020985982 0.0605413987 0.0541780220 0.3361639671 0.0461776737 0.0003463416 0.0048355678 0.0667552967 0.0019704509 0.0031557619 0.0040369775 0.0481173332 0.0089148085 0.0006510101 0.0054145649 0.0020110555;\n\
+frequency C60pi58 = 0.1153088951 0.0151278638 0.0458476603 0.1755516676 0.0014962362 0.0366731222 0.1749410045 0.0394181311 0.0132401530 0.0056912974 0.0101409559 0.0433118387 0.0030332064 0.0015700232 0.1665802563 0.0871536033 0.0468260603 0.0007515702 0.0031432715 0.0141931831;\n\
+frequency C60pi59 = 0.3865149348 0.0037579334 0.0030420497 0.0022366810 0.0218928357 0.0021464743 0.0031387843 0.3694353983 0.0014672902 0.0085376076 0.0127257242 0.0018840458 0.0080581695 0.0039281367 0.0158688291 0.0808877279 0.0305195935 0.0009922880 0.0019020345 0.0410634615;\n\
+frequency C60pi60 = 0.0146570745 0.0028841333 0.0012998335 0.0005210575 0.0024317913 0.0049362750 0.0014874369 0.0020953252 0.0010181940 0.1913901476 0.4432797758 0.0022898369 0.2217427062 0.0091637503 0.0007685153 0.0027251487 0.0170997497 0.0008779380 0.0014756028 0.0778557075;\n\
+model C60 = POISSON+G+FMIX{C60pi1:1:0.0169698865,C60pi2:1:0.0211683374,C60pi3:1:0.0276589079,C60pi4:1:0.0065675964,C60pi5:1:0.0141221416,C60pi6:1:0.0068774834,C60pi7:1:0.0146909701,C60pi8:1:0.0067225777,C60pi9:1:0.0018396660,C60pi10:1:0.0102547197,C60pi11:1:0.0230896163,C60pi12:1:0.0057941033,C60pi13:1:0.0125394534,C60pi14:1:0.0204526478,C60pi15:1:0.0070629602,C60pi16:1:0.0117982741,C60pi17:1:0.0068334668,C60pi18:1:0.0433775839,C60pi19:1:0.0318278731,C60pi20:1:0.0222546108,C60pi21:1:0.01 [...]
+\n\
+end;\n";
+
+const double MIN_MIXTURE_PROP = 0.001;
+//const double MAX_MIXTURE_PROP = 1000.0;
+//const double MIN_MIXTURE_RATE = 0.01;
+//const double MAX_MIXTURE_RATE = 100.0;
+
+ModelSubst* createModel(string model_str, ModelsBlock *models_block, StateFreqType freq_type, string freq_params,
+		PhyloTree* tree, bool count_rates)
+{
+	ModelSubst *model = NULL;
+	//cout << "Numstates: " << tree->aln->num_states << endl;
+	string model_params;
+	NxsModel *nxsmodel = models_block->findModel(model_str);
+	if (nxsmodel) model_params = nxsmodel->description;
+	size_t pos = model_str.find(OPEN_BRACKET);
+	if (pos != string::npos) {
+		if (model_str.rfind(CLOSE_BRACKET) != model_str.length()-1)
+			outError("Close bracket not found at the end of ", model_str);
+		model_params = model_str.substr(pos+1, model_str.length()-pos-2);
+		model_str = model_str.substr(0, pos);
+	}
+	/*
+	if ((model_str == "JC" && tree->aln->seq_type == SEQ_DNA) ||
+		(model_str == "POISSON" && tree->aln->seq_type == SEQ_PROTEIN) ||
+		(model_str == "JC2" && tree->aln->seq_type == SEQ_BINARY) ||
+		(model_str == "JCC" && tree->aln->seq_type == SEQ_CODON) ||
+		(model_str == "MK" && tree->aln->seq_type == SEQ_MORPH))
+	{
+		model = new ModelSubst(tree->aln->num_states);
+	} else */
+	if ((model_str == "GTR" && tree->aln->seq_type == SEQ_DNA) ||
+		(model_str == "GTR2" && tree->aln->seq_type == SEQ_BINARY) ||
+		(model_str == "GTR20" && tree->aln->seq_type == SEQ_PROTEIN)) {
+		model = new ModelGTR(tree, count_rates);
+		if (freq_params != "")
+			((ModelGTR*)model)->readStateFreq(freq_params);
+		if (model_params != "")
+			((ModelGTR*)model)->readRates(model_params);
+		((ModelGTR*)model)->init(freq_type);
+	} else if (model_str == "UNREST") {
+		freq_type = FREQ_EQUAL;
+		//params.optimize_by_newton = false;
+		tree->optimize_by_newton = false;
+		model = new ModelNonRev(tree, count_rates);
+		((ModelNonRev*)model)->init(freq_type);
+	} else if (tree->aln->seq_type == SEQ_BINARY) {
+		model = new ModelBIN(model_str.c_str(), model_params, freq_type, freq_params, tree, count_rates);
+	} else if (tree->aln->seq_type == SEQ_DNA) {
+		model = new ModelDNA(model_str.c_str(), model_params, freq_type, freq_params, tree, count_rates);
+	} else if (tree->aln->seq_type == SEQ_PROTEIN) {
+		model = new ModelProtein(model_str.c_str(), model_params, freq_type, freq_params, tree, count_rates);
+	} else if (tree->aln->seq_type == SEQ_CODON) {
+		model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree, count_rates);
+	} else if (tree->aln->seq_type == SEQ_MORPH) {
+		model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree);
+	} else {
+		outError("Unsupported model type");
+	}
+
+	return model;
+}
+
+ModelMixture::ModelMixture(string orig_model_name, string model_name, string model_list, ModelsBlock *models_block,
+		StateFreqType freq, string freq_params, PhyloTree *tree, bool optimize_weights, bool count_rates)
+	: ModelGTR(tree, count_rates)
+{
+//	const int MAX_MODELS = 64;
+	size_t cur_pos;
+	int m;
+
+	vector<NxsModel*> freq_vec;
+	DoubleVector freq_rates;
+	DoubleVector freq_weights;
+	fix_prop = false;
+	optimizing_submodels = false;
+
+	if (freq == FREQ_MIXTURE) {
+		for (m = 0, cur_pos = 0; cur_pos < freq_params.length(); m++) {
+			size_t pos = freq_params.find(',', cur_pos);
+			if (pos == string::npos)
+				pos = freq_params.length();
+			if (pos <= cur_pos)
+				outError("One frequency name in the mixture is empty.");
+			string this_name = freq_params.substr(cur_pos, pos-cur_pos);
+			double rate = 1.0, weight = 1.0;
+			size_t pos_rate = this_name.find(':');
+			if (pos_rate != string::npos) {
+				size_t pos_weight = this_name.find(':', pos_rate+1);
+				if (pos_weight == string::npos) {
+					rate = convert_double(this_name.substr(pos_rate+1).c_str());
+				} else {
+					rate = convert_double(this_name.substr(pos_rate+1, pos_weight-pos_rate-1).c_str());
+					weight = convert_double(this_name.substr(pos_weight+1).c_str());
+					fix_prop = true;
+					if (weight <= 0.0)
+						outError("Mixture component weight is negative!");
+                    weight = max(weight, MIN_MIXTURE_PROP);
+				}
+				this_name = this_name.substr(0, pos_rate);
+			}
+			freq_rates.push_back(rate);
+			freq_weights.push_back(weight);
+			cur_pos = pos+1;
+			if (this_name == "empirical") {
+				freq_vec.push_back(NULL);
+			} else {
+				NxsModel *freq_mod = models_block->findModel(this_name);
+				if (!freq_mod)
+					outError("Frequency mixture name not found ", this_name);
+				if (!(freq_mod->flag & NM_FREQ)) {
+					cout << freq_mod->flag << endl;
+					outError("Frequency mixture name does not corresponding to frequency model ", this_name);
+				}
+				freq_vec.push_back(freq_mod);
+			}
+		}
+        double sum_weights = 0.0;
+        for (m = 0; m < freq_weights.size(); m++)
+            if (freq_vec[m]) 
+                sum_weights += freq_weights[m];
+        for (m = 0; m < freq_weights.size(); m++)
+            if (!freq_vec[m]) 
+                freq_weights[m] = sum_weights/freq_weights.size();
+		init(FREQ_USER_DEFINED);
+	} else {
+		if (freq_params != "")
+			readStateFreq(freq_params);
+		init(freq);
+	}
+
+	DoubleVector weights;
+    name = orig_model_name;
+	full_name = (string)"MIX" + OPEN_BRACKET;
+	if (model_list == "") model_list = model_name;
+	for (m = 0, cur_pos = 0; cur_pos < model_list.length(); m++) {
+		size_t pos = model_list.find(',', cur_pos);
+		if (pos == string::npos)
+			pos = model_list.length();
+		if (pos <= cur_pos)
+			outError("One model name in the mixture is empty.");
+		string this_name = model_list.substr(cur_pos, pos-cur_pos);
+		double rate = 1.0, weight = 1.0;
+		size_t pos_rate = this_name.find(':');
+		if (pos_rate != string::npos) {
+			size_t pos_weight = this_name.find(':', pos_rate+1);
+			if (pos_weight == string::npos) {
+				rate = convert_double(this_name.substr(pos_rate+1).c_str());
+			} else {
+				rate = convert_double(this_name.substr(pos_rate+1, pos_weight-pos_rate-1).c_str());
+				weight = convert_double(this_name.substr(pos_weight+1).c_str());
+				fix_prop = true;
+				if (weight <= 0.0)
+					outError("Mixture component weight is negative!");
+			}
+			this_name = this_name.substr(0, pos_rate);
+		}
+		cur_pos = pos+1;
+		ModelGTR* model;
+		if (freq == FREQ_MIXTURE) {
+			for(int f = 0; f != freq_vec.size(); f++) {
+				if (freq_vec[f])
+					model = (ModelGTR*)createModel(this_name, models_block, FREQ_USER_DEFINED, freq_vec[f]->description, tree, count_rates);
+				else
+					model = (ModelGTR*)createModel(this_name, models_block, FREQ_EMPIRICAL, "", tree, count_rates);
+				model->total_num_subst = rate * freq_rates[f];
+				push_back(model);
+				weights.push_back(weight * freq_weights[f]);
+				if (m+f > 0) {
+//					name += ',';
+					full_name += ',';
+				}
+				if (freq_vec[f]) {
+					model->name += "+F" +freq_vec[f]->name + "";
+					model->full_name += "+F" +freq_vec[f]->name + "";
+				} else {
+					model->name += "+F";
+					model->full_name += "+F";
+				}
+//				name += model->name;
+				full_name += model->name;
+			}
+		} else {
+			model = (ModelGTR*)createModel(this_name, models_block, freq, freq_params, tree, count_rates);
+			model->total_num_subst = rate;
+			push_back(model);
+			weights.push_back(weight);
+			if (m > 0) {
+//				name += ',';
+				full_name += ',';
+			}
+//			name += model->name;
+			full_name += model->name;
+		}
+	}
+
+//	name += CLOSE_BRACKET;
+	full_name += CLOSE_BRACKET;
+
+	int nmixtures = size();
+	prop = aligned_alloc<double>(nmixtures);
+
+	double sum = 0.0;
+	int i;
+	if (fix_prop) {
+		for (i = 0, sum = 0.0; i < nmixtures; i++) {
+			prop[i] = weights[i];
+			sum += prop[i];
+		}
+	} else {
+		// initialize rates as increasing
+		for (i = 0, sum = 0.0; i < nmixtures; i++) {
+//			prop[i] = random_double();
+            prop[i] = 1.0/nmixtures;
+			sum += prop[i];
+		}
+	}
+	// normalize weights to 1.0
+    if (sum != 1.0) {
+        sum = 1.0/sum;
+//        cout << "NOTE: Mixture weights do not sum up to 1, rescale weights by " << sum << endl;
+        for (i = 0; i < nmixtures; i++)
+             prop[i] *= sum;
+    }     
+
+	// rescale total_num_subst such that the global rate is 1
+	for (i = 0, sum = 0.0; i < nmixtures; i++)
+		sum += prop[i]*at(i)->total_num_subst;
+	for (i = 0; i < nmixtures; i++)
+		at(i)->total_num_subst /= sum;
+
+	if (optimize_weights) fix_prop = false;
+	fix_prop |= (nmixtures == 1);
+	// use central eigen etc. stufffs
+
+	if (eigenvalues) aligned_free(eigenvalues);
+	if (eigenvectors) aligned_free(eigenvectors);
+	if (inv_eigenvectors) aligned_free(inv_eigenvectors);
+	if (eigen_coeff) aligned_free(eigen_coeff);
+
+	eigenvalues = aligned_alloc<double>(num_states*nmixtures);
+	eigenvectors = aligned_alloc<double>(num_states*num_states*nmixtures);
+	inv_eigenvectors = aligned_alloc<double>(num_states*num_states*nmixtures);
+	int ncoeff = num_states*num_states*num_states;
+	eigen_coeff = aligned_alloc<double>(ncoeff*nmixtures);
+
+	// assigning memory for individual models
+	m = 0;
+	for (iterator it = begin(); it != end(); it++, m++) {
+        // first copy memory for eigen stuffs
+        memcpy(&eigenvalues[m*num_states], (*it)->eigenvalues, num_states*sizeof(double));
+        memcpy(&eigenvectors[m*num_states*num_states], (*it)->eigenvectors, num_states*num_states*sizeof(double));
+        memcpy(&inv_eigenvectors[m*num_states*num_states], (*it)->inv_eigenvectors, num_states*num_states*sizeof(double));
+        memcpy(&eigen_coeff[m*ncoeff], (*it)->eigen_coeff, ncoeff*sizeof(double));
+        // then delete
+		if ((*it)->eigenvalues) aligned_free((*it)->eigenvalues);
+		if ((*it)->eigenvectors) aligned_free((*it)->eigenvectors);
+		if ((*it)->inv_eigenvectors) aligned_free((*it)->inv_eigenvectors);
+		if ((*it)->eigen_coeff) aligned_free((*it)->eigen_coeff);
+
+        // and assign new memory
+		(*it)->eigenvalues = &eigenvalues[m*num_states];
+		(*it)->eigenvectors = &eigenvectors[m*num_states*num_states];
+		(*it)->inv_eigenvectors = &inv_eigenvectors[m*num_states*num_states];
+		(*it)->eigen_coeff = &eigen_coeff[m*ncoeff];
+	}
+	decomposeRateMatrix();
+}
+
+ModelMixture::~ModelMixture() {
+	if (prop)
+		aligned_free(prop);
+	for (reverse_iterator rit = rbegin(); rit != rend(); rit++) {
+		(*rit)->eigen_coeff = NULL;
+		(*rit)->eigenvalues = NULL;
+		(*rit)->eigenvectors = NULL;
+		(*rit)->inv_eigenvectors = NULL;
+		delete (*rit);
+	}
+}
+
+int ModelMixture::getNDim() {
+//	int dim = (fix_prop) ? 0: (size()-1);
+    int dim = 0;
+    if (!optimizing_submodels && !fix_prop)
+    	dim = size()-1;
+	for (iterator it = begin(); it != end(); it++)
+		dim += (*it)->getNDim();
+	return dim;
+}
+
+double ModelMixture::targetFunk(double x[]) {
+	getVariables(x);
+//	decomposeRateMatrix();
+	int dim = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		if ((*it)->getNDim() > 0)
+			(*it)->decomposeRateMatrix();
+		dim += ((*it)->getNDim());
+	}
+	assert(phylo_tree);
+	if (dim > 0) // only clear all partial_lh if changing at least 1 rate matrix
+		phylo_tree->clearAllPartialLH();
+//	if (prop[size()-1] < 0.0) return 1.0e+12;
+	return -phylo_tree->computeLikelihood();
+}
+
+double ModelMixture::optimizeWeights() {
+    // first compute _pattern_lh_cat
+    if (phylo_tree->getModelFactory()->fused_mix_rate) {
+        phylo_tree->computeMixrateLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+    } else {
+        phylo_tree->computeMixtureLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+    }
+    size_t ptn, c;
+    size_t nptn = phylo_tree->aln->getNPattern();
+    size_t nmix = getNMixtures();
+    
+    double *lk_ptn = aligned_alloc<double>(nptn);
+    double *new_prop = aligned_alloc<double>(nmix);
+    
+        
+    // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
+    for (int step = 0; step < nmix; step++) {
+        // E-step
+        memset(lk_ptn, 0, nptn*sizeof(double));
+        if (step == 0) {
+            for (c = 0; c < nmix; c++) 
+                new_prop[c] = 1.0 / prop[c];
+            // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
+            for (ptn = 0; ptn < nptn; ptn++) {
+                double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+                for (c = 0; c < nmix; c++) {
+                    lk_ptn[ptn] += this_lk_cat[c];
+                    this_lk_cat[c] *= new_prop[c];
+                }
+            } 
+        } else {
+            // update L_i according to (**)
+            for (ptn = 0; ptn < nptn; ptn++) {
+                double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+                for (c = 0; c < nmix; c++) {
+                    lk_ptn[ptn] += this_lk_cat[c] * prop[c];
+                }
+            }        
+        }
+        
+        // M-step, update weights according to (*)
+        memset(new_prop, 0, nmix*sizeof(double));
+        for (ptn = 0; ptn < nptn; ptn++) {
+            double inv_lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn[ptn];
+            double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+            for (c = 0; c < nmix; c++)
+                new_prop[c] += this_lk_cat[c] * inv_lk_ptn;
+        }
+        
+        bool converged = true;
+        for (c = 0; c < nmix; c++) {
+            new_prop[c] = prop[c] * (new_prop[c] / phylo_tree->getAlnNSite());
+            // check for convergence
+            converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
+            prop[c] = new_prop[c];
+        }
+        if (converged) break;
+    }
+    
+    aligned_free(new_prop);
+    aligned_free(lk_ptn);
+    return phylo_tree->computeLikelihood();
+}
+
+double ModelMixture::optimizeParameters(double gradient_epsilon) {
+	optimizing_submodels = true;
+	double score = ModelGTR::optimizeParameters(gradient_epsilon);
+	optimizing_submodels = false;
+    if (!fix_prop)
+        score = optimizeWeights();
+	if (getNDim() == 0) return score;
+	// now rescale Q matrices to have proper interpretation of branch lengths
+	double sum;
+	int i, ncategory = size();
+	for (i = 0, sum = 0.0; i < ncategory; i++)
+		sum += prop[i]*at(i)->total_num_subst;
+	for (i = 0; i < ncategory; i++)
+		at(i)->total_num_subst /= sum;
+	decomposeRateMatrix();
+	return score;
+}
+
+void ModelMixture::decomposeRateMatrix() {
+	for (iterator it = begin(); it != end(); it++)
+		(*it)->decomposeRateMatrix();
+}
+
+void ModelMixture::setVariables(double *variables) {
+	int dim = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->setVariables(&variables[dim]);
+		dim += (*it)->getNDim();
+	}
+//	if (fix_prop) return;
+//	int i, ncategory = size();
+
+//	variables[dim+1] = prop[0]*at(0)->total_num_subst;
+//	for (i = 2; i < ncategory; i++)
+//		variables[dim+i] = variables[dim+i-1] + prop[i-1]*at(i-1)->total_num_subst;
+
+//	variables[dim+1] = prop[0];
+//	for (i = 2; i < ncategory; i++)
+//		variables[dim+i] = variables[dim+i-1] + prop[i-1];
+
+    // BQM 2015-05-19: modify using the same strategy for FreeRate model (thanks to Thomas Wong)
+//	for (i = 0; i < ncategory-1; i++) {
+//		variables[dim+i+1] = prop[i] / prop[ncategory-1];
+//        if (variables[dim+i+1] < MIN_MIXTURE_PROP*0.9 || variables[dim+i+1] > MAX_MIXTURE_PROP) {
+//            outWarning("For component " + convertIntToString(i+1) + ", mixture weight " + convertDoubleToString(variables[dim+i+1]) + " is out of bound and may cause numerical instability");
+//        }
+//    }
+
+}
+
+void ModelMixture::getVariables(double *variables) {
+	int dim = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->getVariables(&variables[dim]);
+		dim += (*it)->getNDim();
+	}
+//	if (fix_prop) return;
+//	int i, ncategory = size();
+
+//	double *y = new double[ncategory+1];
+//	y[0] = 0; y[ncategory] = 1.0;
+//	memcpy(y+1, variables+dim+1, (ncategory-1) * sizeof(double));
+//	std::sort(y+1, y+ncategory);
+//	double sum = 0.0;
+//	for (i = 0; i < ncategory; i++) {
+//		prop[i] = (y[i+1]-y[i]);
+//	}
+
+    // BQM 2015-05-19: modify using the same strategy for FreeRate model (thanks to Thomas Wong)
+//	double sum = 1.0;
+//	for (i = 0; i < ncategory-1; i++) {
+//		sum += variables[dim+i+1];
+//	}
+//	for (i = 0; i < ncategory-1; i++) {
+//		prop[i] = variables[dim+i+1] / sum;
+//	}
+//	prop[ncategory-1] = 1.0 / sum;
+    
+    
+//	for (i = 0, sum = 0.0; i < ncategory; i++)
+//		sum += prop[i]*at(i)->total_num_subst;
+//	for (i = 0; i < ncategory; i++)
+//		at(i)->total_num_subst /= sum;
+
+//	if (verbose_mode >= VB_MAX) {
+//		for (i = 0; i < ncategory; i++)
+//			cout << "Component " << i << " prop=" << prop[i] << endl;
+//	}
+//	delete [] y;
+
+}
+
+void ModelMixture::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	int dim = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->setBounds(&lower_bound[dim], &upper_bound[dim], &bound_check[dim]);
+		dim += (*it)->getNDim();
+	}
+//	if (fix_prop) return;
+//	int i, ncategory = size();
+//	for (i = 1; i < ncategory; i++) {
+//		lower_bound[dim+i] = MIN_MIXTURE_PROP;
+//		upper_bound[dim+i] = MAX_MIXTURE_PROP;
+//		bound_check[dim+i] = false;
+//	}
+}
+
+void ModelMixture::writeInfo(ostream &out) {
+	int i;
+	for (i = 0; i < size(); i++) {
+		at(i)->writeInfo(out);
+	}
+//	if (fix_prop) return;
+	cout << "Mixture weights:";
+	for (i = 0; i < size(); i++)
+		cout << " " << prop[i];
+	cout << endl;
+}
+
+void ModelMixture::writeParameters(ostream &out) {
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->writeParameters(out);
+	}
+}
+
+string ModelMixture::getNameParams() {
+//	ostringstream retname;
+//	retname << "MIX" << OPEN_BRACKET;
+//    for (iterator it=begin(); it != end(); it++) {
+//        if (it != begin()) retname << ",";
+//        retname << (*it)->ModelSubst::getNameParams();
+//    }
+//	retname << CLOSE_BRACKET;
+//	return retname.str();
+    return full_name;
+}
diff --git a/model/modelmixture.h b/model/modelmixture.h
new file mode 100644
index 0000000..dd53df4
--- /dev/null
+++ b/model/modelmixture.h
@@ -0,0 +1,145 @@
+/*
+ * modelmixture.h
+ *
+ *  Created on: Nov 29, 2014
+ *      Author: minh
+ */
+
+#ifndef MODELMIXTURE_H_
+#define MODELMIXTURE_H_
+
+#include "phylotree.h"
+#include "modelsubst.h"
+#include "modelgtr.h"
+#include "modelsblock.h"
+
+
+const char OPEN_BRACKET = '{';
+const char CLOSE_BRACKET = '}';
+
+extern const string builtin_mixmodels_definition;
+
+/**
+ * create a substitution model
+ * @param model_str model nme
+ * @param freq_type state frequency type
+ * @param freq_params frequency parameters
+ * @param tree associated phylo tree
+ * @param count_rates TRUE to assign rates counted from alignment, FALSE to not initialize rates
+ * @return substitution model created
+ */
+ModelSubst *createModel(string model_str, ModelsBlock *models_block, StateFreqType freq_type, string freq_params,
+		PhyloTree *tree, bool count_rates = true);
+
+
+/**
+ * mixture model
+ */
+class ModelMixture: public ModelGTR, public vector<ModelGTR*> {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+    ModelMixture(string orig_model_name, string model_name, string model_list, ModelsBlock *models_block,
+    		StateFreqType freq, string freq_params, PhyloTree *tree, bool optimize_weights, bool count_rates = true);
+
+    virtual ~ModelMixture();
+
+
+	/**
+	 * @return TRUE if this is a mixture model, FALSE otherwise
+	 */
+	virtual bool isMixture() { return true; }
+
+
+	/**
+	 * @return the number of mixture model components
+	 */
+	virtual int getNMixtures() {return size(); }
+
+	/**
+		@return the number of dimensions
+	*/
+	virtual int getNDim();
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+    double optimizeWeights();
+
+	/**
+		optimize model parameters
+		@return the best likelihood
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix();
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		rates of mixture components
+	*/
+//	double *mix_rates;
+
+	/**
+	 * weight of each sub-model (must sum to 1)
+	 */
+	double *prop;
+
+	/**
+	 * TRUE to fix model weights
+	 */
+	bool fix_prop;
+
+protected:
+
+	bool optimizing_submodels;
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+};
+
+#endif /* MODELMIXTURE_H_ */
diff --git a/model/modelmorphology.cpp b/model/modelmorphology.cpp
new file mode 100644
index 0000000..77a422e
--- /dev/null
+++ b/model/modelmorphology.cpp
@@ -0,0 +1,39 @@
+/*
+ * modelmorphology.cpp
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: minh
+ */
+
+#include "modelmorphology.h"
+
+ModelMorphology::ModelMorphology(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree)
+: ModelGTR(tree, false)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+void ModelMorphology::init(const char *model_name, string model_params, StateFreqType freq, string freq_params)
+{
+	name = model_name;
+	full_name = model_name;
+	freq = FREQ_EQUAL;
+	if (name == "MK") {
+		// all were initialized
+	} else if (name == "ORDERED") {
+		int k = 0;
+		// only allow for substitution from state i to state i+1 and back.
+		for (int i = 0; i < num_states-1; i++) {
+			rates[k++] = 1.0;
+			for (int j = i+2; j < num_states; j++, k++)
+				rates[k] = 0.0;
+		}
+	} else {
+		outError("Unknown morphological model name");
+	}
+	ModelGTR::init(freq);
+}
+
+ModelMorphology::~ModelMorphology() {
+}
+
diff --git a/model/modelmorphology.h b/model/modelmorphology.h
new file mode 100644
index 0000000..427c5c5
--- /dev/null
+++ b/model/modelmorphology.h
@@ -0,0 +1,45 @@
+/*
+ * modelmorphology.h
+ *
+ *  Created on: Apr 15, 2014
+ *      Author: minh
+ */
+
+#ifndef MODELMORPHOLOGY_H_
+#define MODELMORPHOLOGY_H_
+
+#include "modelgtr.h"
+
+/**
+ * This class implement ML model for morphological data. Such models are:
+ * - Mk (Lewis 2001) a JC-type model
+ * - ORDERED: allowing only transition from state i to i-1 and i+1
+ * TODO: Mkv to account for absence of constant sites
+ */
+class ModelMorphology: public ModelGTR {
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+	ModelMorphology(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree);
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JC, HKY.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return 0; }
+
+    virtual ~ModelMorphology();
+};
+
+#endif /* MODELMORPHOLOGY_H_ */
diff --git a/model/modelnonrev.cpp b/model/modelnonrev.cpp
new file mode 100644
index 0000000..315c43b
--- /dev/null
+++ b/model/modelnonrev.cpp
@@ -0,0 +1,258 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "modelnonrev.h"
+//#include "whtest/eigen.h"
+
+ModelNonRev::ModelNonRev(PhyloTree *tree, bool count_rates)
+        : ModelGTR(tree, false)
+{
+    num_params = getNumRateEntries() - 1;
+    delete [] rates;
+    rates = new double [num_params+1];
+    memset(rates, 0, sizeof(double) * (num_params+1));
+	if (count_rates)
+		phylo_tree->aln->computeEmpiricalRateNonRev(rates);
+	else 
+		for (int i = 0; i <= num_params; i++) 
+			rates[i] = 1.0;
+    name = "UNREST";
+    full_name = "Unrestricted model (non-reversible)";
+    rate_matrix = new double[num_states*num_states];
+    temp_space =  new double[num_states*num_states];
+}
+
+void ModelNonRev::freeMem() {
+    ModelGTR::freeMem();
+    delete [] temp_space;
+    delete [] rate_matrix;
+}
+
+void ModelNonRev::getQMatrix(double *rate_mat) {
+    memmove(rate_mat, rate_matrix, num_states*num_states*sizeof(double));
+}
+
+/* BQM: Ziheng Yang code which fixed old matinv function */
+int matinv (double x[], int n, int m, double space[])
+{
+    /* x[n*m]  ... m>=n
+       space[n].  This puts the fabs(|x|) into space[0].  Check and calculate |x|.
+       Det may have the wrong sign.  Check and fix.
+    */
+    int i,j,k;
+    int *irow=(int*) space;
+    double ee=1e-100, t,t1,xmax, det=1;
+
+    for (i=0; i<n; i++) irow[i]=i;
+
+    for (i=0; i<n; i++)  {
+        xmax = fabs(x[i*m+i]);
+        for (j=i+1; j<n; j++)
+            if (xmax<fabs(x[j*m+i]))
+            {
+                xmax = fabs(x[j*m+i]);
+                irow[i]=j;
+            }
+        det *= x[irow[i]*m+i];
+        if (xmax < ee)   {
+            cout << endl << "xmax = " << xmax << " close to zero at " << i+1 << "!\t" << endl;
+            exit(-1);
+        }
+        if (irow[i] != i) {
+            for (j=0; j < m; j++) {
+                t = x[i*m+j];
+                x[i*m+j] = x[irow[i]*m+j];
+                x[irow[i]*m+j] = t;
+            }
+        }
+        t = 1./x[i*m+i];
+        for (j=0; j < n; j++) {
+            if (j == i) continue;
+            t1 = t*x[j*m+i];
+            for (k=0; k<=m; k++)  x[j*m+k] -= t1*x[i*m+k];
+            x[j*m+i] = -t1;
+        }
+        for (j=0; j <= m; j++)   x[i*m+j] *= t;
+        x[i*m+i] = t;
+    }                            /* for(i) */
+    for (i=n-1; i>=0; i--) {
+        if (irow[i] == i) continue;
+        for (j=0; j < n; j++)  {
+            t = x[j*m+i];
+            x[j*m+i] = x[j*m + irow[i]];
+            x[j*m + irow[i]] = t;
+        }
+    }
+    space[0]=det;
+    return(0);
+}
+
+int QtoPi (double Q[], double pi[], int n, double space[])
+{
+    /* from rate matrix Q[] to pi, the stationary frequencies:
+       Q' * pi = 0     pi * 1 = 1
+       space[] is of size n*(n+1).
+    */
+    int i,j;
+    double *T = space;      /* T[n*(n+1)]  */
+
+    for (i=0;i<n+1;i++) T[i]=1;
+    for (i=1;i<n;i++) {
+        for (j=0;j<n;j++)
+            T[i*(n+1)+j] =  Q[j*n+i];     /* transpose */
+        T[i*(n+1)+n] = 0.;
+    }
+    matinv(T, n, n+1, pi);
+    for (i=0;i<n;i++)
+        pi[i] = T[i*(n+1)+n];
+    return (0);
+}
+/* End of Ziheng Yang code */
+
+void ModelNonRev::decomposeRateMatrix() {
+    int i, j, k;
+    double sum;
+    //double m[num_states];
+    double *space = new double[num_states*(num_states+1)];
+
+    for (i = 0; i < num_states; i++)
+        state_freq[i] = 1.0/num_states;
+
+    for (i = 0, k = 0; i < num_states; i++) {
+        rate_matrix[i*num_states+i] = 0.0;
+        double row_sum = 0.0;
+        for (j = 0; j < num_states; j++)
+            if (j != i) {
+                row_sum += (rate_matrix[i*num_states+j] = rates[k++]);
+            }
+        rate_matrix[i*num_states+i] = -row_sum;
+    }
+    QtoPi(rate_matrix, state_freq, num_states, space);
+
+
+    for (i = 0, sum = 0.0; i < num_states; i++) {
+        sum -= rate_matrix[i*num_states+i] * state_freq[i]; /* exp. rate */
+    }
+
+    if (sum == 0.0) throw "Empty Q matrix";
+
+    double delta = total_num_subst / sum; /* 0.01 subst. per unit time */
+
+    for (i = 0; i < num_states; i++) {
+        for (j = 0; j < num_states; j++) {
+            rate_matrix[i*num_states+j] *= delta;
+        }
+    }
+    delete [] space;
+}
+
+
+void ModelNonRev::writeInfo(ostream &out) {
+    if (num_states != 4) return;
+    out << "Rate parameters:" << endl;
+    int i, j, k;
+    for (i = 0, k = 0; i < num_states; i++) {
+        switch (i) {
+        case 0:
+            out << "A";
+            break;
+        case 1:
+            out << "C";
+            break;
+        case 2:
+            out << "G";
+            break;
+        case 3:
+            out << "T";
+            break;
+        }
+        for (j = 0; j < num_states; j++)
+            if (j != i)
+                out << '\t' << rates[k++];
+            else out << '\t' << "-";
+        out << endl;
+    }
+}
+
+
+int matby (double a[], double b[], double c[], int n,int m,int k)
+/* a[n*m], b[m*k], c[n*k]  ......  c = a*b
+*/
+{
+    int i,j,i1;
+    double t;
+    for (i = 0; i < n; i++)
+        for (j = 0; j < k; j++) {
+            for (i1=0,t=0; i1<m; i1++) t+=a[i*m+i1]*b[i1*k+j];
+            c[i*k+j] = t;
+        }
+    return (0);
+}
+
+int matexp (double Q[], double t, int n, int TimeSquare, double space[])
+{
+    /* This calculates the matrix exponential P(t) = exp(t*Q).
+       Input: Q[] has the rate matrix, and t is the time or branch length.
+              TimeSquare is the number of times the matrix is squared and should
+              be from 5 to 31.
+       Output: Q[] has the transition probability matrix, that is P(Qt).
+       space[n*n]: required working space.
+
+          P(t) = (I + Qt/m + (Qt/m)^2/2)^m, with m = 2^TimeSquare.
+
+       T[it=0] is the current matrix, and T[it=1] is the squared result matrix,
+       used to avoid copying matrices.
+       Use an even TimeSquare to avoid one round of matrix copying.
+    */
+    int it, i;
+    double *T[2];
+
+    if (TimeSquare<2 || TimeSquare>31) cout << "TimeSquare not good" << endl;
+    T[0]=Q;
+    T[1]=space;
+    for (i=0; i<n*n; i++)  T[0][i] = ldexp( Q[i]*t, -TimeSquare );
+
+    matby (T[0], T[0], T[1], n, n, n);
+    for (i=0; i<n*n; i++)  T[0][i] += T[1][i]/2;
+    for (i=0; i<n; i++)  T[0][i*n+i] ++;
+
+    for (i=0,it=0; i<TimeSquare; i++) {
+        it = !it;
+        matby (T[1-it], T[1-it], T[it], n, n, n);
+    }
+    if (it==1)
+        for (i=0;i<n*n;i++) Q[i]=T[1][i];
+    return(0);
+}
+
+const int TimeSquare = 10;
+
+void ModelNonRev::computeTransMatrix(double time, double *trans_matrix) {
+    memcpy(trans_matrix, rate_matrix, num_states*num_states*sizeof(double));
+    matexp(trans_matrix, time, num_states, TimeSquare, temp_space);
+}
+
+double ModelNonRev::computeTrans(double time, int state1, int state2) {
+    double *trans_matrix = new double[num_states*num_states];
+    memcpy(trans_matrix, rate_matrix, num_states*num_states*sizeof(double));
+    matexp(trans_matrix, time, num_states, TimeSquare, temp_space);
+    double trans = trans_matrix[state1*num_states+state2];
+    delete [] trans_matrix;
+    return trans;
+}
diff --git a/model/modelnonrev.h b/model/modelnonrev.h
new file mode 100644
index 0000000..2eb0f01
--- /dev/null
+++ b/model/modelnonrev.h
@@ -0,0 +1,92 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MODELNONREV_H
+#define MODELNONREV_H
+
+#include "phylotree.h"
+#include "modelgtr.h"
+
+/**
+The general non-reversible model
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelNonRev : public ModelGTR
+{
+public:
+    ModelNonRev(PhyloTree *tree, bool count_rates = true);
+
+	/**
+		@return TRUE if model is time-reversible, FALSE otherwise
+	*/
+	virtual bool isReversible() { return false; };
+
+	/**
+		@return the number of rate entries, equal to the number of non-diagonal elements
+			of the rate matrix (since model is NOT reversible)
+	*/
+	virtual int getNumRateEntries() { return num_states*(num_states-1); }
+
+	virtual void getQMatrix(double *rate_mat);
+	
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix();
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		compute the transition probability matrix.
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+	*/
+	virtual void computeTransMatrix(double time, double *trans_matrix);
+
+	/**
+		compute the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int state1, int state2);
+
+protected:
+
+	virtual void freeMem();
+
+	/**
+		unrestricted Q matrix. Note that Q is normalized to 1 and has row sums of 0.
+		no state frequencies are involved here since Q is a general matrix.
+	*/
+	double *rate_matrix;
+	
+	/**
+		temporary working space
+	*/
+	double *temp_space;
+};
+
+#endif
diff --git a/model/modelpomo.cpp b/model/modelpomo.cpp
new file mode 100644
index 0000000..c9c08e2
--- /dev/null
+++ b/model/modelpomo.cpp
@@ -0,0 +1,18 @@
+/*
+ * modelpomo.cpp
+ *
+ *  Created on: Mar 25, 2014
+ *      Author: minh
+ */
+
+#include "modelpomo.h"
+
+ModelPoMo::ModelPoMo() {
+	// TODO Auto-generated constructor stub
+
+}
+
+ModelPoMo::~ModelPoMo() {
+	// TODO Auto-generated destructor stub
+}
+
diff --git a/model/modelpomo.h b/model/modelpomo.h
new file mode 100644
index 0000000..e84baa8
--- /dev/null
+++ b/model/modelpomo.h
@@ -0,0 +1,19 @@
+/*
+ * modelpomo.h
+ *
+ *  Created on: Mar 25, 2014
+ *      Author: minh
+ */
+
+#ifndef MODELPOMO_H_
+#define MODELPOMO_H_
+
+#include "modelgtr.h"
+
+class ModelPoMo: public ModelGTR {
+public:
+	ModelPoMo();
+	virtual ~ModelPoMo();
+};
+
+#endif /* MODELPOMO_H_ */
diff --git a/model/modelprotein.cpp b/model/modelprotein.cpp
new file mode 100644
index 0000000..762fe23
--- /dev/null
+++ b/model/modelprotein.cpp
@@ -0,0 +1,3242 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "modelprotein.h"
+#include <string>
+
+/*
+	following are definitions for various protein models encoded in a string.
+	This string contains the lower triangle of the rate matrix and the state frequencies at the end.
+	It should follow the amino acid order:
+	A   R   N   D   C   Q   E   G   H   I   L   K   M   F   P   S   T   W   Y   V
+	Ala Arg Asn Asp Cys Gln Glu Gly His Ile Leu Lys Met Phe Pro Ser Thr Trp Tyr Val
+*/
+
+/* this function is taken from RAxML source code */
+static bool initProtMat(double f[20], double daa[400], string prot_model)
+{ 
+  double max, temp;
+  int i, j;
+  double scaler;
+
+  if (prot_model == "POISSON") {
+	  for (i = 0; i < 400; i++) daa[i] = 1.0;
+	  for (i = 0; i < 20; i++) f[i] = 0.05;
+  } else if (prot_model == "DAYHOFF")
+	  {	
+	    daa[ 1*20+ 0] =   27.00; daa[ 2*20+ 0] =   98.00; daa[ 2*20+ 1] =   32.00; daa[ 3*20+ 0] =  120.00;
+	    daa[ 3*20+ 1] =    0.00; daa[ 3*20+ 2] =  905.00; daa[ 4*20+ 0] =   36.00; daa[ 4*20+ 1] =   23.00;
+	    daa[ 4*20+ 2] =    0.00; daa[ 4*20+ 3] =    0.00; daa[ 5*20+ 0] =   89.00; daa[ 5*20+ 1] =  246.00;
+	    daa[ 5*20+ 2] =  103.00; daa[ 5*20+ 3] =  134.00; daa[ 5*20+ 4] =    0.00; daa[ 6*20+ 0] =  198.00;
+	    daa[ 6*20+ 1] =    1.00; daa[ 6*20+ 2] =  148.00; daa[ 6*20+ 3] = 1153.00; daa[ 6*20+ 4] =    0.00;
+	    daa[ 6*20+ 5] =  716.00; daa[ 7*20+ 0] =  240.00; daa[ 7*20+ 1] =    9.00; daa[ 7*20+ 2] =  139.00;
+	    daa[ 7*20+ 3] =  125.00; daa[ 7*20+ 4] =   11.00; daa[ 7*20+ 5] =   28.00; daa[ 7*20+ 6] =   81.00;
+	    daa[ 8*20+ 0] =   23.00; daa[ 8*20+ 1] =  240.00; daa[ 8*20+ 2] =  535.00; daa[ 8*20+ 3] =   86.00;
+	    daa[ 8*20+ 4] =   28.00; daa[ 8*20+ 5] =  606.00; daa[ 8*20+ 6] =   43.00; daa[ 8*20+ 7] =   10.00;
+	    daa[ 9*20+ 0] =   65.00; daa[ 9*20+ 1] =   64.00; daa[ 9*20+ 2] =   77.00; daa[ 9*20+ 3] =   24.00;
+	    daa[ 9*20+ 4] =   44.00; daa[ 9*20+ 5] =   18.00; daa[ 9*20+ 6] =   61.00; daa[ 9*20+ 7] =    0.00;
+	    daa[ 9*20+ 8] =    7.00; daa[10*20+ 0] =   41.00; daa[10*20+ 1] =   15.00; daa[10*20+ 2] =   34.00;
+	    daa[10*20+ 3] =    0.00; daa[10*20+ 4] =    0.00; daa[10*20+ 5] =   73.00; daa[10*20+ 6] =   11.00;
+	    daa[10*20+ 7] =    7.00; daa[10*20+ 8] =   44.00; daa[10*20+ 9] =  257.00; daa[11*20+ 0] =   26.00;
+	    daa[11*20+ 1] =  464.00; daa[11*20+ 2] =  318.00; daa[11*20+ 3] =   71.00; daa[11*20+ 4] =    0.00;
+	    daa[11*20+ 5] =  153.00; daa[11*20+ 6] =   83.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   26.00;
+	    daa[11*20+ 9] =   46.00; daa[11*20+10] =   18.00; daa[12*20+ 0] =   72.00; daa[12*20+ 1] =   90.00;
+	    daa[12*20+ 2] =    1.00; daa[12*20+ 3] =    0.00; daa[12*20+ 4] =    0.00; daa[12*20+ 5] =  114.00;
+	    daa[12*20+ 6] =   30.00; daa[12*20+ 7] =   17.00; daa[12*20+ 8] =    0.00; daa[12*20+ 9] =  336.00;
+	    daa[12*20+10] =  527.00; daa[12*20+11] =  243.00; daa[13*20+ 0] =   18.00; daa[13*20+ 1] =   14.00;
+	    daa[13*20+ 2] =   14.00; daa[13*20+ 3] =    0.00; daa[13*20+ 4] =    0.00; daa[13*20+ 5] =    0.00;
+	    daa[13*20+ 6] =    0.00; daa[13*20+ 7] =   15.00; daa[13*20+ 8] =   48.00; daa[13*20+ 9] =  196.00;
+	    daa[13*20+10] =  157.00; daa[13*20+11] =    0.00; daa[13*20+12] =   92.00; daa[14*20+ 0] =  250.00;
+	    daa[14*20+ 1] =  103.00; daa[14*20+ 2] =   42.00; daa[14*20+ 3] =   13.00; daa[14*20+ 4] =   19.00;
+	    daa[14*20+ 5] =  153.00; daa[14*20+ 6] =   51.00; daa[14*20+ 7] =   34.00; daa[14*20+ 8] =   94.00;
+	    daa[14*20+ 9] =   12.00; daa[14*20+10] =   32.00; daa[14*20+11] =   33.00; daa[14*20+12] =   17.00;
+	    daa[14*20+13] =   11.00; daa[15*20+ 0] =  409.00; daa[15*20+ 1] =  154.00; daa[15*20+ 2] =  495.00;
+	    daa[15*20+ 3] =   95.00; daa[15*20+ 4] =  161.00; daa[15*20+ 5] =   56.00; daa[15*20+ 6] =   79.00;
+	    daa[15*20+ 7] =  234.00; daa[15*20+ 8] =   35.00; daa[15*20+ 9] =   24.00; daa[15*20+10] =   17.00;
+	    daa[15*20+11] =   96.00; daa[15*20+12] =   62.00; daa[15*20+13] =   46.00; daa[15*20+14] =  245.00;
+	    daa[16*20+ 0] =  371.00; daa[16*20+ 1] =   26.00; daa[16*20+ 2] =  229.00; daa[16*20+ 3] =   66.00;
+	    daa[16*20+ 4] =   16.00; daa[16*20+ 5] =   53.00; daa[16*20+ 6] =   34.00; daa[16*20+ 7] =   30.00;
+	    daa[16*20+ 8] =   22.00; daa[16*20+ 9] =  192.00; daa[16*20+10] =   33.00; daa[16*20+11] =  136.00;
+	    daa[16*20+12] =  104.00; daa[16*20+13] =   13.00; daa[16*20+14] =   78.00; daa[16*20+15] =  550.00;
+	    daa[17*20+ 0] =    0.00; daa[17*20+ 1] =  201.00; daa[17*20+ 2] =   23.00; daa[17*20+ 3] =    0.00;
+	    daa[17*20+ 4] =    0.00; daa[17*20+ 5] =    0.00; daa[17*20+ 6] =    0.00; daa[17*20+ 7] =    0.00;
+	    daa[17*20+ 8] =   27.00; daa[17*20+ 9] =    0.00; daa[17*20+10] =   46.00; daa[17*20+11] =    0.00;
+	    daa[17*20+12] =    0.00; daa[17*20+13] =   76.00; daa[17*20+14] =    0.00; daa[17*20+15] =   75.00;
+	    daa[17*20+16] =    0.00; daa[18*20+ 0] =   24.00; daa[18*20+ 1] =    8.00; daa[18*20+ 2] =   95.00;
+	    daa[18*20+ 3] =    0.00; daa[18*20+ 4] =   96.00; daa[18*20+ 5] =    0.00; daa[18*20+ 6] =   22.00;
+	    daa[18*20+ 7] =    0.00; daa[18*20+ 8] =  127.00; daa[18*20+ 9] =   37.00; daa[18*20+10] =   28.00;
+	    daa[18*20+11] =   13.00; daa[18*20+12] =    0.00; daa[18*20+13] =  698.00; daa[18*20+14] =    0.00;
+	    daa[18*20+15] =   34.00; daa[18*20+16] =   42.00; daa[18*20+17] =   61.00; daa[19*20+ 0] =  208.00;
+	    daa[19*20+ 1] =   24.00; daa[19*20+ 2] =   15.00; daa[19*20+ 3] =   18.00; daa[19*20+ 4] =   49.00;
+	    daa[19*20+ 5] =   35.00; daa[19*20+ 6] =   37.00; daa[19*20+ 7] =   54.00; daa[19*20+ 8] =   44.00;
+	    daa[19*20+ 9] =  889.00; daa[19*20+10] =  175.00; daa[19*20+11] =   10.00; daa[19*20+12] =  258.00;
+	    daa[19*20+13] =   12.00; daa[19*20+14] =   48.00; daa[19*20+15] =   30.00; daa[19*20+16] =  157.00;
+	    daa[19*20+17] =    0.00; daa[19*20+18] =   28.00;	    	    
+
+/*
+ * ROUNDING ERROR again:
+	    f[ 0] = 0.087000; f[ 1] = 0.041000; f[ 2] = 0.040000; f[ 3] = 0.047000;
+	    f[ 4] = 0.034000; f[ 5] = 0.038000; f[ 6] = 0.050000; f[ 7] = 0.089000;
+	    f[ 8] = 0.034000; f[ 9] = 0.037000; f[10] = 0.085000; f[11] = 0.080000;
+	    f[12] = 0.014000; f[13] = 0.040000; f[14] = 0.051000; f[15] = 0.070000;
+	    f[16] = 0.058000; f[17] = 0.011000; f[18] = 0.030000; f[19] = 0.064000;
+*/
+	    //NOTE: Originally, f[19]=0.064718 but frequencies do not sum up to 1
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033618; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080482;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+	  }
+	else if (prot_model == "DCMUT") 
+	  {	
+	    daa[ 1*20+ 0] =   26.78280; daa[ 2*20+ 0] =   98.44740; daa[ 2*20+ 1] =   32.70590; daa[ 3*20+ 0] =  119.98050; 
+	    daa[ 3*20+ 1] =    0.00000; daa[ 3*20+ 2] =  893.15150; daa[ 4*20+ 0] =   36.00160; daa[ 4*20+ 1] =   23.23740; 
+	    daa[ 4*20+ 2] =    0.00000; daa[ 4*20+ 3] =    0.00000; daa[ 5*20+ 0] =   88.77530; daa[ 5*20+ 1] =  243.99390; 
+	    daa[ 5*20+ 2] =  102.85090; daa[ 5*20+ 3] =  134.85510; daa[ 5*20+ 4] =    0.00000; daa[ 6*20+ 0] =  196.11670; 
+	    daa[ 6*20+ 1] =    0.00000; daa[ 6*20+ 2] =  149.34090; daa[ 6*20+ 3] = 1138.86590; daa[ 6*20+ 4] =    0.00000; 
+	    daa[ 6*20+ 5] =  708.60220; daa[ 7*20+ 0] =  238.61110; daa[ 7*20+ 1] =    8.77910; daa[ 7*20+ 2] =  138.53520; 
+	    daa[ 7*20+ 3] =  124.09810; daa[ 7*20+ 4] =   10.72780; daa[ 7*20+ 5] =   28.15810; daa[ 7*20+ 6] =   81.19070; 
+	    daa[ 8*20+ 0] =   22.81160; daa[ 8*20+ 1] =  238.31480; daa[ 8*20+ 2] =  529.00240; daa[ 8*20+ 3] =   86.82410; 
+	    daa[ 8*20+ 4] =   28.27290; daa[ 8*20+ 5] =  601.16130; daa[ 8*20+ 6] =   43.94690; daa[ 8*20+ 7] =   10.68020; 
+	    daa[ 9*20+ 0] =   65.34160; daa[ 9*20+ 1] =   63.26290; daa[ 9*20+ 2] =   76.80240; daa[ 9*20+ 3] =   23.92480; 
+	    daa[ 9*20+ 4] =   43.80740; daa[ 9*20+ 5] =   18.03930; daa[ 9*20+ 6] =   60.95260; daa[ 9*20+ 7] =    0.00000; 
+	    daa[ 9*20+ 8] =    7.69810; daa[10*20+ 0] =   40.64310; daa[10*20+ 1] =   15.49240; daa[10*20+ 2] =   34.11130; 
+	    daa[10*20+ 3] =    0.00000; daa[10*20+ 4] =    0.00000; daa[10*20+ 5] =   73.07720; daa[10*20+ 6] =   11.28800; 
+	    daa[10*20+ 7] =    7.15140; daa[10*20+ 8] =   44.35040; daa[10*20+ 9] =  255.66850; daa[11*20+ 0] =   25.86350; 
+	    daa[11*20+ 1] =  461.01240; daa[11*20+ 2] =  314.83710; daa[11*20+ 3] =   71.69130; daa[11*20+ 4] =    0.00000; 
+	    daa[11*20+ 5] =  151.90780; daa[11*20+ 6] =   83.00780; daa[11*20+ 7] =   26.76830; daa[11*20+ 8] =   27.04750; 
+	    daa[11*20+ 9] =   46.08570; daa[11*20+10] =   18.06290; daa[12*20+ 0] =   71.78400; daa[12*20+ 1] =   89.63210; 
+	    daa[12*20+ 2] =    0.00000; daa[12*20+ 3] =    0.00000; daa[12*20+ 4] =    0.00000; daa[12*20+ 5] =  112.74990; 
+	    daa[12*20+ 6] =   30.48030; daa[12*20+ 7] =   17.03720; daa[12*20+ 8] =    0.00000; daa[12*20+ 9] =  333.27320; 
+	    daa[12*20+10] =  523.01150; daa[12*20+11] =  241.17390; daa[13*20+ 0] =   18.36410; daa[13*20+ 1] =   13.69060; 
+	    daa[13*20+ 2] =   13.85030; daa[13*20+ 3] =    0.00000; daa[13*20+ 4] =    0.00000; daa[13*20+ 5] =    0.00000; 
+	    daa[13*20+ 6] =    0.00000; daa[13*20+ 7] =   15.34780; daa[13*20+ 8] =   47.59270; daa[13*20+ 9] =  195.19510; 
+	    daa[13*20+10] =  156.51600; daa[13*20+11] =    0.00000; daa[13*20+12] =   92.18600; daa[14*20+ 0] =  248.59200; 
+	    daa[14*20+ 1] =  102.83130; daa[14*20+ 2] =   41.92440; daa[14*20+ 3] =   13.39400; daa[14*20+ 4] =   18.75500; 
+	    daa[14*20+ 5] =  152.61880; daa[14*20+ 6] =   50.70030; daa[14*20+ 7] =   34.71530; daa[14*20+ 8] =   93.37090; 
+	    daa[14*20+ 9] =   11.91520; daa[14*20+10] =   31.62580; daa[14*20+11] =   33.54190; daa[14*20+12] =   17.02050; 
+	    daa[14*20+13] =   11.05060; daa[15*20+ 0] =  405.18700; daa[15*20+ 1] =  153.15900; daa[15*20+ 2] =  488.58920; 
+	    daa[15*20+ 3] =   95.60970; daa[15*20+ 4] =  159.83560; daa[15*20+ 5] =   56.18280; daa[15*20+ 6] =   79.39990; 
+	    daa[15*20+ 7] =  232.22430; daa[15*20+ 8] =   35.36430; daa[15*20+ 9] =   24.79550; daa[15*20+10] =   17.14320; 
+	    daa[15*20+11] =   95.45570; daa[15*20+12] =   61.99510; daa[15*20+13] =   45.99010; daa[15*20+14] =  242.72020; 
+	    daa[16*20+ 0] =  368.03650; daa[16*20+ 1] =   26.57450; daa[16*20+ 2] =  227.16970; daa[16*20+ 3] =   66.09300; 
+	    daa[16*20+ 4] =   16.23660; daa[16*20+ 5] =   52.56510; daa[16*20+ 6] =   34.01560; daa[16*20+ 7] =   30.66620; 
+	    daa[16*20+ 8] =   22.63330; daa[16*20+ 9] =  190.07390; daa[16*20+10] =   33.10900; daa[16*20+11] =  135.05990; 
+	    daa[16*20+12] =  103.15340; daa[16*20+13] =   13.66550; daa[16*20+14] =   78.28570; daa[16*20+15] =  543.66740; 
+	    daa[17*20+ 0] =    0.00000; daa[17*20+ 1] =  200.13750; daa[17*20+ 2] =   22.49680; daa[17*20+ 3] =    0.00000; 
+	    daa[17*20+ 4] =    0.00000; daa[17*20+ 5] =    0.00000; daa[17*20+ 6] =    0.00000; daa[17*20+ 7] =    0.00000; 
+	    daa[17*20+ 8] =   27.05640; daa[17*20+ 9] =    0.00000; daa[17*20+10] =   46.17760; daa[17*20+11] =    0.00000; 
+	    daa[17*20+12] =    0.00000; daa[17*20+13] =   76.23540; daa[17*20+14] =    0.00000; daa[17*20+15] =   74.08190; 
+	    daa[17*20+16] =    0.00000; daa[18*20+ 0] =   24.41390; daa[18*20+ 1] =    7.80120; daa[18*20+ 2] =   94.69400; 
+	    daa[18*20+ 3] =    0.00000; daa[18*20+ 4] =   95.31640; daa[18*20+ 5] =    0.00000; daa[18*20+ 6] =   21.47170; 
+	    daa[18*20+ 7] =    0.00000; daa[18*20+ 8] =  126.54000; daa[18*20+ 9] =   37.48340; daa[18*20+10] =   28.65720; 
+	    daa[18*20+11] =   13.21420; daa[18*20+12] =    0.00000; daa[18*20+13] =  695.26290; daa[18*20+14] =    0.00000; 
+	    daa[18*20+15] =   33.62890; daa[18*20+16] =   41.78390; daa[18*20+17] =   60.80700; daa[19*20+ 0] =  205.95640; 
+	    daa[19*20+ 1] =   24.03680; daa[19*20+ 2] =   15.80670; daa[19*20+ 3] =   17.83160; daa[19*20+ 4] =   48.46780; 
+	    daa[19*20+ 5] =   34.69830; daa[19*20+ 6] =   36.72500; daa[19*20+ 7] =   53.81650; daa[19*20+ 8] =   43.87150; 
+	    daa[19*20+ 9] =  881.00380; daa[19*20+10] =  174.51560; daa[19*20+11] =   10.38500; daa[19*20+12] =  256.59550; 
+	    daa[19*20+13] =   12.36060; daa[19*20+14] =   48.50260; daa[19*20+15] =   30.38360; daa[19*20+16] =  156.19970; 
+	    daa[19*20+17] =    0.00000; daa[19*20+18] =   27.93790;   	    	   
+
+	    /* ROUNDING ERROR:
+	    f[ 0] = 0.08700; f[ 1] = 0.04100; f[ 2] = 0.04000; f[ 3] = 0.04700;
+	    f[ 4] = 0.03300; f[ 5] = 0.03800; f[ 6] = 0.04900; f[ 7] = 0.08900;
+	    f[ 8] = 0.03400; f[ 9] = 0.03700; f[10] = 0.08500; f[11] = 0.08000;
+	    f[12] = 0.01500; f[13] = 0.04000; f[14] = 0.05200; f[15] = 0.06900;
+	    f[16] = 0.05900; f[17] = 0.01000; f[18] = 0.03000; f[19] = 0.06500;
+*/
+	    // NOTE: originally f[19]=0.064718 but frequencies do not sum up to 1
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033619; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080481;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+	  }
+	else if (prot_model == "JTT") 
+	  {
+	    daa[ 1*20+ 0] =   58.00; daa[ 2*20+ 0] =   54.00; daa[ 2*20+ 1] =   45.00; daa[ 3*20+ 0] =   81.00;
+	    daa[ 3*20+ 1] =   16.00; daa[ 3*20+ 2] =  528.00; daa[ 4*20+ 0] =   56.00; daa[ 4*20+ 1] =  113.00;
+	    daa[ 4*20+ 2] =   34.00; daa[ 4*20+ 3] =   10.00; daa[ 5*20+ 0] =   57.00; daa[ 5*20+ 1] =  310.00;
+	    daa[ 5*20+ 2] =   86.00; daa[ 5*20+ 3] =   49.00; daa[ 5*20+ 4] =    9.00; daa[ 6*20+ 0] =  105.00;
+	    daa[ 6*20+ 1] =   29.00; daa[ 6*20+ 2] =   58.00; daa[ 6*20+ 3] =  767.00; daa[ 6*20+ 4] =    5.00;
+	    daa[ 6*20+ 5] =  323.00; daa[ 7*20+ 0] =  179.00; daa[ 7*20+ 1] =  137.00; daa[ 7*20+ 2] =   81.00;
+	    daa[ 7*20+ 3] =  130.00; daa[ 7*20+ 4] =   59.00; daa[ 7*20+ 5] =   26.00; daa[ 7*20+ 6] =  119.00;
+	    daa[ 8*20+ 0] =   27.00; daa[ 8*20+ 1] =  328.00; daa[ 8*20+ 2] =  391.00; daa[ 8*20+ 3] =  112.00;
+	    daa[ 8*20+ 4] =   69.00; daa[ 8*20+ 5] =  597.00; daa[ 8*20+ 6] =   26.00; daa[ 8*20+ 7] =   23.00;
+	    daa[ 9*20+ 0] =   36.00; daa[ 9*20+ 1] =   22.00; daa[ 9*20+ 2] =   47.00; daa[ 9*20+ 3] =   11.00;
+	    daa[ 9*20+ 4] =   17.00; daa[ 9*20+ 5] =    9.00; daa[ 9*20+ 6] =   12.00; daa[ 9*20+ 7] =    6.00;
+	    daa[ 9*20+ 8] =   16.00; daa[10*20+ 0] =   30.00; daa[10*20+ 1] =   38.00; daa[10*20+ 2] =   12.00;
+	    daa[10*20+ 3] =    7.00; daa[10*20+ 4] =   23.00; daa[10*20+ 5] =   72.00; daa[10*20+ 6] =    9.00;
+	    daa[10*20+ 7] =    6.00; daa[10*20+ 8] =   56.00; daa[10*20+ 9] =  229.00; daa[11*20+ 0] =   35.00;
+	    daa[11*20+ 1] =  646.00; daa[11*20+ 2] =  263.00; daa[11*20+ 3] =   26.00; daa[11*20+ 4] =    7.00;
+	    daa[11*20+ 5] =  292.00; daa[11*20+ 6] =  181.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   45.00;
+	    daa[11*20+ 9] =   21.00; daa[11*20+10] =   14.00; daa[12*20+ 0] =   54.00; daa[12*20+ 1] =   44.00;
+	    daa[12*20+ 2] =   30.00; daa[12*20+ 3] =   15.00; daa[12*20+ 4] =   31.00; daa[12*20+ 5] =   43.00;
+	    daa[12*20+ 6] =   18.00; daa[12*20+ 7] =   14.00; daa[12*20+ 8] =   33.00; daa[12*20+ 9] =  479.00;
+	    daa[12*20+10] =  388.00; daa[12*20+11] =   65.00; daa[13*20+ 0] =   15.00; daa[13*20+ 1] =    5.00;
+	    daa[13*20+ 2] =   10.00; daa[13*20+ 3] =    4.00; daa[13*20+ 4] =   78.00; daa[13*20+ 5] =    4.00;
+	    daa[13*20+ 6] =    5.00; daa[13*20+ 7] =    5.00; daa[13*20+ 8] =   40.00; daa[13*20+ 9] =   89.00;
+	    daa[13*20+10] =  248.00; daa[13*20+11] =    4.00; daa[13*20+12] =   43.00; daa[14*20+ 0] =  194.00;
+	    daa[14*20+ 1] =   74.00; daa[14*20+ 2] =   15.00; daa[14*20+ 3] =   15.00; daa[14*20+ 4] =   14.00;
+	    daa[14*20+ 5] =  164.00; daa[14*20+ 6] =   18.00; daa[14*20+ 7] =   24.00; daa[14*20+ 8] =  115.00;
+	    daa[14*20+ 9] =   10.00; daa[14*20+10] =  102.00; daa[14*20+11] =   21.00; daa[14*20+12] =   16.00;
+	    daa[14*20+13] =   17.00; daa[15*20+ 0] =  378.00; daa[15*20+ 1] =  101.00; daa[15*20+ 2] =  503.00;
+	    daa[15*20+ 3] =   59.00; daa[15*20+ 4] =  223.00; daa[15*20+ 5] =   53.00; daa[15*20+ 6] =   30.00;
+	    daa[15*20+ 7] =  201.00; daa[15*20+ 8] =   73.00; daa[15*20+ 9] =   40.00; daa[15*20+10] =   59.00;
+	    daa[15*20+11] =   47.00; daa[15*20+12] =   29.00; daa[15*20+13] =   92.00; daa[15*20+14] =  285.00;
+	    daa[16*20+ 0] =  475.00; daa[16*20+ 1] =   64.00; daa[16*20+ 2] =  232.00; daa[16*20+ 3] =   38.00;
+	    daa[16*20+ 4] =   42.00; daa[16*20+ 5] =   51.00; daa[16*20+ 6] =   32.00; daa[16*20+ 7] =   33.00;
+	    daa[16*20+ 8] =   46.00; daa[16*20+ 9] =  245.00; daa[16*20+10] =   25.00; daa[16*20+11] =  103.00;
+	    daa[16*20+12] =  226.00; daa[16*20+13] =   12.00; daa[16*20+14] =  118.00; daa[16*20+15] =  477.00;
+	    daa[17*20+ 0] =    9.00; daa[17*20+ 1] =  126.00; daa[17*20+ 2] =    8.00; daa[17*20+ 3] =    4.00;
+	    daa[17*20+ 4] =  115.00; daa[17*20+ 5] =   18.00; daa[17*20+ 6] =   10.00; daa[17*20+ 7] =   55.00;
+	    daa[17*20+ 8] =    8.00; daa[17*20+ 9] =    9.00; daa[17*20+10] =   52.00; daa[17*20+11] =   10.00;
+	    daa[17*20+12] =   24.00; daa[17*20+13] =   53.00; daa[17*20+14] =    6.00; daa[17*20+15] =   35.00;
+	    daa[17*20+16] =   12.00; daa[18*20+ 0] =   11.00; daa[18*20+ 1] =   20.00; daa[18*20+ 2] =   70.00;
+	    daa[18*20+ 3] =   46.00; daa[18*20+ 4] =  209.00; daa[18*20+ 5] =   24.00; daa[18*20+ 6] =    7.00;
+	    daa[18*20+ 7] =    8.00; daa[18*20+ 8] =  573.00; daa[18*20+ 9] =   32.00; daa[18*20+10] =   24.00;
+	    daa[18*20+11] =    8.00; daa[18*20+12] =   18.00; daa[18*20+13] =  536.00; daa[18*20+14] =   10.00;
+	    daa[18*20+15] =   63.00; daa[18*20+16] =   21.00; daa[18*20+17] =   71.00; daa[19*20+ 0] =  298.00;
+	    daa[19*20+ 1] =   17.00; daa[19*20+ 2] =   16.00; daa[19*20+ 3] =   31.00; daa[19*20+ 4] =   62.00;
+	    daa[19*20+ 5] =   20.00; daa[19*20+ 6] =   45.00; daa[19*20+ 7] =   47.00; daa[19*20+ 8] =   11.00;
+	    daa[19*20+ 9] =  961.00; daa[19*20+10] =  180.00; daa[19*20+11] =   14.00; daa[19*20+12] =  323.00;
+	    daa[19*20+13] =   62.00; daa[19*20+14] =   23.00; daa[19*20+15] =   38.00; daa[19*20+16] =  112.00;
+	    daa[19*20+17] =   25.00; daa[19*20+18] =   16.00;
+	    	    
+	    /* ROUNDING ERROR:
+	    f[ 0] = 0.07700; f[ 1] = 0.05200; f[ 2] = 0.04200; f[ 3] = 0.05100;
+	    f[ 4] = 0.02000; f[ 5] = 0.04100; f[ 6] = 0.06200; f[ 7] = 0.07300;
+	    f[ 8] = 0.02300; f[ 9] = 0.05400; f[10] = 0.09200; f[11] = 0.05900;
+	    f[12] = 0.02400; f[13] = 0.04000; f[14] = 0.05100; f[15] = 0.06900;
+	    f[16] = 0.05800; f[17] = 0.01400; f[18] = 0.03200; f[19] = 0.06600;
+	    */
+	    // NOTE: originally, f[19]=0.066005 but frequencies do not sum up to 1
+	    f[ 0] = 0.076748; f[ 1] = 0.051691; f[ 2] = 0.042645; f[ 3] = 0.051544;
+	    f[ 4] = 0.019803; f[ 5] = 0.040752; f[ 6] = 0.061830; f[ 7] = 0.073152;
+	    f[ 8] = 0.022944; f[ 9] = 0.053761; f[10] = 0.091904; f[11] = 0.058676;
+	    f[12] = 0.023826; f[13] = 0.040126; f[14] = 0.050901; f[15] = 0.068765;
+	    f[16] = 0.058565; f[17] = 0.014261; f[18] = 0.032102; f[19] = 0.066004;
+
+	  }
+	else if (prot_model == "MTREV") 
+	  {
+	    daa[ 1*20+ 0] =   23.18; daa[ 2*20+ 0] =   26.95; daa[ 2*20+ 1] =   13.24; daa[ 3*20+ 0] =   17.67;
+	    daa[ 3*20+ 1] =    1.90; daa[ 3*20+ 2] =  794.38; daa[ 4*20+ 0] =   59.93; daa[ 4*20+ 1] =  103.33;
+	    daa[ 4*20+ 2] =   58.94; daa[ 4*20+ 3] =    1.90; daa[ 5*20+ 0] =    1.90; daa[ 5*20+ 1] =  220.99;
+	    daa[ 5*20+ 2] =  173.56; daa[ 5*20+ 3] =   55.28; daa[ 5*20+ 4] =   75.24; daa[ 6*20+ 0] =    9.77;
+	    daa[ 6*20+ 1] =    1.90; daa[ 6*20+ 2] =   63.05; daa[ 6*20+ 3] =  583.55; daa[ 6*20+ 4] =    1.90;
+	    daa[ 6*20+ 5] =  313.56; daa[ 7*20+ 0] =  120.71; daa[ 7*20+ 1] =   23.03; daa[ 7*20+ 2] =   53.30;
+	    daa[ 7*20+ 3] =   56.77; daa[ 7*20+ 4] =   30.71; daa[ 7*20+ 5] =    6.75; daa[ 7*20+ 6] =   28.28;
+	    daa[ 8*20+ 0] =   13.90; daa[ 8*20+ 1] =  165.23; daa[ 8*20+ 2] =  496.13; daa[ 8*20+ 3] =  113.99;
+	    daa[ 8*20+ 4] =  141.49; daa[ 8*20+ 5] =  582.40; daa[ 8*20+ 6] =   49.12; daa[ 8*20+ 7] =    1.90;
+	    daa[ 9*20+ 0] =   96.49; daa[ 9*20+ 1] =    1.90; daa[ 9*20+ 2] =   27.10; daa[ 9*20+ 3] =    4.34;
+	    daa[ 9*20+ 4] =   62.73; daa[ 9*20+ 5] =    8.34; daa[ 9*20+ 6] =    3.31; daa[ 9*20+ 7] =    5.98;
+	    daa[ 9*20+ 8] =   12.26; daa[10*20+ 0] =   25.46; daa[10*20+ 1] =   15.58; daa[10*20+ 2] =   15.16;
+	    daa[10*20+ 3] =    1.90; daa[10*20+ 4] =   25.65; daa[10*20+ 5] =   39.70; daa[10*20+ 6] =    1.90;
+	    daa[10*20+ 7] =    2.41; daa[10*20+ 8] =   11.49; daa[10*20+ 9] =  329.09; daa[11*20+ 0] =    8.36;
+	    daa[11*20+ 1] =  141.40; daa[11*20+ 2] =  608.70; daa[11*20+ 3] =    2.31; daa[11*20+ 4] =    1.90;
+	    daa[11*20+ 5] =  465.58; daa[11*20+ 6] =  313.86; daa[11*20+ 7] =   22.73; daa[11*20+ 8] =  127.67;
+	    daa[11*20+ 9] =   19.57; daa[11*20+10] =   14.88; daa[12*20+ 0] =  141.88; daa[12*20+ 1] =    1.90;
+	    daa[12*20+ 2] =   65.41; daa[12*20+ 3] =    1.90; daa[12*20+ 4] =    6.18; daa[12*20+ 5] =   47.37;
+	    daa[12*20+ 6] =    1.90; daa[12*20+ 7] =    1.90; daa[12*20+ 8] =   11.97; daa[12*20+ 9] =  517.98;
+	    daa[12*20+10] =  537.53; daa[12*20+11] =   91.37; daa[13*20+ 0] =    6.37; daa[13*20+ 1] =    4.69;
+	    daa[13*20+ 2] =   15.20; daa[13*20+ 3] =    4.98; daa[13*20+ 4] =   70.80; daa[13*20+ 5] =   19.11;
+	    daa[13*20+ 6] =    2.67; daa[13*20+ 7] =    1.90; daa[13*20+ 8] =   48.16; daa[13*20+ 9] =   84.67;
+	    daa[13*20+10] =  216.06; daa[13*20+11] =    6.44; daa[13*20+12] =   90.82; daa[14*20+ 0] =   54.31;
+	    daa[14*20+ 1] =   23.64; daa[14*20+ 2] =   73.31; daa[14*20+ 3] =   13.43; daa[14*20+ 4] =   31.26;
+	    daa[14*20+ 5] =  137.29; daa[14*20+ 6] =   12.83; daa[14*20+ 7] =    1.90; daa[14*20+ 8] =   60.97;
+	    daa[14*20+ 9] =   20.63; daa[14*20+10] =   40.10; daa[14*20+11] =   50.10; daa[14*20+12] =   18.84;
+	    daa[14*20+13] =   17.31; daa[15*20+ 0] =  387.86; daa[15*20+ 1] =    6.04; daa[15*20+ 2] =  494.39;
+	    daa[15*20+ 3] =   69.02; daa[15*20+ 4] =  277.05; daa[15*20+ 5] =   54.11; daa[15*20+ 6] =   54.71;
+	    daa[15*20+ 7] =  125.93; daa[15*20+ 8] =   77.46; daa[15*20+ 9] =   47.70; daa[15*20+10] =   73.61;
+	    daa[15*20+11] =  105.79; daa[15*20+12] =  111.16; daa[15*20+13] =   64.29; daa[15*20+14] =  169.90;
+	    daa[16*20+ 0] =  480.72; daa[16*20+ 1] =    2.08; daa[16*20+ 2] =  238.46; daa[16*20+ 3] =   28.01;
+	    daa[16*20+ 4] =  179.97; daa[16*20+ 5] =   94.93; daa[16*20+ 6] =   14.82; daa[16*20+ 7] =   11.17;
+	    daa[16*20+ 8] =   44.78; daa[16*20+ 9] =  368.43; daa[16*20+10] =  126.40; daa[16*20+11] =  136.33;
+	    daa[16*20+12] =  528.17; daa[16*20+13] =   33.85; daa[16*20+14] =  128.22; daa[16*20+15] =  597.21;
+	    daa[17*20+ 0] =    1.90; daa[17*20+ 1] =   21.95; daa[17*20+ 2] =   10.68; daa[17*20+ 3] =   19.86;
+	    daa[17*20+ 4] =   33.60; daa[17*20+ 5] =    1.90; daa[17*20+ 6] =    1.90; daa[17*20+ 7] =   10.92;
+	    daa[17*20+ 8] =    7.08; daa[17*20+ 9] =    1.90; daa[17*20+10] =   32.44; daa[17*20+11] =   24.00;
+	    daa[17*20+12] =   21.71; daa[17*20+13] =    7.84; daa[17*20+14] =    4.21; daa[17*20+15] =   38.58;
+	    daa[17*20+16] =    9.99; daa[18*20+ 0] =    6.48; daa[18*20+ 1] =    1.90; daa[18*20+ 2] =  191.36;
+	    daa[18*20+ 3] =   21.21; daa[18*20+ 4] =  254.77; daa[18*20+ 5] =   38.82; daa[18*20+ 6] =   13.12;
+	    daa[18*20+ 7] =    3.21; daa[18*20+ 8] =  670.14; daa[18*20+ 9] =   25.01; daa[18*20+10] =   44.15;
+	    daa[18*20+11] =   51.17; daa[18*20+12] =   39.96; daa[18*20+13] =  465.58; daa[18*20+14] =   16.21;
+	    daa[18*20+15] =   64.92; daa[18*20+16] =   38.73; daa[18*20+17] =   26.25; daa[19*20+ 0] =  195.06;
+	    daa[19*20+ 1] =    7.64; daa[19*20+ 2] =    1.90; daa[19*20+ 3] =    1.90; daa[19*20+ 4] =    1.90;
+	    daa[19*20+ 5] =   19.00; daa[19*20+ 6] =   21.14; daa[19*20+ 7] =    2.53; daa[19*20+ 8] =    1.90;
+	    daa[19*20+ 9] = 1222.94; daa[19*20+10] =   91.67; daa[19*20+11] =    1.90; daa[19*20+12] =  387.54;
+	    daa[19*20+13] =    6.35; daa[19*20+14] =    8.23; daa[19*20+15] =    1.90; daa[19*20+16] =  204.54;
+	    daa[19*20+17] =    5.37; daa[19*20+18] =    1.90;
+	    
+	    
+	    f[ 0] = 0.072000; f[ 1] = 0.019000; f[ 2] = 0.039000; f[ 3] = 0.019000;
+	    f[ 4] = 0.006000; f[ 5] = 0.025000; f[ 6] = 0.024000; f[ 7] = 0.056000;
+	    f[ 8] = 0.028000; f[ 9] = 0.088000; f[10] = 0.169000; f[11] = 0.023000;
+	    f[12] = 0.054000; f[13] = 0.061000; f[14] = 0.054000; f[15] = 0.072000;
+	    f[16] = 0.086000; f[17] = 0.029000; f[18] = 0.033000; f[19] = 0.043000;
+/*
+	The original matrix from Adachi & Hasegawa (1996) is:
+	    f[ 0] = 0.072000; f[ 1] = 0.019000; f[ 2] = 0.039000; f[ 3] = 0.019000;
+	    f[ 4] = 0.006000; f[ 5] = 0.025000; f[ 6] = 0.024000; f[ 7] = 0.056000;
+	    f[ 8] = 0.028000; f[ 9] = 0.087000; f[10] = 0.168000; f[11] = 0.023000;
+	    f[12] = 0.053000; f[13] = 0.061000; f[14] = 0.055000; f[15] = 0.072000;
+	    f[16] = 0.088000; f[17] = 0.029000; f[18] = 0.033000; f[19] = 0.044000;
+	but they sum up to 1.001
+*/
+	  }
+	else if (prot_model == "WAG") 
+	  {
+	    daa[ 1*20+ 0] =  55.15710; daa[ 2*20+ 0] =  50.98480; daa[ 2*20+ 1] =  63.53460; 
+	    daa[ 3*20+ 0] =  73.89980; daa[ 3*20+ 1] =  14.73040; daa[ 3*20+ 2] = 542.94200; 
+	    daa[ 4*20+ 0] = 102.70400; daa[ 4*20+ 1] =  52.81910; daa[ 4*20+ 2] =  26.52560; 
+	    daa[ 4*20+ 3] =   3.02949; daa[ 5*20+ 0] =  90.85980; daa[ 5*20+ 1] = 303.55000; 
+	    daa[ 5*20+ 2] = 154.36400; daa[ 5*20+ 3] =  61.67830; daa[ 5*20+ 4] =   9.88179; 
+	    daa[ 6*20+ 0] = 158.28500; daa[ 6*20+ 1] =  43.91570; daa[ 6*20+ 2] =  94.71980; 
+	    daa[ 6*20+ 3] = 617.41600; daa[ 6*20+ 4] =   2.13520; daa[ 6*20+ 5] = 546.94700; 
+	    daa[ 7*20+ 0] = 141.67200; daa[ 7*20+ 1] =  58.46650; daa[ 7*20+ 2] = 112.55600; 
+	    daa[ 7*20+ 3] =  86.55840; daa[ 7*20+ 4] =  30.66740; daa[ 7*20+ 5] =  33.00520; 
+	    daa[ 7*20+ 6] =  56.77170; daa[ 8*20+ 0] =  31.69540; daa[ 8*20+ 1] = 213.71500; 
+	    daa[ 8*20+ 2] = 395.62900; daa[ 8*20+ 3] =  93.06760; daa[ 8*20+ 4] =  24.89720; 
+	    daa[ 8*20+ 5] = 429.41100; daa[ 8*20+ 6] =  57.00250; daa[ 8*20+ 7] =  24.94100; 
+	    daa[ 9*20+ 0] =  19.33350; daa[ 9*20+ 1] =  18.69790; daa[ 9*20+ 2] =  55.42360; 
+	    daa[ 9*20+ 3] =   3.94370; daa[ 9*20+ 4] =  17.01350; daa[ 9*20+ 5] =  11.39170; 
+	    daa[ 9*20+ 6] =  12.73950; daa[ 9*20+ 7] =   3.04501; daa[ 9*20+ 8] =  13.81900; 
+	    daa[10*20+ 0] =  39.79150; daa[10*20+ 1] =  49.76710; daa[10*20+ 2] =  13.15280; 
+	    daa[10*20+ 3] =   8.48047; daa[10*20+ 4] =  38.42870; daa[10*20+ 5] =  86.94890; 
+	    daa[10*20+ 6] =  15.42630; daa[10*20+ 7] =   6.13037; daa[10*20+ 8] =  49.94620; 
+	    daa[10*20+ 9] = 317.09700; daa[11*20+ 0] =  90.62650; daa[11*20+ 1] = 535.14200; 
+	    daa[11*20+ 2] = 301.20100; daa[11*20+ 3] =  47.98550; daa[11*20+ 4] =   7.40339; 
+	    daa[11*20+ 5] = 389.49000; daa[11*20+ 6] = 258.44300; daa[11*20+ 7] =  37.35580; 
+	    daa[11*20+ 8] =  89.04320; daa[11*20+ 9] =  32.38320; daa[11*20+10] =  25.75550; 
+	    daa[12*20+ 0] =  89.34960; daa[12*20+ 1] =  68.31620; daa[12*20+ 2] =  19.82210; 
+	    daa[12*20+ 3] =  10.37540; daa[12*20+ 4] =  39.04820; daa[12*20+ 5] = 154.52600; 
+	    daa[12*20+ 6] =  31.51240; daa[12*20+ 7] =  17.41000; daa[12*20+ 8] =  40.41410; 
+	    daa[12*20+ 9] = 425.74600; daa[12*20+10] = 485.40200; daa[12*20+11] =  93.42760; 
+	    daa[13*20+ 0] =  21.04940; daa[13*20+ 1] =  10.27110; daa[13*20+ 2] =   9.61621; 
+	    daa[13*20+ 3] =   4.67304; daa[13*20+ 4] =  39.80200; daa[13*20+ 5] =   9.99208; 
+	    daa[13*20+ 6] =   8.11339; daa[13*20+ 7] =   4.99310; daa[13*20+ 8] =  67.93710; 
+	    daa[13*20+ 9] = 105.94700; daa[13*20+10] = 211.51700; daa[13*20+11] =   8.88360; 
+	    daa[13*20+12] = 119.06300; daa[14*20+ 0] = 143.85500; daa[14*20+ 1] =  67.94890; 
+	    daa[14*20+ 2] =  19.50810; daa[14*20+ 3] =  42.39840; daa[14*20+ 4] =  10.94040; 
+	    daa[14*20+ 5] =  93.33720; daa[14*20+ 6] =  68.23550; daa[14*20+ 7] =  24.35700; 
+	    daa[14*20+ 8] =  69.61980; daa[14*20+ 9] =   9.99288; daa[14*20+10] =  41.58440; 
+	    daa[14*20+11] =  55.68960; daa[14*20+12] =  17.13290; daa[14*20+13] =  16.14440; 
+	    daa[15*20+ 0] = 337.07900; daa[15*20+ 1] = 122.41900; daa[15*20+ 2] = 397.42300; 
+	    daa[15*20+ 3] = 107.17600; daa[15*20+ 4] = 140.76600; daa[15*20+ 5] = 102.88700; 
+	    daa[15*20+ 6] =  70.49390; daa[15*20+ 7] = 134.18200; daa[15*20+ 8] =  74.01690; 
+	    daa[15*20+ 9] =  31.94400; daa[15*20+10] =  34.47390; daa[15*20+11] =  96.71300; 
+	    daa[15*20+12] =  49.39050; daa[15*20+13] =  54.59310; daa[15*20+14] = 161.32800; 
+	    daa[16*20+ 0] = 212.11100; daa[16*20+ 1] =  55.44130; daa[16*20+ 2] = 203.00600; 
+	    daa[16*20+ 3] =  37.48660; daa[16*20+ 4] =  51.29840; daa[16*20+ 5] =  85.79280; 
+	    daa[16*20+ 6] =  82.27650; daa[16*20+ 7] =  22.58330; daa[16*20+ 8] =  47.33070; 
+	    daa[16*20+ 9] = 145.81600; daa[16*20+10] =  32.66220; daa[16*20+11] = 138.69800; 
+	    daa[16*20+12] = 151.61200; daa[16*20+13] =  17.19030; daa[16*20+14] =  79.53840; 
+	    daa[16*20+15] = 437.80200; daa[17*20+ 0] =  11.31330; daa[17*20+ 1] = 116.39200; 
+	    daa[17*20+ 2] =   7.19167; daa[17*20+ 3] =  12.97670; daa[17*20+ 4] =  71.70700; 
+	    daa[17*20+ 5] =  21.57370; daa[17*20+ 6] =  15.65570; daa[17*20+ 7] =  33.69830; 
+	    daa[17*20+ 8] =  26.25690; daa[17*20+ 9] =  21.24830; daa[17*20+10] =  66.53090; 
+	    daa[17*20+11] =  13.75050; daa[17*20+12] =  51.57060; daa[17*20+13] = 152.96400; 
+	    daa[17*20+14] =  13.94050; daa[17*20+15] =  52.37420; daa[17*20+16] =  11.08640; 
+	    daa[18*20+ 0] =  24.07350; daa[18*20+ 1] =  38.15330; daa[18*20+ 2] = 108.60000; 
+	    daa[18*20+ 3] =  32.57110; daa[18*20+ 4] =  54.38330; daa[18*20+ 5] =  22.77100; 
+	    daa[18*20+ 6] =  19.63030; daa[18*20+ 7] =  10.36040; daa[18*20+ 8] = 387.34400; 
+	    daa[18*20+ 9] =  42.01700; daa[18*20+10] =  39.86180; daa[18*20+11] =  13.32640; 
+	    daa[18*20+12] =  42.84370; daa[18*20+13] = 645.42800; daa[18*20+14] =  21.60460; 
+	    daa[18*20+15] =  78.69930; daa[18*20+16] =  29.11480; daa[18*20+17] = 248.53900; 
+	    daa[19*20+ 0] = 200.60100; daa[19*20+ 1] =  25.18490; daa[19*20+ 2] =  19.62460; 
+	    daa[19*20+ 3] =  15.23350; daa[19*20+ 4] = 100.21400; daa[19*20+ 5] =  30.12810; 
+	    daa[19*20+ 6] =  58.87310; daa[19*20+ 7] =  18.72470; daa[19*20+ 8] =  11.83580; 
+	    daa[19*20+ 9] = 782.13000; daa[19*20+10] = 180.03400; daa[19*20+11] =  30.54340; 
+	    daa[19*20+12] = 205.84500; daa[19*20+13] =  64.98920; daa[19*20+14] =  31.48870; 
+	    daa[19*20+15] =  23.27390; daa[19*20+16] = 138.82300; daa[19*20+17] =  36.53690; 
+	    daa[19*20+18] =  31.47300; 
+	    	   
+/*		THIS WRONG FREQUENCIES ARE ROUNDED to 3 digits, same for RAxML
+  	    f[0]  = 0.08700; f[1]  = 0.04400; f[2]  = 0.03900; f[3]  = 0.05700;
+	    f[4]  = 0.01900; f[5]  = 0.03700; f[6]  = 0.05800; f[7]  = 0.08300;
+	    f[8]  = 0.02400; f[9]  = 0.04900; f[10] = 0.08600; f[11] = 0.06200;
+	    f[12] = 0.02000; f[13] = 0.03800; f[14] = 0.04600; f[15] = 0.07000;
+	    f[16] = 0.06100; f[17] = 0.01400; f[18] = 0.03500; f[19] = 0.07100;   
+*/
+	    // NOTE: originally, f[19]= 0.0708956 but frequencies do not sum up to 1
+	    f[0] = 0.0866279; f[1] =  0.043972; f[2] =  0.0390894; f[3] =  0.0570451;
+	    f[4] =  0.0193078; f[5] =  0.0367281; f[6] =  0.0580589; f[7] =  0.0832518;
+	    f[8] =  0.0244313; f[9] =  0.048466; f[10] =  0.086209; f[11] = 0.0620286;
+	    f[12] = 0.0195027; f[13] =  0.0384319; f[14] =  0.0457631; f[15] = 0.0695179;
+	    f[16] =  0.0610127; f[17] =  0.0143859; f[18] =  0.0352742; f[19] =  0.0708957;
+	  }
+	else if (prot_model == "RTREV") 
+	  {
+	    daa[1*20+0]= 34;         daa[2*20+0]= 51;         daa[2*20+1]= 35;         daa[3*20+0]= 10;         
+	    daa[3*20+1]= 30;         daa[3*20+2]= 384;        daa[4*20+0]= 439;        daa[4*20+1]= 92;         
+	    daa[4*20+2]= 128;        daa[4*20+3]= 1;          daa[5*20+0]= 32;         daa[5*20+1]= 221;        
+	    daa[5*20+2]= 236;        daa[5*20+3]= 78;         daa[5*20+4]= 70;         daa[6*20+0]= 81;         
+	    daa[6*20+1]= 10;         daa[6*20+2]= 79;         daa[6*20+3]= 542;        daa[6*20+4]= 1;          
+	    daa[6*20+5]= 372;        daa[7*20+0]= 135;        daa[7*20+1]= 41;         daa[7*20+2]= 94;         
+	    daa[7*20+3]= 61;         daa[7*20+4]= 48;         daa[7*20+5]= 18;         daa[7*20+6]= 70;         
+	    daa[8*20+0]= 30;         daa[8*20+1]= 90;         daa[8*20+2]= 320;        daa[8*20+3]= 91;         
+	    daa[8*20+4]= 124;        daa[8*20+5]= 387;        daa[8*20+6]= 34;         daa[8*20+7]= 68;         
+	    daa[9*20+0]= 1;          daa[9*20+1]= 24;         daa[9*20+2]= 35;         daa[9*20+3]= 1;          
+	    daa[9*20+4]= 104;        daa[9*20+5]= 33;         daa[9*20+6]= 1;          daa[9*20+7]= 1;          
+	    daa[9*20+8]= 34;         daa[10*20+0]= 45;        daa[10*20+1]= 18;        daa[10*20+2]= 15;        
+	    daa[10*20+3]= 5;         daa[10*20+4]= 110;       daa[10*20+5]= 54;        daa[10*20+6]= 21;        
+	    daa[10*20+7]= 3;         daa[10*20+8]= 51;        daa[10*20+9]= 385;       daa[11*20+0]= 38;        
+	    daa[11*20+1]= 593;       daa[11*20+2]= 123;       daa[11*20+3]= 20;        daa[11*20+4]= 16;        
+	    daa[11*20+5]= 309;       daa[11*20+6]= 141;       daa[11*20+7]= 30;        daa[11*20+8]= 76;        
+	    daa[11*20+9]= 34;        daa[11*20+10]= 23;       daa[12*20+0]= 235;       daa[12*20+1]= 57;        
+	    daa[12*20+2]= 1;         daa[12*20+3]= 1;         daa[12*20+4]= 156;       daa[12*20+5]= 158;       
+	    daa[12*20+6]= 1;         daa[12*20+7]= 37;        daa[12*20+8]= 116;       daa[12*20+9]= 375;       
+	    daa[12*20+10]= 581;      daa[12*20+11]= 134;      daa[13*20+0]= 1;         daa[13*20+1]= 7;         
+	    daa[13*20+2]= 49;        daa[13*20+3]= 1;         daa[13*20+4]= 70;        daa[13*20+5]= 1;         
+	    daa[13*20+6]= 1;         daa[13*20+7]= 7;         daa[13*20+8]= 141;       daa[13*20+9]= 64;        
+	    daa[13*20+10]= 179;      daa[13*20+11]= 14;       daa[13*20+12]= 247;      daa[14*20+0]= 97;        
+	    daa[14*20+1]= 24;        daa[14*20+2]= 33;        daa[14*20+3]= 55;        daa[14*20+4]= 1;         
+	    daa[14*20+5]= 68;        daa[14*20+6]= 52;        daa[14*20+7]= 17;        daa[14*20+8]= 44;        
+	    daa[14*20+9]= 10;        daa[14*20+10]= 22;       daa[14*20+11]= 43;       daa[14*20+12]= 1;        
+	    daa[14*20+13]= 11;       daa[15*20+0]= 460;       daa[15*20+1]= 102;       daa[15*20+2]= 294;       
+	    daa[15*20+3]= 136;       daa[15*20+4]= 75;        daa[15*20+5]= 225;       daa[15*20+6]= 95;        
+	    daa[15*20+7]= 152;       daa[15*20+8]= 183;       daa[15*20+9]= 4;         daa[15*20+10]= 24;       
+	    daa[15*20+11]= 77;       daa[15*20+12]= 1;        daa[15*20+13]= 20;       daa[15*20+14]= 134;      
+	    daa[16*20+0]= 258;       daa[16*20+1]= 64;        daa[16*20+2]= 148;       daa[16*20+3]= 55;        
+	    daa[16*20+4]= 117;       daa[16*20+5]= 146;       daa[16*20+6]= 82;        daa[16*20+7]= 7;         
+	    daa[16*20+8]= 49;        daa[16*20+9]= 72;        daa[16*20+10]= 25;       daa[16*20+11]= 110;      
+	    daa[16*20+12]= 131;      daa[16*20+13]= 69;       daa[16*20+14]= 62;       daa[16*20+15]= 671;      
+	    daa[17*20+0]= 5;         daa[17*20+1]= 13;        daa[17*20+2]= 16;        daa[17*20+3]= 1;         
+	    daa[17*20+4]= 55;        daa[17*20+5]= 10;        daa[17*20+6]= 17;        daa[17*20+7]= 23;        
+	    daa[17*20+8]= 48;        daa[17*20+9]= 39;        daa[17*20+10]= 47;       daa[17*20+11]= 6;        
+	    daa[17*20+12]= 111;      daa[17*20+13]= 182;      daa[17*20+14]= 9;        daa[17*20+15]= 14;       
+	    daa[17*20+16]= 1;        daa[18*20+0]= 55;        daa[18*20+1]= 47;        daa[18*20+2]= 28;        
+	    daa[18*20+3]= 1;         daa[18*20+4]= 131;       daa[18*20+5]= 45;        daa[18*20+6]= 1;         
+	    daa[18*20+7]= 21;        daa[18*20+8]= 307;       daa[18*20+9]= 26;        daa[18*20+10]= 64;       
+	    daa[18*20+11]= 1;        daa[18*20+12]= 74;       daa[18*20+13]= 1017;     daa[18*20+14]= 14;       
+	    daa[18*20+15]= 31;       daa[18*20+16]= 34;       daa[18*20+17]= 176;      daa[19*20+0]= 197;       
+	    daa[19*20+1]= 29;        daa[19*20+2]= 21;        daa[19*20+3]= 6;         daa[19*20+4]= 295;       
+	    daa[19*20+5]= 36;        daa[19*20+6]= 35;        daa[19*20+7]= 3;         daa[19*20+8]= 1;         
+	    daa[19*20+9]= 1048;      daa[19*20+10]= 112;      daa[19*20+11]= 19;       daa[19*20+12]= 236;      
+	    daa[19*20+13]= 92;       daa[19*20+14]= 25;       daa[19*20+15]= 39;       daa[19*20+16]= 196;      
+	    daa[19*20+17]= 26;       daa[19*20+18]= 59;       
+	    
+	    f[0]= 0.0646;           f[1]= 0.0453;           f[2]= 0.0376;           f[3]= 0.0422;           
+	    f[4]= 0.0114;           f[5]= 0.0606;           f[6]= 0.0607;           f[7]= 0.0639;           
+	    f[8]= 0.0273;           f[9]= 0.0679;           f[10]= 0.1018;          f[11]= 0.0751;          
+	    f[12]= 0.015;           f[13]= 0.0287;          f[14]= 0.0681;          f[15]= 0.0488;          
+	    f[16]= 0.0622;          f[17]= 0.0251;          f[18]= 0.0318;          f[19]= 0.0619;	    	    
+	  }
+	else if (prot_model == "CPREV") 
+	  {
+	    daa[1*20+0]= 105;        daa[2*20+0]= 227;        daa[2*20+1]= 357;        daa[3*20+0]= 175;        
+	    daa[3*20+1]= 43;         daa[3*20+2]= 4435;       daa[4*20+0]= 669;        daa[4*20+1]= 823;        
+	    daa[4*20+2]= 538;        daa[4*20+3]= 10;         daa[5*20+0]= 157;        daa[5*20+1]= 1745;       
+	    daa[5*20+2]= 768;        daa[5*20+3]= 400;        daa[5*20+4]= 10;         daa[6*20+0]= 499;        
+	    daa[6*20+1]= 152;        daa[6*20+2]= 1055;       daa[6*20+3]= 3691;       daa[6*20+4]= 10;         
+	    daa[6*20+5]= 3122;       daa[7*20+0]= 665;        daa[7*20+1]= 243;        daa[7*20+2]= 653;        
+	    daa[7*20+3]= 431;        daa[7*20+4]= 303;        daa[7*20+5]= 133;        daa[7*20+6]= 379;        
+	    daa[8*20+0]= 66;         daa[8*20+1]= 715;        daa[8*20+2]= 1405;       daa[8*20+3]= 331;        
+	    daa[8*20+4]= 441;        daa[8*20+5]= 1269;       daa[8*20+6]= 162;        daa[8*20+7]= 19;         
+	    daa[9*20+0]= 145;        daa[9*20+1]= 136;        daa[9*20+2]= 168;        daa[9*20+3]= 10;         
+	    daa[9*20+4]= 280;        daa[9*20+5]= 92;         daa[9*20+6]= 148;        daa[9*20+7]= 40;         
+	    daa[9*20+8]= 29;         daa[10*20+0]= 197;       daa[10*20+1]= 203;       daa[10*20+2]= 113;       
+	    daa[10*20+3]= 10;        daa[10*20+4]= 396;       daa[10*20+5]= 286;       daa[10*20+6]= 82;        
+	    daa[10*20+7]= 20;        daa[10*20+8]= 66;        daa[10*20+9]= 1745;      daa[11*20+0]= 236;       
+	    daa[11*20+1]= 4482;      daa[11*20+2]= 2430;      daa[11*20+3]= 412;       daa[11*20+4]= 48;        
+	    daa[11*20+5]= 3313;      daa[11*20+6]= 2629;      daa[11*20+7]= 263;       daa[11*20+8]= 305;       
+	    daa[11*20+9]= 345;       daa[11*20+10]= 218;      daa[12*20+0]= 185;       daa[12*20+1]= 125;       
+	    daa[12*20+2]= 61;        daa[12*20+3]= 47;        daa[12*20+4]= 159;       daa[12*20+5]= 202;       
+	    daa[12*20+6]= 113;       daa[12*20+7]= 21;        daa[12*20+8]= 10;        daa[12*20+9]= 1772;      
+	    daa[12*20+10]= 1351;     daa[12*20+11]= 193;      daa[13*20+0]= 68;        daa[13*20+1]= 53;        
+	    daa[13*20+2]= 97;        daa[13*20+3]= 22;        daa[13*20+4]= 726;       daa[13*20+5]= 10;        
+	    daa[13*20+6]= 145;       daa[13*20+7]= 25;        daa[13*20+8]= 127;       daa[13*20+9]= 454;       
+	    daa[13*20+10]= 1268;     daa[13*20+11]= 72;       daa[13*20+12]= 327;      daa[14*20+0]= 490;       
+	    daa[14*20+1]= 87;        daa[14*20+2]= 173;       daa[14*20+3]= 170;       daa[14*20+4]= 285;       
+	    daa[14*20+5]= 323;       daa[14*20+6]= 185;       daa[14*20+7]= 28;        daa[14*20+8]= 152;       
+	    daa[14*20+9]= 117;       daa[14*20+10]= 219;      daa[14*20+11]= 302;      daa[14*20+12]= 100;      
+	    daa[14*20+13]= 43;       daa[15*20+0]= 2440;      daa[15*20+1]= 385;       daa[15*20+2]= 2085;      
+	    daa[15*20+3]= 590;       daa[15*20+4]= 2331;      daa[15*20+5]= 396;       daa[15*20+6]= 568;       
+	    daa[15*20+7]= 691;       daa[15*20+8]= 303;       daa[15*20+9]= 216;       daa[15*20+10]= 516;      
+	    daa[15*20+11]= 868;      daa[15*20+12]= 93;       daa[15*20+13]= 487;      daa[15*20+14]= 1202;     
+	    daa[16*20+0]= 1340;      daa[16*20+1]= 314;       daa[16*20+2]= 1393;      daa[16*20+3]= 266;       
+	    daa[16*20+4]= 576;       daa[16*20+5]= 241;       daa[16*20+6]= 369;       daa[16*20+7]= 92;        
+	    daa[16*20+8]= 32;        daa[16*20+9]= 1040;      daa[16*20+10]= 156;      daa[16*20+11]= 918;      
+	    daa[16*20+12]= 645;      daa[16*20+13]= 148;      daa[16*20+14]= 260;      daa[16*20+15]= 2151;     
+	    daa[17*20+0]= 14;        daa[17*20+1]= 230;       daa[17*20+2]= 40;        daa[17*20+3]= 18;        
+	    daa[17*20+4]= 435;       daa[17*20+5]= 53;        daa[17*20+6]= 63;        daa[17*20+7]= 82;        
+	    daa[17*20+8]= 69;        daa[17*20+9]= 42;        daa[17*20+10]= 159;      daa[17*20+11]= 10;       
+	    daa[17*20+12]= 86;       daa[17*20+13]= 468;      daa[17*20+14]= 49;       daa[17*20+15]= 73;       
+	    daa[17*20+16]= 29;       daa[18*20+0]= 56;        daa[18*20+1]= 323;       daa[18*20+2]= 754;       
+	    daa[18*20+3]= 281;       daa[18*20+4]= 1466;      daa[18*20+5]= 391;       daa[18*20+6]= 142;       
+	    daa[18*20+7]= 10;        daa[18*20+8]= 1971;      daa[18*20+9]= 89;        daa[18*20+10]= 189;      
+	    daa[18*20+11]= 247;      daa[18*20+12]= 215;      daa[18*20+13]= 2370;     daa[18*20+14]= 97;       
+	    daa[18*20+15]= 522;      daa[18*20+16]= 71;       daa[18*20+17]= 346;      daa[19*20+0]= 968;       
+	    daa[19*20+1]= 92;        daa[19*20+2]= 83;        daa[19*20+3]= 75;        daa[19*20+4]= 592;       
+	    daa[19*20+5]= 54;        daa[19*20+6]= 200;       daa[19*20+7]= 91;        daa[19*20+8]= 25;        
+	    daa[19*20+9]= 4797;      daa[19*20+10]= 865;      daa[19*20+11]= 249;      daa[19*20+12]= 475;      
+	    daa[19*20+13]= 317;      daa[19*20+14]= 122;      daa[19*20+15]= 167;      daa[19*20+16]= 760;      
+	    daa[19*20+17]= 10;       daa[19*20+18]= 119;      
+	    
+	    f[0]= 0.076;            f[1]= 0.062;            f[2]= 0.041;            f[3]= 0.037;            
+	    f[4]= 0.009;            f[5]= 0.038;            f[6]= 0.049;            f[7]= 0.084;            
+	    f[8]= 0.025;            f[9]= 0.081;            f[10]= 0.101;           f[11]= 0.05;            
+	    f[12]= 0.022;           f[13]= 0.051;           f[14]= 0.043;           f[15]= 0.062;           
+	    f[16]= 0.054;           f[17]= 0.018;           f[18]= 0.031;           f[19]= 0.066; 
+	  }
+	else if (prot_model == "VT") 
+	  {
+
+	    daa[1*20+0]=   1.2412691067876198;
+	    daa[2*20+0]=   1.2184237953498958;
+	    daa[2*20+1]=   1.5720770753326880;
+	    daa[3*20+0]=   1.3759368509441177;
+	    daa[3*20+1]=   0.7550654439001206;
+	    daa[3*20+2]=   7.8584219153689405;
+	    daa[4*20+0]=   2.4731223087544874;
+	    daa[4*20+1]=   1.4414262567428417;
+	    daa[4*20+2]=   0.9784679122774127;
+	    daa[4*20+3]=   0.2272488448121475;
+	    daa[5*20+0]=   2.2155167805137470;
+	    daa[5*20+1]=   5.5120819705248678;
+	    daa[5*20+2]=   3.0143201670924822;
+	    daa[5*20+3]=   1.6562495638176040;
+	    daa[5*20+4]=   0.4587469126746136;
+	    daa[6*20+0]=   2.3379911207495061;
+	    daa[6*20+1]=   1.3542404860613146;
+	    daa[6*20+2]=   2.0093434778398112;
+	    daa[6*20+3]=   9.6883451875685065;
+	    daa[6*20+4]=   0.4519167943192672;
+	    daa[6*20+5]=   6.8124601839937675;
+	    daa[7*20+0]=   3.3386555146457697;
+	    daa[7*20+1]=   1.3121700301622004;
+	    daa[7*20+2]=   2.4117632898861809;
+	    daa[7*20+3]=   1.9142079025990228;
+	    daa[7*20+4]=   1.1034605684472507;
+	    daa[7*20+5]=   0.8776110594765502;
+	    daa[7*20+6]=   1.3860121390169038;
+	    daa[8*20+0]=   0.9615841926910841;
+	    daa[8*20+1]=   4.9238668283945266;
+	    daa[8*20+2]=   6.1974384977884114;
+	    daa[8*20+3]=   2.1459640610133781;
+	    daa[8*20+4]=   1.5196756759380692;
+	    daa[8*20+5]=   7.9943228564946525;
+	    daa[8*20+6]=   1.6360079688522375;
+	    daa[8*20+7]=   0.8561248973045037;
+	    daa[9*20+0]=   0.8908203061925510;
+	    daa[9*20+1]=   0.4323005487925516;
+	    daa[9*20+2]=   0.9179291175331520;
+	    daa[9*20+3]=   0.2161660372725585;
+	    daa[9*20+4]=   0.9126668032539315;
+	    daa[9*20+5]=   0.4882733432879921;
+	    daa[9*20+6]=   0.4035497929633328;
+	    daa[9*20+7]=   0.2888075033037488;
+	    daa[9*20+8]=   0.5787937115407940;
+	    daa[10*20+0]=  1.0778497408764076;
+	    daa[10*20+1]=  0.8386701149158265;
+	    daa[10*20+2]=  0.4098311270816011;
+	    daa[10*20+3]=  0.3574207468998517;
+	    daa[10*20+4]=  1.4081315998413697;
+	    daa[10*20+5]=  1.3318097154194044;
+	    daa[10*20+6]=  0.5610717242294755;
+	    daa[10*20+7]=  0.3578662395745526;
+	    daa[10*20+8]=  1.0765007949562073;
+	    daa[10*20+9]=  6.0019110258426362;
+	    daa[11*20+0]=  1.4932055816372476;
+	    daa[11*20+1]=  10.017330817366002;
+	    daa[11*20+2]=  4.4034547578962568;
+	    daa[11*20+3]=  1.4521790561663968;
+	    daa[11*20+4]=  0.3371091785647479;
+	    daa[11*20+5]=  6.0519085243118811;
+	    daa[11*20+6]=  4.3290086529582830;
+	    daa[11*20+7]=  0.8945563662345198;
+	    daa[11*20+8]=  1.8085136096039203;
+	    daa[11*20+9]=  0.6244297525127139;
+	    daa[11*20+10]= 0.5642322882556321;
+	    daa[12*20+0]=  1.9006455961717605;
+	    daa[12*20+1]=  1.2488638689609959;
+	    daa[12*20+2]=  0.9378803706165143;
+	    daa[12*20+3]=  0.4075239926000898;
+	    daa[12*20+4]=  1.2213054800811556;
+	    daa[12*20+5]=  1.9106190827629084;
+	    daa[12*20+6]=  0.7471936218068498;
+	    daa[12*20+7]=  0.5954812791740037;
+	    daa[12*20+8]=  1.3808291710019667;
+	    daa[12*20+9]=  6.7597899772045418;
+	    daa[12*20+10]= 8.0327792947421148;
+	    daa[12*20+11]= 1.7129670976916258;
+	    daa[13*20+0]=  0.6883439026872615;
+	    daa[13*20+1]=  0.4224945197276290;
+	    daa[13*20+2]=  0.5044944273324311;
+	    daa[13*20+3]=  0.1675129724559251;
+	    daa[13*20+4]=  1.6953951980808002;
+	    daa[13*20+5]=  0.3573432522499545;
+	    daa[13*20+6]=  0.2317194387691585;
+	    daa[13*20+7]=  0.3693722640980460;
+	    daa[13*20+8]=  1.3629765501081097;
+	    daa[13*20+9]=  2.2864286949316077;
+	    daa[13*20+10]= 4.3611548063555778;
+	    daa[13*20+11]= 0.3910559903834828;
+	    daa[13*20+12]= 2.3201373546296349;
+	    daa[14*20+0]=  2.7355620089953550;
+	    daa[14*20+1]=  1.3091837782420783;
+	    daa[14*20+2]=  0.7103720531974738;
+	    daa[14*20+3]=  1.0714605979577547;
+	    daa[14*20+4]=  0.4326227078645523;
+	    daa[14*20+5]=  2.3019177728300728;
+	    daa[14*20+6]=  1.5132807416252063;
+	    daa[14*20+7]=  0.7744933618134962;
+	    daa[14*20+8]=  1.8370555852070649;
+	    daa[14*20+9]=  0.4811402387911145;
+	    daa[14*20+10]= 1.0084320519837335;
+	    daa[14*20+11]= 1.3918935593582853;
+	    daa[14*20+12]= 0.4953193808676289;
+	    daa[14*20+13]= 0.3746821107962129;
+	    daa[15*20+0]=  6.4208961859142883;
+	    daa[15*20+1]=  1.9202994262316166;
+	    daa[15*20+2]=  6.1234512396801764;
+	    daa[15*20+3]=  2.2161944596741829;
+	    daa[15*20+4]=  3.6366815408744255;
+	    daa[15*20+5]=  2.3193703643237220;
+	    daa[15*20+6]=  1.8273535587773553;
+	    daa[15*20+7]=  3.0637776193717610;
+	    daa[15*20+8]=  1.9699895187387506;
+	    daa[15*20+9]=  0.6047491507504744;
+	    daa[15*20+10]= 0.8953754669269811;
+	    daa[15*20+11]= 1.9776630140912268;
+	    daa[15*20+12]= 1.0657482318076852;
+	    daa[15*20+13]= 1.1079144700606407;
+	    daa[15*20+14]= 3.5465914843628927;
+	    daa[16*20+0]=  5.2892514169776437;
+	    daa[16*20+1]=  1.3363401740560601;
+	    daa[16*20+2]=  3.8852506105922231;
+	    daa[16*20+3]=  1.5066839872944762;
+	    daa[16*20+4]=  1.7557065205837685;
+	    daa[16*20+5]=  2.1576510103471440;
+	    daa[16*20+6]=  1.5839981708584689;
+	    daa[16*20+7]=  0.7147489676267383;
+	    daa[16*20+8]=  1.6136654573285647;
+	    daa[16*20+9]=  2.6344778384442731;
+	    daa[16*20+10]= 1.0192004372506540;
+	    daa[16*20+11]= 2.5513781312660280;
+	    daa[16*20+12]= 3.3628488360462363;
+	    daa[16*20+13]= 0.6882725908872254;
+	    daa[16*20+14]= 1.9485376673137556;
+	    daa[16*20+15]= 8.8479984061248178;
+	    daa[17*20+0]=  0.5488578478106930;
+	    daa[17*20+1]=  1.5170142153962840;
+	    daa[17*20+2]=  0.1808525752605976;
+	    daa[17*20+3]=  0.2496584188151770;
+	    daa[17*20+4]=  1.6275179891253113;
+	    daa[17*20+5]=  0.8959082681546182;
+	    daa[17*20+6]=  0.4198391148111098;
+	    daa[17*20+7]=  0.9349753595598769;
+	    daa[17*20+8]=  0.6301954684360302;
+	    daa[17*20+9]=  0.5604648274060783;
+	    daa[17*20+10]= 1.5183114434679339;
+	    daa[17*20+11]= 0.5851920879490173;
+	    daa[17*20+12]= 1.4680478689711018;
+	    daa[17*20+13]= 3.3448437239772266;
+	    daa[17*20+14]= 0.4326058001438786;
+	    daa[17*20+15]= 0.6791126595939816;
+	    daa[17*20+16]= 0.4514203099376473;
+	    daa[18*20+0]=  0.5411769916657778;
+	    daa[18*20+1]=  0.8912614404565405;
+	    daa[18*20+2]=  1.0894926581511342;
+	    daa[18*20+3]=  0.7447620891784513;
+	    daa[18*20+4]=  2.1579775140421025;
+	    daa[18*20+5]=  0.9183596801412757;
+	    daa[18*20+6]=  0.5818111331782764;
+	    daa[18*20+7]=  0.3374467649724478;
+	    daa[18*20+8]=  7.7587442309146040;
+	    daa[18*20+9]=  0.8626796044156272;
+	    daa[18*20+10]= 1.2452243224541324;
+	    daa[18*20+11]= 0.7835447533710449;
+	    daa[18*20+12]= 1.0899165770956820;
+	    daa[18*20+13]= 10.384852333133459;
+	    daa[18*20+14]= 0.4819109019647465;
+	    daa[18*20+15]= 0.9547229305958682;
+	    daa[18*20+16]= 0.8564314184691215;
+	    daa[18*20+17]= 4.5377235790405388;
+	    daa[19*20+0]=  4.6501894691803214;
+	    daa[19*20+1]=  0.7807017855806767;
+	    daa[19*20+2]=  0.4586061981719967;
+	    daa[19*20+3]=  0.4594535241660911;
+	    daa[19*20+4]=  2.2627456996290891;
+	    daa[19*20+5]=  0.6366932501396869;
+	    daa[19*20+6]=  0.8940572875547330;
+	    daa[19*20+7]=  0.6193321034173915;
+	    daa[19*20+8]=  0.5333220944030346;
+	    daa[19*20+9]=  14.872933461519061;
+	    daa[19*20+10]= 3.5458093276667237;
+	    daa[19*20+11]= 0.7801080335991272;
+	    daa[19*20+12]= 4.0584577156753401;
+	    daa[19*20+13]= 1.7039730522675411;
+	    daa[19*20+14]= 0.5985498912985666;
+	    daa[19*20+15]= 0.9305232113028208;
+	    daa[19*20+16]= 3.4242218450865543;
+	    daa[19*20+17]= 0.5658969249032649;
+	    daa[19*20+18]= 1.0000000000000000;
+	    
+	    f[0]=  0.0770764620135024;
+	    f[1]=  0.0500819370772208;
+	    f[2]=  0.0462377395993731;
+	    f[3]=  0.0537929860758246;
+	    f[4]=  0.0144533387583345;
+	    f[5]=  0.0408923608974345;
+	    f[6]=  0.0633579339160905;
+	    f[7]=  0.0655672355884439;
+	    f[8]=  0.0218802687005936;
+	    f[9]=  0.0591969699027449;
+	    f[10]= 0.0976461276528445;
+	    f[11]= 0.0592079410822730;
+	    f[12]= 0.0220695876653368;
+	    f[13]= 0.0413508521834260;
+	    f[14]= 0.0476871596856874;
+	    f[15]= 0.0707295165111524;
+	    f[16]= 0.0567759161524817;
+	    f[17]= 0.0127019797647213;
+	    f[18]= 0.0323746050281867;
+	    f[19]= 0.0669190817443274;
+	  }
+	else if (prot_model == "BLOSUM62") 
+	  {
+	    daa[1*20+0]= 0.735790389698;  daa[2*20+0]= 0.485391055466;  daa[2*20+1]= 1.297446705134;  
+	    daa[3*20+0]= 0.543161820899;  
+	    daa[3*20+1]= 0.500964408555;  daa[3*20+2]= 3.180100048216;  daa[4*20+0]= 1.45999531047;   
+	    daa[4*20+1]= 0.227826574209;  
+	    daa[4*20+2]= 0.397358949897;  daa[4*20+3]= 0.240836614802;  daa[5*20+0]= 1.199705704602;  
+	    daa[5*20+1]= 3.020833610064;  
+	    daa[5*20+2]= 1.839216146992;  daa[5*20+3]= 1.190945703396;  daa[5*20+4]= 0.32980150463;   
+	    daa[6*20+0]= 1.1709490428;    
+	    daa[6*20+1]= 1.36057419042;   daa[6*20+2]= 1.24048850864;   daa[6*20+3]= 3.761625208368;  
+	    daa[6*20+4]= 0.140748891814;  
+	    daa[6*20+5]= 5.528919177928;  daa[7*20+0]= 1.95588357496;   daa[7*20+1]= 0.418763308518;  
+	    daa[7*20+2]= 1.355872344485;  
+	    daa[7*20+3]= 0.798473248968;  daa[7*20+4]= 0.418203192284;  daa[7*20+5]= 0.609846305383;  
+	    daa[7*20+6]= 0.423579992176;  
+	    daa[8*20+0]= 0.716241444998;  daa[8*20+1]= 1.456141166336;  daa[8*20+2]= 2.414501434208;  
+	    daa[8*20+3]= 0.778142664022;  
+	    daa[8*20+4]= 0.354058109831;  daa[8*20+5]= 2.43534113114;   daa[8*20+6]= 1.626891056982;  
+	    daa[8*20+7]= 0.539859124954;  
+	    daa[9*20+0]= 0.605899003687;  daa[9*20+1]= 0.232036445142;  daa[9*20+2]= 0.283017326278;  
+	    daa[9*20+3]= 0.418555732462;  
+	    daa[9*20+4]= 0.774894022794;  daa[9*20+5]= 0.236202451204;  daa[9*20+6]= 0.186848046932;  
+	    daa[9*20+7]= 0.189296292376;  
+	    daa[9*20+8]= 0.252718447885;  daa[10*20+0]= 0.800016530518; daa[10*20+1]= 0.622711669692; 
+	    daa[10*20+2]= 0.211888159615; 
+	    daa[10*20+3]= 0.218131577594; daa[10*20+4]= 0.831842640142; daa[10*20+5]= 0.580737093181; 
+	    daa[10*20+6]= 0.372625175087; 
+	    daa[10*20+7]= 0.217721159236; daa[10*20+8]= 0.348072209797; daa[10*20+9]= 3.890963773304; 
+	    daa[11*20+0]= 1.295201266783; 
+	    daa[11*20+1]= 5.411115141489; daa[11*20+2]= 1.593137043457; daa[11*20+3]= 1.032447924952; 
+	    daa[11*20+4]= 0.285078800906; 
+	    daa[11*20+5]= 3.945277674515; daa[11*20+6]= 2.802427151679; daa[11*20+7]= 0.752042440303; 
+	    daa[11*20+8]= 1.022507035889; 
+	    daa[11*20+9]= 0.406193586642; daa[11*20+10]= 0.445570274261;daa[12*20+0]= 1.253758266664; 
+	    daa[12*20+1]= 0.983692987457; 
+	    daa[12*20+2]= 0.648441278787; daa[12*20+3]= 0.222621897958; daa[12*20+4]= 0.76768882348;  
+	    daa[12*20+5]= 2.494896077113; 
+	    daa[12*20+6]= 0.55541539747;  daa[12*20+7]= 0.459436173579; daa[12*20+8]= 0.984311525359; 
+	    daa[12*20+9]= 3.364797763104; 
+	    daa[12*20+10]= 6.030559379572;daa[12*20+11]= 1.073061184332;daa[13*20+0]= 0.492964679748; 
+	    daa[13*20+1]= 0.371644693209; 
+	    daa[13*20+2]= 0.354861249223; daa[13*20+3]= 0.281730694207; daa[13*20+4]= 0.441337471187; 
+	    daa[13*20+5]= 0.14435695975;  
+	    daa[13*20+6]= 0.291409084165; daa[13*20+7]= 0.368166464453; daa[13*20+8]= 0.714533703928; 
+	    daa[13*20+9]= 1.517359325954; 
+	    daa[13*20+10]= 2.064839703237;daa[13*20+11]= 0.266924750511;daa[13*20+12]= 1.77385516883; 
+	    daa[14*20+0]= 1.173275900924; 
+	    daa[14*20+1]= 0.448133661718; daa[14*20+2]= 0.494887043702; daa[14*20+3]= 0.730628272998; 
+	    daa[14*20+4]= 0.356008498769; 
+	    daa[14*20+5]= 0.858570575674; daa[14*20+6]= 0.926563934846; daa[14*20+7]= 0.504086599527; daa[14*20+8]= 0.527007339151; 
+	    daa[14*20+9]= 0.388355409206; daa[14*20+10]= 0.374555687471;daa[14*20+11]= 1.047383450722;daa[14*20+12]= 0.454123625103;
+	    daa[14*20+13]= 0.233597909629;daa[15*20+0]= 4.325092687057; daa[15*20+1]= 1.12278310421;  daa[15*20+2]= 2.904101656456; 
+	    daa[15*20+3]= 1.582754142065; daa[15*20+4]= 1.197188415094; daa[15*20+5]= 1.934870924596; daa[15*20+6]= 1.769893238937; 
+	    daa[15*20+7]= 1.509326253224; daa[15*20+8]= 1.11702976291;  daa[15*20+9]= 0.35754441246;  daa[15*20+10]= 0.352969184527;
+	    daa[15*20+11]= 1.752165917819;daa[15*20+12]= 0.918723415746;daa[15*20+13]= 0.540027644824;daa[15*20+14]= 1.169129577716;
+	    daa[16*20+0]= 1.729178019485; daa[16*20+1]= 0.914665954563; daa[16*20+2]= 1.898173634533; daa[16*20+3]= 0.934187509431; 
+	    daa[16*20+4]= 1.119831358516; daa[16*20+5]= 1.277480294596; daa[16*20+6]= 1.071097236007; daa[16*20+7]= 0.641436011405; 
+	    daa[16*20+8]= 0.585407090225; daa[16*20+9]= 1.17909119726;  daa[16*20+10]= 0.915259857694;daa[16*20+11]= 1.303875200799;
+	    daa[16*20+12]= 1.488548053722;daa[16*20+13]= 0.488206118793;daa[16*20+14]= 1.005451683149;daa[16*20+15]= 5.15155629227; 
+	    daa[17*20+0]= 0.465839367725; daa[17*20+1]= 0.426382310122; daa[17*20+2]= 0.191482046247; daa[17*20+3]= 0.145345046279; 
+	    daa[17*20+4]= 0.527664418872; daa[17*20+5]= 0.758653808642; daa[17*20+6]= 0.407635648938; daa[17*20+7]= 0.508358924638; 
+	    daa[17*20+8]= 0.30124860078;  daa[17*20+9]= 0.34198578754;  daa[17*20+10]= 0.6914746346;  daa[17*20+11]= 0.332243040634;
+	    daa[17*20+12]= 0.888101098152;daa[17*20+13]= 2.074324893497;daa[17*20+14]= 0.252214830027;daa[17*20+15]= 0.387925622098;
+	    daa[17*20+16]= 0.513128126891;daa[18*20+0]= 0.718206697586; daa[18*20+1]= 0.720517441216; daa[18*20+2]= 0.538222519037; 
+	    daa[18*20+3]= 0.261422208965; daa[18*20+4]= 0.470237733696; daa[18*20+5]= 0.95898974285;  daa[18*20+6]= 0.596719300346; 
+	    daa[18*20+7]= 0.308055737035; daa[18*20+8]= 4.218953969389; daa[18*20+9]= 0.674617093228; daa[18*20+10]= 0.811245856323;
+	    daa[18*20+11]= 0.7179934869;  daa[18*20+12]= 0.951682162246;daa[18*20+13]= 6.747260430801;daa[18*20+14]= 0.369405319355;
+	    daa[18*20+15]= 0.796751520761;daa[18*20+16]= 0.801010243199;daa[18*20+17]= 4.054419006558;daa[19*20+0]= 2.187774522005; 
+	    daa[19*20+1]= 0.438388343772; daa[19*20+2]= 0.312858797993; daa[19*20+3]= 0.258129289418; daa[19*20+4]= 1.116352478606; 
+	    daa[19*20+5]= 0.530785790125; daa[19*20+6]= 0.524253846338; daa[19*20+7]= 0.25334079019;  daa[19*20+8]= 0.20155597175;  
+	    daa[19*20+9]= 8.311839405458; daa[19*20+10]= 2.231405688913;daa[19*20+11]= 0.498138475304;daa[19*20+12]= 2.575850755315;
+	    daa[19*20+13]= 0.838119610178;daa[19*20+14]= 0.496908410676;daa[19*20+15]= 0.561925457442;daa[19*20+16]= 2.253074051176;
+	    daa[19*20+17]= 0.266508731426;daa[19*20+18]= 1;             
+	    
+	    f[0]= 0.074;                 f[1]= 0.052;                 f[2]= 0.045;                 f[3]= 0.054;                 
+	    f[4]= 0.025;                 f[5]= 0.034;                 f[6]= 0.054;                 f[7]= 0.074;                 
+	    f[8]= 0.026;                 f[9]= 0.068;                 f[10]= 0.099;                f[11]= 0.058;                
+	    f[12]= 0.025;                f[13]= 0.047;                f[14]= 0.039;                f[15]= 0.057;                
+	    f[16]= 0.051;                f[17]= 0.013;                f[18]= 0.032;                f[19]= 0.073;
+	  }
+	else if (prot_model == "MTMAM") 
+	  {
+	    daa[1*20+0]= 32;              daa[2*20+0]= 2;    daa[2*20+1]= 4;               daa[3*20+0]= 11;
+	    daa[3*20+1]= 1e-6;            daa[3*20+2]= 864;  daa[4*20+0]= 1e-6;            daa[4*20+1]= 186;
+	    daa[4*20+2]= 1e-6;            daa[4*20+3]= 1e-6; daa[5*20+0]= 1e-6;            daa[5*20+1]= 246;
+	    daa[5*20+2]= 8;               daa[5*20+3]= 49;   daa[5*20+4]= 1e-6;            daa[6*20+0]= 1e-6;
+	    daa[6*20+1]= 1e-6;            daa[6*20+2]= 1e-6; daa[6*20+3]= 569;             daa[6*20+4]= 1e-6;
+	    daa[6*20+5]= 274;             daa[7*20+0]= 78;   daa[7*20+1]= 18;              daa[7*20+2]= 47;
+	    daa[7*20+3]= 79;              daa[7*20+4]= 1e-6; daa[7*20+5]= 1e-6;            daa[7*20+6]= 22;
+	    daa[8*20+0]= 8;               daa[8*20+1]= 232;  daa[8*20+2]= 458;             daa[8*20+3]= 11;
+	    daa[8*20+4]= 305;             daa[8*20+5]= 550;  daa[8*20+6]= 22;              daa[8*20+7]= 1e-6;
+	    daa[9*20+0]= 75;              daa[9*20+1]= 1e-6; daa[9*20+2]= 19;              daa[9*20+3]= 1e-6;
+	    daa[9*20+4]= 41;              daa[9*20+5]= 1e-6; daa[9*20+6]= 1e-6;            daa[9*20+7]= 1e-6;
+	    daa[9*20+8]= 1e-6;            daa[10*20+0]= 21;  daa[10*20+1]= 6;              daa[10*20+2]= 1e-6;
+	    daa[10*20+3]= 1e-6;           daa[10*20+4]= 27;  daa[10*20+5]= 20;             daa[10*20+6]= 1e-6;
+	    daa[10*20+7]= 1e-6;           daa[10*20+8]= 26;  daa[10*20+9]= 232;            daa[11*20+0]= 1e-6;
+	    daa[11*20+1]= 50;             daa[11*20+2]= 408; daa[11*20+3]= 1e-6;           daa[11*20+4]= 1e-6;
+	    daa[11*20+5]= 242;            daa[11*20+6]= 215; daa[11*20+7]= 1e-6;           daa[11*20+8]= 1e-6;
+	    daa[11*20+9]= 6;              daa[11*20+10]= 4;  daa[12*20+0]= 76;             daa[12*20+1]= 1e-6;
+	    daa[12*20+2]= 21;             daa[12*20+3]= 1e-6;daa[12*20+4]= 1e-6;           daa[12*20+5]= 22;
+	    daa[12*20+6]= 1e-6;           daa[12*20+7]= 1e-6;daa[12*20+8]= 1e-6;           daa[12*20+9]= 378;
+	    daa[12*20+10]= 609;           daa[12*20+11]= 59; daa[13*20+0]= 1e-6;           daa[13*20+1]= 1e-6;
+	    daa[13*20+2]= 6;              daa[13*20+3]= 5;   daa[13*20+4]= 7;              daa[13*20+5]= 1e-6;
+	    daa[13*20+6]= 1e-6;           daa[13*20+7]= 1e-6;daa[13*20+8]= 1e-6;           daa[13*20+9]= 57;
+	    daa[13*20+10]= 246;           daa[13*20+11]= 1e-6;  daa[13*20+12]= 11;            daa[14*20+0]= 53;
+	    daa[14*20+1]= 9;              daa[14*20+2]= 33;  daa[14*20+3]= 2;              daa[14*20+4]= 1e-6;
+	    daa[14*20+5]= 51;             daa[14*20+6]= 1e-6;daa[14*20+7]= 1e-6;           daa[14*20+8]= 53;
+	    daa[14*20+9]= 5;              daa[14*20+10]= 43; daa[14*20+11]= 18;            daa[14*20+12]= 1e-6;
+	    daa[14*20+13]= 17;            daa[15*20+0]= 342; daa[15*20+1]= 3;              daa[15*20+2]= 446;
+	    daa[15*20+3]= 16;             daa[15*20+4]= 347; daa[15*20+5]= 30;             daa[15*20+6]= 21;
+	    daa[15*20+7]= 112;            daa[15*20+8]= 20;  daa[15*20+9]= 1e-6;           daa[15*20+10]= 74;
+	    daa[15*20+11]= 65;            daa[15*20+12]= 47; daa[15*20+13]= 90;            daa[15*20+14]= 202;
+	    daa[16*20+0]= 681;            daa[16*20+1]= 1e-6;daa[16*20+2]= 110;            daa[16*20+3]= 1e-6;
+	    daa[16*20+4]= 114;            daa[16*20+5]= 1e-6;daa[16*20+6]= 4;              daa[16*20+7]= 1e-6;
+	    daa[16*20+8]= 1;              daa[16*20+9]= 360; daa[16*20+10]= 34;            daa[16*20+11]= 50;
+	    daa[16*20+12]= 691;           daa[16*20+13]= 8;  daa[16*20+14]= 78;            daa[16*20+15]= 614;
+	    daa[17*20+0]= 5;              daa[17*20+1]= 16;  daa[17*20+2]= 6;              daa[17*20+3]= 1e-6;
+	    daa[17*20+4]= 65;             daa[17*20+5]= 1e-6;daa[17*20+6]= 1e-6;           daa[17*20+7]= 1e-6;
+	    daa[17*20+8]= 1e-6;           daa[17*20+9]= 1e-6;daa[17*20+10]= 12;            daa[17*20+11]= 1e-6;
+	    daa[17*20+12]= 13;            daa[17*20+13]= 1e-6;  daa[17*20+14]= 7;             daa[17*20+15]= 17;
+	    daa[17*20+16]= 1e-6;          daa[18*20+0]= 1e-6;daa[18*20+1]= 1e-6;           daa[18*20+2]= 156;
+	    daa[18*20+3]= 1e-6;           daa[18*20+4]= 530; daa[18*20+5]= 54;             daa[18*20+6]= 1e-6;
+	    daa[18*20+7]= 1;              daa[18*20+8]= 1525;daa[18*20+9]= 16;             daa[18*20+10]= 25;
+	    daa[18*20+11]= 67;            daa[18*20+12]= 1e-6;  daa[18*20+13]= 682;           daa[18*20+14]= 8;
+	    daa[18*20+15]= 107;           daa[18*20+16]= 1e-6;  daa[18*20+17]= 14;            daa[19*20+0]= 398;
+	    daa[19*20+1]= 1e-6;           daa[19*20+2]= 1e-6;daa[19*20+3]= 10;             daa[19*20+4]= 1e-6;
+	    daa[19*20+5]= 33;             daa[19*20+6]= 20;  daa[19*20+7]= 5;              daa[19*20+8]= 1e-6;
+	    daa[19*20+9]= 2220;           daa[19*20+10]= 100;daa[19*20+11]= 1e-6;          daa[19*20+12]= 832;
+	    daa[19*20+13]= 6;             daa[19*20+14]= 1e-6;  daa[19*20+15]= 1e-6;          daa[19*20+16]= 237;
+	    daa[19*20+17]= 1e-6;          daa[19*20+18]= 1e-6;
+	    
+	    f[0]= 0.06920;  f[1]=  0.01840;  f[2]= 0.04000;  f[3]= 0.018600;
+	    f[4]= 0.00650;  f[5]=  0.02380;  f[6]= 0.02360;  f[7]= 0.055700;
+	    f[8]= 0.02770;  f[9]=  0.09050;  f[10]=0.16750;  f[11]= 0.02210;
+	    f[12]=0.05610;  f[13]= 0.06110;  f[14]=0.05360;  f[15]= 0.07250;
+	    f[16]=0.08700;  f[17]= 0.02930;  f[18]=0.03400;  f[19]= 0.04280;
+	  }
+	else if (prot_model == "LG") 
+	  {
+	    daa[1*20+0] = 0.425093;
+
+	    daa[2*20+0] = 0.276818; daa[2*20+1] = 0.751878;
+
+	    daa[3*20+0] = 0.395144; daa[3*20+1] = 0.123954; daa[3*20+2] = 5.076149;
+	    
+	    daa[4*20+0] = 2.489084; daa[4*20+1] = 0.534551; daa[4*20+2] = 0.528768; daa[4*20+3] = 0.062556;
+								 
+	    daa[5*20+0] = 0.969894; daa[5*20+1] = 2.807908; daa[5*20+2] = 1.695752; daa[5*20+3] = 0.523386; daa[5*20+4] = 0.084808;
+
+	    daa[6*20+0] = 1.038545; daa[6*20+1] = 0.363970; daa[6*20+2] = 0.541712; daa[6*20+3] = 5.243870; daa[6*20+4] = 0.003499; daa[6*20+5] = 4.128591;
+
+	    daa[7*20+0] = 2.066040; daa[7*20+1] = 0.390192; daa[7*20+2] = 1.437645; daa[7*20+3] = 0.844926; daa[7*20+4] = 0.569265; daa[7*20+5] = 0.267959; daa[7*20+6] = 0.348847;
+ 
+	    daa[8*20+0] = 0.358858; daa[8*20+1] = 2.426601; daa[8*20+2] = 4.509238; daa[8*20+3] = 0.927114; daa[8*20+4] = 0.640543; daa[8*20+5] = 4.813505; daa[8*20+6] = 0.423881; 
+	    daa[8*20+7] = 0.311484;
+
+	    daa[9*20+0] = 0.149830; daa[9*20+1] = 0.126991; daa[9*20+2] = 0.191503; daa[9*20+3] = 0.010690; daa[9*20+4] = 0.320627; daa[9*20+5] = 0.072854; daa[9*20+6] = 0.044265; 
+	    daa[9*20+7] = 0.008705; daa[9*20+8] = 0.108882; 
+
+	    daa[10*20+0] = 0.395337; daa[10*20+1] = 0.301848; daa[10*20+2] = 0.068427; daa[10*20+3] = 0.015076; daa[10*20+4] = 0.594007; daa[10*20+5] = 0.582457; daa[10*20+6] = 0.069673; 
+	    daa[10*20+7] = 0.044261; daa[10*20+8] = 0.366317; daa[10*20+9] = 4.145067 ;
+
+	    daa[11*20+0] = 0.536518; daa[11*20+1] = 6.326067; daa[11*20+2] = 2.145078; daa[11*20+3] = 0.282959; daa[11*20+4] = 0.013266; daa[11*20+5] = 3.234294; daa[11*20+6] = 1.807177; 
+	    daa[11*20+7] = 0.296636; daa[11*20+8] = 0.697264; daa[11*20+9] = 0.159069; daa[11*20+10] = 0.137500;
+
+
+	    daa[12*20+0] = 1.124035; daa[12*20+1] = 0.484133; daa[12*20+2] = 0.371004; daa[12*20+3] = 0.025548; daa[12*20+4] = 0.893680; daa[12*20+5] = 1.672569; daa[12*20+6] = 0.173735; 
+	    daa[12*20+7] = 0.139538; daa[12*20+8] = 0.442472; daa[12*20+9] = 4.273607; daa[12*20+10] = 6.312358; daa[12*20+11] = 0.656604;
+
+	    daa[13*20+0] = 0.253701; daa[13*20+1] = 0.052722;daa[13*20+2] = 0.089525; daa[13*20+3] = 0.017416; daa[13*20+4] = 1.105251; daa[13*20+5] = 0.035855; daa[13*20+6] = 0.018811; 
+	    daa[13*20+7] = 0.089586; daa[13*20+8] = 0.682139; daa[13*20+9] = 1.112727; daa[13*20+10] = 2.592692; daa[13*20+11] = 0.023918; daa[13*20+12] = 1.798853;
+
+	    daa[14*20+0] = 1.177651; daa[14*20+1] = 0.332533;daa[14*20+2] = 0.161787; daa[14*20+3] = 0.394456; daa[14*20+4] = 0.075382; daa[14*20+5] = 0.624294; daa[14*20+6] = 0.419409; 
+	    daa[14*20+7] = 0.196961; daa[14*20+8] = 0.508851; daa[14*20+9] = 0.078281; daa[14*20+10] = 0.249060; daa[14*20+11] = 0.390322; daa[14*20+12] = 0.099849; 
+	    daa[14*20+13] = 0.094464;
+ 
+	    daa[15*20+0] = 4.727182; daa[15*20+1] = 0.858151;daa[15*20+2] = 4.008358; daa[15*20+3] = 1.240275; daa[15*20+4] = 2.784478; daa[15*20+5] = 1.223828; daa[15*20+6] = 0.611973; 
+	    daa[15*20+7] = 1.739990; daa[15*20+8] = 0.990012; daa[15*20+9] = 0.064105; daa[15*20+10] = 0.182287; daa[15*20+11] = 0.748683; daa[15*20+12] = 0.346960; 
+	    daa[15*20+13] = 0.361819; daa[15*20+14] = 1.338132;
+ 
+	    daa[16*20+0] = 2.139501; daa[16*20+1] = 0.578987;daa[16*20+2] = 2.000679; daa[16*20+3] = 0.425860; daa[16*20+4] = 1.143480; daa[16*20+5] = 1.080136; daa[16*20+6] = 0.604545; 
+	    daa[16*20+7] = 0.129836; daa[16*20+8] = 0.584262; daa[16*20+9] = 1.033739; daa[16*20+10] = 0.302936; daa[16*20+11] = 1.136863; daa[16*20+12] = 2.020366; 
+	    daa[16*20+13] = 0.165001; daa[16*20+14] = 0.571468; daa[16*20+15] = 6.472279;
+
+	    daa[17*20+0] = 0.180717; daa[17*20+1] = 0.593607;daa[17*20+2] = 0.045376; daa[17*20+3] = 0.029890; daa[17*20+4] = 0.670128; daa[17*20+5] = 0.236199; daa[17*20+6] = 0.077852; 
+	    daa[17*20+7] = 0.268491; daa[17*20+8] = 0.597054; daa[17*20+9] = 0.111660; daa[17*20+10] = 0.619632; daa[17*20+11] = 0.049906; daa[17*20+12] = 0.696175; 
+	    daa[17*20+13] = 2.457121; daa[17*20+14] = 0.095131; daa[17*20+15] = 0.248862; daa[17*20+16] = 0.140825;
+
+	    daa[18*20+0] = 0.218959; daa[18*20+1] = 0.314440;daa[18*20+2] = 0.612025; daa[18*20+3] = 0.135107; daa[18*20+4] = 1.165532; daa[18*20+5] = 0.257336; daa[18*20+6] = 0.120037; 
+	    daa[18*20+7] = 0.054679; daa[18*20+8] = 5.306834; daa[18*20+9] = 0.232523; daa[18*20+10] = 0.299648; daa[18*20+11] = 0.131932; daa[18*20+12] = 0.481306; 
+	    daa[18*20+13] = 7.803902; daa[18*20+14] = 0.089613; daa[18*20+15] = 0.400547; daa[18*20+16] = 0.245841; daa[18*20+17] = 3.151815;
+
+	    daa[19*20+0] = 2.547870; daa[19*20+1] = 0.170887;daa[19*20+2] = 0.083688; daa[19*20+3] = 0.037967; daa[19*20+4] = 1.959291; daa[19*20+5] = 0.210332; daa[19*20+6] = 0.245034; 
+	    daa[19*20+7] = 0.076701; daa[19*20+8] = 0.119013; daa[19*20+9] = 10.649107; daa[19*20+10] = 1.702745; daa[19*20+11] = 0.185202; daa[19*20+12] = 1.898718; 
+	    daa[19*20+13] = 0.654683; daa[19*20+14] = 0.296501; daa[19*20+15] = 0.098369; daa[19*20+16] = 2.188158; daa[19*20+17] = 0.189510; daa[19*20+18] = 0.249313;
+	    
+/*	same problem here
+ * 	    f[0] = 0.07906;
+	    f[1] = 0.05594; 
+	    f[2] = 0.04198; 
+	    f[3] = 0.05305; 
+	    f[4] = 0.01294; 
+	    f[5] = 0.04077; 
+	    f[6] = 0.07158; 
+	    f[7] = 0.05734; 
+	    f[8] = 0.02235; 
+	    f[9] = 0.06216; 
+	    f[10] = 0.09908; 
+	    f[11] = 0.06460; 
+	    f[12] = 0.02295; 
+	    f[13] = 0.04230; 
+	    f[14] = 0.04404; 
+	    f[15] = 0.06120; 
+	    f[16] = 0.05329; 
+	    f[17] = 0.01207; 
+	    f[18] = 0.03415; 
+	    f[19] = 0.06915;*/
+	    // NOTE: originally f[19]=0.069147 but frequencies do not sum up to 1
+	    f[0] = 0.079066; f[1] = 0.055941; f[2] = 0.041977; f[3] = 0.053052;
+	    f[4] = 0.012937; f[5] = 0.040767; f[6] = 0.071586; f[7] = 0.057337;
+	    f[8] = 0.022355; f[9] = 0.062157; f[10] = 0.099081; f[11] = 0.064600;
+	    f[12] = 0.022951; f[13] = 0.042302; f[14] = 0.044040; f[15] = 0.061197;
+	    f[16] = 0.053287; f[17] = 0.012066; f[18] = 0.034155; f[19] = 0.069146;
+
+	  }	  
+	else if (prot_model == "MTART") 
+	  {
+	   
+
+	    daa[1*20+0]=   0.2;
+	    daa[2*20+0]=   0.2;
+           daa[2*20+1]=   0.2;
+           daa[3*20+0]=   1;
+           daa[3*20+1]=   4;
+           daa[3*20+2]=   500;
+           daa[4*20+0]=   254;
+           daa[4*20+1]=   36;
+           daa[4*20+2]=   98;
+           daa[4*20+3]=   11;
+           daa[5*20+0]=   0.2;
+           daa[5*20+1]=   154;
+           daa[5*20+2]=   262;
+           daa[5*20+3]=   0.2;
+           daa[5*20+4]=   0.2;
+           daa[6*20+0]=   0.2;
+           daa[6*20+1]=   0.2;
+           daa[6*20+2]=   183;
+           daa[6*20+3]=   862;
+           daa[6*20+4]=   0.2;
+           daa[6*20+5]=   262;
+           daa[7*20+0]=   200;
+           daa[7*20+1]=   0.2;
+           daa[7*20+2]=   121;
+           daa[7*20+3]=   12;
+           daa[7*20+4]=   81;
+           daa[7*20+5]=   3;
+           daa[7*20+6]=   44;
+           daa[8*20+0]=   0.2;
+           daa[8*20+1]=   41;
+           daa[8*20+2]=   180;
+           daa[8*20+3]=   0.2;
+           daa[8*20+4]=   12;
+           daa[8*20+5]=   314;
+           daa[8*20+6]=   15;
+           daa[8*20+7]=   0.2;
+           daa[9*20+0]=   26;
+           daa[9*20+1]=   2;
+           daa[9*20+2]=   21;
+           daa[9*20+3]=   7;
+           daa[9*20+4]=   63;
+           daa[9*20+5]=   11;
+           daa[9*20+6]=   7;
+           daa[9*20+7]=   3;
+           daa[9*20+8]=   0.2;
+           daa[10*20+0]=  4;
+           daa[10*20+1]=  2;
+           daa[10*20+2]=  13;
+           daa[10*20+3]=  1;
+           daa[10*20+4]=  79;
+           daa[10*20+5]=  16;
+           daa[10*20+6]=  2;
+           daa[10*20+7]=  1;
+           daa[10*20+8]=  6;
+           daa[10*20+9]=  515;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  209;
+           daa[11*20+2]=  467;
+           daa[11*20+3]=  2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  349;
+           daa[11*20+6]=  106;
+           daa[11*20+7]=  0.2;
+           daa[11*20+8]=  0.2;
+           daa[11*20+9]=  3;
+           daa[11*20+10]= 4;
+           daa[12*20+0]=  121;
+           daa[12*20+1]=  5;
+           daa[12*20+2]=  79;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  312;
+           daa[12*20+5]=  67;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  56;
+           daa[12*20+8]=  0.2;
+           daa[12*20+9]=  515;
+           daa[12*20+10]= 885;
+           daa[12*20+11]= 106;
+           daa[13*20+0]=  13;
+           daa[13*20+1]=  5;
+           daa[13*20+2]=  20;
+           daa[13*20+3]=  0.2;
+           daa[13*20+4]=  184;
+           daa[13*20+5]=  0.2;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  1;
+           daa[13*20+8]=  14;
+           daa[13*20+9]=  118;
+           daa[13*20+10]= 263;
+           daa[13*20+11]= 11;
+           daa[13*20+12]= 322;
+           daa[14*20+0]=  49;
+           daa[14*20+1]=  0.2;
+           daa[14*20+2]=  17;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  0.2;
+           daa[14*20+5]=  39;
+           daa[14*20+6]=  8;
+           daa[14*20+7]=  0.2;
+           daa[14*20+8]=  1;
+           daa[14*20+9]=  0.2;
+           daa[14*20+10]= 12;
+           daa[14*20+11]= 17;
+           daa[14*20+12]= 5;
+           daa[14*20+13]= 15;
+           daa[15*20+0]=  673;
+           daa[15*20+1]=  3;
+           daa[15*20+2]=  398;
+           daa[15*20+3]=  44;
+           daa[15*20+4]=  664;
+           daa[15*20+5]=  52;
+           daa[15*20+6]=  31;
+           daa[15*20+7]=  226;
+           daa[15*20+8]=  11;
+           daa[15*20+9]=  7;
+           daa[15*20+10]= 8;
+           daa[15*20+11]= 144;
+           daa[15*20+12]= 112;
+           daa[15*20+13]= 36;
+           daa[15*20+14]= 87;
+           daa[16*20+0]=  244;
+           daa[16*20+1]=  0.2;
+           daa[16*20+2]=  166;
+           daa[16*20+3]=  0.2;
+           daa[16*20+4]=  183;
+           daa[16*20+5]=  44;
+           daa[16*20+6]=  43;
+           daa[16*20+7]=  0.2;
+           daa[16*20+8]=  19;
+           daa[16*20+9]=  204;
+           daa[16*20+10]= 48;
+           daa[16*20+11]= 70;
+           daa[16*20+12]= 289;
+           daa[16*20+13]= 14;
+           daa[16*20+14]= 47;
+           daa[16*20+15]= 660;
+           daa[17*20+0]=  0.2;
+           daa[17*20+1]=  0.2;
+           daa[17*20+2]=  8;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  22;
+           daa[17*20+5]=  7;
+           daa[17*20+6]=  11;
+           daa[17*20+7]=  2;
+           daa[17*20+8]=  0.2;
+           daa[17*20+9]=  0.2;
+           daa[17*20+10]= 21;
+           daa[17*20+11]= 16;
+           daa[17*20+12]= 71;
+           daa[17*20+13]= 54;
+           daa[17*20+14]= 0.2;
+           daa[17*20+15]= 2;
+           daa[17*20+16]= 0.2;
+           daa[18*20+0]=  1;
+           daa[18*20+1]=  4;
+           daa[18*20+2]=  251;
+           daa[18*20+3]=  0.2;
+           daa[18*20+4]=  72;
+           daa[18*20+5]=  87;
+           daa[18*20+6]=  8;
+           daa[18*20+7]=  9;
+           daa[18*20+8]=  191;
+           daa[18*20+9]=  12;
+           daa[18*20+10]= 20;
+           daa[18*20+11]= 117;
+           daa[18*20+12]= 71;
+           daa[18*20+13]= 792;
+           daa[18*20+14]= 18;
+           daa[18*20+15]= 30;
+           daa[18*20+16]= 46;
+           daa[18*20+17]= 38;
+           daa[19*20+0]=  340;
+           daa[19*20+1]=  0.2;
+           daa[19*20+2]=  23;
+           daa[19*20+3]=  0.2;
+           daa[19*20+4]=  350;
+           daa[19*20+5]=  0.2;
+           daa[19*20+6]=  14;
+           daa[19*20+7]=  3;
+           daa[19*20+8]=  0.2;
+           daa[19*20+9]=  1855;
+           daa[19*20+10]= 85;
+           daa[19*20+11]= 26;
+           daa[19*20+12]= 281;
+           daa[19*20+13]= 52;
+           daa[19*20+14]= 32;
+           daa[19*20+15]= 61;
+           daa[19*20+16]= 544;
+           daa[19*20+17]= 0.2;
+           daa[19*20+18]= 2;
+           
+           f[0]=  0.054116;
+           f[1]=  0.018227;
+           f[2]=  0.039903;
+           f[3]=  0.020160;
+           f[4]=  0.009709;
+           f[5]=  0.018781;
+           f[6]=  0.024289;
+           f[7]=  0.068183;
+           f[8]=  0.024518;
+           f[9]=  0.092638;
+           f[10]= 0.148658;
+           f[11]= 0.021718;
+           f[12]= 0.061453;
+           f[13]= 0.088668;
+           f[14]= 0.041826;
+           f[15]= 0.091030;
+           f[16]= 0.049194;
+           f[17]= 0.029786;
+           f[18]= 0.039443;
+           f[19]= 0.057700;
+	  }
+	else if (prot_model == "MTZOA") 
+	  {
+           daa[1*20+0]=   3.3;
+           daa[2*20+0]=   1.7;
+           daa[2*20+1]=   33.6;
+           daa[3*20+0]=   16.1;
+           daa[3*20+1]=   3.2;
+           daa[3*20+2]=   617.0;
+           daa[4*20+0]=   272.5;
+           daa[4*20+1]=   61.1;
+           daa[4*20+2]=   94.6;
+           daa[4*20+3]=   9.5;
+           daa[5*20+0]=   7.3;
+           daa[5*20+1]=   231.0;
+           daa[5*20+2]=   190.3;
+           daa[5*20+3]=   19.3;
+           daa[5*20+4]=   49.1;
+           daa[6*20+0]=   17.1;
+           daa[6*20+1]=   6.4;
+           daa[6*20+2]=   174.0;
+           daa[6*20+3]=   883.6;
+           daa[6*20+4]=   3.4;
+           daa[6*20+5]=   349.4;
+           daa[7*20+0]=   289.3;
+           daa[7*20+1]=   7.2;
+           daa[7*20+2]=   99.3;
+           daa[7*20+3]=   26.0;
+           daa[7*20+4]=   82.4;
+           daa[7*20+5]=   8.9;
+           daa[7*20+6]=   43.1;
+           daa[8*20+0]=   2.3;
+           daa[8*20+1]=   61.7;
+           daa[8*20+2]=   228.9;
+           daa[8*20+3]=   55.6;
+           daa[8*20+4]=   37.5;
+           daa[8*20+5]=   421.8;
+           daa[8*20+6]=   14.9;
+           daa[8*20+7]=   7.4;
+           daa[9*20+0]=   33.2;
+           daa[9*20+1]=   0.2;
+           daa[9*20+2]=   24.3;
+           daa[9*20+3]=   1.5;
+           daa[9*20+4]=   48.8;
+           daa[9*20+5]=   0.2;
+           daa[9*20+6]=   7.3;
+           daa[9*20+7]=   3.4;
+           daa[9*20+8]=   1.6;
+           daa[10*20+0]=  15.6;
+           daa[10*20+1]=  4.1;
+           daa[10*20+2]=  7.9;
+           daa[10*20+3]=  0.5;
+           daa[10*20+4]=  59.7;
+           daa[10*20+5]=  23.0;
+           daa[10*20+6]=  1.0;
+           daa[10*20+7]=  3.5;
+           daa[10*20+8]=  6.6;
+           daa[10*20+9]=  425.2;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  292.3;
+           daa[11*20+2]=  413.4;
+           daa[11*20+3]=  0.2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  334.0;
+           daa[11*20+6]=  163.2;
+           daa[11*20+7]=  10.1;
+           daa[11*20+8]=  23.9;
+           daa[11*20+9]=  8.4;
+           daa[11*20+10]= 6.7;
+           daa[12*20+0]=  136.5;
+           daa[12*20+1]=  3.8;
+           daa[12*20+2]=  73.7;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  264.8;
+           daa[12*20+5]=  83.9;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  52.2;
+           daa[12*20+8]=  7.1;
+           daa[12*20+9]=  449.7;
+           daa[12*20+10]= 636.3;
+           daa[12*20+11]= 83.0;
+           daa[13*20+0]=  26.5;
+           daa[13*20+1]=  0.2;
+           daa[13*20+2]=  12.9;
+           daa[13*20+3]=  2.0;
+           daa[13*20+4]=  167.8;
+           daa[13*20+5]=  9.5;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  5.8;
+           daa[13*20+8]=  13.1;
+           daa[13*20+9]=  90.3;
+           daa[13*20+10]= 234.2;
+           daa[13*20+11]= 16.3;
+           daa[13*20+12]= 215.6;
+           daa[14*20+0]=  61.8;
+           daa[14*20+1]=  7.5;
+           daa[14*20+2]=  22.6;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  8.1;
+           daa[14*20+5]=  52.2;
+           daa[14*20+6]=  20.6;
+           daa[14*20+7]=  1.3;
+           daa[14*20+8]=  15.6;
+           daa[14*20+9]=  2.6;
+           daa[14*20+10]= 11.4;
+           daa[14*20+11]= 24.3;
+           daa[14*20+12]= 5.4;
+           daa[14*20+13]= 10.5;
+           daa[15*20+0]=  644.9;
+           daa[15*20+1]=  11.8;
+           daa[15*20+2]=  420.2;
+           daa[15*20+3]=  51.4;
+           daa[15*20+4]=  656.3;
+           daa[15*20+5]=  96.4;
+           daa[15*20+6]=  38.4;
+           daa[15*20+7]=  257.1;
+           daa[15*20+8]=  23.1;
+           daa[15*20+9]=  7.2;
+           daa[15*20+10]= 15.2;
+           daa[15*20+11]= 144.9;
+           daa[15*20+12]= 95.3;
+           daa[15*20+13]= 32.2;
+           daa[15*20+14]= 79.7;
+           daa[16*20+0]=  378.1;
+           daa[16*20+1]=  3.2;
+           daa[16*20+2]=  184.6;
+           daa[16*20+3]=  2.3;
+           daa[16*20+4]=  199.0;
+           daa[16*20+5]=  39.4;
+           daa[16*20+6]=  34.5;
+           daa[16*20+7]=  5.2;
+           daa[16*20+8]=  19.4;
+           daa[16*20+9]=  222.3;
+           daa[16*20+10]= 50.0;
+           daa[16*20+11]= 75.5;
+           daa[16*20+12]= 305.1;
+           daa[16*20+13]= 19.3;
+           daa[16*20+14]= 56.9;
+           daa[16*20+15]= 666.3;
+           daa[17*20+0]=  3.1;
+           daa[17*20+1]=  16.9;
+           daa[17*20+2]=  6.4;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  36.1;
+           daa[17*20+5]=  6.1;
+           daa[17*20+6]=  3.5;
+           daa[17*20+7]=  12.3;
+           daa[17*20+8]=  4.5;
+           daa[17*20+9]=  9.7;
+           daa[17*20+10]= 27.2;
+           daa[17*20+11]= 6.6;
+           daa[17*20+12]= 48.7;
+           daa[17*20+13]= 58.2;
+           daa[17*20+14]= 1.3;
+           daa[17*20+15]= 10.3;
+           daa[17*20+16]= 3.6;
+           daa[18*20+0]=  2.1;
+           daa[18*20+1]=  13.8;
+           daa[18*20+2]=  141.6;
+           daa[18*20+3]=  13.9;
+           daa[18*20+4]=  76.7;
+           daa[18*20+5]=  52.3;
+           daa[18*20+6]=  10.0;
+           daa[18*20+7]=  4.3;
+           daa[18*20+8]=  266.5;
+           daa[18*20+9]=  13.1;
+           daa[18*20+10]= 5.7;
+           daa[18*20+11]= 45.0;
+           daa[18*20+12]= 41.4;
+           daa[18*20+13]= 590.5;
+           daa[18*20+14]= 4.2;
+           daa[18*20+15]= 29.7;
+           daa[18*20+16]= 29.0;
+           daa[18*20+17]= 79.8;
+           daa[19*20+0]=  321.9;
+           daa[19*20+1]=  5.1;
+           daa[19*20+2]=  7.1;
+           daa[19*20+3]=  3.7;
+           daa[19*20+4]=  243.8;
+           daa[19*20+5]=  9.0;
+           daa[19*20+6]=  16.3;
+           daa[19*20+7]=  23.7;
+           daa[19*20+8]=  0.3;
+           daa[19*20+9]=  1710.6;
+           daa[19*20+10]= 126.1;
+           daa[19*20+11]= 11.1;
+           daa[19*20+12]= 279.6;
+           daa[19*20+13]= 59.6;
+           daa[19*20+14]= 17.9;
+           daa[19*20+15]= 49.5;
+           daa[19*20+16]= 396.4;
+           daa[19*20+17]= 13.7;
+           daa[19*20+18]= 15.6;
+           
+           f[0]=  0.069;
+           f[1]=  0.021;
+           f[2]=  0.030;
+           f[3]=  0.020;
+           f[4]=  0.010;
+           f[5]=  0.019;
+           f[6]=  0.025;
+           f[7]=  0.072;
+           f[8]=  0.027;
+           f[9]=  0.085;
+           f[10]= 0.157;
+           f[11]= 0.019;
+           f[12]= 0.051;
+           f[13]= 0.082;
+           f[14]= 0.045;
+           f[15]= 0.081;
+           f[16]= 0.056;
+           f[17]= 0.028;
+           f[18]= 0.037;
+           f[19]= 0.066;
+	  }
+	else if (prot_model == "PMB") 
+	  {
+           daa[1*20+0]=   0.674995699;
+           daa[2*20+0]=   0.589645178;
+           daa[2*20+1]=   1.189067034;
+           daa[3*20+0]=   0.462499504;
+           daa[3*20+1]=   0.605460903;
+           daa[3*20+2]=   3.573373315;
+           daa[4*20+0]=   1.065445546;
+           daa[4*20+1]=   0.31444833;
+           daa[4*20+2]=   0.589852457;
+           daa[4*20+3]=   0.246951424;
+           daa[5*20+0]=   1.111766964;
+           daa[5*20+1]=   2.967840934;
+           daa[5*20+2]=   2.299755865;
+           daa[5*20+3]=   1.686058219;
+           daa[5*20+4]=   0.245163782;
+           daa[6*20+0]=   1.046334652;
+           daa[6*20+1]=   1.201770702;
+           daa[6*20+2]=   1.277836748;
+           daa[6*20+3]=   4.399995525;
+           daa[6*20+4]=   0.091071867;
+           daa[6*20+5]=   4.15967899;
+           daa[7*20+0]=   1.587964372;
+           daa[7*20+1]=   0.523770553;
+           daa[7*20+2]=   1.374854049;
+           daa[7*20+3]=   0.734992057;
+           daa[7*20+4]=   0.31706632;
+           daa[7*20+5]=   0.596789898;
+           daa[7*20+6]=   0.463812837;
+           daa[8*20+0]=   0.580830874;
+           daa[8*20+1]=   1.457127446;
+           daa[8*20+2]=   2.283037894;
+           daa[8*20+3]=   0.839348444;
+           daa[8*20+4]=   0.411543728;
+           daa[8*20+5]=   1.812173605;
+           daa[8*20+6]=   0.877842609;
+           daa[8*20+7]=   0.476331437;
+           daa[9*20+0]=   0.464590585;
+           daa[9*20+1]=   0.35964586;
+           daa[9*20+2]=   0.426069419;
+           daa[9*20+3]=   0.266775558;
+           daa[9*20+4]=   0.417547309;
+           daa[9*20+5]=   0.315256838;
+           daa[9*20+6]=   0.30421529;
+           daa[9*20+7]=   0.180198883;
+           daa[9*20+8]=   0.285186418;
+           daa[10*20+0]=  0.804404505;
+           daa[10*20+1]=  0.520701585;
+           daa[10*20+2]=  0.41009447;
+           daa[10*20+3]=  0.269124919;
+           daa[10*20+4]=  0.450795211;
+           daa[10*20+5]=  0.625792937;
+           daa[10*20+6]=  0.32078471;
+           daa[10*20+7]=  0.259854426;
+           daa[10*20+8]=  0.363981358;
+           daa[10*20+9]=  4.162454693;
+           daa[11*20+0]=  0.831998835;
+           daa[11*20+1]=  4.956476453;
+           daa[11*20+2]=  2.037575629;
+           daa[11*20+3]=  1.114178954;
+           daa[11*20+4]=  0.274163536;
+           daa[11*20+5]=  3.521346591;
+           daa[11*20+6]=  2.415974716;
+           daa[11*20+7]=  0.581001076;
+           daa[11*20+8]=  0.985885486;
+           daa[11*20+9]=  0.374784947;
+           daa[11*20+10]= 0.498011337;
+           daa[12*20+0]=  1.546725076;
+           daa[12*20+1]=  0.81346254;
+           daa[12*20+2]=  0.737846301;
+           daa[12*20+3]=  0.341932741;
+           daa[12*20+4]=  0.618614612;
+           daa[12*20+5]=  2.067388546;
+           daa[12*20+6]=  0.531773639;
+           daa[12*20+7]=  0.465349326;
+           daa[12*20+8]=  0.380925433;
+           daa[12*20+9]=  3.65807012;
+           daa[12*20+10]= 5.002338375;
+           daa[12*20+11]= 0.661095832;
+           daa[13*20+0]=  0.546169219;
+           daa[13*20+1]=  0.303437244;
+           daa[13*20+2]=  0.425193716;
+           daa[13*20+3]=  0.219005213;
+           daa[13*20+4]=  0.669206193;
+           daa[13*20+5]=  0.406042546;
+           daa[13*20+6]=  0.224154698;
+           daa[13*20+7]=  0.35402891;
+           daa[13*20+8]=  0.576231691;
+           daa[13*20+9]=  1.495264661;
+           daa[13*20+10]= 2.392638293;
+           daa[13*20+11]= 0.269496317;
+           daa[13*20+12]= 2.306919847;
+           daa[14*20+0]=  1.241586045;
+           daa[14*20+1]=  0.65577338;
+           daa[14*20+2]=  0.711495595;
+           daa[14*20+3]=  0.775624818;
+           daa[14*20+4]=  0.198679914;
+           daa[14*20+5]=  0.850116543;
+           daa[14*20+6]=  0.794584081;
+           daa[14*20+7]=  0.588254139;
+           daa[14*20+8]=  0.456058589;
+           daa[14*20+9]=  0.366232942;
+           daa[14*20+10]= 0.430073179;
+           daa[14*20+11]= 1.036079005;
+           daa[14*20+12]= 0.337502282;
+           daa[14*20+13]= 0.481144863;
+           daa[15*20+0]=  3.452308792;
+           daa[15*20+1]=  0.910144334;
+           daa[15*20+2]=  2.572577221;
+           daa[15*20+3]=  1.440896785;
+           daa[15*20+4]=  0.99870098;
+           daa[15*20+5]=  1.348272505;
+           daa[15*20+6]=  1.205509425;
+           daa[15*20+7]=  1.402122097;
+           daa[15*20+8]=  0.799966711;
+           daa[15*20+9]=  0.530641901;
+           daa[15*20+10]= 0.402471997;
+           daa[15*20+11]= 1.234648153;
+           daa[15*20+12]= 0.945453716;
+           daa[15*20+13]= 0.613230817;
+           daa[15*20+14]= 1.217683028;
+           daa[16*20+0]=  1.751412803;
+           daa[16*20+1]=  0.89517149;
+           daa[16*20+2]=  1.823161023;
+           daa[16*20+3]=  0.994227284;
+           daa[16*20+4]=  0.847312432;
+           daa[16*20+5]=  1.320626678;
+           daa[16*20+6]=  0.949599791;
+           daa[16*20+7]=  0.542185658;
+           daa[16*20+8]=  0.83039281;
+           daa[16*20+9]=  1.114132523;
+           daa[16*20+10]= 0.779827336;
+           daa[16*20+11]= 1.290709079;
+           daa[16*20+12]= 1.551488041;
+           daa[16*20+13]= 0.718895136;
+           daa[16*20+14]= 0.780913179;
+           daa[16*20+15]= 4.448982584;
+           daa[17*20+0]=  0.35011051;
+           daa[17*20+1]=  0.618778365;
+           daa[17*20+2]=  0.422407388;
+           daa[17*20+3]=  0.362495245;
+           daa[17*20+4]=  0.445669347;
+           daa[17*20+5]=  0.72038474;
+           daa[17*20+6]=  0.261258229;
+           daa[17*20+7]=  0.37874827;
+           daa[17*20+8]=  0.72436751;
+           daa[17*20+9]=  0.516260502;
+           daa[17*20+10]= 0.794797115;
+           daa[17*20+11]= 0.43340962;
+           daa[17*20+12]= 0.768395107;
+           daa[17*20+13]= 3.29519344;
+           daa[17*20+14]= 0.499869138;
+           daa[17*20+15]= 0.496334956;
+           daa[17*20+16]= 0.38372361;
+           daa[18*20+0]=  0.573154753;
+           daa[18*20+1]=  0.628599063;
+           daa[18*20+2]=  0.720013799;
+           daa[18*20+3]=  0.436220437;
+           daa[18*20+4]=  0.55626163;
+           daa[18*20+5]=  0.728970584;
+           daa[18*20+6]=  0.50720003;
+           daa[18*20+7]=  0.284727562;
+           daa[18*20+8]=  2.210952064;
+           daa[18*20+9]=  0.570562395;
+           daa[18*20+10]= 0.811019594;
+           daa[18*20+11]= 0.664884513;
+           daa[18*20+12]= 0.93253606;
+           daa[18*20+13]= 5.894735673;
+           daa[18*20+14]= 0.433748126;
+           daa[18*20+15]= 0.593795813;
+           daa[18*20+16]= 0.523549536;
+           daa[18*20+17]= 2.996248013;
+           daa[19*20+0]=  2.063050067;
+           daa[19*20+1]=  0.388680158;
+           daa[19*20+2]=  0.474418852;
+           daa[19*20+3]=  0.275658381;
+           daa[19*20+4]=  0.998911631;
+           daa[19*20+5]=  0.634408285;
+           daa[19*20+6]=  0.527640634;
+           daa[19*20+7]=  0.314700907;
+           daa[19*20+8]=  0.305792277;
+           daa[19*20+9]=  8.002789424;
+           daa[19*20+10]= 2.113077156;
+           daa[19*20+11]= 0.526184203;
+           daa[19*20+12]= 1.737356217;
+           daa[19*20+13]= 0.983844803;
+           daa[19*20+14]= 0.551333603;
+           daa[19*20+15]= 0.507506011;
+           daa[19*20+16]= 1.89965079;
+           daa[19*20+17]= 0.429570747;
+           daa[19*20+18]= 0.716795463;
+           
+           f[0]=  0.076;
+           f[1]=  0.054;
+           f[2]=  0.038;
+           f[3]=  0.045;
+           f[4]=  0.028;
+           f[5]=  0.034;
+           f[6]=  0.053;
+           f[7]=  0.078;
+           f[8]=  0.030;
+           f[9]=  0.060;
+           f[10]= 0.096;
+           f[11]= 0.052;
+           f[12]= 0.022;
+           f[13]= 0.045;
+           f[14]= 0.042;
+           f[15]= 0.068;
+           f[16]= 0.056;
+           f[17]= 0.016;
+           f[18]= 0.036;
+           f[19]= 0.071;
+	  }
+	else if (prot_model == "HIVB") 
+	  {
+           daa[1*20+0]=   0.30750700;
+           daa[2*20+0]=   0.00500000;
+           daa[2*20+1]=   0.29554300;
+           daa[3*20+0]=   1.45504000;
+           daa[3*20+1]=   0.00500000;
+           daa[3*20+2]=   17.66120000;
+           daa[4*20+0]=   0.12375800;
+           daa[4*20+1]=   0.35172100;
+           daa[4*20+2]=   0.08606420;
+           daa[4*20+3]=   0.00500000;
+           daa[5*20+0]=   0.05511280;
+           daa[5*20+1]=   3.42150000;
+           daa[5*20+2]=   0.67205200;
+           daa[5*20+3]=   0.00500000;
+           daa[5*20+4]=   0.00500000;
+           daa[6*20+0]=   1.48135000;
+           daa[6*20+1]=   0.07492180;
+           daa[6*20+2]=   0.07926330;
+           daa[6*20+3]=   10.58720000;
+           daa[6*20+4]=   0.00500000;
+           daa[6*20+5]=   2.56020000;
+           daa[7*20+0]=   2.13536000;
+           daa[7*20+1]=   3.65345000;
+           daa[7*20+2]=   0.32340100;
+           daa[7*20+3]=   2.83806000;
+           daa[7*20+4]=   0.89787100;
+           daa[7*20+5]=   0.06191370;
+           daa[7*20+6]=   3.92775000;
+           daa[8*20+0]=   0.08476130;
+           daa[8*20+1]=   9.04044000;
+           daa[8*20+2]=   7.64585000;
+           daa[8*20+3]=   1.91690000;
+           daa[8*20+4]=   0.24007300;
+           daa[8*20+5]=   7.05545000;
+           daa[8*20+6]=   0.11974000;
+           daa[8*20+7]=   0.00500000;
+           daa[9*20+0]=   0.00500000;
+           daa[9*20+1]=   0.67728900;
+           daa[9*20+2]=   0.68056500;
+           daa[9*20+3]=   0.01767920;
+           daa[9*20+4]=   0.00500000;
+           daa[9*20+5]=   0.00500000;
+           daa[9*20+6]=   0.00609079;
+           daa[9*20+7]=   0.00500000;
+           daa[9*20+8]=   0.10311100;
+           daa[10*20+0]=  0.21525600;
+           daa[10*20+1]=  0.70142700;
+           daa[10*20+2]=  0.00500000;
+           daa[10*20+3]=  0.00876048;
+           daa[10*20+4]=  0.12977700;
+           daa[10*20+5]=  1.49456000;
+           daa[10*20+6]=  0.00500000;
+           daa[10*20+7]=  0.00500000;
+           daa[10*20+8]=  1.74171000;
+           daa[10*20+9]=  5.95879000;
+           daa[11*20+0]=  0.00500000;
+           daa[11*20+1]=  20.45000000;
+           daa[11*20+2]=  7.90443000;
+           daa[11*20+3]=  0.00500000;
+           daa[11*20+4]=  0.00500000;
+           daa[11*20+5]=  6.54737000;
+           daa[11*20+6]=  4.61482000;
+           daa[11*20+7]=  0.52170500;
+           daa[11*20+8]=  0.00500000;
+           daa[11*20+9]=  0.32231900;
+           daa[11*20+10]= 0.08149950;
+           daa[12*20+0]=  0.01866430;
+           daa[12*20+1]=  2.51394000;
+           daa[12*20+2]=  0.00500000;
+           daa[12*20+3]=  0.00500000;
+           daa[12*20+4]=  0.00500000;
+           daa[12*20+5]=  0.30367600;
+           daa[12*20+6]=  0.17578900;
+           daa[12*20+7]=  0.00500000;
+           daa[12*20+8]=  0.00500000;
+           daa[12*20+9]=  11.20650000;
+           daa[12*20+10]= 5.31961000;
+           daa[12*20+11]= 1.28246000;
+           daa[13*20+0]=  0.01412690;
+           daa[13*20+1]=  0.00500000;
+           daa[13*20+2]=  0.00500000;
+           daa[13*20+3]=  0.00500000;
+           daa[13*20+4]=  9.29815000;
+           daa[13*20+5]=  0.00500000;
+           daa[13*20+6]=  0.00500000;
+           daa[13*20+7]=  0.29156100;
+           daa[13*20+8]=  0.14555800;
+           daa[13*20+9]=  3.39836000;
+           daa[13*20+10]= 8.52484000;
+           daa[13*20+11]= 0.03426580;
+           daa[13*20+12]= 0.18802500;
+           daa[14*20+0]=  2.12217000;
+           daa[14*20+1]=  1.28355000;
+           daa[14*20+2]=  0.00739578;
+           daa[14*20+3]=  0.03426580;
+           daa[14*20+4]=  0.00500000;
+           daa[14*20+5]=  4.47211000;
+           daa[14*20+6]=  0.01202260;
+           daa[14*20+7]=  0.00500000;
+           daa[14*20+8]=  2.45318000;
+           daa[14*20+9]=  0.04105930;
+           daa[14*20+10]= 2.07757000;
+           daa[14*20+11]= 0.03138620;
+           daa[14*20+12]= 0.00500000;
+           daa[14*20+13]= 0.00500000;
+           daa[15*20+0]=  2.46633000;
+           daa[15*20+1]=  3.47910000;
+           daa[15*20+2]=  13.14470000;
+           daa[15*20+3]=  0.52823000;
+           daa[15*20+4]=  4.69314000;
+           daa[15*20+5]=  0.11631100;
+           daa[15*20+6]=  0.00500000;
+           daa[15*20+7]=  4.38041000;
+           daa[15*20+8]=  0.38274700;
+           daa[15*20+9]=  1.21803000;
+           daa[15*20+10]= 0.92765600;
+           daa[15*20+11]= 0.50411100;
+           daa[15*20+12]= 0.00500000;
+           daa[15*20+13]= 0.95647200;
+           daa[15*20+14]= 5.37762000;
+           daa[16*20+0]=  15.91830000;
+           daa[16*20+1]=  2.86868000;
+           daa[16*20+2]=  6.88667000;
+           daa[16*20+3]=  0.27472400;
+           daa[16*20+4]=  0.73996900;
+           daa[16*20+5]=  0.24358900;
+           daa[16*20+6]=  0.28977400;
+           daa[16*20+7]=  0.36961500;
+           daa[16*20+8]=  0.71159400;
+           daa[16*20+9]=  8.61217000;
+           daa[16*20+10]= 0.04376730;
+           daa[16*20+11]= 4.67142000;
+           daa[16*20+12]= 4.94026000;
+           daa[16*20+13]= 0.01412690;
+           daa[16*20+14]= 2.01417000;
+           daa[16*20+15]= 8.93107000;
+           daa[17*20+0]=  0.00500000;
+           daa[17*20+1]=  0.99133800;
+           daa[17*20+2]=  0.00500000;
+           daa[17*20+3]=  0.00500000;
+           daa[17*20+4]=  2.63277000;
+           daa[17*20+5]=  0.02665600;
+           daa[17*20+6]=  0.00500000;
+           daa[17*20+7]=  1.21674000;
+           daa[17*20+8]=  0.06951790;
+           daa[17*20+9]=  0.00500000;
+           daa[17*20+10]= 0.74884300;
+           daa[17*20+11]= 0.00500000;
+           daa[17*20+12]= 0.08907800;
+           daa[17*20+13]= 0.82934300;
+           daa[17*20+14]= 0.04445060;
+           daa[17*20+15]= 0.02487280;
+           daa[17*20+16]= 0.00500000;
+           daa[18*20+0]=  0.00500000;
+           daa[18*20+1]=  0.00991826;
+           daa[18*20+2]=  1.76417000;
+           daa[18*20+3]=  0.67465300;
+           daa[18*20+4]=  7.57932000;
+           daa[18*20+5]=  0.11303300;
+           daa[18*20+6]=  0.07926330;
+           daa[18*20+7]=  0.00500000;
+           daa[18*20+8]=  18.69430000;
+           daa[18*20+9]=  0.14816800;
+           daa[18*20+10]= 0.11198600;
+           daa[18*20+11]= 0.00500000;
+           daa[18*20+12]= 0.00500000;
+           daa[18*20+13]= 15.34000000;
+           daa[18*20+14]= 0.03043810;
+           daa[18*20+15]= 0.64802400;
+           daa[18*20+16]= 0.10565200;
+           daa[18*20+17]= 1.28022000;
+           daa[19*20+0]=  7.61428000;
+           daa[19*20+1]=  0.08124540;
+           daa[19*20+2]=  0.02665600;
+           daa[19*20+3]=  1.04793000;
+           daa[19*20+4]=  0.42002700;
+           daa[19*20+5]=  0.02091530;
+           daa[19*20+6]=  1.02847000;
+           daa[19*20+7]=  0.95315500;
+           daa[19*20+8]=  0.00500000;
+           daa[19*20+9]=  17.73890000;
+           daa[19*20+10]= 1.41036000;
+           daa[19*20+11]= 0.26582900;
+           daa[19*20+12]= 6.85320000;
+           daa[19*20+13]= 0.72327400;
+           daa[19*20+14]= 0.00500000;
+           daa[19*20+15]= 0.07492180;
+           daa[19*20+16]= 0.70922600;
+           daa[19*20+17]= 0.00500000;
+           daa[19*20+18]= 0.04105930;
+           /* ROUNDING ERROR:
+           f[0]=  0.060;
+           f[1]=  0.066;
+           f[2]=  0.044;
+           f[3]=  0.042;
+           f[4]=  0.020;
+           f[5]=  0.054;
+           f[6]=  0.071;
+           f[7]=  0.072;
+           f[8]=  0.022;
+           f[9]=  0.070;
+           f[10]= 0.099;
+           f[11]= 0.057;
+           f[12]= 0.020;
+           f[13]= 0.029;
+           f[14]= 0.046;
+           f[15]= 0.051;
+           f[16]= 0.054;
+           f[17]= 0.033;
+           f[18]= 0.028;
+           f[19]= 0.062;
+           */
+           f[0]= 0.060490222;           f[1]= 0.066039665;           f[2]= 0.044127815;           f[3]= 0.042109048;
+           f[4]= 0.020075899;           f[5]= 0.053606488;           f[6]= 0.071567447;           f[7]= 0.072308239;
+           f[8]= 0.022293943;           f[9]= 0.069730629;           f[10]= 0.098851122;          f[11]= 0.056968211;
+           f[12]= 0.019768318;          f[13]= 0.028809447;          f[14]= 0.046025282;          f[15]= 0.05060433;
+           f[16]= 0.053636813;          f[17]= 0.033011601;          f[18]= 0.028350243;          f[19]= 0.061625237;
+
+	  }
+	else if (prot_model == "HIVW") 
+	  {
+           daa[1*20+0]=   0.0744808;
+           daa[2*20+0]=   0.6175090;
+           daa[2*20+1]=   0.1602400;
+           daa[3*20+0]=   4.4352100;
+           daa[3*20+1]=   0.0674539;
+           daa[3*20+2]=   29.4087000;
+           daa[4*20+0]=   0.1676530;
+           daa[4*20+1]=   2.8636400;
+           daa[4*20+2]=   0.0604932;
+           daa[4*20+3]=   0.0050000;
+           daa[5*20+0]=   0.0050000;
+           daa[5*20+1]=   10.6746000;
+           daa[5*20+2]=   0.3420680;
+           daa[5*20+3]=   0.0050000;
+           daa[5*20+4]=   0.0050000;
+           daa[6*20+0]=   5.5632500;
+           daa[6*20+1]=   0.0251632;
+           daa[6*20+2]=   0.2015260;
+           daa[6*20+3]=   12.1233000;
+           daa[6*20+4]=   0.0050000;
+           daa[6*20+5]=   3.2065600;
+           daa[7*20+0]=   1.8685000;
+           daa[7*20+1]=   13.4379000;
+           daa[7*20+2]=   0.0604932;
+           daa[7*20+3]=   10.3969000;
+           daa[7*20+4]=   0.0489798;
+           daa[7*20+5]=   0.0604932;
+           daa[7*20+6]=   14.7801000;
+           daa[8*20+0]=   0.0050000;
+           daa[8*20+1]=   6.8440500;
+           daa[8*20+2]=   8.5987600;
+           daa[8*20+3]=   2.3177900;
+           daa[8*20+4]=   0.0050000;
+           daa[8*20+5]=   18.5465000;
+           daa[8*20+6]=   0.0050000;
+           daa[8*20+7]=   0.0050000;
+           daa[9*20+0]=   0.0050000;
+           daa[9*20+1]=   1.3406900;
+           daa[9*20+2]=   0.9870280;
+           daa[9*20+3]=   0.1451240;
+           daa[9*20+4]=   0.0050000;
+           daa[9*20+5]=   0.0342252;
+           daa[9*20+6]=   0.0390512;
+           daa[9*20+7]=   0.0050000;
+           daa[9*20+8]=   0.0050000;
+           daa[10*20+0]=  0.1602400;
+           daa[10*20+1]=  0.5867570;
+           daa[10*20+2]=  0.0050000;
+           daa[10*20+3]=  0.0050000;
+           daa[10*20+4]=  0.0050000;
+           daa[10*20+5]=  2.8904800;
+           daa[10*20+6]=  0.1298390;
+           daa[10*20+7]=  0.0489798;
+           daa[10*20+8]=  1.7638200;
+           daa[10*20+9]=  9.1024600;
+           daa[11*20+0]=  0.5927840;
+           daa[11*20+1]=  39.8897000;
+           daa[11*20+2]=  10.6655000;
+           daa[11*20+3]=  0.8943130;
+           daa[11*20+4]=  0.0050000;
+           daa[11*20+5]=  13.0705000;
+           daa[11*20+6]=  23.9626000;
+           daa[11*20+7]=  0.2794250;
+           daa[11*20+8]=  0.2240600;
+           daa[11*20+9]=  0.8174810;
+           daa[11*20+10]= 0.0050000;
+           daa[12*20+0]=  0.0050000;
+           daa[12*20+1]=  3.2865200;
+           daa[12*20+2]=  0.2015260;
+           daa[12*20+3]=  0.0050000;
+           daa[12*20+4]=  0.0050000;
+           daa[12*20+5]=  0.0050000;
+           daa[12*20+6]=  0.0050000;
+           daa[12*20+7]=  0.0489798;
+           daa[12*20+8]=  0.0050000;
+           daa[12*20+9]=  17.3064000;
+           daa[12*20+10]= 11.3839000;
+           daa[12*20+11]= 4.0956400;
+           daa[13*20+0]=  0.5979230;
+           daa[13*20+1]=  0.0050000;
+           daa[13*20+2]=  0.0050000;
+           daa[13*20+3]=  0.0050000;
+           daa[13*20+4]=  0.3629590;
+           daa[13*20+5]=  0.0050000;
+           daa[13*20+6]=  0.0050000;
+           daa[13*20+7]=  0.0050000;
+           daa[13*20+8]=  0.0050000;
+           daa[13*20+9]=  1.4828800;
+           daa[13*20+10]= 7.4878100;
+           daa[13*20+11]= 0.0050000;
+           daa[13*20+12]= 0.0050000;
+           daa[14*20+0]=  1.0098100;
+           daa[14*20+1]=  0.4047230;
+           daa[14*20+2]=  0.3448480;
+           daa[14*20+3]=  0.0050000;
+           daa[14*20+4]=  0.0050000;
+           daa[14*20+5]=  3.0450200;
+           daa[14*20+6]=  0.0050000;
+           daa[14*20+7]=  0.0050000;
+           daa[14*20+8]=  13.9444000;
+           daa[14*20+9]=  0.0050000;
+           daa[14*20+10]= 9.8309500;
+           daa[14*20+11]= 0.1119280;
+           daa[14*20+12]= 0.0050000;
+           daa[14*20+13]= 0.0342252;
+           daa[15*20+0]=  8.5942000;
+           daa[15*20+1]=  8.3502400;
+           daa[15*20+2]=  14.5699000;
+           daa[15*20+3]=  0.4278810;
+           daa[15*20+4]=  1.1219500;
+           daa[15*20+5]=  0.1602400;
+           daa[15*20+6]=  0.0050000;
+           daa[15*20+7]=  6.2796600;
+           daa[15*20+8]=  0.7251570;
+           daa[15*20+9]=  0.7400910;
+           daa[15*20+10]= 6.1439600;
+           daa[15*20+11]= 0.0050000;
+           daa[15*20+12]= 0.3925750;
+           daa[15*20+13]= 4.2793900;
+           daa[15*20+14]= 14.2490000;
+           daa[16*20+0]=  24.1422000;
+           daa[16*20+1]=  0.9282030;
+           daa[16*20+2]=  4.5420600;
+           daa[16*20+3]=  0.6303950;
+           daa[16*20+4]=  0.0050000;
+           daa[16*20+5]=  0.2030910;
+           daa[16*20+6]=  0.4587430;
+           daa[16*20+7]=  0.0489798;
+           daa[16*20+8]=  0.9595600;
+           daa[16*20+9]=  9.3634500;
+           daa[16*20+10]= 0.0050000;
+           daa[16*20+11]= 4.0480200;
+           daa[16*20+12]= 7.4131300;
+           daa[16*20+13]= 0.1145120;
+           daa[16*20+14]= 4.3370100;
+           daa[16*20+15]= 6.3407900;
+           daa[17*20+0]=  0.0050000;
+           daa[17*20+1]=  5.9656400;
+           daa[17*20+2]=  0.0050000;
+           daa[17*20+3]=  0.0050000;
+           daa[17*20+4]=  5.4989400;
+           daa[17*20+5]=  0.0443298;
+           daa[17*20+6]=  0.0050000;
+           daa[17*20+7]=  2.8258000;
+           daa[17*20+8]=  0.0050000;
+           daa[17*20+9]=  0.0050000;
+           daa[17*20+10]= 1.3703100;
+           daa[17*20+11]= 0.0050000;
+           daa[17*20+12]= 0.0050000;
+           daa[17*20+13]= 0.0050000;
+           daa[17*20+14]= 0.0050000;
+           daa[17*20+15]= 1.1015600;
+           daa[17*20+16]= 0.0050000;
+           daa[18*20+0]=  0.0050000;
+           daa[18*20+1]=  0.0050000;
+           daa[18*20+2]=  5.0647500;
+           daa[18*20+3]=  2.2815400;
+           daa[18*20+4]=  8.3483500;
+           daa[18*20+5]=  0.0050000;
+           daa[18*20+6]=  0.0050000;
+           daa[18*20+7]=  0.0050000;
+           daa[18*20+8]=  47.4889000;
+           daa[18*20+9]=  0.1145120;
+           daa[18*20+10]= 0.0050000;
+           daa[18*20+11]= 0.0050000;
+           daa[18*20+12]= 0.5791980;
+           daa[18*20+13]= 4.1272800;
+           daa[18*20+14]= 0.0050000;
+           daa[18*20+15]= 0.9331420;
+           daa[18*20+16]= 0.4906080;
+           daa[18*20+17]= 0.0050000;
+           daa[19*20+0]=  24.8094000;
+           daa[19*20+1]=  0.2794250;
+           daa[19*20+2]=  0.0744808;
+           daa[19*20+3]=  2.9178600;
+           daa[19*20+4]=  0.0050000;
+           daa[19*20+5]=  0.0050000;
+           daa[19*20+6]=  2.1995200;
+           daa[19*20+7]=  2.7962200;
+           daa[19*20+8]=  0.8274790;
+           daa[19*20+9]=  24.8231000;
+           daa[19*20+10]= 2.9534400;
+           daa[19*20+11]= 0.1280650;
+           daa[19*20+12]= 14.7683000;
+           daa[19*20+13]= 2.2800000;
+           daa[19*20+14]= 0.0050000;
+           daa[19*20+15]= 0.8626370;
+           daa[19*20+16]= 0.0050000;
+           daa[19*20+17]= 0.0050000;
+           daa[19*20+18]= 1.3548200;
+           /*
+           f[0]=  0.038;
+           f[1]=  0.057;
+           f[2]=  0.089;
+           f[3]=  0.034;
+           f[4]=  0.024;
+           f[5]=  0.044;
+           f[6]=  0.062;
+           f[7]=  0.084;
+           f[8]=  0.016;
+           f[9]=  0.098;
+           f[10]= 0.058;
+           f[11]= 0.064;
+           f[12]= 0.016;
+           f[13]= 0.042;
+           f[14]= 0.046;
+           f[15]= 0.055;
+           f[16]= 0.081;
+           f[17]= 0.020;
+           f[18]= 0.021;
+           f[19]= 0.051;
+           */
+           // NOTE: originally f[19]=0.0515639 but frequencies do not sum up to 1
+           f[0]= 0.0377494;             f[1]= 0.057321;              f[2]= 0.0891129;             f[3]= 0.0342034;
+           f[4]= 0.0240105;             f[5]= 0.0437824;             f[6]= 0.0618606;             f[7]= 0.0838496;
+           f[8]= 0.0156076;             f[9]= 0.0983641;             f[10]= 0.0577867;            f[11]= 0.0641682;
+           f[12]= 0.0158419;            f[13]= 0.0422741;            f[14]= 0.0458601;            f[15]= 0.0550846;
+           f[16]= 0.0813774;            f[17]= 0.019597;             f[18]= 0.0205847;            f[19]= 0.0515638;
+
+	  }
+	else if (prot_model == "JTTDCMUT") 
+	  {
+           daa[1*20+0]=   0.531678;
+           daa[2*20+0]=   0.557967;
+           daa[2*20+1]=   0.451095;
+           daa[3*20+0]=   0.827445;
+           daa[3*20+1]=   0.154899;
+           daa[3*20+2]=   5.549530;
+           daa[4*20+0]=   0.574478;
+           daa[4*20+1]=   1.019843;
+           daa[4*20+2]=   0.313311;
+           daa[4*20+3]=   0.105625;
+           daa[5*20+0]=   0.556725;
+           daa[5*20+1]=   3.021995;
+           daa[5*20+2]=   0.768834;
+           daa[5*20+3]=   0.521646;
+           daa[5*20+4]=   0.091304;
+           daa[6*20+0]=   1.066681;
+           daa[6*20+1]=   0.318483;
+           daa[6*20+2]=   0.578115;
+           daa[6*20+3]=   7.766557;
+           daa[6*20+4]=   0.053907;
+           daa[6*20+5]=   3.417706;
+           daa[7*20+0]=   1.740159;
+           daa[7*20+1]=   1.359652;
+           daa[7*20+2]=   0.773313;
+           daa[7*20+3]=   1.272434;
+           daa[7*20+4]=   0.546389;
+           daa[7*20+5]=   0.231294;
+           daa[7*20+6]=   1.115632;
+           daa[8*20+0]=   0.219970;
+           daa[8*20+1]=   3.210671;
+           daa[8*20+2]=   4.025778;
+           daa[8*20+3]=   1.032342;
+           daa[8*20+4]=   0.724998;
+           daa[8*20+5]=   5.684080;
+           daa[8*20+6]=   0.243768;
+           daa[8*20+7]=   0.201696;
+           daa[9*20+0]=   0.361684;
+           daa[9*20+1]=   0.239195;
+           daa[9*20+2]=   0.491003;
+           daa[9*20+3]=   0.115968;
+           daa[9*20+4]=   0.150559;
+           daa[9*20+5]=   0.078270;
+           daa[9*20+6]=   0.111773;
+           daa[9*20+7]=   0.053769;
+           daa[9*20+8]=   0.181788;
+           daa[10*20+0]=  0.310007;
+           daa[10*20+1]=  0.372261;
+           daa[10*20+2]=  0.137289;
+           daa[10*20+3]=  0.061486;
+           daa[10*20+4]=  0.164593;
+           daa[10*20+5]=  0.709004;
+           daa[10*20+6]=  0.097485;
+           daa[10*20+7]=  0.069492;
+           daa[10*20+8]=  0.540571;
+           daa[10*20+9]=  2.335139;
+           daa[11*20+0]=  0.369437;
+           daa[11*20+1]=  6.529255;
+           daa[11*20+2]=  2.529517;
+           daa[11*20+3]=  0.282466;
+           daa[11*20+4]=  0.049009;
+           daa[11*20+5]=  2.966732;
+           daa[11*20+6]=  1.731684;
+           daa[11*20+7]=  0.269840;
+           daa[11*20+8]=  0.525096;
+           daa[11*20+9]=  0.202562;
+           daa[11*20+10]= 0.146481;
+           daa[12*20+0]=  0.469395;
+           daa[12*20+1]=  0.431045;
+           daa[12*20+2]=  0.330720;
+           daa[12*20+3]=  0.190001;
+           daa[12*20+4]=  0.409202;
+           daa[12*20+5]=  0.456901;
+           daa[12*20+6]=  0.175084;
+           daa[12*20+7]=  0.130379;
+           daa[12*20+8]=  0.329660;
+           daa[12*20+9]=  4.831666;
+           daa[12*20+10]= 3.856906;
+           daa[12*20+11]= 0.624581;
+           daa[13*20+0]=  0.138293;
+           daa[13*20+1]=  0.065314;
+           daa[13*20+2]=  0.073481;
+           daa[13*20+3]=  0.032522;
+           daa[13*20+4]=  0.678335;
+           daa[13*20+5]=  0.045683;
+           daa[13*20+6]=  0.043829;
+           daa[13*20+7]=  0.050212;
+           daa[13*20+8]=  0.453428;
+           daa[13*20+9]=  0.777090;
+           daa[13*20+10]= 2.500294;
+           daa[13*20+11]= 0.024521;
+           daa[13*20+12]= 0.436181;
+           daa[14*20+0]=  1.959599;
+           daa[14*20+1]=  0.710489;
+           daa[14*20+2]=  0.121804;
+           daa[14*20+3]=  0.127164;
+           daa[14*20+4]=  0.123653;
+           daa[14*20+5]=  1.608126;
+           daa[14*20+6]=  0.191994;
+           daa[14*20+7]=  0.208081;
+           daa[14*20+8]=  1.141961;
+           daa[14*20+9]=  0.098580;
+           daa[14*20+10]= 1.060504;
+           daa[14*20+11]= 0.216345;
+           daa[14*20+12]= 0.164215;
+           daa[14*20+13]= 0.148483;
+           daa[15*20+0]=  3.887095;
+           daa[15*20+1]=  1.001551;
+           daa[15*20+2]=  5.057964;
+           daa[15*20+3]=  0.589268;
+           daa[15*20+4]=  2.155331;
+           daa[15*20+5]=  0.548807;
+           daa[15*20+6]=  0.312449;
+           daa[15*20+7]=  1.874296;
+           daa[15*20+8]=  0.743458;
+           daa[15*20+9]=  0.405119;
+           daa[15*20+10]= 0.592511;
+           daa[15*20+11]= 0.474478;
+           daa[15*20+12]= 0.285564;
+           daa[15*20+13]= 0.943971;
+           daa[15*20+14]= 2.788406;
+           daa[16*20+0]=  4.582565;
+           daa[16*20+1]=  0.650282;
+           daa[16*20+2]=  2.351311;
+           daa[16*20+3]=  0.425159;
+           daa[16*20+4]=  0.469823;
+           daa[16*20+5]=  0.523825;
+           daa[16*20+6]=  0.331584;
+           daa[16*20+7]=  0.316862;
+           daa[16*20+8]=  0.477355;
+           daa[16*20+9]=  2.553806;
+           daa[16*20+10]= 0.272514;
+           daa[16*20+11]= 0.965641;
+           daa[16*20+12]= 2.114728;
+           daa[16*20+13]= 0.138904;
+           daa[16*20+14]= 1.176961;
+           daa[16*20+15]= 4.777647;
+           daa[17*20+0]=  0.084329;
+           daa[17*20+1]=  1.257961;
+           daa[17*20+2]=  0.027700;
+           daa[17*20+3]=  0.057466;
+           daa[17*20+4]=  1.104181;
+           daa[17*20+5]=  0.172206;
+           daa[17*20+6]=  0.114381;
+           daa[17*20+7]=  0.544180;
+           daa[17*20+8]=  0.128193;
+           daa[17*20+9]=  0.134510;
+           daa[17*20+10]= 0.530324;
+           daa[17*20+11]= 0.089134;
+           daa[17*20+12]= 0.201334;
+           daa[17*20+13]= 0.537922;
+           daa[17*20+14]= 0.069965;
+           daa[17*20+15]= 0.310927;
+           daa[17*20+16]= 0.080556;
+           daa[18*20+0]=  0.139492;
+           daa[18*20+1]=  0.235601;
+           daa[18*20+2]=  0.700693;
+           daa[18*20+3]=  0.453952;
+           daa[18*20+4]=  2.114852;
+           daa[18*20+5]=  0.254745;
+           daa[18*20+6]=  0.063452;
+           daa[18*20+7]=  0.052500;
+           daa[18*20+8]=  5.848400;
+           daa[18*20+9]=  0.303445;
+           daa[18*20+10]= 0.241094;
+           daa[18*20+11]= 0.087904;
+           daa[18*20+12]= 0.189870;
+           daa[18*20+13]= 5.484236;
+           daa[18*20+14]= 0.113850;
+           daa[18*20+15]= 0.628608;
+           daa[18*20+16]= 0.201094;
+           daa[18*20+17]= 0.747889;
+           daa[19*20+0]=  2.924161;
+           daa[19*20+1]=  0.171995;
+           daa[19*20+2]=  0.164525;
+           daa[19*20+3]=  0.315261;
+           daa[19*20+4]=  0.621323;
+           daa[19*20+5]=  0.179771;
+           daa[19*20+6]=  0.465271;
+           daa[19*20+7]=  0.470140;
+           daa[19*20+8]=  0.121827;
+           daa[19*20+9]=  9.533943;
+           daa[19*20+10]= 1.761439;
+           daa[19*20+11]= 0.124066;
+           daa[19*20+12]= 3.038533;
+           daa[19*20+13]= 0.593478;
+           daa[19*20+14]= 0.211561;
+           daa[19*20+15]= 0.408532;
+           daa[19*20+16]= 1.143980;
+           daa[19*20+17]= 0.239697;
+           daa[19*20+18]= 0.165473;
+           
+           f[0]=  0.077;
+           f[1]=  0.051;
+           f[2]=  0.043;
+           f[3]=  0.051;
+           f[4]=  0.020;
+           f[5]=  0.041;
+           f[6]=  0.062;
+           f[7]=  0.075;
+           f[8]=  0.023;
+           f[9]=  0.053;
+           f[10]= 0.091;
+           f[11]= 0.059;
+           f[12]= 0.024;
+           f[13]= 0.040;
+           f[14]= 0.051;
+           f[15]= 0.068;
+           f[16]= 0.059;
+           f[17]= 0.014;
+           f[18]= 0.032;
+           f[19]= 0.066;
+	  }
+	else if (prot_model == "FLU") 
+	  {
+	    daa[ 1*20+ 0] 	=	0.138658765	;
+	    daa[ 2*20+ 0] 	=	0.053366579	;
+	    daa[ 2*20+ 1] 	=	0.161000889	;
+	    daa[ 3*20+ 0] 	=	0.584852306	;
+	    daa[ 3*20+ 1] 	=	0.006771843	;
+	    daa[ 3*20+ 2] 	=	7.737392871	;
+	    daa[ 4*20+ 0] 	=	0.026447095	;
+	    daa[ 4*20+ 1] 	=	0.167207008	;
+	    daa[ 4*20+ 2] 	=	1.30E-05	;
+	    daa[ 4*20+ 3] 	=	1.41E-02	;
+	    daa[ 5*20+ 0] 	=	0.353753982	;
+	    daa[ 5*20+ 1] 	=	3.292716942	;
+	    daa[ 5*20+ 2] 	=	0.530642655	;
+	    daa[ 5*20+ 3] 	=	0.145469388	;
+	    daa[ 5*20+ 4] 	=	0.002547334	;
+	    daa[ 6*20+ 0] 	=	1.484234503	;
+	    daa[ 6*20+ 1] 	=	0.124897617	;
+	    daa[ 6*20+ 2] 	=	0.061652192	;
+	    daa[ 6*20+ 3] 	=	5.370511279	;
+	    daa[ 6*20+ 4] 	=	3.91E-11	;
+	    daa[ 6*20+ 5] 	=	1.195629122	;
+	    daa[ 7*20+ 0] 	=	1.132313122	;
+	    daa[ 7*20+ 1] 	=	1.190624465	;
+	    daa[ 7*20+ 2] 	=	0.322524648	;
+	    daa[ 7*20+ 3] 	=	1.934832784	;
+	    daa[ 7*20+ 4] 	=	0.116941459	;
+	    daa[ 7*20+ 5] 	=	0.108051341	;
+	    daa[ 7*20+ 6] 	=	1.593098825	;
+	    daa[ 8*20+ 0] 	=	0.214757862	;
+	    daa[ 8*20+ 1] 	=	1.879569938	;
+	    daa[ 8*20+ 2] 	=	1.387096032	;
+	    daa[ 8*20+ 3] 	=	0.887570549	;
+	    daa[ 8*20+ 4] 	=	2.18E-02	;
+	    daa[ 8*20+ 5] 	=	5.330313412	;
+	    daa[ 8*20+ 6] 	=	0.256491863	;
+	    daa[ 8*20+ 7] 	=	0.058774527	;
+	    daa[ 9*20+ 0] 	=	0.149926734	;
+	    daa[ 9*20+ 1] 	=	0.246117172	;
+	    daa[ 9*20+ 2] 	=	0.218571975	;
+	    daa[ 9*20+ 3] 	=	0.014085917	;
+	    daa[ 9*20+ 4] 	=	0.001112158	;
+	    daa[ 9*20+ 5] 	=	0.02883995	;
+	    daa[ 9*20+ 6] 	=	1.42E-02	;
+	    daa[ 9*20+ 7] 	=	1.63E-05	;
+	    daa[ 9*20+ 8] 	=	0.243190142	;
+	    daa[10*20+ 0] 	=	0.023116952	;
+	    daa[10*20+ 1] 	=	0.296045557	;
+	    daa[10*20+ 2] 	=	8.36E-04	;
+	    daa[10*20+ 3] 	=	0.005730682	;
+	    daa[10*20+ 4] 	=	0.005613627	;
+	    daa[10*20+ 5] 	=	1.020366955	;
+	    daa[10*20+ 6] 	=	0.016499536	;
+	    daa[10*20+ 7] 	=	0.006516229	;
+	    daa[10*20+ 8] 	=	0.321611694	;
+	    daa[10*20+ 9] 	=	3.512072282	;
+	    daa[11*20+ 0] 	=	0.47433361	;
+	    daa[11*20+ 1] 	=	15.30009662	;
+	    daa[11*20+ 2] 	=	2.646847965	;
+	    daa[11*20+ 3] 	=	0.29004298	;
+	    daa[11*20+ 4] 	=	3.83E-06	;
+	    daa[11*20+ 5] 	=	2.559587177	;
+	    daa[11*20+ 6] 	=	3.881488809	;
+	    daa[11*20+ 7] 	=	0.264148929	;
+	    daa[11*20+ 8] 	=	0.347302791	;
+	    daa[11*20+ 9] 	=	0.227707997	;
+	    daa[11*20+10] 	=	0.129223639	;
+	    daa[12*20+ 0] 	=	0.058745423	;
+	    daa[12*20+ 1] 	=	0.890162346	;
+	    daa[12*20+ 2] 	=	0.005251688	;
+	    daa[12*20+ 3] 	=	0.041762964	;
+	    daa[12*20+ 4] 	=	0.11145731	;
+	    daa[12*20+ 5] 	=	0.190259181	;
+	    daa[12*20+ 6] 	=	0.313974351	;
+	    daa[12*20+ 7] 	=	0.001500467	;
+	    daa[12*20+ 8] 	=	0.001273509	;
+	    daa[12*20+ 9] 	=	9.017954203	;
+	    daa[12*20+10] 	=	6.746936485	;
+	    daa[12*20+11] 	=	1.331291619	;
+	    daa[13*20+ 0] 	=	0.080490909	;
+	    daa[13*20+ 1] 	=	1.61E-02	;
+	    daa[13*20+ 2] 	=	8.36E-04	;
+	    daa[13*20+ 3] 	=	1.06E-06	;
+	    daa[13*20+ 4] 	=	0.104053666	;
+	    daa[13*20+ 5] 	=	0.032680657	;
+	    daa[13*20+ 6] 	=	0.001003501	;
+	    daa[13*20+ 7] 	=	0.001236645	;
+	    daa[13*20+ 8] 	=	0.119028506	;
+	    daa[13*20+ 9] 	=	1.463357278	;
+	    daa[13*20+10] 	=	2.986800036	;
+	    daa[13*20+11] 	=	3.20E-01	;
+	    daa[13*20+12] 	=	0.279910509	;
+	    daa[14*20+ 0] 	=	0.659311478	;
+	    daa[14*20+ 1] 	=	0.15402718	;
+	    daa[14*20+ 2] 	=	3.64E-02	;
+	    daa[14*20+ 3] 	=	0.188539456	;
+	    daa[14*20+ 4] 	=	1.59E-13	;
+	    daa[14*20+ 5] 	=	0.712769599	;
+	    daa[14*20+ 6] 	=	0.319558828	;
+	    daa[14*20+ 7] 	=	0.038631761	;
+	    daa[14*20+ 8] 	=	0.924466914	;
+	    daa[14*20+ 9] 	=	0.080543327	;
+	    daa[14*20+10] 	=	0.634308521	;
+	    daa[14*20+11] 	=	0.195750632	;
+	    daa[14*20+12] 	=	5.69E-02	;
+	    daa[14*20+13] 	=	0.00713243	;
+	    daa[15*20+ 0] 	=	3.011344519	;
+	    daa[15*20+ 1] 	=	0.95013841	;
+	    daa[15*20+ 2] 	=	3.881310531	;
+	    daa[15*20+ 3] 	=	0.338372183	;
+	    daa[15*20+ 4] 	=	0.336263345	;
+	    daa[15*20+ 5] 	=	0.487822499	;
+	    daa[15*20+ 6] 	=	0.307140298	;
+	    daa[15*20+ 7] 	=	1.585646577	;
+	    daa[15*20+ 8] 	=	0.58070425	;
+	    daa[15*20+ 9] 	=	0.290381075	;
+	    daa[15*20+10] 	=	0.570766693	;
+	    daa[15*20+11] 	=	0.283807672	;
+	    daa[15*20+12] 	=	0.007026588	;
+	    daa[15*20+13] 	=	0.99668567	;
+	    daa[15*20+14] 	=	2.087385344	;
+	    daa[16*20+ 0] 	=	5.418298175	;
+	    daa[16*20+ 1] 	=	0.183076905	;
+	    daa[16*20+ 2] 	=	2.140332316	;
+	    daa[16*20+ 3] 	=	0.135481233	;
+	    daa[16*20+ 4] 	=	0.011975266	;
+	    daa[16*20+ 5] 	=	0.602340963	;
+	    daa[16*20+ 6] 	=	0.280124895	;
+	    daa[16*20+ 7] 	=	0.01880803	;
+	    daa[16*20+ 8] 	=	0.368713573	;
+	    daa[16*20+ 9] 	=	2.904052286	;
+	    daa[16*20+10] 	=	0.044926357	;
+	    daa[16*20+11] 	=	1.5269642	;
+	    daa[16*20+12] 	=	2.031511321	;
+	    daa[16*20+13] 	=	0.000134906	;
+	    daa[16*20+14] 	=	0.542251094	;
+	    daa[16*20+15] 	=	2.206859934	;
+	    daa[17*20+ 0] 	=	1.96E-01	;
+	    daa[17*20+ 1] 	=	1.369429408	;
+	    daa[17*20+ 2] 	=	5.36E-04	;
+	    daa[17*20+ 3] 	=	1.49E-05	;
+	    daa[17*20+ 4] 	=	0.09410668	;
+	    daa[17*20+ 5] 	=	4.40E-02	;
+	    daa[17*20+ 6] 	=	0.155245492	;
+	    daa[17*20+ 7] 	=	0.196486447	;
+	    daa[17*20+ 8] 	=	2.24E-02	;
+	    daa[17*20+ 9] 	=	0.03213215	;
+	    daa[17*20+10] 	=	0.431277663	;
+	    daa[17*20+11] 	=	4.98E-05	;
+	    daa[17*20+12] 	=	0.070460039	;
+	    daa[17*20+13] 	=	0.814753094	;
+	    daa[17*20+14] 	=	0.000431021	;
+	    daa[17*20+15] 	=	0.099835753	;
+	    daa[17*20+16] 	=	0.207066206	;
+	    daa[18*20+ 0] 	=	0.018289288	;
+	    daa[18*20+ 1] 	=	0.099855497	;
+	    daa[18*20+ 2] 	=	0.373101927	;
+	    daa[18*20+ 3] 	=	0.525398543	;
+	    daa[18*20+ 4] 	=	0.601692431	;
+	    daa[18*20+ 5] 	=	0.072205935	;
+	    daa[18*20+ 6] 	=	0.10409287	;
+	    daa[18*20+ 7] 	=	0.074814997	;
+	    daa[18*20+ 8] 	=	6.448954446	;
+	    daa[18*20+ 9] 	=	0.273934263	;
+	    daa[18*20+10] 	=	0.340058468	;
+	    daa[18*20+11] 	=	0.012416222	;
+	    daa[18*20+12] 	=	0.874272175	;
+	    daa[18*20+13] 	=	5.393924245	;
+	    daa[18*20+14] 	=	1.82E-04	;
+	    daa[18*20+15] 	=	0.39255224	;
+	    daa[18*20+16] 	=	0.12489802	;
+	    daa[18*20+17] 	=	0.42775543	;
+	    daa[19*20+ 0] 	=	3.53200527	;
+	    daa[19*20+ 1] 	=	0.103964386	;
+	    daa[19*20+ 2] 	=	0.010257517	;
+	    daa[19*20+ 3] 	=	0.297123975	;
+	    daa[19*20+ 4] 	=	0.054904564	;
+	    daa[19*20+ 5] 	=	0.406697814	;
+	    daa[19*20+ 6] 	=	0.285047948	;
+	    daa[19*20+ 7] 	=	0.337229619	;
+	    daa[19*20+ 8] 	=	0.098631355	;
+	    daa[19*20+ 9] 	=	14.39405219	;
+	    daa[19*20+10] 	=	0.890598579	;
+	    daa[19*20+11] 	=	0.07312793	;
+	    daa[19*20+12] 	=	4.904842235	;
+	    daa[19*20+13] 	=	0.592587985	;
+	    daa[19*20+14] 	=	0.058971975	;
+	    daa[19*20+15] 	=	0.088256423	;
+	    daa[19*20+16] 	=	0.654109108	;
+	    daa[19*20+17] 	=	0.256900461	;
+	    daa[19*20+18] 	=	0.167581647	;
+	    
+ 
+  
+	    f[0]	=	0.0471	;
+	    f[1]	=	0.0509	;
+	    f[2]	=	0.0742	;
+	    f[3]	=	0.0479	;
+	    f[4]	=	0.0250	;
+	    f[5]	=	0.0333	;
+	    f[6]	=	0.0546	;
+	    f[7]	=	0.0764	;
+	    f[8]	=	0.0200	;
+	    f[9]	=	0.0671	;
+	    f[10]	=	0.0715	;
+	    f[11]	=	0.0568	;
+	    f[12]	=	0.0181	;
+	    f[13]	=	0.0305	;
+	    f[14]	=	0.0507	;
+	    f[15]	=	0.0884	;
+	    f[16]	=	0.0743	;
+	    f[17]	=	0.0185	;
+	    f[18]	=	0.0315	;
+	    f[19]	=	0.0632	;
+	  }
+	else  return false;
+
+  for (i=0; i<20; i++)  
+	daa[i*20+i] = 0.0;
+  for (i=0; i<20; i++)
+    for (j=0; j<i; j++)               
+      daa[j*20+i] = daa[i*20+j];
+
+  max = 0;
+  
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)
+      {
+		temp = daa[i * 20 + j];
+		if(temp > max) 
+			max = temp;
+      }
+ 
+  const double AA_SCALE = 10.0;
+  scaler = AA_SCALE / max;
+   
+  /* SCALING HAS BEEN RE-INTRODUCED TO RESOLVE NUMERICAL  PROBLEMS */   
+
+  for(i = 0; i < 20; i++)
+      for(j = 0; j < 20; j++)
+		{  
+		daa[i*20+j] *= scaler;
+		}
+	return true;
+}
+
+string model_WAG = 
+"0.551571 \
+0.509848  0.635346 \
+0.738998  0.147304  5.429420 \
+1.027040  0.528191  0.265256  0.0302949 \
+0.908598  3.035500  1.543640  0.616783  0.0988179 \
+1.582850  0.439157  0.947198  6.174160  0.021352  5.469470 \
+1.416720  0.584665  1.125560  0.865584  0.306674  0.330052  0.567717 \
+0.316954  2.137150  3.956290  0.930676  0.248972  4.294110  0.570025  0.249410 \
+0.193335  0.186979  0.554236  0.039437  0.170135  0.113917  0.127395  0.0304501 0.138190 \
+0.397915  0.497671  0.131528  0.0848047 0.384287  0.869489  0.154263  0.0613037 0.499462  3.170970 \
+0.906265  5.351420  3.012010  0.479855  0.0740339 3.894900  2.584430  0.373558  0.890432  0.323832  0.257555 \
+0.893496  0.683162  0.198221  0.103754  0.390482  1.545260  0.315124  0.174100  0.404141  4.257460  4.854020  0.934276 \
+0.210494  0.102711  0.0961621 0.0467304 0.398020  0.0999208 0.0811339 0.049931  0.679371  1.059470  2.115170  0.088836  1.190630 \
+1.438550  0.679489  0.195081  0.423984  0.109404  0.933372  0.682355  0.243570  0.696198  0.0999288 0.415844  0.556896  0.171329  0.161444 \
+3.370790  1.224190  3.974230  1.071760  1.407660  1.028870  0.704939  1.341820  0.740169  0.319440  0.344739  0.967130  0.493905  0.545931  1.613280 \
+2.121110  0.554413  2.030060  0.374866  0.512984  0.857928  0.822765  0.225833  0.473307  1.458160  0.326622  1.386980  1.516120  0.171903  0.795384  4.378020 \
+0.113133  1.163920  0.0719167 0.129767  0.717070  0.215737  0.156557  0.336983  0.262569  0.212483  0.665309  0.137505  0.515706  1.529640  0.139405  0.523742  0.110864 \
+0.240735  0.381533  1.086000  0.325711  0.543833  0.227710  0.196303  0.103604  3.873440  0.420170  0.398618  0.133264  0.428437  6.454280  0.216046  0.786993  0.291148  2.485390 \
+2.006010  0.251849  0.196246  0.152335  1.002140  0.301281  0.588731  0.187247  0.118358  7.821300  1.800340  0.305434  2.058450  0.649892  0.314887  0.232739  1.388230  0.365369  0.314730 \
+\
+0.0866279 0.043972  0.0390894 0.0570451 0.0193078 0.0367281 0.0580589 0.0832518 0.0244313 0.048466  0.086209  0.0620286 0.0195027 0.0384319 0.0457631 0.0695179 0.0610127 0.0143859 0.0352742 0.0708956";
+
+string model_cpREV = 
+"  105\
+  227  357\
+  175   43 4435\
+  669  823  538   10\
+  157 1745  768  400   10\
+  499  152 1055 3691   10 3122\
+  665  243  653  431  303  133  379\
+   66  715 1405  331  441 1269  162   19\
+  145  136  168   10  280   92  148   40   29\
+  197  203  113   10  396  286   82   20   66 1745\
+  236 4482 2430  412   48 3313 2629  263  305  345  218\
+  185  125   61   47  159  202  113   21   10 1772 1351  193\
+   68   53   97   22  726   10  145   25  127  454 1268   72  327\
+  490   87  173  170  285  323  185   28  152  117  219  302  100   43\
+ 2440  385 2085  590 2331  396  568  691  303  216  516  868   93  487 1202\
+ 1340  314 1393  266  576  241  369   92   32 1040  156  918  645  148  260 2151\
+   14  230   40   18  435   53   63   82   69   42  159   10   86  468   49   73   29\
+   56  323  754  281 1466  391  142   10 1971   89  189  247  215 2370   97  522   71  346\
+  968   92   83   75  592   54  200   91   25 4797  865  249  475  317  122  167  760   10  119\
+\
+ 0.0755 0.0621 0.0410 0.0371 0.0091 0.0382 0.0495 0.0838 0.0246 0.0806\
+ 0.1011 0.0504 0.0220 0.0506 0.0431 0.0622 0.0543 0.0181 0.0307 0.0660";
+
+string model_mtREV = 
+"  23.18\
+  26.95  13.24\
+  17.67   1.90 794.38\
+  59.93 103.33  58.94   1.90\
+   1.90 220.99 173.56  55.28  75.24\
+   9.77   1.90  63.05 583.55   1.90 313.56\
+ 120.71  23.03  53.30  56.77  30.71   6.75  28.28\
+  13.90 165.23 496.13 113.99 141.49 582.40  49.12   1.90\
+  96.49   1.90  27.10   4.34  62.73   8.34   3.31   5.98  12.26\
+  25.46  15.58  15.16   1.90  25.65  39.70   1.90   2.41  11.49 329.09\
+   8.36 141.40 608.70   2.31   1.90 465.58 313.86  22.73 127.67  19.57  14.88\
+ 141.88   1.90  65.41   1.90   6.18  47.37   1.90   1.90  11.97 517.98 537.53  91.37\
+   6.37   4.69  15.20   4.98  70.80  19.11   2.67   1.90  48.16  84.67 216.06   6.44  90.82\
+  54.31  23.64  73.31  13.43  31.26 137.29  12.83   1.90  60.97  20.63  40.10  50.10  18.84  17.31\
+ 387.86   6.04 494.39  69.02 277.05  54.11  54.71 125.93  77.46  47.70  73.61 105.79 111.16  64.29 169.90\
+ 480.72   2.08 238.46  28.01 179.97  94.93  14.82  11.17  44.78 368.43 126.40 136.33 528.17  33.85 128.22 597.21\
+   1.90  21.95  10.68  19.86  33.60   1.90   1.90  10.92   7.08   1.90  32.44  24.00  21.71   7.84   4.21  38.58   9.99\
+   6.48   1.90 191.36  21.21 254.77  38.82  13.12   3.21 670.14  25.01  44.15  51.17  39.96 465.58  16.21  64.92  38.73  26.25\
+ 195.06   7.64   1.90   1.90   1.90  19.00  21.14   2.53   1.90 1222.94  91.67   1.90 387.54   6.35   8.23   1.90 204.54   5.37   1.90\
+\
+0.072 0.019 0.039 0.019 0.006 0.025 0.024 0.056 0.028 0.088 0.169\
+0.023 0.054 0.061 0.054 0.072 0.086 0.029 0.033 0.043";
+
+/*
+string model_Dayhoff = 
+" 27									    \
+ 98  32									    \
+120   0 905								    \
+ 36  23   0   0								    \
+ 89 246 103 134   0							    \
+198   1 148 1153  0 716							    \
+240   9 139 125  11  28  81						    \
+ 23 240 535  86  28 606  43  10						    \
+ 65  64  77  24  44  18  61   0   7					    \
+ 41  15  34   0   0  73  11   7  44 257					    \
+ 26 464 318  71   0 153  83  27  26  46  18				    \
+ 72  90   1   0   0 114  30  17   0 336 527 243				    \
+ 18  14  14   0   0   0   0  15  48 196 157   0  92			    \
+250 103  42  13  19 153  51  34  94  12  32  33  17  11			    \
+409 154 495  95 161  56  79 234  35  24  17  96  62  46 245		    \
+371  26 229  66  16  53  34  30  22 192  33 136 104  13  78 550		    \
+  0 201  23   0   0   0   0   0  27   0  46   0   0  76   0  75   0	    \
+ 24   8  95   0  96   0  22   0 127  37  28  13   0 698   0  34  42  61	    \
+208  24  15  18  49  35  37  54  44 889 175  10 258  12  48  30 157   0  28 \
+\
+0.087127 0.040904 0.040432 0.046872 0.033474 0.038255 0.049530\
+0.088612 0.033618 0.036886 0.085357 0.080482 0.014753 0.039772\
+0.050680 0.069577 0.058542 0.010494 0.029916 0.064718";
+*/
+string model_mtMAM = 
+" 32                                                                         \
+  2   4                                                                     \
+ 11   0 864                                                                 \
+  0 186   0   0                                                             \
+  0 246   8  49   0                                                         \
+  0   0   0 569   0 274                                                     \
+ 78  18  47  79   0   0  22                                                 \
+  8 232 458  11 305 550  22   0                                             \
+ 75   0  19   0  41   0   0   0   0                                         \
+ 21   6   0   0  27  20   0   0  26 232                                     \
+  0  50 408   0   0 242 215   0   0   6   4                                 \
+ 76   0  21   0   0  22   0   0   0 378 609  59                             \
+  0   0   6   5   7   0   0   0   0  57 246   0  11                         \
+ 53   9  33   2   0  51   0   0  53   5  43  18   0  17                     \
+342   3 446  16 347  30  21 112  20   0  74  65  47  90 202                 \
+681   0 110   0 114   0   4   0   1 360  34  50 691   8  78 614             \
+  5  16   6   0  65   0   0   0   0   0  12   0  13   0   7  17   0         \
+  0   0 156   0 530  54   0   1 1525 16  25  67   0 682   8 107   0  14    \
+398   0   0  10   0  33  20   5   0 2220 100  0 832   6   0   0 237   0   0 \
+\
+0.0692 0.0184 0.0400 0.0186 0.0065 0.0238 0.0236 0.0557 0.0277 0.0905\
+0.1675 0.0221 0.0561 0.0611 0.0536 0.0725 0.0870 0.0293 0.0340 0.0428";
+
+
+string model_JTT=
+" 58                                                                        \
+ 54  45                                                                    \
+ 81  16 528                                                                \
+ 56 113  34  10                                                            \
+ 57 310  86  49   9                                                        \
+105  29  58 767   5 323                                                    \
+179 137  81 130  59  26 119                                                \
+ 27 328 391 112  69 597  26  23                                            \
+ 36  22  47  11  17   9  12   6  16                                        \
+ 30  38  12   7  23  72   9   6  56 229                                    \
+ 35 646 263  26   7 292 181  27  45  21  14                                \
+ 54  44  30  15  31  43  18  14  33 479 388  65                            \
+ 15   5  10   4  78   4   5   5  40  89 248   4  43                        \
+194  74  15  15  14 164  18  24 115  10 102  21  16  17                    \
+378 101 503  59 223  53  30 201  73  40  59  47  29  92 285                \
+475  64 232  38  42  51  32  33  46 245  25 103 226  12 118 477            \
+  9 126   8   4 115  18  10  55   8   9  52  10  24  53   6  35  12        \
+ 11  20  70  46 209  24   7   8 573  32  24   8  18 536  10  63  21  71    \
+298  17  16  31  62  20  45  47  11 961 180  14 323  62  23  38 112  25  16 \
+\
+0.076748 0.051691 0.042645 0.051544 0.019803 0.040752 0.061830\
+0.073152 0.022944 0.053761 0.091904 0.058676 0.023826 0.040126\
+0.050901 0.068765 0.058565 0.014261 0.032102 0.066005";
+
+string model_LG = 
+"0.425093 \
+0.276818 0.751878 \
+0.395144 0.123954 5.076149 \
+2.489084 0.534551 0.528768 0.062556 \
+0.969894 2.807908 1.695752 0.523386 0.084808 \
+1.038545 0.363970 0.541712 5.243870 0.003499 4.128591 \
+2.066040 0.390192 1.437645 0.844926 0.569265 0.267959 0.348847 \
+0.358858 2.426601 4.509238 0.927114 0.640543 4.813505 0.423881 0.311484 \
+0.149830 0.126991 0.191503 0.010690 0.320627 0.072854 0.044265 0.008705 0.108882 \
+0.395337 0.301848 0.068427 0.015076 0.594007 0.582457 0.069673 0.044261 0.366317 4.145067 \
+0.536518 6.326067 2.145078 0.282959 0.013266 3.234294 1.807177 0.296636 0.697264 0.159069 0.137500 \
+1.124035 0.484133 0.371004 0.025548 0.893680 1.672569 0.173735 0.139538 0.442472 4.273607 6.312358 0.656604 \
+0.253701 0.052722 0.089525 0.017416 1.105251 0.035855 0.018811 0.089586 0.682139 1.112727 2.592692 0.023918 1.798853 \
+1.177651 0.332533 0.161787 0.394456 0.075382 0.624294 0.419409 0.196961 0.508851 0.078281 0.249060 0.390322 0.099849 0.094464 \
+4.727182 0.858151 4.008358 1.240275 2.784478 1.223828 0.611973 1.739990 0.990012 0.064105 0.182287 0.748683 0.346960 0.361819 1.338132 \
+2.139501 0.578987 2.000679 0.425860 1.143480 1.080136 0.604545 0.129836 0.584262 1.033739 0.302936 1.136863 2.020366 0.165001 0.571468 6.472279 \
+0.180717 0.593607 0.045376 0.029890 0.670128 0.236199 0.077852 0.268491 0.597054 0.111660 0.619632 0.049906 0.696175 2.457121 0.095131 0.248862 0.140825 \
+0.218959 0.314440 0.612025 0.135107 1.165532 0.257336 0.120037 0.054679 5.306834 0.232523 0.299648 0.131932 0.481306 7.803902 0.089613 0.400547 0.245841 3.151815 \
+2.547870 0.170887 0.083688 0.037967 1.959291 0.210332 0.245034 0.076701 0.119013 10.649107 1.702745 0.185202 1.898718 0.654683 0.296501 0.098369 2.188158 0.189510 0.249313 \
+\
+0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147";
+
+string model_mtART = 
+"0.2 \
+0.2 0.2 \
+1 4 500 \
+254 36 98 11 \
+0.2 154 262 0.2 0.2 \
+0.2 0.2 183 862 0.2 262 \
+200 0.2 121 12 81 3 44 \
+0.2 41 180 0.2 12 314 15 0.2 \
+26 2 21 7 63 11 7 3 0.2 \
+4 2 13 1 79 16 2 1 6 515 \
+0.2 209 467 2 0.2 349 106 0.2 0.2 3 4 \
+121 5 79 0.2 312 67 0.2 56 0.2 515 885 106 \
+13 5 20 0.2 184 0.2 0.2 1 14 118 263 11 322 \
+49 0.2 17 0.2 0.2 39 8 0.2 1 0.2 12 17 5 15 \
+673 3 398 44 664 52 31 226 11 7 8 144 112 36 87 \
+244 0.2 166 0.2 183 44 43 0.2 19 204 48 70 289 14 47 660 \
+0.2 0.2 8 0.2 22 7 11 2 0.2 0.2 21 16 71 54 0.2 2 0.2 \
+1 4 251 0.2 72 87 8 9 191 12 20 117 71 792 18 30 46 38 \
+340 0.2 23 0.2 350 0.2 14 3 0.2 1855 85 26 281 52 32 61 544 0.2 2 \
+\
+0.054116 0.018227 0.039903 0.020160 0.009709 0.018781 0.024289 0.068183 0.024518 0.092638 \
+0.148658 0.021718 0.061453 0.088668 0.041826 0.091030 0.049194 0.029786 0.039443 0.057700";
+
+
+string model_mtZOA = 
+"   3.3\
+   1.7  33.6\
+  16.1   3.2 617.0\
+ 272.5  61.1  94.6   9.5\
+   7.3 231.0 190.3  19.3  49.1\
+  17.1   6.4 174.0 883.6   3.4 349.4\
+ 289.3   7.2  99.3  26.0  82.4   8.9  43.1\
+   2.3  61.7 228.9  55.6  37.5 421.8  14.9   7.4\
+  33.2   0.2  24.3   1.5  48.8   0.2   7.3   3.4   1.6\
+  15.6   4.1   7.9   0.5  59.7  23.0   1.0   3.5   6.6 425.2\
+   0.2 292.3 413.4   0.2   0.2 334.0 163.2  10.1  23.9   8.4   6.7\
+ 136.5   3.8  73.7   0.2 264.8  83.9   0.2  52.2   7.1 449.7 636.3  83.0\
+  26.5   0.2  12.9   2.0 167.8   9.5   0.2   5.8  13.1  90.3 234.2  16.3 215.6\
+  61.8   7.5  22.6   0.2   8.1  52.2  20.6   1.3  15.6   2.6  11.4  24.3   5.4  10.5\
+ 644.9  11.8 420.2  51.4 656.3  96.4  38.4 257.1  23.1   7.2  15.2 144.9  95.3  32.2  79.7\
+ 378.1   3.2 184.6   2.3 199.0  39.4  34.5   5.2  19.4 222.3  50.0  75.5 305.1  19.3  56.9 666.3\
+   3.1  16.9   6.4   0.2  36.1   6.1   3.5  12.3   4.5   9.7  27.2   6.6  48.7  58.2   1.3  10.3   3.6\
+   2.1  13.8 141.6  13.9  76.7  52.3  10.0   4.3 266.5  13.1   5.7  45.0  41.4 590.5   4.2  29.7  29.0  79.8\
+ 321.9   5.1   7.1   3.7 243.8   9.0  16.3  23.7   0.3 1710.6 126.1  11.1 279.6  59.6  17.9  49.5 396.4  13.7  15.6 \
+\
+    0.068880    0.021037    0.030390    0.020696    0.009966    0.018623    0.024989    0.071968    0.026814    0.085072    0.156717    0.019276    0.050652    0.081712    0.044803    0.080535    0.056386    0.027998    0.037404    0.066083";
+
+void get_rtREV(double **q, double *f) {
+	/* rtRev */
+	q[ 0][ 0] =   0; q[ 1][ 0] =  34; q[ 2][ 0] =  51; q[ 3][ 0] =  10; q[ 4][ 0] = 439;
+	q[ 5][ 0] =  32; q[ 6][ 0] =  81; q[ 7][ 0] = 135; q[ 8][ 0] =  30; q[ 9][ 0] =   1;
+	q[10][ 0] =  45; q[11][ 0] =  38; q[12][ 0] = 235; q[13][ 0] =   1; q[14][ 0] =  97;
+	q[15][ 0] = 460; q[16][ 0] = 258; q[17][ 0] =   5; q[18][ 0] =  55; q[19][ 0] = 197;
+
+	q[ 0][ 1] =  34; q[ 1][ 1] =   0; q[ 2][ 1] =  35; q[ 3][ 1] =  30; q[ 4][ 1] =  92;
+	q[ 5][ 1] = 221; q[ 6][ 1] =  10; q[ 7][ 1] =  41; q[ 8][ 1] =  90; q[ 9][ 1] =  24;
+	q[10][ 1] =  18; q[11][ 1] = 593; q[12][ 1] =  57; q[13][ 1] =   7; q[14][ 1] =  24;
+	q[15][ 1] = 102; q[16][ 1] =  64; q[17][ 1] =  13; q[18][ 1] =  47; q[19][ 1] =  29;
+
+	q[ 0][ 2] =  51; q[ 1][ 2] =  35; q[ 2][ 2] =   0; q[ 3][ 2] = 384; q[ 4][ 2] = 128;
+	q[ 5][ 2] = 236; q[ 6][ 2] =  79; q[ 7][ 2] =  94; q[ 8][ 2] = 320; q[ 9][ 2] =  35;
+	q[10][ 2] =  15; q[11][ 2] = 123; q[12][ 2] =   1; q[13][ 2] =  49; q[14][ 2] =  33;
+	q[15][ 2] = 294; q[16][ 2] = 148; q[17][ 2] =  16; q[18][ 2] =  28; q[19][ 2] =  21;
+
+	q[ 0][ 3] =  10; q[ 1][ 3] =  30; q[ 2][ 3] = 384; q[ 3][ 3] =   0; q[ 4][ 3] =   1;
+	q[ 5][ 3] =  78; q[ 6][ 3] = 542; q[ 7][ 3] =  61; q[ 8][ 3] =  91; q[ 9][ 3] =   1;
+	q[10][ 3] =   5; q[11][ 3] =  20; q[12][ 3] =   1; q[13][ 3] =   1; q[14][ 3] =  55;
+	q[15][ 3] = 136; q[16][ 3] =  55; q[17][ 3] =   1; q[18][ 3] =   1; q[19][ 3] =   6;
+
+	q[ 0][ 4] = 439; q[ 1][ 4] =  92; q[ 2][ 4] = 128; q[ 3][ 4] =   1; q[ 4][ 4] =   0;
+	q[ 5][ 4] =  70; q[ 6][ 4] =   1; q[ 7][ 4] =  48; q[ 8][ 4] = 124; q[ 9][ 4] = 104;
+	q[10][ 4] = 110; q[11][ 4] =  16; q[12][ 4] = 156; q[13][ 4] =  70; q[14][ 4] =   1;
+	q[15][ 4] =  75; q[16][ 4] = 117; q[17][ 4] =  55; q[18][ 4] = 131; q[19][ 4] = 295;
+
+	q[ 0][ 5] =  32; q[ 1][ 5] = 221; q[ 2][ 5] = 236; q[ 3][ 5] =  78; q[ 4][ 5] =  70;
+	q[ 5][ 5] =   0; q[ 6][ 5] = 372; q[ 7][ 5] =  18; q[ 8][ 5] = 387; q[ 9][ 5] =  33;
+	q[10][ 5] =  54; q[11][ 5] = 309; q[12][ 5] = 158; q[13][ 5] =   1; q[14][ 5] =  68;
+	q[15][ 5] = 225; q[16][ 5] = 146; q[17][ 5] =  10; q[18][ 5] =  45; q[19][ 5] =  36;
+
+	q[ 0][ 6] =  81; q[ 1][ 6] =  10; q[ 2][ 6] =  79; q[ 3][ 6] = 542; q[ 4][ 6] =   1;
+	q[ 5][ 6] = 372; q[ 6][ 6] =   0; q[ 7][ 6] =  70; q[ 8][ 6] =  34; q[ 9][ 6] =   1;
+	q[10][ 6] =  21; q[11][ 6] = 141; q[12][ 6] =   1; q[13][ 6] =   1; q[14][ 6] =  52;
+	q[15][ 6] =  95; q[16][ 6] =  82; q[17][ 6] =  17; q[18][ 6] =   1; q[19][ 6] =  35;
+
+	q[ 0][ 7] = 135; q[ 1][ 7] =  41; q[ 2][ 7] =  94; q[ 3][ 7] =  61; q[ 4][ 7] =  48;
+	q[ 5][ 7] =  18; q[ 6][ 7] =  70; q[ 7][ 7] =   0; q[ 8][ 7] =  68; q[ 9][ 7] =   1;
+	q[10][ 7] =   3; q[11][ 7] =  30; q[12][ 7] =  37; q[13][ 7] =   7; q[14][ 7] =  17;
+	q[15][ 7] = 152; q[16][ 7] =   7; q[17][ 7] =  23; q[18][ 7] =  21; q[19][ 7] =   3;
+
+	q[ 0][ 8] =  30; q[ 1][ 8] =  90; q[ 2][ 8] = 320; q[ 3][ 8] =  91; q[ 4][ 8] = 124;
+	q[ 5][ 8] = 387; q[ 6][ 8] =  34; q[ 7][ 8] =  68; q[ 8][ 8] =   0; q[ 9][ 8] =  34;
+	q[10][ 8] =  51; q[11][ 8] =  76; q[12][ 8] = 116; q[13][ 8] = 141; q[14][ 8] =  44;
+	q[15][ 8] = 183; q[16][ 8] =  49; q[17][ 8] =  48; q[18][ 8] = 307; q[19][ 8] =   1;
+
+	q[ 0][ 9] =   1; q[ 1][ 9] =  24; q[ 2][ 9] =  35; q[ 3][ 9] =   1; q[ 4][ 9] = 104;
+	q[ 5][ 9] =  33; q[ 6][ 9] =   1; q[ 7][ 9] =   1; q[ 8][ 9] =  34; q[ 9][ 9] =   0;
+	q[10][ 9] = 385; q[11][ 9] =  34; q[12][ 9] = 375; q[13][ 9] =  64; q[14][ 9] =  10;
+	q[15][ 9] =   4; q[16][ 9] =  72; q[17][ 9] =  39; q[18][ 9] =  26; q[19][ 9] =1048;
+
+	q[ 0][10] =  45; q[ 1][10] =  18; q[ 2][10] =  15; q[ 3][10] =   5; q[ 4][10] = 110;
+	q[ 5][10] =  54; q[ 6][10] =  21; q[ 7][10] =   3; q[ 8][10] =  51; q[ 9][10] = 385;
+	q[10][10] =   0; q[11][10] =  23; q[12][10] = 581; q[13][10] = 179; q[14][10] =  22;
+	q[15][10] =  24; q[16][10] =  25; q[17][10] =  47; q[18][10] =  64; q[19][10] = 112;
+
+	q[ 0][11] =  38; q[ 1][11] = 593; q[ 2][11] = 123; q[ 3][11] =  20; q[ 4][11] =  16;
+	q[ 5][11] = 309; q[ 6][11] = 141; q[ 7][11] =  30; q[ 8][11] =  76; q[ 9][11] =  34;
+	q[10][11] =  23; q[11][11] =   0; q[12][11] = 134; q[13][11] =  14; q[14][11] =  43;
+	q[15][11] =  77; q[16][11] = 110; q[17][11] =   6; q[18][11] =   1; q[19][11] =  19;
+
+	q[ 0][12] = 235; q[ 1][12] =  57; q[ 2][12] =   1; q[ 3][12] =   1; q[ 4][12] = 156;
+	q[ 5][12] = 158; q[ 6][12] =   1; q[ 7][12] =  37; q[ 8][12] = 116; q[ 9][12] = 375;
+	q[10][12] = 581; q[11][12] = 134; q[12][12] =   0; q[13][12] = 247; q[14][12] =   1;
+	q[15][12] =   1; q[16][12] = 131; q[17][12] = 111; q[18][12] =  74; q[19][12] = 236;
+
+	q[ 0][13] =   1; q[ 1][13] =   7; q[ 2][13] =  49; q[ 3][13] =   1; q[ 4][13] =  70;
+	q[ 5][13] =   1; q[ 6][13] =   1; q[ 7][13] =   7; q[ 8][13] = 141; q[ 9][13] =  64;
+	q[10][13] = 179; q[11][13] =  14; q[12][13] = 247; q[13][13] =   0; q[14][13] =  11;
+	q[15][13] =  20; q[16][13] =  69; q[17][13] = 182; q[18][13] =1017; q[19][13] =  92;
+
+	q[ 0][14] =  97; q[ 1][14] =  24; q[ 2][14] =  33; q[ 3][14] =  55; q[ 4][14] =   1;
+	q[ 5][14] =  68; q[ 6][14] =  52; q[ 7][14] =  17; q[ 8][14] =  44; q[ 9][14] =  10;
+	q[10][14] =  22; q[11][14] =  43; q[12][14] =   1; q[13][14] =  11; q[14][14] =   0;
+	q[15][14] = 134; q[16][14] =  62; q[17][14] =   9; q[18][14] =  14; q[19][14] =  25;
+
+	q[ 0][15] = 460; q[ 1][15] = 102; q[ 2][15] = 294; q[ 3][15] = 136; q[ 4][15] =  75;
+	q[ 5][15] = 225; q[ 6][15] =  95; q[ 7][15] = 152; q[ 8][15] = 183; q[ 9][15] =   4;
+	q[10][15] =  24; q[11][15] =  77; q[12][15] =   1; q[13][15] =  20; q[14][15] = 134;
+	q[15][15] =   0; q[16][15] = 671; q[17][15] =  14; q[18][15] =  31; q[19][15] =  39;
+
+	q[ 0][16] = 258; q[ 1][16] =  64; q[ 2][16] = 148; q[ 3][16] =  55; q[ 4][16] = 117;
+	q[ 5][16] = 146; q[ 6][16] =  82; q[ 7][16] =   7; q[ 8][16] =  49; q[ 9][16] =  72;
+	q[10][16] =  25; q[11][16] = 110; q[12][16] = 131; q[13][16] =  69; q[14][16] =  62;
+	q[15][16] = 671; q[16][16] =   0; q[17][16] =   1; q[18][16] =  34; q[19][16] = 196;
+
+	q[ 0][17] =   5; q[ 1][17] =  13; q[ 2][17] =  16; q[ 3][17] =   1; q[ 4][17] =  55;
+	q[ 5][17] =  10; q[ 6][17] =  17; q[ 7][17] =  23; q[ 8][17] =  48; q[ 9][17] =  39;
+	q[10][17] =  47; q[11][17] =   6; q[12][17] = 111; q[13][17] = 182; q[14][17] =   9;
+	q[15][17] =  14; q[16][17] =   1; q[17][17] =   0; q[18][17] = 176; q[19][17] =  26;
+
+	q[ 0][18] =  55; q[ 1][18] =  47; q[ 2][18] =  28; q[ 3][18] =   1; q[ 4][18] = 131;
+	q[ 5][18] =  45; q[ 6][18] =   1; q[ 7][18] =  21; q[ 8][18] = 307; q[ 9][18] =  26;
+	q[10][18] =  64; q[11][18] =   1; q[12][18] =  74; q[13][18] =1017; q[14][18] =  14;
+	q[15][18] =  31; q[16][18] =  34; q[17][18] = 176; q[18][18] =   0; q[19][18] =  59;
+
+	q[ 0][19] = 197; q[ 1][19] =  29; q[ 2][19] =  21; q[ 3][19] =   6; q[ 4][19] = 295;
+	q[ 5][19] =  36; q[ 6][19] =  35; q[ 7][19] =   3; q[ 8][19] =   1; q[ 9][19] =1048;
+	q[10][19] = 112; q[11][19] =  19; q[12][19] = 236; q[13][19] =  92; q[14][19] =  25;
+	q[15][19] =  39; q[16][19] = 196; q[17][19] =  26; q[18][19] =  59; q[19][19] =   0;
+
+	f[ 0] = 0.0646;
+	f[ 1] = 0.0453;
+	f[ 2] = 0.0376;
+	f[ 3] = 0.0422;
+	f[ 4] = 0.0114;
+	f[ 5] = 0.0606;
+	f[ 6] = 0.0607;
+	f[ 7] = 0.0639;
+	f[ 8] = 0.0273;
+	f[ 9] = 0.0679;
+	f[10] = 0.1018;
+	f[11] = 0.0751;
+	f[12] = 0.0150;
+	f[13] = 0.0287;
+	f[14] = 0.0681;
+	f[15] = 0.0488;
+	f[16] = 0.0622;
+	f[17] = 0.0251;
+	f[18] = 0.0318;
+	f[19] = 0.0619;
+}
+
+void get_VT(double **q, double *f) {
+	/*
+	* Mueller, T. and Vingron, M. 
+	* "Modeling Amino Acid Replacement" 
+	* Journal of Comp. Biology, 7(6):761-776,2000
+	*/
+
+	/* amino acid frequencies */
+	f[ 0]=0.0770764620135024 ; f[ 1]=0.0500819370772208 ;
+	f[ 2]=0.0462377395993731 ; f[ 3]=0.0537929860758246 ;
+	f[ 4]=0.0144533387583345 ; f[ 5]=0.0408923608974345 ;
+	f[ 6]=0.0633579339160905 ; f[ 7]=0.0655672355884439 ;
+	f[ 8]=0.0218802687005936 ; f[ 9]=0.0591969699027449 ;
+	f[10]=0.0976461276528445 ; f[11]=0.0592079410822730 ;
+	f[12]=0.0220695876653368 ; f[13]=0.0413508521834260 ;
+	f[14]=0.0476871596856874 ; f[15]=0.0707295165111524 ;
+	f[16]=0.0567759161524817 ; f[17]=0.0127019797647213 ;
+	f[18]=0.0323746050281867 ; f[19]=0.0669190817443274 ;
+
+	/* relative rates */
+
+	q[ 0][ 1] = 1.2412691067876198;  q[ 0][ 2] = 1.2184237953498958;
+	q[ 0][ 3] = 1.3759368509441177;  q[ 0][ 4] = 2.4731223087544874;
+	q[ 0][ 5] = 2.2155167805137470;  q[ 0][ 6] = 2.3379911207495061;
+	q[ 0][ 7] = 3.3386555146457697;  q[ 0][ 8] = 0.9615841926910841;
+	q[ 0][ 9] = 0.8908203061925510;  q[ 0][10] = 1.0778497408764076;
+	q[ 0][11] = 1.4932055816372476;  q[ 0][12] = 1.9006455961717605;
+	q[ 0][13] = 0.6883439026872615;  q[ 0][14] = 2.7355620089953550;
+	q[ 0][15] = 6.4208961859142883;  q[ 0][16] = 5.2892514169776437;
+	q[ 0][17] = 0.5488578478106930;  q[ 0][18] = 0.5411769916657778;
+	q[ 0][19] = 4.6501894691803214;
+
+	q[ 1][ 2] = 1.5720770753326880;  q[ 1][ 3] = 0.7550654439001206;
+	q[ 1][ 4] = 1.4414262567428417;  q[ 1][ 5] = 5.5120819705248678;
+	q[ 1][ 6] = 1.3542404860613146;  q[ 1][ 7] = 1.3121700301622004;
+	q[ 1][ 8] = 4.9238668283945266;  q[ 1][ 9] = 0.4323005487925516;
+	q[ 1][10] = 0.8386701149158265;  q[ 1][11] = 10.0173308173660018;
+	q[ 1][12] = 1.2488638689609959;  q[ 1][13] = 0.4224945197276290;
+	q[ 1][14] = 1.3091837782420783;  q[ 1][15] = 1.9202994262316166;
+	q[ 1][16] = 1.3363401740560601;  q[ 1][17] = 1.5170142153962840;
+	q[ 1][18] = 0.8912614404565405;  q[ 1][19] = 0.7807017855806767;
+
+	q[ 2][ 3] = 7.8584219153689405;  q[ 2][ 4] = 0.9784679122774127;
+	q[ 2][ 5] = 3.0143201670924822;  q[ 2][ 6] = 2.0093434778398112;
+	q[ 2][ 7] = 2.4117632898861809;  q[ 2][ 8] = 6.1974384977884114;
+	q[ 2][ 9] = 0.9179291175331520;  q[ 2][10] = 0.4098311270816011;
+	q[ 2][11] = 4.4034547578962568;  q[ 2][12] = 0.9378803706165143;
+	q[ 2][13] = 0.5044944273324311;  q[ 2][14] = 0.7103720531974738;
+	q[ 2][15] = 6.1234512396801764;  q[ 2][16] = 3.8852506105922231;
+	q[ 2][17] = 0.1808525752605976;  q[ 2][18] = 1.0894926581511342;
+	q[ 2][19] = 0.4586061981719967;
+
+	q[ 3][ 4] = 0.2272488448121475;  q[ 3][ 5] = 1.6562495638176040;
+	q[ 3][ 6] = 9.6883451875685065;  q[ 3][ 7] = 1.9142079025990228;
+	q[ 3][ 8] = 2.1459640610133781;  q[ 3][ 9] = 0.2161660372725585;
+	q[ 3][10] = 0.3574207468998517;  q[ 3][11] = 1.4521790561663968;
+	q[ 3][12] = 0.4075239926000898;  q[ 3][13] = 0.1675129724559251;
+	q[ 3][14] = 1.0714605979577547;  q[ 3][15] = 2.2161944596741829;
+	q[ 3][16] = 1.5066839872944762;  q[ 3][17] = 0.2496584188151770;
+	q[ 3][18] = 0.7447620891784513;  q[ 3][19] = 0.4594535241660911;
+
+	q[ 4][ 5] = 0.4587469126746136;  q[ 4][ 6] = 0.4519167943192672;
+	q[ 4][ 7] = 1.1034605684472507;  q[ 4][ 8] = 1.5196756759380692;
+	q[ 4][ 9] = 0.9126668032539315;  q[ 4][10] = 1.4081315998413697;
+	q[ 4][11] = 0.3371091785647479;  q[ 4][12] = 1.2213054800811556;
+	q[ 4][13] = 1.6953951980808002;  q[ 4][14] = 0.4326227078645523;
+	q[ 4][15] = 3.6366815408744255;  q[ 4][16] = 1.7557065205837685;
+	q[ 4][17] = 1.6275179891253113;  q[ 4][18] = 2.1579775140421025;
+	q[ 4][19] = 2.2627456996290891;
+
+	q[ 5][ 6] = 6.8124601839937675;  q[ 5][ 7] = 0.8776110594765502;
+	q[ 5][ 8] = 7.9943228564946525;  q[ 5][ 9] = 0.4882733432879921;
+	q[ 5][10] = 1.3318097154194044;  q[ 5][11] = 6.0519085243118811;
+	q[ 5][12] = 1.9106190827629084;  q[ 5][13] = 0.3573432522499545;
+	q[ 5][14] = 2.3019177728300728;  q[ 5][15] = 2.3193703643237220;
+	q[ 5][16] = 2.1576510103471440;  q[ 5][17] = 0.8959082681546182;
+	q[ 5][18] = 0.9183596801412757;  q[ 5][19] = 0.6366932501396869;
+
+	q[ 6][ 7] = 1.3860121390169038;  q[ 6][ 8] = 1.6360079688522375;
+	q[ 6][ 9] = 0.4035497929633328;  q[ 6][10] = 0.5610717242294755;
+	q[ 6][11] = 4.3290086529582830;  q[ 6][12] = 0.7471936218068498;
+	q[ 6][13] = 0.2317194387691585;  q[ 6][14] = 1.5132807416252063;
+	q[ 6][15] = 1.8273535587773553;  q[ 6][16] = 1.5839981708584689;
+	q[ 6][17] = 0.4198391148111098;  q[ 6][18] = 0.5818111331782764;
+	q[ 6][19] = 0.8940572875547330;
+
+	q[ 7][ 8] = 0.8561248973045037;  q[ 7][ 9] = 0.2888075033037488;
+	q[ 7][10] = 0.3578662395745526;  q[ 7][11] = 0.8945563662345198;
+	q[ 7][12] = 0.5954812791740037;  q[ 7][13] = 0.3693722640980460;
+	q[ 7][14] = 0.7744933618134962;  q[ 7][15] = 3.0637776193717610;
+	q[ 7][16] = 0.7147489676267383;  q[ 7][17] = 0.9349753595598769;
+	q[ 7][18] = 0.3374467649724478;  q[ 7][19] = 0.6193321034173915;
+
+	q[ 8][ 9] = 0.5787937115407940;  q[ 8][10] = 1.0765007949562073;
+	q[ 8][11] = 1.8085136096039203;  q[ 8][12] = 1.3808291710019667;
+	q[ 8][13] = 1.3629765501081097;  q[ 8][14] = 1.8370555852070649;
+	q[ 8][15] = 1.9699895187387506;  q[ 8][16] = 1.6136654573285647;
+	q[ 8][17] = 0.6301954684360302;  q[ 8][18] = 7.7587442309146040;
+	q[ 8][19] = 0.5333220944030346;
+
+	q[ 9][10] = 6.0019110258426362;  q[ 9][11] = 0.6244297525127139;
+	q[ 9][12] = 6.7597899772045418;  q[ 9][13] = 2.2864286949316077;
+	q[ 9][14] = 0.4811402387911145;  q[ 9][15] = 0.6047491507504744;
+	q[ 9][16] = 2.6344778384442731;  q[ 9][17] = 0.5604648274060783;
+	q[ 9][18] = 0.8626796044156272;  q[ 9][19] = 14.8729334615190609;
+
+	q[10][11] = 0.5642322882556321;  q[10][12] = 8.0327792947421148;
+	q[10][13] = 4.3611548063555778;  q[10][14] = 1.0084320519837335;
+	q[10][15] = 0.8953754669269811;  q[10][16] = 1.0192004372506540;
+	q[10][17] = 1.5183114434679339;  q[10][18] = 1.2452243224541324;
+	q[10][19] = 3.5458093276667237;
+
+	q[11][12] = 1.7129670976916258;  q[11][13] = 0.3910559903834828;
+	q[11][14] = 1.3918935593582853;  q[11][15] = 1.9776630140912268;
+	q[11][16] = 2.5513781312660280;  q[11][17] = 0.5851920879490173;
+	q[11][18] = 0.7835447533710449;  q[11][19] = 0.7801080335991272;
+
+	q[12][13] = 2.3201373546296349;  q[12][14] = 0.4953193808676289;
+	q[12][15] = 1.0657482318076852;  q[12][16] = 3.3628488360462363;
+	q[12][17] = 1.4680478689711018;  q[12][18] = 1.0899165770956820;
+	q[12][19] = 4.0584577156753401;
+
+	q[13][14] = 0.3746821107962129;  q[13][15] = 1.1079144700606407;
+	q[13][16] = 0.6882725908872254;  q[13][17] = 3.3448437239772266;
+	q[13][18] = 10.3848523331334590;  q[13][19] = 1.7039730522675411;
+
+	q[14][15] = 3.5465914843628927;  q[14][16] = 1.9485376673137556;
+	q[14][17] = 0.4326058001438786;  q[14][18] = 0.4819109019647465;
+	q[14][19] = 0.5985498912985666;
+
+	q[15][16] = 8.8479984061248178;  q[15][17] = 0.6791126595939816;
+	q[15][18] = 0.9547229305958682;  q[15][19] = 0.9305232113028208;
+
+	q[16][17] = 0.4514203099376473;  q[16][18] = 0.8564314184691215;
+	q[16][19] = 3.4242218450865543;
+
+	q[17][18] = 4.5377235790405388;  q[17][19] = 0.5658969249032649;
+
+	q[18][19] = 1.0000000000000000;
+
+
+} /* vt data */
+
+//this part are taken from PUZZLE-TREE
+void get_Dayhoff(double **q, double *f) {
+	/*
+	 * Dayhoff model for amino acid evolution
+	 * Dayhoff, M.O., Schwartz, R.M., Orcutt, B.C. (1978)
+	 * "A model of evolutionary change in proteins."
+	 * Dayhoff, M.O. (ed.) Atlas of Protein Sequence Structur., Vol5, Suppl. 3,
+	 * National Biomedical Research Foundation, Washington DC, pp. 345-352.
+	 */
+
+	q[0][1]=9.6472567159749e-01; 	q[0][2]=3.5927991886410e+00;
+	q[0][3]=4.3200552414656e+00; 	q[0][4]=1.3184584178499e+00;
+	q[0][5]=3.2267534963169e+00; 	q[0][6]=7.0141987829615e+00;
+	q[0][7]=8.5773867857875e+00; 	q[0][8]=8.1434196396611e-01;
+	q[0][9]=2.3518447453539e+00; 	q[0][10]=1.4735711728911e+00;
+	q[0][11]=9.3940162271805e-01; 	q[0][12]=2.5490196078431e+00;
+	q[0][13]=6.5922920892495e-01; 	q[0][14]=8.9189834148670e+00;
+	q[0][15]=1.4540712836859e+01; 	q[0][16]=1.3411904595370e+01;
+	q[0][17]=3.8517964118027e-02; 	q[0][18]=8.7897227856660e-01;
+	q[0][19]=7.4036511156187e+00;
+
+	q[1][2]=1.1890243902439e+00; 	q[1][3]=5.9525626545377e-02;
+	q[1][4]=8.4778922655537e-01; 	q[1][5]=8.8348561504191e+00;
+	q[1][6]=5.5954088952654e-02; 	q[1][7]=3.1434881434075e-01;
+	q[1][8]=8.4753987678285e+00; 	q[1][9]=2.2684090115941e+00;
+	q[1][10]=5.5954088952654e-01; 	q[1][11]=1.6681312769010e+01;
+	q[1][12]=3.1707317073171e+00; 	q[1][13]=4.8959827833572e-01;
+	q[1][14]=3.6754156468900e+00; 	q[1][15]=5.4755072760812e+00;
+	q[1][16]=9.6472567159749e-01; 	q[1][17]=7.5538020086083e+00;
+	q[1][18]=2.7977044476327e-01; 	q[1][19]=8.6083213773314e-01;
+
+	q[2][3]=3.2459324155194e+01; 	q[2][4]=7.3852625416383e-02;
+	q[2][5]=3.7732198142415e+00; 	q[2][6]=5.3911764705882e+00;
+	q[2][7]=5.0264375413087e+00; 	q[2][8]=1.9061418685121e+01;
+	q[2][9]=2.7901430842607e+00; 	q[2][10]=1.2482698961938e+00;
+	q[2][11]=1.1542279411765e+01; 	q[2][12]=1.9117647058824e-01;
+	q[2][13]=5.0183823529412e-01; 	q[2][14]=1.5181660899654e+00;
+	q[2][15]=1.7697478991597e+01; 	q[2][16]=8.3557302231237e+00;
+	q[2][17]=8.6029411764706e-01; 	q[2][18]=3.4411764705882e+00;
+	q[2][19]=5.7352941176471e-01;
+
+	q[3][4]=2.5534152404601e-02; 	q[3][5]=4.8811013767209e+00;
+	q[3][6]=4.0561952440551e+01; 	q[3][7]=4.4423506911730e+00;
+	q[3][8]=3.0865788117500e+00; 	q[3][9]=8.5749078239692e-01;
+	q[3][10]=2.5926985518518e-02; 	q[3][11]=2.5930851063830e+00;
+	q[3][12]=1.1667143483333e-01; 	q[3][13]=1.2963492759259e-02;
+	q[3][14]=4.7853935065891e-01; 	q[3][15]=3.4167709637046e+00;
+	q[3][16]=2.3984722282163e+00; 	q[3][17]=3.2408731898147e-02;
+	q[3][18]=8.1351689612015e-02; 	q[3][19]=6.3829787234043e-01;
+
+	q[4][5]=2.1864264103535e-02; 	q[4][6]=1.4770525083277e-02;
+	q[4][7]=3.9055458751427e-01; 	q[4][8]=1.0223340673168e+00;
+	q[4][9]=1.5970515970516e+00; 	q[4][10]=3.9098448749850e-02;
+	q[4][11]=8.0776309049169e-03; 	q[4][12]=1.4155086538140e-01;
+	q[4][13]=8.6898395721925e-02; 	q[4][14]=6.8155604487784e-01;
+	q[4][15]=5.8097784568373e+00; 	q[4][16]=5.9929928084086e-01;
+	q[4][17]=3.4759358288770e-01; 	q[4][18]=3.4759358288770e+00;
+	q[4][19]=1.7647058823529e+00;
+
+	q[5][6]=2.5476780185759e+01; 	q[5][7]=1.0174974779977e+00;
+	q[5][8]=2.1573939173192e+01; 	q[5][9]=6.5266504894988e-01;
+	q[5][10]=2.6634492806410e+00; 	q[5][11]=5.5466331269350e+00;
+	q[5][12]=4.0247678018576e+00; 	q[5][13]=1.8038017885416e-02;
+	q[5][14]=5.5044618466582e+00; 	q[5][15]=2.0267580716497e+00;
+	q[5][16]=1.9256432155439e+00; 	q[5][17]=9.6202762055552e-02;
+	q[5][18]=1.0061919504644e-01; 	q[5][19]=1.2538699690402e+00;
+
+	q[6][7]=2.8869795109055e+00; 	q[6][8]=1.5519031141869e+00;
+	q[6][9]=2.1701112877583e+00; 	q[6][10]=4.0484429065744e-01;
+	q[6][11]=2.9823529411765e+00; 	q[6][12]=1.0705882352941e+00;
+	q[6][13]=1.9801735189768e-02; 	q[6][14]=1.7993079584775e+00;
+	q[6][15]=2.8184873949580e+00; 	q[6][16]=1.2261663286004e+00;
+	q[6][17]=7.3114099162219e-02; 	q[6][18]=7.6470588235294e-01;
+	q[6][19]=1.3058823529412e+00;
+
+	q[7][8]=3.7906768788150e-01; 	q[7][9]=2.3128004846840e-02;
+	q[7][10]=2.5776602775942e-01; 	q[7][11]=9.6662260409782e-01;
+	q[7][12]=6.0145406477198e-01; 	q[7][13]=5.4775280898876e-01;
+	q[7][14]=1.2382877804129e+00; 	q[7][15]=8.2853366065527e+00;
+	q[7][16]=1.1110604644803e+00; 	q[7][17]=1.2888301387971e-01;
+	q[7][18]=1.7114723586662e-02; 	q[7][19]=1.9233311302049e+00;
+
+	q[8][9]=2.7354343963341e-01; 	q[8][10]=1.5876246692449e+00;
+	q[8][11]=9.6993944636678e-01; 	q[8][12]=1.2544085640577e-01;
+	q[8][13]=1.6868512110727e+00; 	q[8][14]=3.3075513942601e+00;
+	q[8][15]=1.2530894710826e+00; 	q[8][16]=8.1434196396611e-01;
+	q[8][17]=1.0121107266436e+00; 	q[8][18]=4.4982698961938e+00;
+	q[8][19]=1.5570934256055e+00;
+
+	q[9][10]=9.2275320303002e+00; 	q[9][11]=1.6663354531002e+00;
+	q[9][12]=1.1780604133545e+01; 	q[9][13]=6.9753577106518e+00;
+	q[9][14]=4.2551201720752e-01; 	q[9][15]=8.8575970928912e-01;
+	q[9][16]=6.8951811852420e+00; 	q[9][17]=9.8802836705702e-02;
+	q[9][18]=1.3434022257552e+00; 	q[9][19]=3.1526232114467e+01;
+
+	q[10][11]=6.5787197231834e-01; 	q[10][12]=1.8622837370242e+01;
+	q[10][13]=5.6340830449827e+00; 	q[10][14]=1.1377976796255e+00;
+	q[10][15]=6.1690558576372e-01; 	q[10][16]=1.2098794893211e+00;
+	q[10][17]=1.7543252595156e+00; 	q[10][18]=1.0346020761246e+00;
+	q[10][19]=6.2906574394464e+00;
+
+	q[11][12]=8.6029411764706e+00; 	q[11][13]=6.6640454965565e-03;
+	q[11][14]=1.2089100346021e+00; 	q[11][15]=3.4411764705882e+00;
+	q[11][16]=4.9442190669371e+00; 	q[11][17]=3.4272233982290e-02;
+	q[11][18]=4.7794117647059e-01; 	q[11][19]=3.7500000000000e-01;
+
+	q[12][13]=3.2500000000000e+00; 	q[12][14]=5.9976931949250e-01;
+	q[12][15]=2.1848739495798e+00; 	q[12][16]=3.6916835699797e+00;
+	q[12][17]=1.6247577591604e-01; 	q[12][18]=1.1508700794053e-01;
+	q[12][19]=9.0588235294118e+00;
+
+	q[13][14]=3.9359861591695e-01; 	q[13][15]=1.6386554621849e+00;
+	q[13][16]=4.9442190669371e-01; 	q[13][17]=2.8676470588235e+00;
+	q[13][18]=2.4852941176471e+01; 	q[13][19]=4.4117647058824e-01;
+
+	q[14][15]=8.6431043005437e+00; 	q[14][16]=2.8308077795013e+00;
+	q[14][17]=3.5840244687362e-02; 	q[14][18]=4.3804743506776e-02;
+	q[14][19]=1.7301038062284e+00;
+
+	q[15][16]=1.9663865546218e+01; 	q[15][17]=2.7857142857143e+00;
+	q[15][18]=1.2016806722689e+00; 	q[15][19]=1.0840336134454e+00;
+
+	q[16][17]=4.2019597219666e-02; 	q[16][18]=1.5162271805274e+00;
+	q[16][19]=5.6592292089249e+00;
+
+	q[17][18]=2.2941176470588e+00; 	q[17][19]=1.2654363316538e-01;
+
+	q[18][19]=1.0000000000000e+00;
+
+
+	f[0] = 0.087; f[1] = 0.041; f[2] = 0.040; f[3] = 0.047;
+	f[4] = 0.033; f[5] = 0.038; f[6] = 0.05; f[7] = 0.089;
+	f[8] = 0.034; f[9] = 0.037; f[10] = 0.085; f[11] = 0.08;
+	f[12] = 0.015; f[13] = 0.04; f[14] = 0.051; f[15] = 0.07;
+	f[16] = 0.058; f[17] = 0.01; f[18] = 0.03; f[19] = 0.065;
+
+} /* dayhoff data */
+
+
+ModelProtein::ModelProtein(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates)
+ : ModelGTR(tree, count_rates)
+{
+	init(model_name, model_params, freq, freq_params);
+}
+
+
+void ModelProtein::init(const char *model_name, string model_params, StateFreqType freq, string freq_params) {
+	assert(num_states == 20);
+	name = model_name;
+	//string model_str;
+	//bool user_model = false;
+	double daa[400];
+	string name_upper = model_name;
+	for (string::iterator it = name_upper.begin(); it != name_upper.end(); it++)
+		(*it) = toupper(*it);
+
+	if (initProtMat(state_freq, daa, name_upper)) {
+		int i, j, k;
+		double sum = 0.0;
+		for (i = 0; i < num_states; i++)
+			sum += (double) state_freq[i];
+		if (round(sum*1e8) != 1e8) {
+			cout.precision(7);
+			cout << "WARNING: " <<  name_upper << " state frequencies do not sum up to 1: " << sum << endl;
+		}
+		if (verbose_mode >= VB_DEBUG) {
+			cout.precision(6);
+			cout.unsetf(ios::fixed);
+			cout << name_upper << " rate matrix and state frequencies:" << endl;
+			for (i=0; i < num_states; i++) {
+				for (j=0; j < num_states; j++)
+					cout << ((j>0) ? "\t":"") << daa[i*20+j];
+				cout << endl;
+			}
+			for (i=0; i < num_states; i++)
+				cout << ((i>0)? "\t":"") << state_freq[i];
+			cout << endl;
+
+		}
+		for (i = 0, k = 0; i < num_states-1; i++)
+			for (j = i+1; j < num_states; j++)
+				rates[k++] = daa[i*20+j];
+	} else if (!model_params.empty()) {
+		stringstream ss(model_params);
+		readRates(ss);
+		readStateFreq(ss);
+	} else {
+		// if name does not match, read the user-defined model
+		readParameters(model_name);
+	}
+	if (freq_params != "") {
+		stringstream ss(freq_params);
+		readStateFreq(ss);
+	}
+/*	if (name == "WAG") { model_str = model_WAG;}
+	else if (name == "cpREV") model_str = model_cpREV;
+	else if (name == "mtREV") model_str = model_mtREV;
+	//else if (name == "Dayhoff") model_str = model_Dayhoff;
+	else if (name == "mtMAM") model_str = model_mtMAM;
+	else if (name == "JTT") model_str = model_JTT;
+	else if (name == "LG") model_str = model_LG;
+	else if (name == "mtART") model_str = model_mtART;
+	else if (name == "mtZOA") model_str = model_mtZOA;
+	else if (name == "VT" || name == "rtREV" || name == "Dayhoff" || name == "PAM") {
+		double *q[num_states];
+		int i, j, k;
+		for (i = 0; i < num_states; i++)
+			q[i] = new double[num_states];
+		if (name == "VT") 
+			get_VT(q, state_freq);
+		else if (name == "rtREV")
+			get_rtREV(q, state_freq);
+		else 
+			get_Dayhoff(q, state_freq);
+
+		for (i = 0, k = 0; i < num_states-1; i++)
+			for (j = i+1; j < num_states; j++)
+				rates[k++] = q[i][j];
+		for (i = num_states-1; i >= 0; i--)
+			delete q[i];
+	} else {
+		//outError("Invalid model name: " + name);	
+		user_model = true;
+	}
+
+	if (!model_str.empty()) {
+		// read rates from internal string
+		try {
+			istringstream in(model_str);
+			readRates(in);
+			readStateFreq(in);
+		}
+		catch (const char *str) {
+			outError(str);
+		} 
+	} else if (user_model) {
+		readParameters(model_name);
+	}*/
+
+	num_params = 0;
+	//assert(freq != FREQ_ESTIMATE);
+	if (freq == FREQ_UNKNOWN) freq = FREQ_USER_DEFINED;
+	ModelGTR::init(freq);
+}
+
+void ModelProtein::readRates(istream &in) throw(const char*, string) {
+	int nrates = getNumRateEntries();
+	int row = 1, col = 0;
+	// since states for protein is stored in lower-triangle, special treatment is needed
+	for (int i = 0; i < nrates; i++, col++) {
+		if (col == row) {
+			row++; col = 0;
+		}
+		// switch col and row
+		int id = col*(2*num_states-col-1)/2 + (row-col-1);
+		if (id >= nrates) {
+			cout << row << " " << col << endl;
+		}
+		assert(id < nrates && id >= 0); // make sure that the conversion is correct
+		if (!(in >> rates[id]))
+			throw name+string(": Rate entries could not be read");
+		if (rates[id] < 0.0)
+			throw "Negative rates found";
+	}
+}
+
+
diff --git a/model/modelprotein.h b/model/modelprotein.h
new file mode 100644
index 0000000..b363fff
--- /dev/null
+++ b/model/modelprotein.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MODELPROTEIN_H
+#define MODELPROTEIN_H
+
+#include "modelgtr.h"
+
+/**
+Substitution models for protein sequences
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class ModelProtein : public ModelGTR
+{
+public:
+	/**
+		constructor
+		@param model_name model name, e.g., JTT, WAG.
+		@param freq state frequency type
+		@param tree associated phylogenetic tree
+	*/
+    ModelProtein(const char *model_name, string model_params, StateFreqType freq, string freq_params, PhyloTree *tree, bool count_rates = true);
+
+	/**
+		initialization, called automatically by the constructor, no need to call it
+		@param model_name model name, e.g., JTT, WAG.
+		@param freq state frequency type
+	*/
+	virtual void init(const char *model_name, string model_params, StateFreqType freq, string freq_params);
+
+	/**
+		read the rates from an input stream. it will throw error messages if failed
+		@param in input stream
+	*/
+	virtual void readRates(istream &in) throw(const char*, string);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() { return name; }
+
+
+};
+
+#endif
diff --git a/model/models.nex b/model/models.nex
new file mode 100644
index 0000000..3800424
--- /dev/null
+++ b/model/models.nex
@@ -0,0 +1,955 @@
+#nexus
+
+begin models;
+
+[ ---------------------------------------------------------
+    EX2 mixture model of Le, Lartillot & Gascuel (2008) 
+ --------------------------------------------------------- ]
+
+[ Exposed component ]
+model ExpEX2 =
+0.526738 
+0.483150 0.505837 
+0.658902 0.051052 3.902456 
+2.051872 2.214326 0.961103 0.129989 
+1.280002 2.039552 1.301786 0.399061 0.456521 
+1.306565 0.137928 0.285806 3.100403 0.033946 2.514377 
+1.370782 0.363365 1.820100 0.885317 0.886564 0.320746 0.303966 
+0.540809 2.288922 4.949307 0.700890 2.172284 3.755421 0.270957 0.401311 
+0.171986 0.237023 0.337226 0.018315 1.037046 0.212032 0.084442 0.012279 0.317239 
+0.430511 0.670514 0.158937 0.021949 1.702066 1.261113 0.110508 0.052946 0.869247 8.675343 
+0.697731 3.881079 1.677194 0.105450 0.146263 2.570254 0.730337 0.279865 0.598289 0.338782 0.313102 
+1.043937 0.656943 0.539827 0.066925 1.846562 1.973592 0.188160 0.158136 0.519993 9.483497 14.176858 1.013268 
+0.265209 0.097443 0.182522 0.026918 3.002586 0.080193 0.023999 0.084663 2.047163 2.193062 4.802817 0.044792 3.261401 
+1.270693 0.166534 0.068692 0.228829 0.156216 0.362501 0.214847 0.148900 0.323141 0.071992 0.343919 0.195470 0.099252 0.087020 
+4.826665 0.751947 4.412265 0.975564 5.294149 1.033459 0.382235 1.970857 0.993310 0.190509 0.389101 0.592156 0.557254 0.668834 1.223981 
+2.131819 0.584329 2.133604 0.368887 2.067387 1.013613 0.511390 0.174527 0.580960 2.563630 0.522334 1.147459 2.960091 0.244420 0.413148 7.384701 
+0.143081 0.475590 0.061094 0.042618 1.603125 0.210329 0.048276 0.186382 0.961546 0.208313 1.130724 0.052858 1.328785 5.210001 0.045945 0.316078 0.144393 
+0.208643 0.196271 0.599369 0.121313 3.842632 0.158470 0.064648 0.039280 8.230282 0.517123 0.713426 0.084962 0.812142 23.228875 0.043249 0.405310 0.234217 4.903887 
+2.544463 0.313443 0.172264 0.073705 4.207648 0.497398 0.484620 0.132496 0.329895 23.711178 3.466991 0.348362 4.136445 1.199764 0.368231 0.266531 3.184874 0.252132 0.459187 
+
+0.088367 0.078147 0.047163 0.087976 0.004517 0.058526 0.128039 0.056993 0.024856 0.025277 0.045202 0.094639 0.012338 0.016158 0.060124 0.055346 0.051290 0.006771 0.021554 0.036718;
+
+[ Buried component ]
+model BurEX2 =
+0.338649 
+0.201335 0.981635 
+0.283859 0.247537 6.505182 
+2.640244 0.904730 1.353325 0.312005 
+0.543136 4.570308 2.439639 0.682052 0.216787 
+0.748479 0.917979 0.804756 10.030310 0.024055 8.670112 
+2.700465 0.539246 0.810739 0.810727 0.701320 0.330139 0.636675 
+0.237686 3.175221 6.308043 1.540002 0.469875 8.675492 0.750683 0.183743 
+0.044209 0.099241 0.162644 0.020816 0.166986 0.082745 0.030581 0.005017 0.075820 
+0.124047 0.314159 0.088243 0.017526 0.449241 0.641784 0.073392 0.017752 0.277023 2.383760 
+0.433721 17.781822 2.851914 0.459939 0.117548 6.815411 3.482941 0.484653 1.247888 0.161658 0.219757 
+0.497479 0.448773 0.380964 0.057176 0.815999 2.089412 0.291379 0.054491 0.307450 2.817174 4.759683 1.082403 
+0.093991 0.055530 0.098936 0.026160 0.662517 0.091948 0.022760 0.034431 0.675645 0.521416 1.672365 0.077917 1.296869 
+0.986621 0.356417 0.214521 0.246129 0.164228 0.654039 0.295079 0.179095 0.428213 0.037671 0.170780 0.347219 0.074086 0.057233 
+5.925588 0.979993 4.725421 1.158990 5.111992 1.120931 0.737456 2.279470 0.886126 0.051057 0.089611 0.925355 0.275366 0.274582 1.151114 
+1.958501 0.630713 2.007592 0.289641 2.284140 0.787821 0.539892 0.097432 0.467489 0.644041 0.202812 1.401676 1.340732 0.103118 0.601281 8.190534 
+0.068357 0.784449 0.109073 0.085810 0.457880 0.297731 0.155877 0.157418 0.708743 0.054134 0.374568 0.115777 0.477495 2.362999 0.047127 0.209085 0.097054 
+0.084768 0.312038 0.615093 0.202611 0.788164 0.293543 0.137306 0.035497 4.938330 0.101803 0.180086 0.280737 0.264540 8.142914 0.059308 0.264401 0.133054 2.905674 
+1.387752 0.140091 0.112176 0.058637 1.575057 0.203946 0.239406 0.044011 0.085226 6.427279 1.035942 0.244336 1.033583 0.278010 0.213475 0.079878 1.592560 0.081135 0.108383 
+
+0.123119 0.019475 0.019852 0.018583 0.018711 0.017275 0.018723 0.050388 0.016402 0.119697 0.161399 0.012776 0.035838 0.057019 0.030913 0.043472 0.049935 0.012600 0.039929 0.133894;
+
+[ main definition of EX2 with fixed component rates ]
+model EX2 =MIX{BurEX2:0.672020808818762,ExpEX2:1.6413466609931};
+
+
+[ ---------------------------------------------------------
+    EX3 mixture model of Le, Lartillot & Gascuel (2008) 
+ --------------------------------------------------------- ]
+
+[ Buried component ]
+model BurEX3 =
+0.352598 
+0.216996 1.087422 
+0.292440 0.323465 7.797086 
+2.610812 0.913640 1.460331 0.344397 
+0.510610 5.128748 2.811070 0.773241 0.220223 
+0.753729 1.090823 0.956820 12.012282 0.021022 10.123412 
+2.838061 0.595013 0.884971 0.922298 0.707214 0.351856 0.713974 
+0.239679 3.625577 7.108377 1.826237 0.481109 10.246488 0.839852 0.219310 
+0.051496 0.102940 0.168735 0.024207 0.162795 0.087881 0.036973 0.004515 0.079975 
+0.119849 0.316151 0.091984 0.018800 0.422679 0.648064 0.075035 0.016317 0.282195 2.225363 
+0.443183 20.766910 3.194817 0.568138 0.132784 7.478955 4.176123 0.551523 1.415394 0.163276 0.207613 
+0.460570 0.458210 0.398615 0.059146 0.765112 2.134261 0.313124 0.053192 0.340474 2.609469 4.476961 1.014674 
+0.089411 0.056698 0.104720 0.027913 0.630095 0.094857 0.023275 0.034031 0.691151 0.491179 1.606618 0.077868 1.226530 
+0.993370 0.419898 0.217106 0.273526 0.181230 0.729534 0.311152 0.192454 0.483200 0.040002 0.170402 0.376998 0.075002 0.057218 
+6.108406 1.066008 5.182562 1.216396 5.236005 1.159086 0.763810 2.404073 0.924395 0.048875 0.084247 0.923997 0.260340 0.260617 1.208454 
+1.992855 0.687262 2.181095 0.312299 2.276505 0.829879 0.551397 0.101409 0.480998 0.610331 0.198919 1.407257 1.292634 0.096955 0.648250 8.527249 
+0.063159 0.855332 0.134012 0.099769 0.468450 0.329372 0.136731 0.169991 0.745868 0.056715 0.377293 0.137955 0.463394 2.343596 0.058650 0.211406 0.085948 
+0.078057 0.341493 0.655744 0.241264 0.762740 0.302096 0.142491 0.040257 5.226086 0.092084 0.180292 0.311130 0.249838 8.141649 0.062812 0.267992 0.128044 3.047417 
+1.339724 0.144916 0.125078 0.062854 1.481083 0.194081 0.225389 0.043663 0.090575 5.973306 0.993888 0.222252 0.964622 0.262045 0.207448 0.083450 1.544911 0.078358 0.105286 
+
+0.123992 0.016529 0.017595 0.015784 0.019325 0.015552 0.015939 0.049573 0.014540 0.126555 0.167605 0.011083 0.037438 0.058363 0.028849 0.042324 0.049207 0.011962 0.037833 0.139953;
+
+[ Intermediate component ]
+model IntEX3 =
+0.489239 
+0.466919 0.536794 
+0.601908 0.069474 4.603441 
+2.430552 1.807414 0.997223 0.166431 
+1.101971 2.081359 1.299123 0.508086 0.393348 
+1.227777 0.215899 0.345545 3.579383 0.046861 3.113235 
+1.873072 0.390054 1.528288 0.941969 0.867139 0.349219 0.406414 
+0.519003 1.930915 5.003737 0.781887 1.630085 3.567804 0.324903 0.315383 
+0.158722 0.180317 0.295816 0.013254 0.642786 0.179498 0.090830 0.013181 0.209208 
+0.345026 0.503290 0.138767 0.024393 1.107569 1.027755 0.123806 0.048549 0.592981 5.439892 
+0.610178 4.322929 1.524318 0.121994 0.181609 2.674484 0.792405 0.276766 0.591509 0.301836 0.294950 
+0.949957 0.472702 0.502710 0.091008 1.283305 1.905885 0.242081 0.140301 0.378459 6.259505 9.391081 1.074513 
+0.247271 0.069820 0.161809 0.028611 2.065479 0.077874 0.025753 0.065388 1.541097 1.306479 3.015722 0.048689 2.243101 
+1.334722 0.170174 0.099375 0.211869 0.163190 0.349495 0.155436 0.186099 0.300496 0.065625 0.265961 0.162529 0.088677 0.083754 
+5.316955 0.699036 4.526191 1.143652 5.249370 0.970695 0.438792 2.366185 0.939629 0.138819 0.275119 0.532771 0.521510 0.547761 1.187779 
+1.963809 0.535034 2.034583 0.383040 2.012437 0.891145 0.531018 0.180104 0.467342 1.861944 0.395319 1.071879 2.340268 0.183984 0.400373 7.243848 
+0.145693 0.378596 0.046601 0.048388 1.074147 0.174525 0.063777 0.168836 0.822524 0.110645 0.677913 0.062047 0.796395 3.502387 0.046950 0.290501 0.107097 
+0.195764 0.149382 0.534652 0.105996 2.446201 0.150150 0.071967 0.031908 6.198893 0.299207 0.413150 0.090874 0.492692 15.039152 0.044765 0.328289 0.175204 3.125850 
+2.227504 0.220361 0.150316 0.066496 3.112801 0.393451 0.444469 0.108811 0.224352 15.532696 2.152640 0.302279 2.658339 0.738053 0.322254 0.197018 2.507055 0.175763 0.276642 
+
+0.086346 0.080808 0.041727 0.064440 0.006654 0.052795 0.092110 0.048527 0.028831 0.040497 0.071679 0.079687 0.018007 0.025901 0.052632 0.052778 0.056138 0.010733 0.034744 0.054964;
+
+[ Highly exposed component ]
+model HExEX3 =
+0.557500 
+0.467024 0.508965 
+0.660464 0.044039 3.386724 
+1.332582 3.667491 1.440486 0.185886 
+1.402485 2.156104 1.297398 0.333117 0.789370 
+1.259192 0.111162 0.245837 2.707953 0.058650 2.098300 
+0.934526 0.393780 2.196372 0.868249 1.336358 0.322363 0.252359 
+0.518929 3.157422 5.392488 0.748008 3.827563 4.517669 0.284167 0.634601 
+0.279723 0.407537 0.535113 0.054030 3.345087 0.427624 0.148200 0.015686 0.658979 
+0.715094 1.182387 0.270883 0.035162 3.520931 2.366650 0.172395 0.100089 1.779380 18.830270 
+0.694526 3.728628 1.747648 0.083685 0.100399 2.477205 0.623294 0.280977 0.694965 0.569776 0.493141 
+1.338414 1.261833 0.818216 0.054313 3.918703 2.383718 0.219943 0.228757 0.867786 19.605444 31.431195 1.089056 
+0.295523 0.190129 0.263800 0.044853 5.266468 0.120909 0.042178 0.194665 3.494314 5.825792 11.527190 0.044361 6.237844 
+1.085021 0.168461 0.041147 0.203765 0.185173 0.353420 0.218194 0.120292 0.375260 0.116875 0.705493 0.190747 0.139085 0.108823 
+4.090024 0.852803 4.335615 0.829194 6.499129 1.095446 0.336922 1.733724 1.144100 0.413986 0.878828 0.631498 0.730416 1.167593 1.195720 
+2.318400 0.650016 2.351068 0.385247 1.883085 1.167877 0.532167 0.187062 0.796107 4.825759 0.838744 1.268311 4.445757 0.381760 0.419944 7.677284 
+0.134371 1.021826 0.151293 0.065183 3.716538 0.530580 0.077516 0.396559 1.324147 0.443432 3.290145 0.064651 4.411035 13.056874 0.056705 0.534908 0.408415 
+0.212989 0.424870 1.115762 0.268883 8.874037 0.255572 0.125866 0.107717 14.436023 1.292209 1.491799 0.104026 2.063744 49.760746 0.057618 0.756357 0.396791 12.032322 
+3.112666 0.544010 0.214411 0.125541 5.301703 0.868794 0.839508 0.215758 0.533676 46.074660 7.301056 0.557248 9.151909 2.634769 0.523205 0.564572 4.519860 0.456880 0.670812 
+
+0.094155 0.070537 0.052200 0.112406 0.002213 0.062733 0.165272 0.062302 0.019853 0.011154 0.019829 0.108860 0.006503 0.006873 0.070091 0.057931 0.046183 0.002449 0.008629 0.019827;
+
+[ main definition of EX3 with fixed component rates ]
+model EX3 = MIX{BurEX3:0.427672756793791,IntEX3:0.837595938019774,HExEX3:1.51863631431518};
+
+[ ---------------------------------------------------------
+    EHO mixture model of Le, Lartillot & Gascuel (2008)
+ --------------------------------------------------------- ]
+
+[ extended component ]
+model ExtEHO = 
+0.221750 
+0.256487 0.595368 
+0.447755 0.112310 7.769815 
+4.893140 0.929131 1.061884 0.164472 
+0.542660 2.886791 1.927072 0.497273 0.133291 
+0.549459 0.290798 0.518264 5.393249 0.003776 4.326528 
+5.411319 0.302948 0.907713 0.961651 1.249183 0.173873 0.316780 
+0.283752 2.760038 5.159285 0.978418 0.737799 5.086066 0.421812 0.209276 
+0.026683 0.053027 0.166715 0.016491 0.151942 0.055934 0.026726 0.001780 0.098605 
+0.226816 0.251641 0.062256 0.015837 0.763554 0.537705 0.042909 0.032938 0.321607 3.217159 
+0.235513 6.017300 2.543177 0.223507 0.023575 3.432847 1.211039 0.160545 0.671045 0.082221 0.106179 
+0.992834 0.351969 0.415447 0.041511 1.271632 1.700679 0.111984 0.117596 0.326393 3.329162 7.496635 0.519821 
+0.191967 0.041219 0.090517 0.014810 1.004694 0.042779 0.011177 0.040989 0.641267 0.813011 2.233318 0.023173 1.863238 
+1.876507 0.395175 0.362650 0.550534 0.174031 0.731229 0.412907 0.205341 0.381717 0.011597 0.315127 0.393303 0.135360 0.043846 
+6.066032 1.083228 5.612711 1.035540 4.263932 1.429211 0.766802 2.266299 1.074108 0.047896 0.147065 0.683291 0.352118 0.382422 1.462674 
+1.827471 0.645132 1.883173 0.287521 1.395928 1.013709 0.781080 0.055140 0.512000 0.588357 0.142327 1.256445 1.435179 0.079647 0.417388 6.092548 
+0.101419 0.452274 0.065206 0.034173 0.592031 0.164037 0.049674 0.183473 0.741383 0.069289 0.429275 0.050856 0.545447 2.178510 0.022770 0.304839 0.111242 
+0.091914 0.112094 0.451176 0.108762 1.183567 0.132194 0.042952 0.030418 4.373360 0.122828 0.186938 0.096667 0.344096 8.276255 0.053251 0.325231 0.135310 2.597897 
+1.970427 0.119016 0.091863 0.041044 1.750822 0.222903 0.225961 0.053387 0.123318 6.815243 1.427658 0.124284 1.427074 0.341263 0.127045 0.076658 1.052442 0.073165 0.101733 
+
+0.062087 0.053435 0.023743 0.032063 0.013132 0.034151 0.061042 0.030664 0.022696 0.104732 0.099541 0.054991 0.022312 0.045996 0.025392 0.045673 0.072789 0.012691 0.043790 0.139079;
+
+[ Helix component ]
+model HelEHO = 
+0.346476 
+0.374362 0.664870 
+0.557349 0.079157 3.710526 
+3.192474 1.027228 0.891196 0.006722 
+0.776545 1.902860 1.561002 0.517360 0.112028 
+0.841893 0.158406 0.443065 3.792847 0.000006 2.320685 
+4.037113 0.661209 1.866962 1.144918 1.465540 0.511489 0.573208 
+0.394225 2.123760 5.845902 0.737868 1.084909 3.960964 0.270146 0.380762 
+0.111350 0.099645 0.233216 0.005627 0.839533 0.089484 0.019520 0.021251 0.132153 
+0.193017 0.307622 0.115495 0.009651 1.136538 0.584189 0.039838 0.048105 0.485901 4.915707 
+0.481682 3.827872 1.926308 0.163314 0.021755 2.487895 0.768919 0.327002 0.534206 0.147053 0.136159 
+0.610432 0.344033 0.452639 0.035659 1.624032 1.146169 0.103241 0.171164 0.364836 6.260678 7.738615 0.549401 
+0.147278 0.035167 0.106276 0.018468 1.864906 0.047207 0.010268 0.086543 1.244539 0.927331 3.243633 0.016265 2.326533 
+1.090575 0.181605 0.093658 0.386490 0.097655 0.462559 0.290152 0.568098 0.458437 0.043237 0.207460 0.198291 0.061027 0.067592 
+6.243684 0.836138 5.633664 0.952131 6.398291 1.267404 0.430602 5.463144 1.088326 0.102127 0.193860 0.707365 0.438507 0.470620 1.534272 
+2.847158 0.566364 2.984732 0.347047 3.711971 1.083181 0.495700 0.500029 0.642773 1.698955 0.402699 1.111399 2.483456 0.231119 0.685164 8.832473 
+0.090983 0.369015 0.085583 0.046821 0.950521 0.183299 0.040785 0.391093 0.950288 0.075780 0.624335 0.041505 0.980672 3.915972 0.053806 0.299723 0.100663 
+0.152848 0.170981 0.594708 0.106099 2.051641 0.121416 0.047614 0.064377 8.167042 0.195540 0.352598 0.069186 0.465779 15.178886 0.058255 0.405459 0.201603 4.035822 
+2.140511 0.136453 0.145376 0.046174 4.011687 0.191618 0.192292 0.202844 0.174981 14.460840 2.175028 0.136317 2.393838 0.659302 0.418505 0.180248 3.585329 0.175143 0.281722 
+
+0.121953 0.076798 0.032215 0.066765 0.006842 0.061304 0.131841 0.026596 0.020392 0.047287 0.087919 0.084679 0.020970 0.024145 0.025871 0.042103 0.038715 0.008346 0.023841 0.051421;
+
+[ Other component ]
+model OthEHO =
+0.529263 
+0.379476 0.612335 
+0.516691 0.067732 4.012914 
+3.774890 1.615176 0.888663 0.165810 
+1.312262 2.913667 1.533683 0.442262 0.337571 
+1.403437 0.154460 0.333334 3.815893 0.015567 3.743866 
+1.272402 0.389317 1.243222 0.661976 0.554904 0.332656 0.319770 
+0.558733 2.816641 4.803000 0.761339 1.223662 4.889028 0.323617 0.300981 
+0.124057 0.155080 0.219635 0.019097 0.560959 0.100743 0.038076 0.005599 0.184752 
+0.340362 0.580087 0.119838 0.015948 1.192857 1.156516 0.083154 0.031031 0.646292 7.873544 
+0.706732 5.734632 1.847806 0.128114 0.050896 3.616626 1.131071 0.283950 0.643558 0.179831 0.224320 
+1.056749 0.665355 0.399943 0.053900 1.893946 2.299714 0.168079 0.085094 0.556024 8.136055 14.213193 0.931689 
+0.233961 0.079465 0.130295 0.016768 1.902244 0.077611 0.012655 0.048906 1.403178 1.581816 4.275863 0.036062 2.888633 
+1.518830 0.252482 0.049484 0.171011 0.108909 0.501196 0.346600 0.058913 0.299924 0.073007 0.297573 0.249478 0.091619 0.068920 
+5.595735 0.861017 3.749627 0.987083 4.952776 1.045071 0.463265 1.190738 0.897478 0.131753 0.265701 0.607097 0.399537 0.408758 0.993614 
+2.157458 0.613623 1.733380 0.361861 2.145775 1.011592 0.523086 0.091023 0.450662 1.492403 0.408418 1.143233 2.378569 0.131777 0.381007 7.574340 
+0.151895 0.544292 0.060182 0.043433 1.259614 0.228038 0.045082 0.134804 0.748147 0.134416 0.979277 0.038787 0.908253 4.850762 0.052415 0.249753 0.114232 
+0.219509 0.243507 0.580103 0.130214 2.325021 0.196580 0.079660 0.037482 6.907609 0.299245 0.552917 0.067894 0.685250 19.404995 0.047839 0.323207 0.183044 4.704884 
+3.049976 0.278740 0.134120 0.055382 4.149385 0.500946 0.435957 0.067170 0.214393 22.435652 2.883298 0.323886 3.369448 0.722571 0.315978 0.152899 2.423398 0.186495 0.303833 
+
+0.076458 0.052393 0.055429 0.088634 0.007473 0.040671 0.080952 0.100192 0.025439 0.031730 0.053100 0.070835 0.014039 0.023159 0.087111 0.063636 0.055346 0.007033 0.023779 0.042590;
+
+[ main definition of EHO with fixed component rates ]
+model EHO = MIX{ExtEHO:0.720274356,HelEHO:0.976798797,OthEHO:0.783109376};
+
+
+[ ---------------------------------------------------------
+    UL2 mixture model of Le, Lartillot & Gascuel (2008)
+ --------------------------------------------------------- ]
+
+model M1_UL2 =
+0.267149 
+0.211944 0.816250 
+0.156648 0.336150 3.110967 
+2.402535 1.001114 1.287205 0.467161 
+0.301870 3.168646 1.844180 0.571540 0.394361 
+0.503678 1.529332 0.788530 3.920399 0.234553 8.502278 
+3.124853 0.171548 0.220006 0.250690 0.766651 0.174653 0.399019 
+0.139279 1.597241 5.622886 2.146897 0.349557 8.097306 1.211287 0.044878 
+0.037158 0.139068 0.189483 0.049336 0.147864 0.122799 0.153664 0.006928 0.085276 
+0.108752 0.387538 0.092568 0.035815 0.399254 0.617370 0.225586 0.018972 0.202328 2.343778 
+0.255267 15.176345 1.030178 0.196011 0.396427 3.731061 2.642525 0.142626 0.878376 0.319044 0.422741 
+0.430988 0.522887 0.351960 0.102916 0.683070 2.247889 0.621957 0.070803 0.228871 2.780325 4.767336 1.450453 
+0.088392 0.116382 0.114044 0.066251 0.668683 0.133418 0.075116 0.039034 0.780377 0.488538 1.586897 0.143427 1.211385 
+1.303487 0.178064 0.192016 0.065259 0.315140 0.406966 0.144065 0.135536 0.273070 0.087171 0.298010 0.087701 0.165232 0.104423 
+7.472990 0.579607 3.004054 0.854304 5.789930 0.930019 0.709540 2.018826 0.527351 0.051443 0.070322 0.432286 0.281917 0.286341 0.473611 
+2.276542 0.392852 1.332166 0.193248 2.577504 0.541748 0.690939 0.052900 0.272814 0.634227 0.224553 0.795413 1.360016 0.120449 0.745729 6.088861 
+0.048841 0.673695 0.076107 0.073261 0.377566 0.284556 0.284138 0.130136 0.649073 0.047797 0.324911 0.148403 0.390301 2.189403 0.122493 0.131225 0.080727 
+0.073190 0.425791 0.503951 0.250485 0.577049 0.306036 0.198368 0.024991 3.987606 0.083215 0.127898 0.372637 0.179514 7.784255 0.089874 0.175724 0.117177 2.629196 
+1.351002 0.175990 0.120675 0.105544 1.491339 0.203270 0.463186 0.055506 0.065132 6.411609 1.020423 0.337618 1.047308 0.272790 0.407545 0.079844 1.634833 0.077263 0.083195 
+
+0.122413 0.017757 0.020209 0.012086 0.018894 0.014525 0.009897 0.045663 0.020120 0.124002 0.168915 0.011684 0.037631 0.063612 0.023347 0.039268 0.046707 0.015603 0.050968 0.136701;
+
+
+model M2_UL2 =
+0.557363 
+0.539068 0.465628 
+0.696831 0.032997 3.879799 
+1.480953 4.566841 1.777582 0.310752 
+1.402193 1.920868 1.276554 0.327085 0.972350 
+1.335667 0.096752 0.255510 2.685052 0.088385 2.281328 
+1.056193 0.423348 2.171283 0.933450 1.398738 0.369406 0.334900 
+0.729300 2.712485 5.461073 0.679965 5.202985 4.012284 0.282038 0.585359 
+0.267035 0.493033 0.523699 0.023230 2.563394 0.459103 0.176281 0.010013 0.551901 
+0.700687 0.932999 0.206875 0.025161 3.939537 1.918986 0.154733 0.085684 1.446302 8.189198 
+0.736759 3.603558 1.676442 0.070721 0.292188 2.403019 0.611829 0.307607 0.675279 0.627044 0.410941 
+1.505101 0.819561 0.736222 0.089302 4.462071 2.539203 0.250970 0.204790 0.654198 11.105816 15.171688 1.258549 
+0.541573 0.185468 0.343735 0.042217 5.958046 0.156533 0.064557 0.188906 3.891682 3.152154 5.098336 0.088022 4.518197 
+1.155460 0.142408 0.044854 0.175385 0.123605 0.316005 0.157783 0.157894 0.347393 0.047328 0.344717 0.153954 0.054635 0.108793 
+3.823040 0.733964 4.846938 0.890611 7.416660 0.987912 0.343107 2.296896 1.193558 0.368432 0.667347 0.535051 0.754875 1.469714 1.242760 
+1.897039 0.590040 2.371940 0.347041 1.619173 1.025240 0.479587 0.210934 0.728868 5.106169 0.726618 1.152768 3.985684 0.433442 0.358997 9.007029 
+0.296375 0.833840 0.091310 0.080326 5.217767 0.363445 0.078944 0.378088 1.571919 0.351013 2.139511 0.098671 2.796573 6.102504 0.023698 0.665667 0.292919 
+0.297444 0.206563 0.871576 0.173621 11.803422 0.181973 0.110832 0.073892 12.757344 1.161331 1.646025 0.101481 1.732368 29.335598 0.037045 0.706902 0.346859 5.666524 
+2.765737 0.415803 0.194725 0.093474 5.264577 0.734884 0.683342 0.156374 0.517626 26.038986 3.741256 0.457775 5.253478 1.999427 0.297563 0.344932 4.012753 0.385172 0.870088 
+
+0.087622 0.083588 0.048847 0.098882 0.002815 0.062809 0.143166 0.055391 0.023310 0.015495 0.032465 0.102135 0.009511 0.008409 0.069323 0.057733 0.051876 0.003945 0.014462 0.028216;
+
+model UL2 = MIX{M1_UL2:0.581348617,M2_UL2:1.465482789};
+
+
+[ ---------------------------------------------------------
+    UL3 mixture model of Le, Lartillot & Gascuel (2008)
+ --------------------------------------------------------- ]
+
+model Q1_UL3 =
+0.514865 
+0.774348 0.583403 
+0.854291 0.046141 2.011233 
+1.019817 5.652322 2.260587 0.057603 
+1.095968 1.696154 1.296536 0.417322 0.967032 
+1.054599 0.084924 0.368384 3.592374 0.063073 1.885301 
+3.510012 0.797055 1.759631 1.421695 2.627911 0.743770 0.772359 
+0.694799 2.596186 4.214186 0.654590 6.673533 3.664595 0.294967 0.608220 
+0.344837 0.543739 0.965435 0.062495 2.500862 0.452448 0.155720 0.083334 0.905291 
+0.593987 0.857922 0.351903 0.045358 3.290242 1.421539 0.109100 0.230693 1.595696 5.042430 
+0.708843 2.012940 1.662582 0.106190 0.329149 2.268825 0.579185 0.365374 0.696286 0.701896 0.398546 
+0.990080 0.754111 0.910436 0.143464 3.570847 1.708803 0.181804 0.706982 0.789517 8.138995 13.390024 1.137779 
+0.085639 0.012721 0.098898 0.018361 2.148695 0.012425 0.009316 0.135782 0.921964 1.006572 2.479349 0.014715 1.418875 
+0.655013 0.150052 0.120388 0.698261 0.254951 0.353826 0.250818 0.715043 0.329691 0.170251 0.827093 0.187804 0.178490 0.048299 
+2.863328 0.657706 3.761619 0.619692 9.817007 0.810603 0.344050 6.758412 0.997214 0.414623 0.625678 0.555290 0.647617 0.392859 0.929152 
+1.373936 0.392433 2.711122 0.237865 2.460302 0.701472 0.319136 0.607889 0.728133 3.705396 0.412346 0.953939 2.446017 0.054119 0.279699 9.934970 
+0.247598 0.514750 0.144529 0.157484 5.383077 0.199950 0.045688 0.790171 1.116595 0.243053 1.738186 0.070214 3.427855 3.275850 0.007577 0.583988 0.205721 
+0.090644 0.046952 0.326197 0.089450 7.475195 0.018555 0.020706 0.016617 3.728614 0.404819 0.617948 0.029889 0.956437 47.933104 0.050416 0.181180 0.070113 5.242459 
+2.093798 0.323334 0.307076 0.101486 8.553531 0.473023 0.410909 0.459941 0.568017 13.906640 1.778101 0.426825 2.763369 0.570421 0.311278 0.389524 2.915452 0.252168 0.268516 
+
+0.104307 0.092553 0.043722 0.085643 0.003218 0.074342 0.163928 0.024243 0.022216 0.016012 0.038591 0.105577 0.011434 0.016126 0.018057 0.061232 0.061373 0.004086 0.020876 0.032465;
+
+model Q2_UL3 =
+1.709484 
+0.184309 0.860448 
+0.660851 0.182073 4.471383 
+4.554487 2.843438 1.801073 1.068728 
+3.425703 6.092362 2.868388 0.790473 0.794773 
+4.278840 0.359055 0.585031 4.176143 0.121031 6.860012 
+0.625715 1.054231 1.222442 0.492366 1.418419 0.796035 0.643251 
+1.089116 6.396197 8.965630 1.915247 2.033352 11.058341 0.768162 0.523196 
+0.024545 0.023433 0.014686 0.002204 0.628823 0.008720 0.008363 0.002485 0.046726 
+0.150945 0.140520 0.002514 0.000212 1.903535 0.384413 0.015127 0.010251 0.210723 5.066207 
+1.751314 12.981698 3.641808 0.278298 0.036599 7.677610 2.744099 0.612733 1.686490 0.042380 0.023858 
+0.475876 0.364580 0.063143 0.001486 3.890832 0.754732 0.041044 0.024222 0.236955 5.752463 12.019762 0.229898 
+0.142125 0.051255 0.006503 0.000593 5.397699 0.064190 0.006871 0.015588 0.424840 1.005341 5.458275 0.021422 1.779060 
+5.433246 1.051312 0.012611 0.027267 0.635181 1.765792 0.849429 0.023324 0.610884 0.000184 0.037705 0.604166 0.001415 0.003197 
+6.267113 1.750009 5.986041 1.411952 5.482009 1.923966 0.595886 0.943724 1.786620 0.043381 0.066093 0.813893 0.053557 0.199095 1.723045 
+6.389458 1.828974 2.044599 1.561907 2.083626 2.070125 1.210529 0.217976 1.192222 0.515450 0.199809 2.020941 1.238100 0.150760 1.727569 9.882473 
+0.281689 1.180712 0.000006 0.017218 3.696424 0.146508 0.068518 0.222418 0.497727 0.199828 1.849405 0.001429 1.394852 2.473491 0.016401 0.288550 0.190290 
+0.302638 0.475135 0.196905 0.067615 6.355457 0.576342 0.232832 0.059485 4.525509 0.571811 1.194578 0.006674 0.467694 8.107893 0.024556 0.394389 0.441794 4.067825 
+11.333027 0.298555 0.053673 0.009846 5.743238 0.296166 0.413471 0.120393 0.105418 9.130937 2.674960 0.165290 4.417978 1.811161 0.492985 0.042803 3.284174 0.844277 2.327679 
+
+0.044015 0.021591 0.056258 0.102405 0.003260 0.018409 0.041364 0.168549 0.015843 0.064277 0.118096 0.036061 0.027358 0.036695 0.124914 0.047807 0.022696 0.005866 0.018644 0.025892;
+
+
+model Q3_UL3 =
+0.063622 
+0.118948 0.528684 
+0.065502 0.142677 12.092355 
+2.010382 0.302352 1.127688 0.014546 
+0.169022 2.026184 1.256016 0.417582 0.170493 
+0.172876 0.453837 0.454428 1.882165 0.045799 4.705997 
+5.254550 0.174422 0.364886 0.192790 0.891120 0.148450 0.195211 
+0.090586 1.258840 5.523808 0.313487 0.211550 4.734918 0.466811 0.096529 
+0.034911 0.065167 0.222440 0.023060 0.132230 0.122571 0.075521 0.003942 0.065261 
+0.146425 0.219004 0.136129 0.028165 0.448432 0.795591 0.146014 0.016718 0.240024 2.387089 
+0.041117 11.082325 0.783756 0.049843 0.039616 1.828161 0.649991 0.069199 0.271006 0.094864 0.140698 
+0.599734 0.230551 0.595641 0.059404 0.699534 2.765355 0.391569 0.089210 0.206188 3.020110 2.806927 0.516762 
+0.148889 0.145244 0.408811 0.089283 0.724422 0.260910 0.101509 0.101882 1.508086 0.693424 0.709933 0.061880 1.887041 
+0.378131 0.225548 0.181924 0.038283 0.146379 0.511097 0.151769 0.166424 0.386101 0.186116 0.753595 0.182723 0.420131 0.341199 
+8.340091 0.495564 3.010756 0.463573 5.601734 0.985082 0.415256 2.532014 0.720035 0.067860 0.121784 0.279698 0.626080 0.788724 0.890779 
+1.932342 0.358779 1.069003 0.093122 2.028674 0.637861 0.597230 0.019551 0.211054 0.870341 0.381670 0.461083 2.131453 0.255120 0.604567 3.450887 
+0.031419 0.378744 0.085588 0.008303 0.277628 0.277906 0.122038 0.055246 0.420382 0.021973 0.127345 0.027776 0.139098 3.077241 0.163299 0.177114 0.062650 
+0.106283 0.366786 1.453278 0.404793 0.753053 0.475024 0.273992 0.055936 7.367304 0.094629 0.126746 0.148716 0.369726 5.251757 0.232549 0.676796 0.247828 2.910495 
+1.104848 0.069287 0.110149 0.084224 1.280832 0.175361 0.342585 0.047507 0.038768 8.415916 1.577573 0.054532 1.537983 0.409619 0.309028 0.094413 1.483411 0.060147 0.076595 
+
+0.134055 0.044392 0.020730 0.020801 0.021008 0.024509 0.028587 0.029069 0.028138 0.105826 0.117458 0.049732 0.026120 0.038984 0.016306 0.038369 0.055334 0.017276 0.039905 0.143398;
+
+model UL3 = MIX{Q1_UL3:0.484340397,Q2_UL3:0.492780514,Q3_UL3:1.15597274};
+
+
+[ ---------------------------------------------------------
+    EX_EHO mixture model of Le & Gascuel (2010)
+ --------------------------------------------------------- ]
+
+
+model BUR_EXT =
+0.228492 
+0.165543 0.916344 
+0.238509 0.258514 8.498064 
+3.374029 1.037434 1.667702 0.332072 
+0.344742 4.971495 2.471912 0.654950 0.130301 
+0.417921 1.039226 0.875808 13.073209 0.040759 9.834742 
+4.248714 0.411876 0.585570 0.748848 0.908311 0.221633 0.593504 
+0.182762 3.872065 6.999812 1.719470 0.493863 8.695395 0.749303 0.137367 
+0.011705 0.090751 0.149898 0.021996 0.077693 0.043664 0.013820 0.001527 0.073342 
+0.133793 0.286232 0.065118 0.015540 0.456304 0.546974 0.052641 0.024196 0.226460 2.160734 
+0.249141 17.756919 3.385483 0.343780 0.093875 6.677050 2.745017 0.295602 1.481997 0.100576 0.167406 
+0.641194 0.342577 0.427146 0.059345 0.867233 2.306480 0.218260 0.058613 0.358032 2.187901 5.151337 0.750049 
+0.118366 0.068606 0.102572 0.009357 0.633943 0.033356 0.012944 0.024474 0.497973 0.534407 1.581972 0.063281 1.329239 
+1.561052 0.483968 0.385170 0.261437 0.310131 0.913924 0.355871 0.175520 0.512823 0.019789 0.295416 0.348527 0.104569 0.059641 
+5.891807 1.320618 5.737159 1.074011 4.702782 1.389531 0.878480 2.178078 1.111068 0.033343 0.094349 1.035903 0.327901 0.292022 1.344678 
+2.059884 0.976165 2.166428 0.369522 1.951862 0.815145 0.575774 0.060834 0.558388 0.422299 0.153549 1.793263 1.268126 0.085468 0.780914 9.031309 
+0.081683 0.814216 0.057557 0.055146 0.450959 0.191881 0.109420 0.144367 0.651978 0.068649 0.345622 0.169527 0.387902 1.883741 0.023466 0.309129 0.111568 
+0.052650 0.248907 0.570101 0.180267 0.701260 0.253975 0.061388 0.025465 4.206114 0.083799 0.147600 0.226848 0.254720 6.549427 0.027521 0.283138 0.141408 2.561108 
+1.355342 0.137437 0.104597 0.051387 1.203830 0.218892 0.194527 0.031054 0.088935 4.577473 1.003647 0.153722 0.883283 0.242657 0.191295 0.068785 0.990922 0.056276 0.078264 
+
+0.087158 0.015906 0.012970 0.012566 0.020325 0.013301 0.013777 0.039603 0.014597 0.161107 0.147775 0.011033 0.031334 0.064281 0.013322 0.035417 0.048583 0.012672 0.045210 0.199064;
+
+
+model BUR_HEL =
+0.317211 
+0.209784 1.120865 
+0.315205 0.301050 7.439896 
+2.214446 0.884449 1.356293 0.110768 
+0.465495 4.319791 2.843187 1.082540 0.215988 
+0.668735 0.901135 0.986572 11.245156 0.009874 7.561773 
+3.614157 0.568883 0.972660 1.036117 0.894733 0.409083 0.780808 
+0.249929 3.138701 7.344935 1.747672 0.379845 9.559763 0.842239 0.146008 
+0.059633 0.103290 0.206475 0.017492 0.286194 0.123433 0.037593 0.010910 0.071273 
+0.096230 0.285199 0.113728 0.015874 0.439724 0.547078 0.063675 0.021607 0.303531 2.097349 
+0.380075 15.783354 2.780107 0.569108 0.093004 6.179905 3.209588 0.413960 1.002075 0.185911 0.185249 
+0.371379 0.411553 0.398602 0.076761 0.727245 1.665645 0.249045 0.068128 0.256194 2.940308 3.649539 0.972247 
+0.075616 0.043519 0.096446 0.041118 0.636688 0.102460 0.039991 0.041269 0.839126 0.376556 1.551814 0.064774 1.173962 
+1.100574 0.385197 0.319458 0.353000 0.112549 0.805706 0.369483 0.482895 0.520098 0.058167 0.144341 0.361488 0.074069 0.057968 
+6.832958 0.955160 5.296628 1.265211 6.144756 1.315182 0.902504 3.903795 0.862633 0.072343 0.080478 0.979654 0.330305 0.328917 1.924898 
+2.223205 0.445571 2.461831 0.299635 2.943208 0.830637 0.621903 0.184055 0.468356 0.911139 0.208091 1.343261 1.515339 0.158763 0.915879 9.298787 
+0.062541 0.806724 0.110928 0.132125 0.414525 0.388313 0.191952 0.271274 0.909529 0.025790 0.343842 0.099137 0.543577 2.467147 0.044938 0.215329 0.087955 
+0.082948 0.329591 0.693402 0.286594 0.866329 0.259566 0.167425 0.049038 6.332054 0.093136 0.177755 0.275998 0.261754 8.344684 0.088981 0.335859 0.137177 3.125017 
+1.390479 0.142986 0.175068 0.106294 1.687293 0.159520 0.297915 0.080925 0.085103 6.414688 0.953785 0.240157 1.097345 0.264988 0.373870 0.144230 2.572837 0.089110 0.115941 
+
+0.158060 0.021566 0.016487 0.014079 0.016937 0.020232 0.023096 0.032822 0.014618 0.114447 0.198900 0.014668 0.042840 0.053434 0.015640 0.037275 0.043095 0.012211 0.036330 0.113263;
+
+model BUR_OTH =
+0.406682 
+0.246649 0.848592 
+0.364260 0.198690 4.535840 
+3.292044 0.837291 1.295138 0.420726 
+0.735862 4.205085 2.062501 0.427451 0.259335 
+0.954795 0.673046 0.671062 8.395674 0.048284 8.922739 
+1.958847 0.573207 0.632317 0.572264 0.486274 0.345345 0.650009 
+0.312042 2.699661 4.969855 1.181781 0.551188 7.620453 0.701108 0.195346 
+0.071000 0.127041 0.184028 0.030240 0.180591 0.065984 0.039235 0.005033 0.098525 
+0.142298 0.338853 0.086876 0.026095 0.484427 0.867777 0.087780 0.017129 0.309774 3.477136 
+0.624622 18.390649 2.748646 0.442886 0.238266 6.993941 3.906971 0.652336 1.365814 0.219252 0.288480 
+0.610604 0.581287 0.382156 0.048508 0.963147 2.672887 0.384585 0.051334 0.386066 3.752286 6.858529 1.524446 
+0.124670 0.047666 0.102656 0.031532 0.699124 0.129867 0.004923 0.039185 0.701690 0.643782 2.019473 0.104308 1.568249 
+1.126387 0.321347 0.107738 0.137858 0.150346 0.601413 0.310374 0.073794 0.332910 0.056230 0.208204 0.368816 0.078902 0.062410 
+5.908551 0.834735 3.611589 0.969189 4.765870 0.881934 0.528944 1.439305 0.746876 0.060111 0.114374 0.784754 0.235963 0.219009 0.710100 
+1.856381 0.574277 1.573584 0.223054 2.038789 0.763848 0.461329 0.076195 0.396095 0.701247 0.249302 1.091322 1.282643 0.070553 0.419070 6.616977 
+0.069294 0.654056 0.127255 0.078896 0.517561 0.188732 0.125541 0.104279 0.547504 0.066927 0.454998 0.056498 0.425274 2.668838 0.050943 0.151483 0.062698 
+0.128158 0.354167 0.640140 0.182565 0.793990 0.368725 0.157796 0.037084 4.307140 0.140691 0.241076 0.323966 0.293629 9.711414 0.060323 0.207489 0.111492 2.857446 
+1.982761 0.158227 0.115545 0.051117 2.065903 0.338262 0.258245 0.045770 0.089942 10.113118 1.382024 0.431385 1.456614 0.295718 0.273919 0.066465 1.668063 0.113899 0.144981 
+
+0.102123 0.021199 0.032404 0.032350 0.018985 0.017469 0.017625 0.089270 0.021090 0.083642 0.123866 0.012720 0.029789 0.055399 0.072705 0.061298 0.061705 0.013496 0.039682 0.093184;
+
+model EXP_EXT=
+0.464716 
+0.597009 0.420578 
+1.010693 0.048553 5.944290 
+3.915828 2.088244 0.878468 0.236108 
+1.156023 1.882317 1.435926 0.338823 0.482742 
+1.131098 0.127150 0.346338 3.317186 0.061060 2.724696 
+4.638659 0.351041 1.379174 1.216518 1.396050 0.199361 0.353970 
+0.657615 2.215990 4.150252 0.717363 1.853969 3.768864 0.347165 0.313421 
+0.078558 0.127092 0.347281 0.032361 0.605448 0.171553 0.104678 0.010608 0.309418 
+0.516672 0.510585 0.105529 0.039188 1.808273 1.017577 0.112010 0.044661 0.772131 5.693102 
+0.519389 3.571104 1.844049 0.109305 0.103105 2.232749 0.653339 0.195325 0.547017 0.219311 0.253086 
+1.658261 0.640712 0.558751 0.063591 1.694880 2.088441 0.194697 0.291701 0.321392 6.220456 12.392618 0.862547 
+0.426071 0.064894 0.132019 0.034872 2.076573 0.085745 0.026972 0.099963 1.388250 1.765294 3.859637 0.032198 3.134107 
+3.082729 0.250470 0.232578 0.376163 0.290522 0.502379 0.240501 0.302007 0.283950 0.013574 0.606936 0.248475 0.226716 0.058246 
+7.012884 0.866957 5.008997 0.814153 4.758346 1.192080 0.595351 2.514269 0.993487 0.135167 0.349525 0.542021 0.512591 0.744682 1.258172 
+2.037755 0.446367 1.618299 0.203392 1.177421 0.840646 0.583757 0.071515 0.466886 1.503883 0.260405 0.934230 2.245607 0.123552 0.258896 4.504833 
+0.171334 0.385971 0.087717 0.019596 1.015512 0.127027 0.037725 0.217844 0.822780 0.095756 0.777332 0.039952 0.977419 3.217291 0.015240 0.301259 0.102153 
+0.194998 0.091803 0.433021 0.086495 3.074882 0.111578 0.041481 0.048438 4.904785 0.336528 0.411742 0.087476 0.640594 14.126821 0.061656 0.338111 0.129249 2.902137 
+2.811391 0.216605 0.127240 0.061503 2.320268 0.390874 0.450783 0.132513 0.234279 12.181354 2.539512 0.233848 3.363159 0.717467 0.138035 0.159602 1.615372 0.132268 0.186175 
+
+0.043140 0.090761 0.034408 0.052848 0.006370 0.053817 0.107749 0.024812 0.029498 0.049134 0.050167 0.098127 0.013722 0.025841 0.037395 0.056505 0.094326 0.012045 0.039238 0.080099;
+
+model EXP_HEL =
+0.434227 
+0.551823 0.569806 
+0.698268 0.056291 3.064314 
+2.026002 2.379205 1.077282 0.016649 
+0.986617 1.606282 1.331570 0.426399 0.409724 
+1.005936 0.120122 0.390888 2.999742 0.021217 1.881156 
+3.221202 0.736168 2.269617 1.272893 1.771711 0.622430 0.656603 
+0.515574 2.032567 5.484997 0.666491 2.985549 3.380526 0.265244 0.557878 
+0.200810 0.241566 0.441585 0.009830 1.541200 0.198621 0.069562 0.043838 0.339616 
+0.328669 0.583849 0.178015 0.022077 2.045404 1.046125 0.089148 0.104708 0.875298 8.628242 
+0.598864 3.090263 1.682415 0.113637 0.207957 2.085253 0.582536 0.376534 0.554395 0.371883 0.290692 
+0.799278 0.528354 0.704087 0.062290 2.303849 1.507620 0.173293 0.356580 0.492228 10.028453 12.162732 0.867109 
+0.256227 0.083117 0.192262 0.030759 4.328951 0.078062 0.022890 0.181917 2.406824 2.014776 4.856941 0.041675 3.521229 
+1.118844 0.147481 0.061969 0.323498 0.171678 0.387521 0.237715 0.641036 0.433529 0.069102 0.359935 0.164055 0.063832 0.126592 
+5.069051 0.749554 5.245486 0.840686 7.114530 1.177802 0.382956 6.139836 1.086779 0.194824 0.424579 0.655759 0.682174 0.753148 1.355810 
+2.949741 0.623328 3.248881 0.406219 3.345739 1.214278 0.538553 0.867954 0.747654 3.316346 0.754081 1.193593 3.516479 0.366653 0.622665 7.975653 
+0.115446 0.394156 0.090971 0.055309 1.947845 0.185912 0.046886 0.451084 1.173014 0.277029 1.078778 0.054622 1.516237 5.813526 0.071865 0.359167 0.106921 
+0.205680 0.197878 0.678775 0.118188 4.183184 0.139485 0.059999 0.051336 10.200670 0.507328 0.721921 0.086974 0.741023 24.191458 0.046460 0.489820 0.247367 4.904042 
+2.494211 0.280293 0.235248 0.083648 5.509932 0.429196 0.409105 0.447130 0.351675 23.404006 3.840750 0.300727 4.126659 1.483049 0.675560 0.336101 4.426709 0.309940 0.588217 
+
+0.115826 0.094038 0.037357 0.085821 0.003363 0.073078 0.167709 0.025416 0.021634 0.024147 0.050238 0.106612 0.013318 0.013330 0.029895 0.044902 0.037901 0.006460 0.018548 0.030407;
+
+model EXP_OTH =
+0.603175 
+0.478745 0.562615 
+0.608325 0.056553 3.755571 
+2.371839 2.480665 0.889513 0.170707 
+1.551117 2.685995 1.462350 0.424139 0.669728 
+1.624084 0.129505 0.314826 3.404205 0.049823 3.375473 
+0.987777 0.356744 1.294077 0.640234 0.583980 0.331879 0.304731 
+0.667236 2.788429 4.719171 0.731257 1.872668 4.612209 0.316233 0.320454 
+0.186911 0.269245 0.318538 0.028464 0.987958 0.242926 0.090427 0.007312 0.327205 
+0.527992 0.844027 0.167295 0.021423 1.623589 1.636879 0.135662 0.044560 0.939347 10.338048 
+0.842575 5.076266 1.736167 0.106076 0.132985 3.365869 0.969736 0.270931 0.669196 0.356829 0.352830 
+1.296147 0.863599 0.469732 0.075018 1.832599 2.642602 0.217378 0.107935 0.624941 10.670411 17.593544 1.247987 
+0.325034 0.135328 0.192352 0.021631 2.731423 0.103263 0.027708 0.060740 2.148472 2.344767 5.497995 0.057563 3.278627 
+1.670091 0.235642 0.042844 0.164518 0.112539 0.479958 0.326780 0.057540 0.291899 0.110067 0.380466 0.240061 0.109541 0.083760 
+5.098150 0.831455 3.661924 0.978777 4.500240 1.064732 0.455496 1.095629 0.915898 0.226713 0.405000 0.608323 0.525496 0.593321 1.035726 
+2.174502 0.630453 1.791747 0.396219 1.681712 1.083797 0.556968 0.100584 0.457070 2.361119 0.543612 1.211816 2.987220 0.198957 0.368383 7.505908 
+0.203719 0.615713 0.044203 0.046952 1.745090 0.303876 0.050920 0.155176 0.920001 0.165182 1.385828 0.055323 1.274920 5.896599 0.059081 0.303111 0.156402 
+0.271220 0.253084 0.643377 0.142691 3.763228 0.209729 0.093004 0.035856 8.167503 0.490579 0.894778 0.077103 1.029700 26.210400 0.045876 0.373529 0.218567 5.726440 
+3.470639 0.410713 0.180011 0.081584 4.323431 0.751254 0.686467 0.086874 0.318032 29.800262 3.856040 0.482930 4.862267 1.182403 0.390522 0.268937 2.836818 0.229423 0.453335 
+
+0.071716 0.058979 0.060316 0.101089 0.005039 0.044673 0.093349 0.105394 0.026228 0.020220 0.037831 0.081647 0.010677 0.015875 0.090566 0.065046 0.054453 0.005546 0.019924 0.031432;
+
+
+model EX_EHO = MIX{BUR_EXT:0.761816796788931,BUR_HEL:0.744425646802117,BUR_OTH:0.532457759429489,EXP_EXT:1.5639387472863,EXP_HEL:2.06403411829438,EXP_OTH:1.43336795177594};
+
+
+[ ---------------------------------------------------------
+    LG4M mixture model of Le, Dang & Gascuel (2012)
+ --------------------------------------------------------- ]
+
+model LG4M1 =
+ 0.269343
+ 0.254612 0.150988
+ 0.236821 0.031863 0.659648
+ 2.506547 0.938594 0.975736 0.175533
+ 0.359080 0.348288 0.697708 0.086573 0.095967
+ 0.304674 0.156000 0.377704 0.449140 0.064706 4.342595
+ 1.692015 0.286638 0.565095 0.380358 0.617945 0.202058 0.264342
+ 0.251974 0.921633 1.267609 0.309692 0.390429 2.344059 0.217750 0.104842
+ 1.085220 0.325624 0.818658 0.037814 1.144150 0.534567 0.222793 0.062682 0.567431
+ 0.676353 0.602366 0.217027 0.007533 1.595775 0.671143 0.158424 0.070463 0.764255 8.226528
+ 0.179155 0.971338 1.343718 0.133744 0.122468 0.983857 0.994128 0.220916 0.410581 0.387487 0.181110
+ 1.636817 0.515217 0.670461 0.071252 1.534848 5.288642 0.255628 0.094198 0.257229 25.667158 6.819689 1.591212
+ 0.235498 0.123932 0.099793 0.030425 0.897279 0.112229 0.022529 0.047488 0.762914 1.344259 0.865691 0.038921 2.030833
+ 1.265605 0.040163 0.173354 0.027579 0.259961 0.580374 0.088041 0.145595 0.143676 0.298859 1.020117 0.000714 0.190019 0.093964
+ 5.368405 0.470952 5.267140 0.780505 4.986071 0.890554 0.377949 1.755515 0.786352 0.527246 0.667783 0.659948 0.731921 0.837669 1.355630
+ 1.539394 0.326789 1.688169 0.283738 1.389282 0.329821 0.231770 0.117017 0.449977 3.531600 0.721586 0.497588 2.691697 0.152088 0.698040 16.321298
+ 0.140944 0.375611 0.025163 0.002757 0.801456 0.257253 0.103678 0.132995 0.345834 0.377156 0.839647 0.176970 0.505682 1.670170 0.091298 0.210096 0.013165
+ 0.199836 0.146857 0.806275 0.234246 1.436970 0.319669 0.010076 0.036859 3.503317 0.598632 0.738969 0.154436 0.579000 4.245524 0.074524 0.454195 0.232913 1.178490
+ 9.435529 0.285934 0.395670 0.130890 6.097263 0.516259 0.503665 0.222960 0.149143 13.666175 2.988174 0.162725 5.973826 0.843416 0.597394 0.701149 4.680002 0.300085 0.416262
+
+0.082276 0.055172 0.043853 0.053484 0.018957 0.028152 0.046679 0.157817 0.033297 0.028284 0.054284 0.025275 0.023665 0.041874 0.063071 0.066501 0.065424 0.023837 0.038633 0.049465;
+
+model LG4M2 =
+0.133720
+ 0.337212 0.749052
+ 0.110918 0.105087 4.773487
+ 3.993460 0.188305 1.590332 0.304942
+ 0.412075 2.585774 1.906884 0.438367 0.242076
+ 0.435295 0.198278 0.296366 7.470333 0.008443 3.295515
+ 7.837540 0.164607 0.431724 0.153850 1.799716 0.269744 0.242866
+ 0.203872 2.130334 9.374479 1.080878 0.152458 12.299133 0.279589 0.089714
+ 0.039718 0.024553 0.135254 0.014979 0.147498 0.033964 0.005585 0.007248 0.022746
+ 0.075784 0.080091 0.084971 0.014128 0.308347 0.500836 0.022833 0.022999 0.161270 1.511682
+ 0.177662 10.373708 1.036721 0.038303 0.043030 2.181033 0.321165 0.103050 0.459502 0.021215 0.078395
+ 0.420784 0.192765 0.329545 0.008331 0.883142 1.403324 0.168673 0.160728 0.612573 1.520889 7.763266 0.307903
+ 0.071268 0.019652 0.088753 0.013547 0.566609 0.071878 0.020050 0.041022 0.625361 0.382806 1.763059 0.044644 1.551911
+ 0.959127 1.496585 0.377794 0.332010 0.318192 1.386970 0.915904 0.224255 2.611479 0.029351 0.068250 1.542356 0.047525 0.182715
+ 11.721512 0.359408 2.399158 0.219464 9.104192 0.767563 0.235229 3.621219 0.971955 0.033780 0.043035 0.236929 0.319964 0.124977 0.840651
+ 2.847068 0.218463 1.855386 0.109808 4.347048 0.765848 0.164569 0.312024 0.231569 0.356327 0.159597 0.403210 1.135162 0.106903 0.269190 9.816481
+ 0.030203 0.387292 0.118878 0.067287 0.190240 0.122113 0.007023 0.137411 0.585141 0.020634 0.228824 0.000122 0.474862 3.135128 0.030313 0.093830 0.119152
+ 0.067183 0.130101 0.348730 0.061798 0.301198 0.095382 0.095764 0.044628 2.107384 0.046105 0.100117 0.017073 0.192383 8.367641 0.000937 0.137416 0.044722 4.179782
+ 0.679398 0.041567 0.092408 0.023701 1.271187 0.115566 0.055277 0.086988 0.060779 8.235167 0.609420 0.061764 0.581962 0.184187 0.080246 0.098033 1.438350 0.023439 0.039124
+
+0.120900 0.036460 0.026510 0.040410 0.015980 0.021132 0.025191 0.036369 0.015884 0.111029 0.162852 0.024820 0.028023 0.074058 0.012065 0.041963 0.039072 0.012666 0.040478 0.114137;
+
+model LG4M3 =
+0.421017
+ 0.316236 0.693340
+ 0.285984 0.059926 6.158219
+ 4.034031 1.357707 0.708088 0.063669
+ 0.886972 2.791622 1.701830 0.484347 0.414286
+ 0.760525 0.233051 0.378723 4.032667 0.081977 4.940411
+ 0.754103 0.402894 2.227443 1.102689 0.416576 0.459376 0.508409
+ 0.571422 2.319453 5.579973 0.885376 1.439275 4.101979 0.576745 0.428799
+ 0.162152 0.085229 0.095692 0.006129 0.490937 0.104843 0.045514 0.004705 0.098934
+ 0.308006 0.287051 0.056994 0.007102 0.958988 0.578990 0.067119 0.024403 0.342983 3.805528
+ 0.390161 7.663209 1.663641 0.105129 0.135029 3.364474 0.652618 0.457702 0.823674 0.129858 0.145630
+ 1.042298 0.364551 0.293222 0.037983 1.486520 1.681752 0.192414 0.070498 0.222626 4.529623 4.781730 0.665308
+ 0.362476 0.073439 0.129245 0.020078 1.992483 0.114549 0.023272 0.064490 1.491794 1.113437 2.132006 0.041677 1.928654
+ 1.755491 0.087050 0.099325 0.163817 0.242851 0.322939 0.062943 0.198698 0.192904 0.062948 0.180283 0.059655 0.129323 0.065778
+ 3.975060 0.893398 5.496314 1.397313 3.575120 1.385297 0.576191 1.733288 1.021255 0.065131 0.129115 0.600308 0.387276 0.446001 1.298493
+ 2.565079 0.534056 2.143993 0.411388 2.279084 0.893006 0.528209 0.135731 0.518741 0.972662 0.280700 0.890086 1.828755 0.189028 0.563778 7.788147
+ 0.283631 0.497926 0.075454 0.043794 1.335322 0.308605 0.140137 0.150797 1.409726 0.119868 0.818331 0.080591 1.066017 3.754687 0.073415 0.435046 0.197272
+ 0.242513 0.199157 0.472207 0.085937 2.039787 0.262751 0.084578 0.032247 7.762326 0.153966 0.299828 0.117255 0.438215 14.506235 0.089180 0.352766 0.215417 5.054245
+ 2.795818 0.107130 0.060909 0.029724 2.986426 0.197267 0.196977 0.044327 0.116751 7.144311 1.848622 0.118020 1.999696 0.705747 0.272763 0.096935 1.820982 0.217007 0.172975
+
+0.072639 0.051691 0.038642 0.055580 0.009829 0.031374 0.048731 0.065283 0.023791 0.086640 0.120847 0.052177 0.026728 0.032589 0.039238 0.046748 0.053361 0.008024 0.037426 0.098662;
+
+model LG4M4 =
+0.576160
+ 0.567606 0.498643
+ 0.824359 0.050698 3.301401
+ 0.822724 4.529235 1.291808 0.101930
+ 1.254238 2.169809 1.427980 0.449474 0.868679
+ 1.218615 0.154502 0.411471 3.172277 0.050239 2.138661
+ 1.803443 0.604673 2.125496 1.276384 1.598679 0.502653 0.479490
+ 0.516862 2.874265 4.845769 0.719673 3.825677 4.040275 0.292773 0.596643
+ 0.180898 0.444586 0.550969 0.023542 2.349573 0.370160 0.142187 0.016618 0.500788
+ 0.452099 0.866322 0.201033 0.026731 2.813990 1.645178 0.135556 0.072152 1.168817 5.696116
+ 0.664186 2.902886 2.101971 0.127988 0.200218 2.505933 0.759509 0.333569 0.623100 0.547454 0.363656
+ 0.864415 0.835049 0.632649 0.079201 2.105931 1.633544 0.216462 0.252419 0.665406 7.994105 11.751178 1.096842
+ 0.324478 0.208947 0.280339 0.041683 4.788477 0.107022 0.067711 0.171320 3.324779 2.965328 5.133843 0.084856 4.042591
+ 1.073043 0.173826 0.041985 0.270336 0.121299 0.351384 0.228565 0.225318 0.376089 0.058027 0.390354 0.214230 0.058954 0.126299
+ 3.837562 0.884342 4.571911 0.942751 6.592827 1.080063 0.465397 3.137614 1.119667 0.362516 0.602355 0.716940 0.506796 1.444484 1.432558
+ 2.106026 0.750016 2.323325 0.335915 1.654673 1.194017 0.617231 0.318671 0.801030 4.455842 0.580191 1.384210 3.522468 0.473128 0.432718 5.716300
+ 0.163720 0.818102 0.072322 0.068275 3.305436 0.373790 0.054323 0.476587 1.100360 0.392946 1.703323 0.085720 1.725516 5.436253 0.053108 0.498594 0.231832
+ 0.241167 0.302440 1.055095 0.246940 9.741942 0.249895 0.129973 0.052363 11.542498 1.047449 1.319667 0.139770 1.330225 26.562270 0.046986 0.737653 0.313460 5.165098
+ 1.824586 0.435795 0.179086 0.091739 3.609570 0.649507 0.656681 0.225234 0.473437 19.897252 3.001995 0.452926 3.929598 1.692159 0.370204 0.373501 3.329822 0.326593 0.860743
+
+0.104843 0.078835 0.043513 0.090498 0.002924 0.066163 0.151640 0.038843 0.022556 0.018383 0.038687 0.104462 0.010166 0.009089 0.066950 0.053667 0.049486 0.004409 0.012924 0.031963;
+
+model LG4M = MIX{LG4M1,LG4M2,LG4M3,LG4M4}*G4;
+
+
+[ ---------------------------------------------------------
+    LG4X mixture model of Le, Dang & Gascuel (2012)
+ --------------------------------------------------------- ]
+
+model LG4X1 =
+0.295719
+0.067388 0.448317
+0.253712 0.457483 2.358429
+1.029289 0.576016 0.251987 0.189008
+0.107964 1.741924 0.216561 0.599450 0.029955
+0.514644 0.736017 0.503084 109.901504 0.084794 4.117654
+10.868848 0.704334 0.435271 1.070052 1.862626 0.246260 1.202023
+0.380498 5.658311 4.873453 5.229858 0.553477 6.508329 1.634845 0.404968
+0.084223 0.123387 0.090748 0.052764 0.151733 0.054187 0.060194 0.048984 0.204296
+0.086976 0.221777 0.033310 0.021407 0.230320 0.195703 0.069359 0.069963 0.504221 1.495537
+0.188789 93.433377 0.746537 0.621146 0.096955 1.669092 2.448827 0.256662 1.991533 0.091940 0.122332
+0.286389 0.382175 0.128905 0.081091 0.352526 0.810168 0.232297 0.228519 0.655465 1.994320 3.256485 0.457430
+0.155567 0.235965 0.127321 0.205164 0.590018 0.066081 0.064822 0.241077 6.799829 0.754940 2.261319 0.163849 1.559944
+1.671061 6.535048 0.904011 5.164456 0.386853 2.437439 3.537387 4.320442 11.291065 0.170343 0.848067 5.260446 0.426508 0.438856
+2.132922 0.525521 0.939733 0.747330 1.559564 0.165666 0.435384 3.656545 0.961142 0.050315 0.064441 0.360946 0.132547 0.306683 4.586081
+0.529591 0.303537 0.435450 0.308078 0.606648 0.106333 0.290413 0.290216 0.448965 0.372166 0.102493 0.389413 0.498634 0.109129 2.099355 3.634276
+0.115551 0.641259 0.046646 0.260889 0.587531 0.093417 0.280695 0.307466 6.227274 0.206332 0.459041 0.033291 0.559069 18.392863 0.411347 0.101797 0.034710
+0.102453 0.289466 0.262076 0.185083 0.592318 0.035149 0.105999 0.096556 20.304886 0.097050 0.133091 0.115301 0.264728 66.647302 0.476350 0.148995 0.063603 20.561407
+0.916683 0.102065 0.043986 0.080708 0.885230 0.072549 0.206603 0.306067 0.205944 5.381403 0.561215 0.112593 0.693307 0.400021 0.584622 0.089177 0.755865 0.133790 0.154902
+
+0.147383 0.017579 0.058208 0.017707 0.026331 0.041582 0.017494 0.027859 0.011849 0.076971 0.147823 0.019535 0.037132 0.029940 0.008059 0.088179 0.089653 0.006477 0.032308 0.097931;
+
+model LG4X2 =
+ 0.066142
+ 0.590377 0.468325
+ 0.069930 0.013688 2.851667
+ 9.850951 0.302287 3.932151 0.146882
+ 1.101363 1.353957 8.159169 0.249672 0.582670
+ 0.150375 0.028386 0.219934 0.560142 0.005035 3.054085
+ 0.568586 0.037750 0.421974 0.046719 0.275844 0.129551 0.037250
+ 0.051668 0.262130 2.468752 0.106259 0.098208 4.210126 0.029788 0.013513
+ 0.127170 0.016923 0.344765 0.003656 0.445038 0.165753 0.008541 0.002533 0.031779
+ 0.292429 0.064289 0.210724 0.004200 1.217010 1.088704 0.014768 0.005848 0.064558 7.278994
+ 0.071458 0.855973 1.172204 0.014189 0.033969 1.889645 0.125869 0.031390 0.065585 0.029917 0.042762
+ 1.218562 0.079621 0.763553 0.009876 1.988516 3.344809 0.056702 0.021612 0.079927 7.918203 14.799537 0.259400
+ 0.075144 0.011169 0.082464 0.002656 0.681161 0.111063 0.004186 0.004854 0.095591 0.450964 1.506485 0.009457 1.375871
+ 7.169085 0.161937 0.726566 0.040244 0.825960 2.067758 0.110993 0.129497 0.196886 0.169797 0.637893 0.090576 0.457399 0.143327
+ 30.139501 0.276530 11.149790 0.267322 18.762977 3.547017 0.201148 0.976631 0.408834 0.104288 0.123793 0.292108 0.598048 0.328689 3.478333
+ 13.461692 0.161053 4.782635 0.053740 11.949233 2.466507 0.139705 0.053397 0.126088 1.578530 0.641351 0.297913 4.418398 0.125011 2.984862 13.974326
+ 0.021372 0.081472 0.058046 0.006597 0.286794 0.188236 0.009201 0.019475 0.037226 0.015909 0.154810 0.017172 0.239749 0.562720 0.061299 0.154326 0.060703
+ 0.045779 0.036742 0.498072 0.027639 0.534219 0.203493 0.012095 0.004964 0.452302 0.094365 0.140750 0.021976 0.168432 1.414883 0.077470 0.224675 0.123480 0.447011
+ 4.270235 0.030342 0.258487 0.012745 4.336817 0.281953 0.043812 0.015539 0.016212 16.179952 3.416059 0.032578 2.950318 0.227807 1.050562 0.112000 5.294490 0.033381 0.045528
+
+0.063139 0.066357 0.011586 0.066571 0.010800 0.009276 0.053984 0.146986 0.034214 0.088822 0.098196 0.032390 0.021263 0.072697 0.016761 0.020711 0.020797 0.025463 0.045615 0.094372;
+
+model LG4X3 =
+ 0.733336
+ 0.558955 0.597671
+ 0.503360 0.058964 5.581680
+ 4.149599 2.863355 1.279881 0.225860
+ 1.415369 2.872594 1.335650 0.434096 1.043232
+ 1.367574 0.258365 0.397108 2.292917 0.209978 4.534772
+ 1.263002 0.366868 1.840061 1.024707 0.823594 0.377181 0.496780
+ 0.994098 2.578946 5.739035 0.821921 3.039380 4.877840 0.532488 0.398817
+ 0.517204 0.358350 0.284730 0.027824 1.463390 0.370939 0.232460 0.008940 0.349195
+ 0.775054 0.672023 0.109781 0.021443 1.983693 1.298542 0.169219 0.043707 0.838324 5.102837
+ 0.763094 5.349861 1.612642 0.088850 0.397640 3.509873 0.755219 0.436013 0.888693 0.561690 0.401070
+ 1.890137 0.691594 0.466979 0.060820 2.831098 2.646440 0.379926 0.087640 0.488389 7.010411 8.929538 1.357738
+ 0.540460 0.063347 0.141582 0.018288 4.102068 0.087872 0.020447 0.064863 1.385133 3.054968 5.525874 0.043394 3.135353
+ 0.200122 0.032875 0.019509 0.042687 0.059723 0.072299 0.023282 0.036426 0.050226 0.039318 0.067505 0.023126 0.012695 0.015631
+ 4.972745 0.821562 4.670980 1.199607 5.901348 1.139018 0.503875 1.673207 0.962470 0.204155 0.273372 0.567639 0.570771 0.458799 0.233109
+ 1.825593 0.580847 1.967383 0.420710 2.034980 0.864479 0.577513 0.124068 0.502294 2.653232 0.437116 1.048288 2.319555 0.151684 0.077004 8.113282
+ 0.450842 0.661866 0.088064 0.037642 2.600668 0.390688 0.109318 0.218118 1.065585 0.564368 1.927515 0.120994 1.856122 4.154750 0.011074 0.377578 0.222293
+ 0.526135 0.265730 0.581928 0.141233 5.413080 0.322761 0.153776 0.039217 8.351808 0.854294 0.940458 0.180650 0.975427 11.429924 0.026268 0.429221 0.273138 4.731579
+ 3.839269 0.395134 0.145401 0.090101 4.193725 0.625409 0.696533 0.104335 0.377304 15.559906 2.508169 0.449074 3.404087 1.457957 0.052132 0.260296 2.903836 0.564762 0.681215
+
+ 0.062457 0.066826 0.049332 0.065270 0.006513 0.041231 0.058965 0.080852 0.028024 0.037024 0.075925 0.064131 0.019620 0.028710 0.104579 0.056388 0.062027 0.008241 0.033124 0.050760;
+
+model LG4X4 =
+ 0.658412
+ 0.566269 0.540749
+ 0.854111 0.058015 3.060574
+ 0.884454 5.851132 1.279257 0.160296
+ 1.309554 2.294145 1.438430 0.482619 0.992259
+ 1.272639 0.182966 0.431464 2.992763 0.086318 2.130054
+ 1.874713 0.684164 2.075952 1.296206 2.149634 0.571406 0.507160
+ 0.552007 3.192521 4.840271 0.841829 5.103188 4.137385 0.351381 0.679853
+ 0.227683 0.528161 0.644656 0.031467 3.775817 0.437589 0.189152 0.025780 0.665865
+ 0.581512 1.128882 0.266076 0.048542 3.954021 2.071689 0.217780 0.082005 1.266791 8.904999
+ 0.695190 3.010922 2.084975 0.132774 0.190734 2.498630 0.767361 0.326441 0.680174 0.652629 0.440178
+ 0.967985 1.012866 0.720060 0.133055 1.776095 1.763546 0.278392 0.343977 0.717301 10.091413 14.013035 1.082703
+ 0.344015 0.227296 0.291854 0.056045 4.495841 0.116381 0.092075 0.195877 4.001286 2.671718 5.069337 0.091278 4.643214
+ 0.978992 0.156635 0.028961 0.209188 0.264277 0.296578 0.177263 0.217424 0.362942 0.086367 0.539010 0.172734 0.121821 0.161015
+ 3.427163 0.878405 4.071574 0.925172 7.063879 1.033710 0.451893 3.057583 1.189259 0.359932 0.742569 0.693405 0.584083 1.531223 1.287474
+ 2.333253 0.802754 2.258357 0.360522 2.221150 1.283423 0.653836 0.377558 0.964545 4.797423 0.780580 1.422571 4.216178 0.599244 0.444362 5.231362
+ 0.154701 0.830884 0.073037 0.094591 3.017954 0.312579 0.074620 0.401252 1.350568 0.336801 1.331875 0.068958 1.677263 5.832025 0.076328 0.548763 0.208791
+ 0.221089 0.431617 1.238426 0.313945 8.558815 0.305772 0.181992 0.072258 12.869737 1.021885 1.531589 0.163829 1.575754 33.873091 0.079916 0.831890 0.307846 5.910440
+ 2.088785 0.456530 0.199728 0.118104 4.310199 0.681277 0.752277 0.241015 0.531100 23.029406 4.414850 0.481711 5.046403 1.914768 0.466823 0.382271 3.717971 0.282540 0.964421
+
+0.106471 0.074171 0.044513 0.096390 0.002148 0.066733 0.158908 0.037625 0.020691 0.014608 0.028797 0.105352 0.007864 0.007477 0.083595 0.055726 0.047711 0.003975 0.010088 0.027159;
+
+model LG4X = MIX{LG4X1,LG4X2,LG4X3,LG4X4}*R4;
+
+[ ---------------------------------------------------------
+    +cF class frequency mixture model of Wang et al. (2008)
+ --------------------------------------------------------- ]
+
+frequency Fclass1 = 0.02549352 0.01296012 0.005545202 0.006005566 0.01002193 0.01112289 0.008811948 0.001796161 0.004312188 0.2108274 0.2730413 0.01335451 0.07862202 0.03859909 0.005058205 0.008209453 0.03210019 0.002668138 0.01379098 0.2376598;
+frequency Fclass2 = 0.09596966 0.008786096 0.02805857 0.01880183 0.005026264 0.006454635 0.01582725 0.7215719 0.003379354 0.002257725 0.003013483 0.01343441 0.001511657 0.002107865 0.006751404 0.04798539 0.01141559 0.000523736 0.002188483 0.004934972;
+frequency Fclass3 = 0.01726065 0.005467988 0.01092937 0.3627871 0.001046402 0.01984758 0.5149206 0.004145081 0.002563289 0.002955213 0.005286931 0.01558693 0.002693098 0.002075771 0.003006167 0.01263069 0.01082144 0.000253451 0.001144787 0.004573568;
+frequency Fclass4 = 0.1263139 0.09564027 0.07050061 0.03316681 0.02095119 0.05473468 0.02790523 0.009007538 0.03441334 0.005855319 0.008061884 0.1078084 0.009019514 0.05018693 0.07948 0.09447839 0.09258897 0.01390669 0.05367769 0.01230413;
+frequency CF4 = FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4};
+model JTTCF4G = JTT+FMIX{empirical,Fclass1,Fclass2,Fclass3,Fclass4}+G;
+
+[ ---------------------------------------------------------
+    CAT-C10 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+
+frequency C10pi1 = 0.4082573125 0.0081783015 0.0096285438 0.0069870889 0.0349388179 0.0075279735 0.0097846653 0.1221613215 0.0039151830 0.0125784287 0.0158338663 0.0059670150 0.0081313216 0.0061604332 0.0394155867 0.1682450664 0.0658132542 0.0018751587 0.0041579747 0.0604426865;
+frequency C10pi2 = 0.1027763487 0.0418664491 0.0213272051 0.0155943616 0.0149663448 0.0440685478 0.0419667447 0.0138805792 0.0158864807 0.1066076641 0.1131944125 0.0436343681 0.0437800327 0.0180729309 0.0223250701 0.0529608087 0.1081741005 0.0045147205 0.0137373857 0.1606654446;
+frequency C10pi3 = 0.0351766018 0.0019678632 0.0016591476 0.0006768741 0.0078706538 0.0016559557 0.0019686768 0.0022420602 0.0012878339 0.3515819591 0.1278183107 0.0018856550 0.0242631753 0.0126221329 0.0029771559 0.0049998099 0.0255378034 0.0011907778 0.0037539283 0.3888636245;
+frequency C10pi4 = 0.0408513927 0.0269887074 0.2185648186 0.2333814790 0.0037602852 0.0380451418 0.0901238869 0.1158332065 0.0373197176 0.0025523644 0.0052164616 0.0485017266 0.0022571778 0.0025108218 0.0108333610 0.0804527209 0.0302879995 0.0010815260 0.0069890931 0.0044481118;
+frequency C10pi5 = 0.0185492661 0.0062362395 0.0024895723 0.0009775062 0.0070416514 0.0083539447 0.0024891617 0.0028952913 0.0040103982 0.1632422345 0.4443079409 0.0043570878 0.1202815687 0.0733329781 0.0048827648 0.0051642443 0.0131806647 0.0068759784 0.0144734420 0.0968580644;
+frequency C10pi6 = 0.1106750119 0.0352190043 0.0405186210 0.1636437899 0.0014834855 0.0877962201 0.2638456592 0.0325228293 0.0163803600 0.0068334902 0.0140679579 0.0677158208 0.0048988133 0.0023256777 0.0298982139 0.0562887953 0.0426922497 0.0010338979 0.0040522304 0.0181078719;
+frequency C10pi7 = 0.0522657662 0.0668294648 0.0714836849 0.0297745257 0.0143324928 0.0736540298 0.0388386669 0.0228101108 0.1551638111 0.0187406149 0.0653779932 0.0439469345 0.0207189121 0.0624033021 0.0145475497 0.0549017631 0.0370140058 0.0193756900 0.1110694548 0.0267512268;
+frequency C10pi8 = 0.0116587342 0.0050990142 0.0064011054 0.0021742457 0.0105340743 0.0040203734 0.0024251112 0.0034709143 0.0366787049 0.0187185330 0.0676489746 0.0026694717 0.0143534813 0.3650985596 0.0031159927 0.0094848536 0.0073713920 0.0509564551 0.3574858593 0.0206341497;
+frequency C10pi9 = 0.0627195947 0.2038782162 0.0428629162 0.0236193294 0.0052662886 0.1098111767 0.0686284994 0.0256174957 0.0332612124 0.0128968249 0.0305627740 0.2270839355 0.0124036991 0.0039181841 0.0140440613 0.0483152469 0.0463378087 0.0025143473 0.0065521118 0.0197062770;
+frequency C10pi10 = 0.1145518598 0.0324008908 0.0750614981 0.0416192189 0.0098549497 0.0339624663 0.0364907910 0.0503817581 0.0165233329 0.0092949460 0.0139153707 0.0423026886 0.0082240805 0.0046605982 0.0379221548 0.2610647896 0.1845829279 0.0017548981 0.0058538316 0.0195769483;
+model C10 = POISSON+G4+FMIX{C10pi1:1:0.1191344178,C10pi2:1:0.0874372456,C10pi3:1:0.1037105070,C10pi4:1:0.0922584809,C10pi5:1:0.1070492801,C10pi6:1:0.1329945166,C10pi7:1:0.0538028458,C10pi8:1:0.0691986212,C10pi9:1:0.1319937434,C10pi10:1:0.1024203429};
+model C10Opt = POISSON+G4+FMIX{C10pi1,C10pi2,C10pi3,C10pi4,C10pi5,C10pi6,C10pi7,C10pi8,C10pi9,C10pi10};
+
+[ ---------------------------------------------------------
+    CAT-C20 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+frequency C20pi1 = 0.0862412505 0.0171943793 0.0791293376 0.0329908619 0.0130504558 0.0169046938 0.0184526503 0.0366905299 0.0108013340 0.0097907148 0.0112826424 0.0220195221 0.0087821483 0.0044155335 0.0189273201 0.3178152357 0.2711700523 0.0015317305 0.0048342853 0.0179753220 ;
+frequency C20pi2 = 0.2035582865 0.0050980810 0.0077052407 0.0031656079 0.0348667285 0.0064044073 0.0070859400 0.0195235515 0.0024392035 0.1152573291 0.0789777393 0.0042380850 0.0309187017 0.0112429356 0.0164189221 0.0496777139 0.1118946615 0.0017762569 0.0048448213 0.2849057867 ;
+frequency C20pi3 = 0.0211547413 0.0014946177 0.0012755030 0.0005492865 0.0048188557 0.0012328812 0.0014539632 0.0011430874 0.0011346394 0.3928460626 0.1250644210 0.0013579946 0.0209788805 0.0128251737 0.0020247248 0.0026240726 0.0171914121 0.0011591071 0.0036027969 0.3860677787 ;
+frequency C20pi4 = 0.0376903543 0.2885196153 0.0365411474 0.0109469400 0.0064073829 0.0893564381 0.0358365464 0.0191106776 0.0329513951 0.0101711878 0.0237495504 0.2897626974 0.0096528870 0.0036349802 0.0105337370 0.0356313768 0.0355926500 0.0027925238 0.0066557222 0.0144621902 ;
+frequency C20pi5 = 0.0084597802 0.0053589922 0.0072525884 0.0024487852 0.0084909000 0.0042781483 0.0025055486 0.0024277107 0.0433214027 0.0097713028 0.0380507037 0.0026741007 0.0080724771 0.3420463838 0.0021418673 0.0080418935 0.0055322116 0.0494840193 0.4375001561 0.0121410277 ;
+frequency C20pi6 = 0.1759898886 0.0290429175 0.0332845569 0.1301263816 0.0017558693 0.0707183953 0.2182166681 0.0409535143 0.0130708195 0.0085622087 0.0159530702 0.0542946169 0.0054045759 0.0025276980 0.0371020404 0.0793480500 0.0540083424 0.0010592104 0.0036259116 0.0249552645 ;
+frequency C20pi7 = 0.1634397322 0.0195541184 0.0438701833 0.0374272612 0.0088659891 0.0137554758 0.0220611924 0.5296717726 0.0090006141 0.0017569353 0.0061156267 0.0167117975 0.0029390787 0.0030641349 0.0126457766 0.0829342776 0.0142835614 0.0028640685 0.0032398299 0.0057985736 ;
+frequency C20pi8 = 0.0917468761 0.0265853306 0.0290699087 0.0133818895 0.0284015012 0.0255084506 0.0196875685 0.0249898794 0.0449766405 0.0583555688 0.1155009222 0.0164915955 0.0395994595 0.0998479096 0.0209916159 0.0736482742 0.0661518462 0.0246463919 0.0972327226 0.0831856483 ;
+frequency C20pi9 = 0.0646700714 0.0988015996 0.0228907308 0.0168733856 0.0077117603 0.0996414875 0.0544977962 0.0148893975 0.0313851988 0.0505983315 0.1844282999 0.0907931290 0.0774839960 0.0219148172 0.0105004469 0.0321196170 0.0411766062 0.0084303030 0.0206106035 0.0505824221 ;
+frequency C20pi10 = 0.0135993865 0.0043408375 0.0018469375 0.0007951703 0.0100090240 0.0046420778 0.0018011758 0.0026794645 0.0072401918 0.0814026713 0.3661422246 0.0025158135 0.0734965132 0.2640965246 0.0038994134 0.0043668760 0.0075248451 0.0261564898 0.0660970801 0.0573472826 ;
+frequency C20pi11 = 0.1478036236 0.0842845089 0.0726630217 0.0534743238 0.0048825808 0.0757166156 0.0727246460 0.0907725939 0.0262288856 0.0035781075 0.0126777221 0.1051660098 0.0059621792 0.0029903868 0.0156558198 0.1459903343 0.0634877444 0.0015928454 0.0050760739 0.0092719768 ;
+frequency C20pi12 = 0.0186377412 0.0042055165 0.0019865236 0.0008329696 0.0054968852 0.0065890091 0.0020248504 0.0021713483 0.0023665991 0.2020809776 0.4370381920 0.0029120653 0.1241860384 0.0385383157 0.0040672279 0.0046177381 0.0149904396 0.0026871667 0.0056324117 0.1189379840 ;
+frequency C20pi13 = 0.0477624336 0.0505742667 0.0209574273 0.0141349161 0.0075791708 0.0429296799 0.0462688073 0.0052327914 0.0165351815 0.1741496627 0.1121253570 0.0577575020 0.0330288046 0.0130691347 0.0124374733 0.0264988925 0.0951754678 0.0031660482 0.0112465746 0.2093704079 ;
+frequency C20pi14 = 0.4164189845 0.0056100821 0.0091701381 0.0045131748 0.0406937949 0.0061320495 0.0063229801 0.0946185184 0.0031057404 0.0076443223 0.0099885414 0.0038941773 0.0069323155 0.0048438356 0.0187840756 0.2360774301 0.0746274607 0.0012172579 0.0034825786 0.0459225422 ;
+frequency C20pi15 = 0.0402295888 0.0735203003 0.1036647193 0.0365523994 0.0124782975 0.0826558132 0.0372197283 0.0233618081 0.2108307125 0.0093478727 0.0360561493 0.0482410586 0.0100289536 0.0459094917 0.0098503973 0.0533383445 0.0310209005 0.0140076639 0.1064377821 0.0152480184 ;
+frequency C20pi16 = 0.0323453034 0.0236282995 0.2520448083 0.2431495959 0.0035976296 0.0330831153 0.0710274499 0.1016074562 0.0366225082 0.0031410809 0.0051980542 0.0470129351 0.0024028744 0.0024429276 0.0094837826 0.0848355278 0.0359083275 0.0008730928 0.0067247672 0.0048704638 ;
+frequency C20pi17 = 0.1476256642 0.0334506604 0.0211972524 0.0403051550 0.0032327194 0.0371554480 0.0576893391 0.0330850942 0.0146392559 0.0108267008 0.0256200793 0.0451350877 0.0058651400 0.0047177179 0.3473710507 0.0892065279 0.0485899446 0.0016358749 0.0044177191 0.0282335685 ;
+frequency C20pi18 = 0.1031448143 0.0717747663 0.0435172139 0.0386401502 0.0061762467 0.0786603123 0.0923369140 0.0202338419 0.0246761899 0.0376904275 0.0376283678 0.0921698920 0.0161883318 0.0067666433 0.0128302120 0.0951450188 0.1378566702 0.0022144738 0.0083041573 0.0740453560 ;
+frequency C20pi19 = 0.0837542823 0.0899383244 0.0518811417 0.0804870571 0.0020735078 0.1456497470 0.1947759184 0.0229030361 0.0268458796 0.0074079756 0.0190249576 0.1459287407 0.0067395241 0.0023063393 0.0085616014 0.0455739585 0.0451080843 0.0010771349 0.0049325333 0.0150302559 ;
+frequency C20pi20 = 0.0578735570 0.0138313604 0.0491421636 0.2946738942 0.0011130839 0.0598250358 0.3402102668 0.0293911435 0.0139817004 0.0030525663 0.0062611922 0.0363365043 0.0027295976 0.0017034884 0.0156106390 0.0358044639 0.0249941878 0.0008664342 0.0038312977 0.0087674229 ;
+
+[ C20 with fixed weights ]
+model C20 = POISSON+G4+FMIX{C20pi1:1:0.0559910600,C20pi2:1:0.0514824870,C20pi3:1:0.0812922124,C20pi4:1:0.0721976867,C20pi5:1:0.0556718858,C20pi6:1:0.0331003080,C20pi7:1:0.0589501763,C20pi8:1:0.0263756889,C20pi9:1:0.0307584220,C20pi10:1:0.0376701125,C20pi11:1:0.0303058290,C20pi12:1:0.0808775576,C20pi13:1:0.0263349134,C20pi14:1:0.0579101455,C20pi15:1:0.0371248064,C20pi16:1:0.0586867766,C20pi17:1:0.0561479138,C20pi18:1:0.0349810886,C20pi19:1:0.0544937394,C20pi20:1:0.0596471901};
+[ C20 to weights to be optimized ]
+model C20Opt = POISSON+G4+FMIX{C20pi1,C20pi2,C20pi3,C20pi4,C20pi5,C20pi6,C20pi7,C20pi8,C20pi9,C20pi10,C20pi11,C20pi12,C20pi13,C20pi14,C20pi15,C20pi16,C20pi17,C20pi18,C20pi19,C20pi20};
+
+model C20Test = POISSON+G4+FMIX{C20pi1:1:0.089485,C20pi2:1:0.021281,C20pi3:1:0.119676,C20pi4:1:0.080933,C20pi5:1:0.064054,C20pi6:1:0.021848,C20pi7:1:0.063392,C20pi8:1:0.003629,C20pi9:1:0.007174,C20pi10:1:0.006256,C20pi11:1:0.023424,C20pi12:1:0.086825,C20pi13:1:0.038495,C20pi14:1:0.090028,C20pi15:1:0.020025,C20pi16:1:0.043484,C20pi17:1:0.076864,C20pi18:1:0.031347,C20pi19:1:0.047749,C20pi20:1:0.064031};
+
+[ ---------------------------------------------------------
+    CAT-C30 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+frequency C30pi1 = 0.1100453954 0.0171294861 0.0640338464 0.1595411459 0.0019047235 0.0310187088 0.1098958823 0.0684301540 0.0137950707 0.0026283074 0.0073396531 0.0358553674 0.0024706414 0.0016629473 0.1669356820 0.1381790473 0.0568342547 0.0004661120 0.0035970152 0.0082365591;
+frequency C30pi2 = 0.0874125465 0.0806320385 0.0382152368 0.0326119879 0.0049826376 0.0798168854 0.0951700809 0.0144042708 0.0210626652 0.0399884450 0.0301585074 0.1147200015 0.0126488911 0.0048996596 0.0137397028 0.0873769666 0.1558616621 0.0015122843 0.0053974463 0.0793880836;
+frequency C30pi3 = 0.0225477414 0.0014900535 0.0013034594 0.0005959279 0.0050018158 0.0011436556 0.0015030529 0.0011570953 0.0009374322 0.3944689167 0.0889573138 0.0013600872 0.0189102669 0.0089216031 0.0018312028 0.0028336408 0.0189813395 0.0006693746 0.0023303726 0.4250556480;
+frequency C30pi4 = 0.0602158209 0.0136833299 0.0414987935 0.2900084105 0.0009525462 0.0621611083 0.3610869026 0.0281925621 0.0130500799 0.0030516237 0.0060401889 0.0352704692 0.0027460635 0.0014625624 0.0127175499 0.0318109377 0.0225279521 0.0007948027 0.0034024563 0.0093258397;
+frequency C30pi5 = 0.0101223637 0.0028344920 0.0012928910 0.0006379191 0.0085989355 0.0035028551 0.0011249625 0.0024085229 0.0047753376 0.0701153131 0.4135913903 0.0016748492 0.0744862631 0.2785384406 0.0040466582 0.0037087155 0.0052379329 0.0200222636 0.0523938808 0.0408860135;
+frequency C30pi6 = 0.1335831781 0.0284789590 0.0213891629 0.1125775537 0.0010514541 0.0565844323 0.2099572968 0.0207551870 0.0121330488 0.0073526522 0.0133278240 0.0771772013 0.0030571689 0.0016793592 0.1890195131 0.0484054108 0.0373318180 0.0009266995 0.0026946425 0.0225174379;
+frequency C30pi7 = 0.0408277374 0.0124491768 0.0080464869 0.0030634898 0.0153918410 0.0102922098 0.0066010880 0.0058113137 0.0245211764 0.1487514547 0.1637802160 0.0075923232 0.0385527359 0.1575049888 0.0058352224 0.0151578617 0.0332220362 0.0264937109 0.1213342989 0.1547706314;
+frequency C30pi8 = 0.2469059247 0.0106278945 0.0168929681 0.0027418266 0.1039406309 0.0103988197 0.0054944756 0.0373263209 0.0085752319 0.0292403793 0.0535091180 0.0056123053 0.0302246485 0.0251775640 0.0078098946 0.1642352274 0.1239889705 0.0053155877 0.0163953993 0.0955868125;
+frequency C30pi9 = 0.0549428629 0.1305426495 0.0202957532 0.0092915274 0.0099280995 0.0906036344 0.0417085054 0.0105563869 0.0363512470 0.0569584863 0.1681833183 0.1152521806 0.0592328363 0.0243860149 0.0083055411 0.0283778833 0.0412594019 0.0096355359 0.0249780472 0.0592100878;
+frequency C30pi10 = 0.0462773303 0.0362984274 0.0412365193 0.0182504174 0.0172727117 0.0348990852 0.0224266258 0.0160971397 0.1357852215 0.0164966886 0.0598936127 0.0239396241 0.0164507129 0.1336320854 0.0117413009 0.0454156401 0.0304387749 0.0330338410 0.2350163763 0.0253978649;
+frequency C30pi11 = 0.0474379955 0.0410179935 0.0222453982 0.0112116958 0.0082332447 0.0374051414 0.0388100853 0.0055998598 0.0149156570 0.1832173840 0.1100691114 0.0467850545 0.0356443791 0.0116643783 0.0100244663 0.0317171100 0.1114352326 0.0026685586 0.0099660086 0.2199312452;
+frequency C30pi12 = 0.0213607696 0.0069976154 0.0039878996 0.0012941246 0.0061024858 0.0139566033 0.0036297282 0.0030017014 0.0038425894 0.1309465785 0.4566988203 0.0054567760 0.1947837355 0.0371808169 0.0040747282 0.0076991487 0.0198018718 0.0034086391 0.0064545692 0.0693207986;
+frequency C30pi13 = 0.0919632044 0.0160004872 0.0764682386 0.0306717360 0.0117031014 0.0160060006 0.0171907654 0.0370684649 0.0100792697 0.0093123713 0.0097240970 0.0205385908 0.0075767282 0.0041589440 0.0179686194 0.3254471625 0.2744377258 0.0013887442 0.0044739725 0.0178217761;
+frequency C30pi14 = 0.4649246103 0.0043013249 0.0075304815 0.0050731691 0.0233328752 0.0043571322 0.0057994247 0.1495242047 0.0023298425 0.0043361190 0.0055995530 0.0028525398 0.0039313170 0.0025588185 0.0186467246 0.2150194771 0.0477030158 0.0009038096 0.0020087184 0.0292668421;
+frequency C30pi15 = 0.2051329382 0.0439661329 0.0339418395 0.1070980865 0.0020915940 0.0822742346 0.1989733497 0.0487574293 0.0127143076 0.0058124693 0.0133471767 0.0667787412 0.0043783406 0.0018235059 0.0110997761 0.0873961609 0.0519781961 0.0007361603 0.0023821404 0.0193174204;
+frequency C30pi16 = 0.0263689890 0.0133613622 0.2727158135 0.3117715371 0.0039462429 0.0218978778 0.0694354212 0.0799842408 0.0309615130 0.0027521242 0.0038579661 0.0288630708 0.0018363656 0.0023351927 0.0062457560 0.0798729385 0.0324143174 0.0007229656 0.0063857732 0.0042705326;
+frequency C30pi17 = 0.1526502637 0.0332784464 0.0168229991 0.0237392180 0.0040215287 0.0341733672 0.0377949108 0.0306214335 0.0141929803 0.0123317972 0.0290062362 0.0375543022 0.0064473224 0.0058584416 0.3864504800 0.0880336410 0.0489543188 0.0018252558 0.0048877798 0.0313552773;
+frequency C30pi18 = 0.0080247558 0.0017408595 0.0006327403 0.0003385965 0.0023412143 0.0015507896 0.0007818945 0.0005403825 0.0010026402 0.3177056649 0.3737894172 0.0012598254 0.0488212345 0.0311968471 0.0020687549 0.0012095129 0.0065696791 0.0016309208 0.0043343553 0.1944599147;
+frequency C30pi19 = 0.0599950319 0.1000540567 0.1334918892 0.0889730776 0.0016884984 0.0864856169 0.0962700957 0.0588796388 0.0327277145 0.0021467269 0.0070876372 0.1825860579 0.0033979446 0.0011800742 0.0141408084 0.0779002375 0.0448817374 0.0006249028 0.0032641120 0.0042241415;
+frequency C30pi20 = 0.0393520657 0.0838170642 0.1425481600 0.0431197671 0.0099071945 0.1019786610 0.0394639510 0.0282866471 0.2095718357 0.0076101442 0.0258339558 0.0596434088 0.0084586675 0.0188680789 0.0096840517 0.0624998643 0.0347087967 0.0054645779 0.0564145251 0.0127685828;
+frequency C30pi21 = 0.0072715487 0.0140998918 0.0019756795 0.0027603830 0.0067852535 0.0043339290 0.0025069369 0.0080834718 0.0113217919 0.0056609640 0.0394199644 0.0017735096 0.0079866080 0.1271475634 0.0041098092 0.0052244365 0.0043022271 0.6273570153 0.1084563767 0.0094226397;
+frequency C30pi22 = 0.0907070068 0.0290062335 0.0860677696 0.0745872716 0.0063699858 0.0259377035 0.0386802115 0.4750046194 0.0168090013 0.0014721054 0.0055149849 0.0343855535 0.0024692074 0.0028859215 0.0112150781 0.0731110371 0.0153705714 0.0022914775 0.0041860660 0.0039281943;
+frequency C30pi23 = 0.0055291882 0.0024626303 0.0046086594 0.0011413426 0.0072105915 0.0022692184 0.0009683043 0.0016070950 0.0325831191 0.0082918400 0.0353677882 0.0013849437 0.0074486804 0.3744093753 0.0013374573 0.0057402692 0.0037279636 0.0330334445 0.4609978298 0.0098802591;
+frequency C30pi24 = 0.2443263138 0.0045386562 0.0062422652 0.0031590902 0.0273880205 0.0053593950 0.0076715636 0.0196089609 0.0020189401 0.1017435067 0.0468424225 0.0045492259 0.0201286022 0.0060619450 0.0185219126 0.0497753825 0.1170795523 0.0009577255 0.0035333687 0.3104931504;
+frequency C30pi25 = 0.0863111274 0.0984811895 0.0313963115 0.0600902926 0.0024419845 0.1672351286 0.2036096150 0.0175221435 0.0245245046 0.0105994220 0.0271209781 0.1485789590 0.0095824358 0.0029393105 0.0068276769 0.0347800318 0.0408210979 0.0014001253 0.0055105388 0.0202271268;
+frequency C30pi26 = 0.0643926114 0.0369048739 0.1031213278 0.1628208462 0.0023165895 0.0752534859 0.1762701353 0.0297139006 0.0303503732 0.0088163033 0.0148016812 0.0727140107 0.0056748403 0.0043066715 0.0099270322 0.0926433867 0.0833129915 0.0011237109 0.0093801464 0.0161550816;
+frequency C30pi27 = 0.1736682858 0.0943628709 0.0520404980 0.0285984935 0.0083596568 0.0722446698 0.0483894060 0.0781901497 0.0266134684 0.0068641911 0.0219499324 0.0964011794 0.0112303313 0.0058273974 0.0169661076 0.1547802460 0.0751701930 0.0028774511 0.0082130397 0.0172524320;
+frequency C30pi28 = 0.0347856579 0.3075984538 0.0314157384 0.0092355245 0.0062754891 0.0861073155 0.0323568406 0.0170288127 0.0306438905 0.0091932292 0.0224428556 0.3020845818 0.0093720833 0.0034303536 0.0104447169 0.0326882932 0.0328713449 0.0025244855 0.0064171317 0.0130832013;
+frequency C30pi29 = 0.1087737102 0.0051781020 0.0032679768 0.0015823203 0.0247877480 0.0057932006 0.0041769888 0.0134703172 0.0024765788 0.1643462917 0.2337152707 0.0027000391 0.0539213396 0.0316523420 0.0154886946 0.0188187787 0.0474912345 0.0037656478 0.0073106362 0.2512827825;
+frequency C30pi30 = 0.1101008748 0.0324324597 0.0435098681 0.0579268520 0.0072699765 0.0615196630 0.0828181488 0.0314463068 0.0308557019 0.0530865813 0.1096787834 0.0293860426 0.0458728977 0.0269153699 0.0296430687 0.0715887866 0.0685882454 0.0062324120 0.0257237601 0.0754042006;
+model C30 = POISSON+G4+FMIX{C30pi1:1:0.0095783264,C30pi2:1:0.0248476365,C30pi3:1:0.0636309366,C30pi4:1:0.0537939225,C30pi5:1:0.0295885587,C30pi6:1:0.0117587936,C30pi7:1:0.0132013428,C30pi8:1:0.0236868805,C30pi9:1:0.0261687659,C30pi10:1:0.0239821974,C30pi11:1:0.0257100906,C30pi12:1:0.0465072425,C30pi13:1:0.0546794546,C30pi14:1:0.0536085131,C30pi15:1:0.0270622670,C30pi16:1:0.0403913593,C30pi17:1:0.0474212700,C30pi18:1:0.0458816478,C30pi19:1:0.0214036510,C30pi20:1:0.0290385981,C30pi21:1:0.0 [...]
+
+[ ---------------------------------------------------------
+    CAT-C40 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+frequency C40pi1 = 0.0660259814 0.0231861755 0.1599815873 0.1054473175 0.0056586745 0.0273928499 0.0440360794 0.0711238664 0.0168194755 0.0039088727 0.0055316013 0.0366689617 0.0037412416 0.0013104807 0.0176359169 0.2497687201 0.1507079582 0.0006723214 0.0038290224 0.0065528958;
+frequency C40pi2 = 0.0232377444 0.0122683027 0.2759650991 0.3532087982 0.0037987468 0.0197339134 0.0739378219 0.0576668030 0.0315866952 0.0031092806 0.0038711609 0.0259363304 0.0017355634 0.0024032103 0.0063116881 0.0657067704 0.0270483653 0.0007602894 0.0069602476 0.0047531689;
+frequency C40pi3 = 0.0166486809 0.0012594763 0.0012622242 0.0005651446 0.0036665719 0.0010669784 0.0013356251 0.0008894749 0.0008231853 0.4129367561 0.0884689295 0.0011904105 0.0186054583 0.0082775676 0.0014029981 0.0021339439 0.0162167380 0.0006082049 0.0019553200 0.4206863114;
+frequency C40pi4 = 0.2394741986 0.0072901253 0.0120536943 0.0044741726 0.0283811727 0.0086558850 0.0105529632 0.0135109628 0.0038929844 0.0765957115 0.0358494908 0.0071093014 0.0199496319 0.0055991131 0.0114265585 0.0847798773 0.1797284519 0.0009838000 0.0042240671 0.2454678377;
+frequency C40pi5 = 0.1194613086 0.0233255669 0.0294552140 0.0134272792 0.0150526644 0.0301537796 0.0192173037 0.0337675998 0.0214746045 0.0579001821 0.1446308373 0.0147261337 0.0561242940 0.0550467421 0.0631355418 0.0925266727 0.0831230185 0.0131636136 0.0331118002 0.0811758434;
+frequency C40pi6 = 0.0567043710 0.0117359330 0.0364734454 0.2955500969 0.0008924801 0.0609516515 0.3795154126 0.0230469606 0.0118360971 0.0031182036 0.0060137466 0.0314205689 0.0028584065 0.0012972333 0.0124745819 0.0300334889 0.0227051137 0.0007738758 0.0031343761 0.0094639563;
+frequency C40pi7 = 0.0179027412 0.0040967133 0.0035697688 0.0008870412 0.0160760340 0.0045395474 0.0023182113 0.0039829808 0.0127292680 0.0404650518 0.1676143477 0.0027994718 0.0424172255 0.3344862590 0.0020115128 0.0075841581 0.0068227293 0.0518381385 0.2452542553 0.0326045442;
+frequency C40pi8 = 0.2712170094 0.0056480837 0.0141045260 0.0021017036 0.2003830179 0.0048264059 0.0023229984 0.0502501222 0.0053727960 0.0150684657 0.0330003443 0.0020646283 0.0154811217 0.0202990358 0.0045351023 0.1764198412 0.0839578061 0.0046265242 0.0141271048 0.0741933626;
+frequency C40pi9 = 0.0894736584 0.1040026384 0.0190192153 0.0272183085 0.0045538316 0.1168091917 0.1275076663 0.0115685734 0.0215746293 0.0469424171 0.0512035100 0.1382047308 0.0147656854 0.0056590176 0.0095546504 0.0383953611 0.0836652641 0.0017079427 0.0062181292 0.0819555787;
+frequency C40pi10 = 0.0495441385 0.0375345822 0.0315863530 0.0143641284 0.0182505609 0.0316504100 0.0215379122 0.0140199913 0.1108543799 0.0247065801 0.0700287927 0.0258142032 0.0188271760 0.1418048822 0.0112101202 0.0456094427 0.0361427973 0.0371985427 0.2223972375 0.0369177689;
+frequency C40pi11 = 0.1704314254 0.0415784004 0.0271109259 0.1098556600 0.0009747331 0.0917299929 0.2536458944 0.0249846466 0.0101389736 0.0058749399 0.0116526350 0.0903324267 0.0036512738 0.0013321301 0.0293613681 0.0561765645 0.0479045729 0.0006696817 0.0022637316 0.0203300232;
+frequency C40pi12 = 0.0162725399 0.0054826071 0.0021876158 0.0010182101 0.0050614097 0.0104414465 0.0025141347 0.0021935389 0.0029914328 0.1328173512 0.4904441779 0.0040120394 0.1929931280 0.0376245580 0.0034333187 0.0040122105 0.0127074428 0.0032107554 0.0058100621 0.0647720205;
+frequency C40pi13 = 0.0823765743 0.0734226431 0.0598389731 0.0311745159 0.0065694304 0.0686451074 0.0675530778 0.0178961594 0.0251143622 0.0291161743 0.0287904106 0.0982301674 0.0168022878 0.0064717899 0.0114044922 0.1302995288 0.1820374273 0.0022724618 0.0079573279 0.0540270885;
+frequency C40pi14 = 0.3594965940 0.0072407229 0.0033421456 0.0031484357 0.0251417178 0.0049014279 0.0064962700 0.1194682267 0.0022970448 0.0458766662 0.0468053893 0.0050168849 0.0215568816 0.0092020461 0.0443915884 0.0465270945 0.0477755293 0.0024540215 0.0046450361 0.1942162766;
+frequency C40pi15 = 0.2015583874 0.0430161610 0.0425386444 0.0954149893 0.0032365302 0.0772010857 0.1534908791 0.0667291678 0.0155218808 0.0067740832 0.0165114429 0.0547322644 0.0060162992 0.0025643300 0.0091970560 0.1185981804 0.0625472744 0.0009565508 0.0031150007 0.0202797924;
+frequency C40pi16 = 0.1042731047 0.0147062345 0.0621645800 0.2424069523 0.0022450116 0.0356498946 0.1774821588 0.1697819523 0.0132648834 0.0018929517 0.0042542620 0.0220651981 0.0016441234 0.0012570256 0.0317041583 0.0778636230 0.0288515782 0.0006930898 0.0017741945 0.0060250231;
+frequency C40pi17 = 0.0781183281 0.0111498472 0.0159270309 0.0041541669 0.0194448667 0.0240151620 0.0116633921 0.0111524105 0.0063589385 0.1354530457 0.2457574952 0.0093729846 0.1087781166 0.0262793949 0.0055294038 0.0408518858 0.0860514305 0.0031547586 0.0085108496 0.1482764918;
+frequency C40pi18 = 0.0856592432 0.0101233167 0.0441923073 0.0135061568 0.0136072878 0.0092590642 0.0078602552 0.0245400880 0.0055379075 0.0100591561 0.0103343559 0.0127318506 0.0080675803 0.0047153035 0.0175273997 0.3406479487 0.3573294650 0.0014243098 0.0035099810 0.0193670227;
+frequency C40pi19 = 0.0674594695 0.1161734658 0.1163107783 0.0662588409 0.0021634231 0.0939360452 0.0865501280 0.0368556575 0.0381149118 0.0033238825 0.0093839985 0.1899736999 0.0039487389 0.0018212730 0.0151207830 0.0842204423 0.0565953680 0.0007187305 0.0046189437 0.0064514195;
+frequency C40pi20 = 0.0572262322 0.0494723554 0.1083882793 0.1793932771 0.0015301521 0.0903668522 0.1992261265 0.0316472274 0.0291392067 0.0045804559 0.0100739563 0.1015624916 0.0040204606 0.0013701849 0.0063674130 0.0621142922 0.0496102162 0.0006669285 0.0046497641 0.0085941279;
+frequency C40pi21 = 0.0036020163 0.0102712927 0.0013455508 0.0020871647 0.0045484804 0.0032718114 0.0017857730 0.0056391633 0.0064968790 0.0029292916 0.0232635081 0.0010419846 0.0044592278 0.0855714596 0.0024991984 0.0030671803 0.0025900250 0.7617821954 0.0678809532 0.0058668443;
+frequency C40pi22 = 0.2032018418 0.0083895722 0.0143743754 0.0135011707 0.0098131618 0.0044514580 0.0083818173 0.6184886075 0.0027747899 0.0011828492 0.0039826789 0.0044598895 0.0020631785 0.0019619615 0.0085870399 0.0739919851 0.0108922273 0.0018606145 0.0015638674 0.0060769136;
+frequency C40pi23 = 0.0050898779 0.0028740788 0.0057092962 0.0016126151 0.0061776450 0.0024693148 0.0012040415 0.0016334183 0.0393460780 0.0059088776 0.0249343597 0.0013713662 0.0049795162 0.3563126947 0.0014136424 0.0059527667 0.0036536770 0.0357987380 0.4853645852 0.0081934106;
+frequency C40pi24 = 0.0403335679 0.0540186397 0.0216052457 0.0098218598 0.0081549541 0.0383639077 0.0375406578 0.0047934404 0.0176735565 0.1893424159 0.1051859862 0.0607377395 0.0305599836 0.0119140782 0.0077550551 0.0257110173 0.1009913165 0.0028780020 0.0115276935 0.2210908828;
+frequency C40pi25 = 0.0790086293 0.1065441152 0.0309384274 0.0546012394 0.0024947877 0.1843375981 0.1997882784 0.0192655847 0.0270700474 0.0075667489 0.0254542392 0.1553108816 0.0098024439 0.0023773444 0.0056640684 0.0332370813 0.0359574739 0.0011682801 0.0048820809 0.0145306498;
+frequency C40pi26 = 0.0722240672 0.0489728405 0.0678929607 0.1194883992 0.0064755348 0.0708969573 0.1345886574 0.0287815397 0.0699011334 0.0173588702 0.0519870084 0.0490341790 0.0154411043 0.0348233029 0.0145597486 0.0589579876 0.0425972780 0.0087913770 0.0554386705 0.0317883834;
+frequency C40pi27 = 0.1085842431 0.0206450023 0.0441956285 0.1529666596 0.0012502570 0.0405398136 0.1664851192 0.0336098469 0.0134902179 0.0038821795 0.0089861440 0.0576227094 0.0024339036 0.0014553522 0.1990095021 0.0846749753 0.0454715217 0.0005902831 0.0027650162 0.0113416246;
+frequency C40pi28 = 0.0309526387 0.3195887318 0.0301336637 0.0082352132 0.0065593963 0.0832608108 0.0291974083 0.0154206187 0.0310385092 0.0098251607 0.0237900204 0.3062634996 0.0097071728 0.0036891639 0.0095029109 0.0295285439 0.0303052301 0.0028125285 0.0068850639 0.0133037148;
+frequency C40pi29 = 0.0098953741 0.0019604525 0.0007307935 0.0003748228 0.0028276741 0.0017337004 0.0009182100 0.0006997068 0.0010419482 0.3115040359 0.3750387796 0.0013960508 0.0474451070 0.0298607430 0.0025296256 0.0014628019 0.0075738968 0.0016799771 0.0040259930 0.1973003069;
+frequency C40pi30 = 0.1163213921 0.0273321006 0.0250163656 0.0731917718 0.0034792282 0.0586677248 0.1380880502 0.0193193469 0.0160240740 0.0712243431 0.0771473538 0.0355120487 0.0242841072 0.0094117688 0.0508926833 0.0475560280 0.0726552233 0.0026892716 0.0076166020 0.1235705162;
+frequency C40pi31 = 0.1285218235 0.0373073487 0.1179844215 0.0402749992 0.0172928883 0.0439706110 0.0250692272 0.1127033137 0.0606981059 0.0109350265 0.0258415767 0.0288749652 0.0167592956 0.0199118302 0.0180674983 0.1741489481 0.0648967655 0.0063574951 0.0321771650 0.0182066946;
+frequency C40pi32 = 0.0372286941 0.0094528028 0.0053377315 0.0023703173 0.0144940088 0.0079097138 0.0048585146 0.0046433943 0.0186795102 0.1820459527 0.1780099317 0.0058198481 0.0371334296 0.1463772419 0.0048538601 0.0103570678 0.0284161577 0.0211293603 0.0958905187 0.1849919442;
+frequency C40pi33 = 0.0535643726 0.1159797757 0.0239172676 0.0113537364 0.0096256227 0.0928585070 0.0391699080 0.0120279334 0.0384887950 0.0522748270 0.1892392595 0.0996037748 0.0712219098 0.0264213736 0.0083720574 0.0299114019 0.0389484845 0.0104232046 0.0265030050 0.0500947835;
+frequency C40pi34 = 0.1332424803 0.0033147683 0.0022704992 0.0012739239 0.0246514263 0.0030843469 0.0040461524 0.0089139209 0.0015864680 0.1971284995 0.1251288442 0.0023713225 0.0286947200 0.0156995251 0.0118845743 0.0171461828 0.0563298009 0.0017341820 0.0048778410 0.3566205216;
+frequency C40pi35 = 0.1498658185 0.0326607222 0.0176452820 0.0280354786 0.0035437399 0.0348151308 0.0435380704 0.0311112643 0.0140625707 0.0101953314 0.0251433928 0.0393124980 0.0051548319 0.0047533945 0.3923800449 0.0874496981 0.0473306717 0.0015215239 0.0043208299 0.0271597054;
+frequency C40pi36 = 0.4214366359 0.0061425967 0.0121590498 0.0073305074 0.0187609694 0.0072748556 0.0086837775 0.0902333103 0.0030262044 0.0039362777 0.0047193320 0.0051508681 0.0038306586 0.0027156136 0.0208940236 0.2901188793 0.0651922314 0.0008108235 0.0023622848 0.0252211004;
+frequency C40pi37 = 0.1770713890 0.1332782050 0.0311656783 0.0226500225 0.0078348946 0.0752471493 0.0509767242 0.0897389513 0.0220667143 0.0059519850 0.0205369728 0.1257689326 0.0092982479 0.0040514178 0.0264087912 0.1169591448 0.0565566955 0.0029947127 0.0049346701 0.0165087010;
+frequency C40pi38 = 0.0293984032 0.0370901720 0.1483622633 0.1099709900 0.0031729093 0.0388688450 0.0464270335 0.4222420155 0.0272494642 0.0007997326 0.0037634298 0.0622314461 0.0016657052 0.0015039626 0.0056481827 0.0472252404 0.0086568982 0.0009176022 0.0027693124 0.0020363920;
+frequency C40pi39 = 0.0265779317 0.0791104753 0.1318603134 0.0280314140 0.0101369144 0.0989710810 0.0269057233 0.0173376629 0.2815133703 0.0064646977 0.0268210053 0.0474749135 0.0072375268 0.0276960902 0.0083014995 0.0426276702 0.0259042511 0.0078528946 0.0891598394 0.0100147256;
+frequency C40pi40 = 0.0096096503 0.0027136180 0.0013104432 0.0006331856 0.0077301682 0.0033899420 0.0010471898 0.0020227436 0.0039001415 0.0733098005 0.4451691588 0.0014931484 0.0732575295 0.2630171690 0.0042768091 0.0036117358 0.0057928403 0.0181275729 0.0370698053 0.0425173480;
+model C40 = POISSON+G4+FMIX{C40pi1:1:0.0223853788,C40pi2:1:0.0338891820,C40pi3:1:0.0577169375,C40pi4:1:0.0252416233,C40pi5:1:0.0108607921,C40pi6:1:0.0462373793,C40pi7:1:0.0102293175,C40pi8:1:0.0147523625,C40pi9:1:0.0143161352,C40pi10:1:0.0182302541,C40pi11:1:0.0204025079,C40pi12:1:0.0425505156,C40pi13:1:0.0248627269,C40pi14:1:0.0105892988,C40pi15:1:0.0188238725,C40pi16:1:0.0086663445,C40pi17:1:0.0148496147,C40pi18:1:0.0343037402,C40pi19:1:0.0225335203,C40pi20:1:0.0174068578,C40pi21:1:0.0 [...]
+
+[ ---------------------------------------------------------
+    CAT-C50 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+frequency C50pi1 = 0.1357566757 0.0328511938 0.0937692919 0.0757182069 0.0041887049 0.0448010470 0.0572805366 0.1210866186 0.0167465028 0.0049719235 0.0113823284 0.0458096069 0.0064563157 0.0029292810 0.0228705187 0.2060115780 0.1011347978 0.0012443033 0.0056104605 0.0093801079;
+frequency C50pi2 = 0.0530862751 0.1905936010 0.0595772279 0.0320970468 0.0026608079 0.1152605895 0.0840617877 0.0196495178 0.0274729775 0.0064919200 0.0158709120 0.2635539775 0.0078171228 0.0017231166 0.0121639300 0.0449347664 0.0472425608 0.0008407188 0.0037608716 0.0111402722;
+frequency C50pi3 = 0.0083279799 0.0007172026 0.0006359642 0.0003134388 0.0020547407 0.0007351595 0.0005373710 0.0005576905 0.0004858721 0.4370910601 0.1208722220 0.0006394909 0.0195499664 0.0090175268 0.0007265254 0.0007876194 0.0057076665 0.0006453449 0.0016797264 0.3889174318;
+frequency C50pi4 = 0.2072868350 0.0166858699 0.0129177658 0.0020625574 0.0849982226 0.0151757635 0.0065903656 0.0472047575 0.0130289256 0.0345690755 0.1042722764 0.0075861385 0.0498042308 0.0572909747 0.0064928361 0.1183618036 0.0780339514 0.0128352368 0.0323576924 0.0924447209;
+frequency C50pi5 = 0.0364181183 0.0076427099 0.0052725527 0.0020389950 0.0171009943 0.0064088232 0.0042399368 0.0053824238 0.0198596156 0.1361523026 0.1651892915 0.0045481616 0.0387479055 0.2025922657 0.0055053348 0.0121111950 0.0254621828 0.0327580458 0.1368025306 0.1357666147;
+frequency C50pi6 = 0.0535489196 0.0099543365 0.0269073208 0.3076150732 0.0007101021 0.0574988641 0.4066173371 0.0204537673 0.0096286483 0.0025879708 0.0049721459 0.0280989086 0.0025143457 0.0010618006 0.0124317994 0.0247246015 0.0191107367 0.0006385967 0.0024132214 0.0085115039;
+frequency C50pi7 = 0.0074733729 0.0025226602 0.0033967505 0.0005574007 0.0081158286 0.0037658904 0.0013610444 0.0022017759 0.0115142679 0.0195730439 0.1268878488 0.0018497296 0.0269141680 0.3821985941 0.0019970421 0.0057127939 0.0039692337 0.0553575998 0.3184099394 0.0162210153;
+frequency C50pi8 = 0.2615592974 0.0027098854 0.0124908261 0.0020153852 0.2740228527 0.0017043893 0.0007667803 0.0463498030 0.0019474361 0.0082858275 0.0147048711 0.0010787235 0.0063051368 0.0062080862 0.0039442437 0.1940042648 0.0963699489 0.0016185483 0.0048431386 0.0590705550;
+frequency C50pi9 = 0.1190557043 0.0956320251 0.0215995297 0.0378323341 0.0041536088 0.1151348174 0.1337084452 0.0179375220 0.0216767047 0.0336228770 0.0557402194 0.1132452331 0.0178407325 0.0063405927 0.0147606946 0.0478666925 0.0712091035 0.0022867238 0.0075728630 0.0627835766;
+frequency C50pi10 = 0.0505010344 0.0281381134 0.0341872191 0.0178157543 0.0183140005 0.0271729546 0.0212018661 0.0176052654 0.1190104107 0.0161645217 0.0561232531 0.0203908848 0.0146521042 0.1553484132 0.0135251600 0.0478959652 0.0292963208 0.0376058633 0.2477283800 0.0273225153;
+frequency C50pi11 = 0.1239446910 0.0355525870 0.0409769096 0.1479953346 0.0011563976 0.0908869312 0.2700270273 0.0283589709 0.0126760201 0.0064825033 0.0122101302 0.0787433823 0.0042467440 0.0016540857 0.0205717500 0.0552940245 0.0474239965 0.0008596621 0.0027823209 0.0181565313;
+frequency C50pi12 = 0.0160542063 0.0027359185 0.0014708079 0.0007004900 0.0034820152 0.0061470051 0.0016359686 0.0022137927 0.0013207229 0.1640035117 0.4616043506 0.0021342205 0.2174099502 0.0143751693 0.0013694259 0.0037614383 0.0172651408 0.0011454338 0.0019438536 0.0792265779;
+frequency C50pi13 = 0.1548192401 0.0131324559 0.0280584102 0.0095301620 0.0166267416 0.0175228950 0.0170969133 0.0179616718 0.0078385586 0.0865181208 0.0523369910 0.0132802182 0.0326348210 0.0083511229 0.0145594414 0.1096327081 0.2218108602 0.0015829972 0.0062173360 0.1704883347;
+frequency C50pi14 = 0.2950313592 0.0027580697 0.0021616268 0.0015364190 0.0375439186 0.0028808733 0.0042976283 0.0261726702 0.0008294969 0.0834938143 0.0553606311 0.0022642314 0.0181259911 0.0074433078 0.0126794048 0.0382913338 0.0783205173 0.0010015148 0.0034016419 0.3264055498;
+frequency C50pi15 = 0.1683177099 0.0820396152 0.0526048706 0.0822517150 0.0023029997 0.0969341246 0.1488943001 0.0535291188 0.0179803231 0.0032503636 0.0114941086 0.1156402642 0.0039439899 0.0015002945 0.0066854154 0.0924511658 0.0480769504 0.0006152103 0.0025022919 0.0089851683;
+frequency C50pi16 = 0.0334088176 0.0134485791 0.1590918150 0.3657542471 0.0025127086 0.0327665151 0.1820739351 0.0740807194 0.0202010901 0.0016650025 0.0036700956 0.0295517886 0.0017087810 0.0011422805 0.0073155123 0.0426788071 0.0211162106 0.0005931485 0.0034724580 0.0037474882;
+frequency C50pi17 = 0.0777586977 0.0174438357 0.0053423343 0.0043431532 0.0062523949 0.0220851281 0.0161769285 0.0053903202 0.0080675581 0.1052945216 0.1617365895 0.0148319919 0.0288253912 0.0168985297 0.2565426868 0.0202089662 0.0542929694 0.0060146095 0.0078109966 0.1646823969;
+frequency C50pi18 = 0.0727013979 0.0048977192 0.0026095383 0.0011420120 0.0198747408 0.0066949336 0.0030401434 0.0079074845 0.0026492900 0.1685788878 0.3185489163 0.0026024909 0.0735597038 0.0490419983 0.0051699104 0.0128630830 0.0305356924 0.0050857840 0.0095279173 0.2029683559;
+frequency C50pi19 = 0.0658153836 0.0833432992 0.0224582275 0.0107735824 0.0092974677 0.0745951987 0.0299754097 0.0146336557 0.0148026634 0.0671888719 0.2198675990 0.0868172087 0.1084156835 0.0155812696 0.0071132147 0.0381451947 0.0562948237 0.0056421684 0.0102813038 0.0589577740;
+frequency C50pi20 = 0.0525278351 0.0364897390 0.0903013988 0.1854660991 0.0037795400 0.0776857292 0.1789287290 0.0232011648 0.0687702011 0.0135825419 0.0337350646 0.0458143770 0.0108457797 0.0191020037 0.0088729983 0.0495289201 0.0389358438 0.0046292762 0.0354195947 0.0223831639;
+frequency C50pi21 = 0.0026515970 0.0080885204 0.0010572021 0.0016052142 0.0036540307 0.0022979498 0.0014681767 0.0046230912 0.0043887616 0.0020669456 0.0172444871 0.0006593575 0.0034691503 0.0658351447 0.0019185467 0.0022498420 0.0021278866 0.8183345006 0.0515918357 0.0046677595;
+frequency C50pi22 = 0.0548133174 0.0692044159 0.0211265710 0.0207779125 0.0072646572 0.0567865657 0.0738456579 0.0051797705 0.0168408457 0.1386104888 0.0713795154 0.0896393340 0.0201205491 0.0082150393 0.0104049016 0.0282344422 0.0995597110 0.0019722093 0.0074054035 0.1986186919;
+frequency C50pi23 = 0.0047955268 0.0028033787 0.0050506238 0.0014080516 0.0061671241 0.0019350126 0.0009861551 0.0014396818 0.0389623239 0.0048950388 0.0151748150 0.0012306644 0.0032520404 0.3601993060 0.0011266316 0.0054509935 0.0034763921 0.0362899931 0.4980200998 0.0073361467;
+frequency C50pi24 = 0.0365462996 0.0280070630 0.0183606115 0.0070525803 0.0093251684 0.0300239431 0.0221812842 0.0047778642 0.0178840316 0.2025947306 0.1973012130 0.0250209750 0.0557862640 0.0258067541 0.0042772210 0.0209374223 0.0731398943 0.0049738166 0.0200601168 0.1959427463;
+frequency C50pi25 = 0.0684197684 0.0111619750 0.0544764241 0.0224313301 0.0106958312 0.0091799953 0.0097436799 0.0255871619 0.0055558006 0.0059416697 0.0076746853 0.0144198991 0.0056892166 0.0037356845 0.0172554137 0.3527301149 0.3586913194 0.0012501907 0.0028636710 0.0124961682;
+frequency C50pi26 = 0.0495330775 0.1060064564 0.1511923969 0.0483471288 0.0080946362 0.0886108407 0.0449556763 0.0331436148 0.1447288287 0.0061850770 0.0190407203 0.0948075276 0.0063418871 0.0126162987 0.0100869563 0.0799801169 0.0445418973 0.0044765096 0.0363930724 0.0109172804;
+frequency C50pi27 = 0.0702411901 0.0642050323 0.0779553908 0.0510328304 0.0042438849 0.0723300485 0.0883747710 0.0177347101 0.0233800891 0.0198779320 0.0183537117 0.1051267065 0.0107865869 0.0037987118 0.0112811107 0.1345081583 0.1805543234 0.0014252764 0.0055089381 0.0392805971;
+frequency C50pi28 = 0.1207399152 0.1741788075 0.0385528120 0.0162689581 0.0118494185 0.0760068404 0.0337935391 0.0653431008 0.0342783806 0.0085426053 0.0256788075 0.1434443984 0.0112347894 0.0061270793 0.0294493558 0.1091415488 0.0634181251 0.0046156419 0.0085374279 0.0187984481;
+frequency C50pi29 = 0.0064521696 0.0021817337 0.0005939658 0.0003904032 0.0021538307 0.0019099968 0.0008007758 0.0005208471 0.0011374294 0.2850758996 0.4278536740 0.0013920239 0.0561988528 0.0449501501 0.0026289702 0.0011053664 0.0055157148 0.0022753671 0.0059612583 0.1509015707;
+frequency C50pi30 = 0.0969092741 0.0359723370 0.0633194168 0.0411020773 0.0145578946 0.0466661704 0.0469223767 0.0374614202 0.0537149580 0.0394603009 0.0856256544 0.0283577862 0.0346435320 0.0507298072 0.0167177549 0.0990945318 0.0806503833 0.0128373826 0.0598972198 0.0553597218;
+frequency C50pi31 = 0.0840212010 0.0214242172 0.2240668646 0.0354684798 0.0265031681 0.0235675678 0.0076026464 0.1173325117 0.0516019781 0.0048917455 0.0067211727 0.0173653354 0.0079342101 0.0087501486 0.0093276105 0.2637097946 0.0630157977 0.0022314593 0.0170994247 0.0073646661;
+frequency C50pi32 = 0.0055061507 0.0012508737 0.0004824961 0.0004530173 0.0054435931 0.0011315076 0.0004150379 0.0012285001 0.0019884532 0.0617431901 0.4342418135 0.0008161868 0.0554628445 0.3289659386 0.0025814794 0.0021197505 0.0029510440 0.0172981374 0.0412097497 0.0347102358;
+frequency C50pi33 = 0.0442014612 0.1295816316 0.0258622052 0.0148900471 0.0076165815 0.1301765579 0.0636708052 0.0105339122 0.0662542863 0.0423977240 0.1434197528 0.1040381429 0.0403363621 0.0260540342 0.0089335090 0.0242573966 0.0317938092 0.0077831996 0.0309973779 0.0472012033;
+frequency C50pi34 = 0.0571984155 0.0034929878 0.0031324721 0.0012472712 0.0113230439 0.0025279922 0.0040737817 0.0030647398 0.0020494153 0.3131200932 0.0901750144 0.0034699557 0.0242565205 0.0112345295 0.0048197020 0.0095675953 0.0529842025 0.0010645104 0.0041851135 0.3970126433;
+frequency C50pi35 = 0.1141963934 0.0102229903 0.0178644126 0.0172307307 0.0056978908 0.0039055039 0.0085974326 0.7425714921 0.0026414175 0.0005602022 0.0019872568 0.0055400059 0.0004739977 0.0010663175 0.0054302447 0.0508318204 0.0055408544 0.0018890811 0.0012409205 0.0025110348;
+frequency C50pi36 = 0.3531758625 0.0043402857 0.0031812423 0.0030024877 0.0165711581 0.0029126214 0.0042077690 0.4520896100 0.0021366362 0.0063692579 0.0120143269 0.0022586970 0.0080260130 0.0043865828 0.0111462027 0.0658344033 0.0182952730 0.0010872878 0.0023330172 0.0266312657;
+frequency C50pi37 = 0.0310798708 0.0234519814 0.1273669012 0.1197925100 0.0031216960 0.0295858842 0.0470763446 0.4883046368 0.0193412101 0.0008855622 0.0032808220 0.0408430573 0.0014984226 0.0016298596 0.0063229464 0.0423452622 0.0082797260 0.0007718998 0.0024996877 0.0025217188;
+frequency C50pi38 = 0.0370340667 0.0689410214 0.1704407181 0.1041817082 0.0018108784 0.0715495095 0.0659866718 0.2159298358 0.0443591808 0.0008668888 0.0064679416 0.1275300877 0.0027248464 0.0014178323 0.0060253154 0.0534574556 0.0147073432 0.0007999410 0.0037708147 0.0019979426;
+frequency C50pi39 = 0.0160398536 0.0526622999 0.1051167149 0.0187352256 0.0085330116 0.0922616498 0.0154450839 0.0076235155 0.3848449137 0.0057129406 0.0277195224 0.0219347380 0.0071078308 0.0376358992 0.0072201969 0.0209969653 0.0142198783 0.0096946226 0.1384243143 0.0080708232;
+frequency C50pi40 = 0.0165549167 0.0085856833 0.0049441851 0.0016567380 0.0086529073 0.0184087838 0.0033759867 0.0033844413 0.0084695063 0.0483923758 0.4963073963 0.0056997331 0.1949377866 0.0999527140 0.0060271256 0.0084289585 0.0122619536 0.0114013282 0.0192314834 0.0233259964;
+frequency C50pi41 = 0.0227379959 0.0137060298 0.3162561805 0.2932103363 0.0037073869 0.0169119273 0.0380984220 0.0550224760 0.0319886436 0.0039219190 0.0041582288 0.0312539900 0.0019467591 0.0022276545 0.0059660826 0.0998736999 0.0462336456 0.0007310446 0.0069012376 0.0051463400;
+frequency C50pi42 = 0.2406936002 0.0197081082 0.0462578641 0.0206379264 0.0186726798 0.0189843646 0.0129785315 0.1749109142 0.0118714342 0.0049349532 0.0126237761 0.0127876711 0.0095642661 0.0083606873 0.0326283314 0.2101300187 0.1130042042 0.0041951500 0.0069210515 0.0201344675;
+frequency C50pi43 = 0.0214325714 0.3730744306 0.0220674626 0.0037495290 0.0069038342 0.0670391950 0.0159298773 0.0126211348 0.0284477629 0.0102051798 0.0242954287 0.3272456489 0.0093147452 0.0036403029 0.0070138928 0.0216860624 0.0232259733 0.0030422478 0.0065368590 0.0125278613;
+frequency C50pi44 = 0.1567707052 0.0258059606 0.0161658338 0.0223946414 0.0074382689 0.0274455582 0.0410010574 0.0360501033 0.0159972680 0.0640941463 0.0944756654 0.0192586366 0.0312789234 0.0227728534 0.1653169011 0.0640177954 0.0549103568 0.0050980224 0.0138248643 0.1158824381;
+frequency C50pi45 = 0.4345912387 0.0061142999 0.0097660767 0.0060102195 0.0197377879 0.0069062805 0.0082800652 0.0829075516 0.0029125126 0.0047747098 0.0054182241 0.0049974525 0.0039676868 0.0029052002 0.0193588692 0.2795854727 0.0677816788 0.0008196092 0.0025196339 0.0306454302;
+frequency C50pi46 = 0.0296734965 0.1443250343 0.0128668160 0.0059561454 0.0129805897 0.0492311054 0.0262726056 0.0069437743 0.0676183913 0.0452364160 0.1374511139 0.0907089722 0.0308070846 0.0816441785 0.0060701025 0.0197130339 0.0299715868 0.0461468661 0.1119414237 0.0444412635;
+frequency C50pi47 = 0.1089911217 0.0159187676 0.0643054232 0.2086425054 0.0016540963 0.0375565797 0.1791004993 0.0610564917 0.0144660242 0.0038322948 0.0067778708 0.0372270242 0.0022817918 0.0012634818 0.0851792013 0.1065821239 0.0524401536 0.0005901255 0.0027836060 0.0093508169;
+frequency C50pi48 = 0.1429463629 0.0304191716 0.0191145368 0.0351867799 0.0031493079 0.0341248336 0.0508492526 0.0305914291 0.0134276644 0.0070227247 0.0197257013 0.0421442438 0.0038904796 0.0040697467 0.4052202085 0.0874406009 0.0445304918 0.0012842531 0.0039485525 0.0209136585;
+frequency C50pi49 = 0.0580116857 0.0903213669 0.0369245281 0.0613603988 0.0022829951 0.2073851382 0.2225853236 0.0159476910 0.0311816018 0.0068543753 0.0217092509 0.1504781849 0.0084841006 0.0020581132 0.0046206107 0.0276754451 0.0321477211 0.0011651089 0.0051889637 0.0136173964;
+frequency C50pi50 = 0.2153540940 0.0359173007 0.0219927944 0.0735128474 0.0037017294 0.0566408566 0.1350375818 0.0662986417 0.0157121780 0.0138456188 0.0266922211 0.0474338339 0.0088042600 0.0035035311 0.0739583083 0.0921989198 0.0575687235 0.0019306896 0.0044520833 0.0454437865;
+model C50 = POISSON+G4+FMIX{C50pi1:1:0.0164297003,C50pi2:1:0.0273175755,C50pi3:1:0.0460247610,C50pi4:1:0.0084864734,C50pi5:1:0.0125389252,C50pi6:1:0.0343549036,C50pi7:1:0.0130241102,C50pi8:1:0.0094755681,C50pi9:1:0.0190040551,C50pi10:1:0.0151902354,C50pi11:1:0.0320534760,C50pi12:1:0.0210059850,C50pi13:1:0.0237408547,C50pi14:1:0.0239841203,C50pi15:1:0.0213748021,C50pi16:1:0.0210717705,C50pi17:1:0.0050241805,C50pi18:1:0.0166262276,C50pi19:1:0.0143945956,C50pi20:1:0.0104391130,C50pi21:1:0.0 [...]
+
+[ ---------------------------------------------------------
+    CAT-C60 profile mixture model of Le, Gascuel & Lartillot (2008)
+ --------------------------------------------------------- ]
+frequency C60pi1 = 0.1534363248 0.0444389067 0.0796726990 0.0546757288 0.0047306596 0.0514333025 0.0529324359 0.1103775749 0.0174480218 0.0050343887 0.0130294160 0.0603928711 0.0075550589 0.0035554315 0.0249523704 0.2029625968 0.0957668473 0.0014444483 0.0059800307 0.0101808864;
+frequency C60pi2 = 0.0281984692 0.3031055487 0.0312954609 0.0091549350 0.0019503463 0.0939884393 0.0388530140 0.0084028325 0.0155384715 0.0107872879 0.0217786594 0.3476042929 0.0109904917 0.0015919288 0.0071539896 0.0197479052 0.0328352333 0.0009209994 0.0025714024 0.0135302919;
+frequency C60pi3 = 0.0083680740 0.0007319768 0.0006123446 0.0002228366 0.0020433870 0.0009498685 0.0004731544 0.0004825748 0.0005189995 0.3768453098 0.2608334606 0.0006296168 0.0315700586 0.0123984358 0.0009595916 0.0009746383 0.0049990761 0.0008657759 0.0017132332 0.2938075872;
+frequency C60pi4 = 0.2227229348 0.0064846074 0.0061206496 0.0007997588 0.1640285908 0.0051051888 0.0027280806 0.0202702520 0.0037183875 0.0455406072 0.0883350071 0.0022832871 0.0348094559 0.0228667054 0.0035471579 0.0850040072 0.1012848285 0.0048424833 0.0096500033 0.1698580069;
+frequency C60pi5 = 0.0412139519 0.0067627055 0.0051067690 0.0017434391 0.0204715649 0.0057538477 0.0037263409 0.0069107492 0.0180293946 0.1154281623 0.1693562458 0.0042900270 0.0414066566 0.2239001858 0.0058416410 0.0149106129 0.0239548406 0.0332237129 0.1379349474 0.1200342049;
+frequency C60pi6 = 0.0480550249 0.0308438053 0.0940628721 0.2084606133 0.0037801787 0.0747676701 0.1855184661 0.0191402239 0.0872162350 0.0094685435 0.0277340828 0.0375741243 0.0088308358 0.0196000958 0.0081267777 0.0439680761 0.0324588883 0.0034665720 0.0387499964 0.0181769181;
+frequency C60pi7 = 0.0062848745 0.0026246919 0.0030342510 0.0005324147 0.0073027627 0.0034409089 0.0009741492 0.0019578159 0.0102225186 0.0180592309 0.1179064681 0.0016205916 0.0234721825 0.3974552519 0.0020165583 0.0056903327 0.0037091821 0.0598639097 0.3185565304 0.0152753744;
+frequency C60pi8 = 0.1815005560 0.0026845411 0.0148484537 0.0025145485 0.4205633920 0.0014097001 0.0007088144 0.0461854175 0.0014374605 0.0041745536 0.0098310464 0.0006474254 0.0041611385 0.0068976432 0.0038767247 0.1864537050 0.0687189855 0.0027083549 0.0061033012 0.0345742379;
+frequency C60pi9 = 0.0600740822 0.0367642654 0.0134869242 0.0170572285 0.0070719770 0.0142469806 0.0127486975 0.0343564471 0.0305859029 0.0204571345 0.0994551128 0.0212367087 0.0318165939 0.1140907926 0.0297628218 0.0505792699 0.0339368402 0.2312808862 0.1192491702 0.0217421638;
+frequency C60pi10 = 0.0708394513 0.0474098489 0.0416822304 0.0324482918 0.0131641265 0.0494874703 0.0508264389 0.0183309196 0.0567272697 0.0650369079 0.1282255556 0.0343618389 0.0390362930 0.0594359563 0.0135608209 0.0551343199 0.0642260358 0.0137118382 0.0673934289 0.0789609573;
+frequency C60pi11 = 0.0617689371 0.0076332888 0.0303081645 0.3430234188 0.0007199837 0.0307856241 0.3792509407 0.0284658686 0.0079592120 0.0016999627 0.0039945339 0.0216076877 0.0019734329 0.0009814186 0.0174791407 0.0337831940 0.0203426591 0.0006130268 0.0017102752 0.0058992300;
+frequency C60pi12 = 0.0421559537 0.1042068314 0.0286980872 0.0164385240 0.0044450330 0.1393690851 0.0531949072 0.0134711207 0.0177764997 0.0267727728 0.1967237776 0.1323735242 0.1182827521 0.0086728324 0.0051837880 0.0255852718 0.0333292020 0.0045852327 0.0070281498 0.0217066546;
+frequency C60pi13 = 0.2814809927 0.0100367066 0.0172867775 0.0064385734 0.0258337508 0.0133101925 0.0115046410 0.0270054934 0.0054629657 0.0188216093 0.0190993462 0.0098712843 0.0158719589 0.0050481705 0.0129510033 0.1886808600 0.2427104979 0.0012274627 0.0036052922 0.0837524211;
+frequency C60pi14 = 0.2769188320 0.0017226995 0.0021315271 0.0011672545 0.0318292645 0.0018216251 0.0024752467 0.0199646887 0.0005170863 0.0983109006 0.0489264326 0.0016232163 0.0173414948 0.0070843906 0.0070179705 0.0336348952 0.0814141404 0.0007118144 0.0032942319 0.3620922883;
+frequency C60pi15 = 0.1577797792 0.1112140270 0.0570403237 0.0648290471 0.0053318076 0.1065373681 0.0913586945 0.0906209718 0.0533809635 0.0029171632 0.0156225571 0.0782148712 0.0045758969 0.0025047816 0.0067077844 0.0929310045 0.0393122597 0.0028575821 0.0077590269 0.0085040899;
+frequency C60pi16 = 0.0593735135 0.0354740772 0.1151175314 0.2189482708 0.0015332173 0.0688752402 0.1819422913 0.0813707101 0.0220478285 0.0020993577 0.0056191259 0.0750172075 0.0021871739 0.0010838321 0.0109737422 0.0726449461 0.0380238271 0.0007346460 0.0026664883 0.0042669729;
+frequency C60pi17 = 0.0978066326 0.0265576438 0.0101843505 0.0120781428 0.0064138404 0.0307876446 0.0291282947 0.0128912798 0.0128036716 0.0723904209 0.1279438950 0.0245630658 0.0303267312 0.0198963719 0.2723524069 0.0350549441 0.0484557340 0.0046842467 0.0104773833 0.1152032995;
+frequency C60pi18 = 0.0124023388 0.0030680354 0.0009239105 0.0006037316 0.0041885695 0.0032957441 0.0012524000 0.0011306791 0.0013542104 0.2344167852 0.4550557697 0.0016718177 0.0667307666 0.0610615367 0.0037076169 0.0019420934 0.0067612939 0.0038937184 0.0074911765 0.1290478057;
+frequency C60pi19 = 0.0794230623 0.1294739355 0.0662792725 0.0587236242 0.0019919499 0.1143880588 0.1246900644 0.0325432311 0.0238605372 0.0036277150 0.0097987961 0.2147597316 0.0041846209 0.0012869951 0.0142410239 0.0615807386 0.0477333594 0.0006525371 0.0029420233 0.0078187231;
+frequency C60pi20 = 0.0248148778 0.0083552910 0.1888915388 0.4278832998 0.0027839717 0.0210777725 0.1432386297 0.0643968435 0.0185736870 0.0022506941 0.0034558626 0.0179274104 0.0015714503 0.0014680353 0.0073768035 0.0377003132 0.0187767966 0.0005891859 0.0042602708 0.0046072655;
+frequency C60pi21 = 0.0017003427 0.0060674330 0.0004222900 0.0010711490 0.0029059420 0.0016424179 0.0011731741 0.0035579609 0.0027630465 0.0012291190 0.0127420810 0.0004273804 0.0025671348 0.0513377024 0.0013536738 0.0011871674 0.0014033068 0.8640436936 0.0390912582 0.0033137266;
+frequency C60pi22 = 0.0468360682 0.0639796924 0.0205603686 0.0185615516 0.0059954138 0.0557030821 0.0705436036 0.0045435329 0.0152062773 0.1550613356 0.0824253382 0.0866248354 0.0245854443 0.0080177192 0.0081485616 0.0237025617 0.0962054496 0.0018368673 0.0067131723 0.2047491243;
+frequency C60pi23 = 0.0258764792 0.0201097124 0.0298384107 0.0107037437 0.0142503909 0.0158529432 0.0105649532 0.0073064999 0.1411078834 0.0114777629 0.0407992414 0.0119179202 0.0098798997 0.1876429961 0.0051228805 0.0275699644 0.0170764901 0.0405124999 0.3536390834 0.0187502449;
+frequency C60pi24 = 0.0296285022 0.0046400334 0.0034944393 0.0008851024 0.0090046468 0.0055481111 0.0033046518 0.0027969482 0.0050701500 0.2583397750 0.2668085481 0.0046690936 0.0770825277 0.0408798247 0.0026918193 0.0068538089 0.0322265673 0.0035506055 0.0153353414 0.2271895033;
+frequency C60pi25 = 0.0555725806 0.0098447861 0.0409064430 0.0140389597 0.0097418602 0.0068727710 0.0069443190 0.0157956555 0.0041631258 0.0069826497 0.0075271247 0.0139224817 0.0058762687 0.0034496730 0.0119733364 0.3482466393 0.4213655981 0.0010061491 0.0026576772 0.0131119012;
+frequency C60pi26 = 0.0682671212 0.0615207091 0.0530661192 0.0360278709 0.0141433148 0.0612274332 0.0497415394 0.0268696520 0.1127674983 0.0132646615 0.0544493838 0.0482609047 0.0170033964 0.0803375967 0.0191949850 0.0671839752 0.0443995774 0.0199957919 0.1255070748 0.0267713947;
+frequency C60pi27 = 0.0792618808 0.0638377192 0.0635289371 0.0436646174 0.0049503302 0.0666365188 0.0829639117 0.0183428565 0.0233169239 0.0249427251 0.0221483402 0.0932577596 0.0120893380 0.0049131149 0.0126360122 0.1334848656 0.1916745928 0.0018040086 0.0062353115 0.0503102360;
+frequency C60pi28 = 0.0731759112 0.2105335985 0.0324200854 0.0110007149 0.0123458504 0.0858951989 0.0349942684 0.0224509173 0.0386903280 0.0246226304 0.0508307349 0.1783344831 0.0185740720 0.0093148787 0.0148722772 0.0603181436 0.0649574934 0.0051046395 0.0130597421 0.0385040321;
+frequency C60pi29 = 0.0878402710 0.0110331750 0.0060801213 0.0032803903 0.0171147088 0.0109831614 0.0101465790 0.0087090941 0.0054902234 0.1987761871 0.1756460821 0.0082096925 0.0417232903 0.0191954435 0.0111283542 0.0209862621 0.0697718709 0.0031744014 0.0081905473 0.2825201446;
+frequency C60pi30 = 0.0990215820 0.0349351987 0.0211149501 0.0118797946 0.0108995677 0.0557710676 0.0278999992 0.0240250097 0.0123445071 0.0776564721 0.2354511299 0.0322817789 0.1207665429 0.0214442058 0.0075655541 0.0524170141 0.0649785115 0.0047075806 0.0077328724 0.0771066610;
+frequency C60pi31 = 0.0601641168 0.0161995226 0.2783522747 0.0337188808 0.0315066987 0.0210645987 0.0059839451 0.0543080710 0.0531523512 0.0070650825 0.0070698142 0.0139598368 0.0088298653 0.0069525877 0.0075834331 0.2829802556 0.0860317092 0.0014966551 0.0134849454 0.0100953553;
+frequency C60pi32 = 0.0049781737 0.0018412331 0.0007012207 0.0005315368 0.0052978737 0.0024089907 0.0007630546 0.0015051317 0.0041575221 0.0443828633 0.4417417476 0.0011615060 0.0602807417 0.3351117140 0.0027847686 0.0025795769 0.0030288544 0.0171302592 0.0458455751 0.0237676560;
+frequency C60pi33 = 0.0251996593 0.1114468110 0.0142031925 0.0041012288 0.0097099500 0.0620070749 0.0262571641 0.0038067269 0.0431938935 0.0974043253 0.2447197423 0.0824312856 0.0539323021 0.0429091639 0.0052658505 0.0096093107 0.0251183002 0.0146571900 0.0456965140 0.0783303143;
+frequency C60pi34 = 0.0230361648 0.0014748749 0.0013534390 0.0006264439 0.0048580122 0.0009870046 0.0015762583 0.0011565336 0.0008899238 0.3952895890 0.0576537208 0.0014663528 0.0140986541 0.0072127040 0.0020177885 0.0028770237 0.0205580852 0.0005477695 0.0019539080 0.4603657493;
+frequency C60pi35 = 0.1408776963 0.0297808449 0.0171297613 0.0285076933 0.0032213718 0.0320632225 0.0423838922 0.0299558472 0.0131321477 0.0066914481 0.0195120028 0.0383781635 0.0036276863 0.0041231064 0.4383466229 0.0851400095 0.0422765692 0.0013236871 0.0037087638 0.0198194632;
+frequency C60pi36 = 0.4442491220 0.0050216551 0.0102305117 0.0057193038 0.0235405374 0.0055997640 0.0064889886 0.0822687710 0.0025505743 0.0033615104 0.0040990063 0.0038097073 0.0028683069 0.0024413211 0.0162890960 0.2999969708 0.0559664935 0.0007735426 0.0020639824 0.0226608347;
+frequency C60pi37 = 0.0898717958 0.0070958305 0.0130067619 0.0129166888 0.0044131479 0.0023806547 0.0058957027 0.8087563021 0.0016517855 0.0004339282 0.0015564455 0.0033939025 0.0004253422 0.0008073572 0.0034128140 0.0362876891 0.0032887534 0.0015223902 0.0008537454 0.0020289624;
+frequency C60pi38 = 0.0550840246 0.0472254260 0.1877829604 0.1273796123 0.0035824944 0.0527969268 0.0655884730 0.0637607521 0.0404883483 0.0075574152 0.0136304510 0.0867682792 0.0081684229 0.0040375032 0.0110681809 0.1263380956 0.0752544318 0.0013563681 0.0118590434 0.0102727908;
+frequency C60pi39 = 0.0117681394 0.0442558806 0.0844144627 0.0144712108 0.0070388254 0.1038342049 0.0110901161 0.0049626578 0.4337194047 0.0061337038 0.0298794939 0.0137928558 0.0076237551 0.0338266335 0.0081346096 0.0140571089 0.0108276801 0.0080683065 0.1437251732 0.0083757773;
+frequency C60pi40 = 0.0159285638 0.0048098656 0.0032692643 0.0010966937 0.0080519916 0.0134552459 0.0021324215 0.0025086365 0.0049192147 0.0501543893 0.5307634291 0.0035599431 0.2160085187 0.0743650717 0.0045247350 0.0066922196 0.0119092283 0.0070928134 0.0106565111 0.0281012433;
+frequency C60pi41 = 0.0195973253 0.0105142992 0.3289103336 0.3099848991 0.0034539049 0.0116196758 0.0250777800 0.0627528956 0.0295961112 0.0032650434 0.0028246884 0.0240963907 0.0008425062 0.0019706550 0.0049062781 0.1064984500 0.0438053705 0.0006333959 0.0056197958 0.0040302013;
+frequency C60pi42 = 0.0833804360 0.0125871438 0.0969824220 0.0686820704 0.0081981143 0.0121520930 0.0227415415 0.0982291876 0.0073954898 0.0017471177 0.0039653113 0.0129342146 0.0019557975 0.0024132583 0.0355924232 0.3115606483 0.2113368612 0.0016329034 0.0017991083 0.0047138579;
+frequency C60pi43 = 0.0181409133 0.4129662563 0.0233205154 0.0033333547 0.0085143598 0.0526694251 0.0096531879 0.0224552642 0.0375238929 0.0035090482 0.0149146621 0.3208065790 0.0046098856 0.0035426859 0.0087197469 0.0262309419 0.0131791136 0.0034766995 0.0079588201 0.0044746474;
+frequency C60pi44 = 0.2494227404 0.0185481724 0.0164119567 0.0169234299 0.0122862654 0.0228501981 0.0370491083 0.0347467705 0.0087069587 0.0595718359 0.0451065029 0.0177064733 0.0204556127 0.0077360919 0.0686403544 0.0889295672 0.0986017356 0.0028603862 0.0061938477 0.1672519917;
+frequency C60pi45 = 0.1419737638 0.0373945961 0.0576296888 0.0537452477 0.0068856658 0.0286239972 0.0407540287 0.3988107872 0.0152895617 0.0016627616 0.0092348297 0.0314273807 0.0055425500 0.0040286132 0.0180328866 0.1123731997 0.0242478202 0.0025909098 0.0049054208 0.0048462908;
+frequency C60pi46 = 0.0178903305 0.1958843646 0.0155853897 0.0031054277 0.0290304227 0.1051819261 0.0040503389 0.0100480293 0.1252696215 0.0016708003 0.0722356645 0.0233340169 0.0116142354 0.0238913260 0.0009938415 0.0181675536 0.0186260222 0.2260554691 0.0859787232 0.0113864962;
+frequency C60pi47 = 0.1454758367 0.0420979067 0.0400419720 0.1294249748 0.0014186329 0.0906469055 0.2471353458 0.0319650773 0.0130426183 0.0058525371 0.0123593139 0.0818154090 0.0044178939 0.0017552077 0.0151135525 0.0656688174 0.0511289472 0.0007731441 0.0029258438 0.0169400635;
+frequency C60pi48 = 0.0169799462 0.0242346701 0.1318047919 0.1043655101 0.0022087215 0.0269349684 0.0376379591 0.5404470183 0.0181137053 0.0007459679 0.0021146994 0.0508617611 0.0009473769 0.0006780593 0.0038754401 0.0297030159 0.0045836180 0.0006031889 0.0015704090 0.0015891728;
+frequency C60pi49 = 0.0402646249 0.1152022601 0.0323829165 0.0293968352 0.0039388655 0.2497008043 0.1603524245 0.0129260411 0.0617967839 0.0098491259 0.0354918823 0.1448804422 0.0124818865 0.0041153375 0.0043374229 0.0243246958 0.0305645368 0.0026676598 0.0097227847 0.0156026694;
+frequency C60pi50 = 0.2256914610 0.0523417493 0.0244308734 0.0637125217 0.0043390149 0.0578159236 0.1154830640 0.0867335173 0.0131066949 0.0085086217 0.0193314218 0.0660468804 0.0064877206 0.0027440054 0.0611149102 0.1070877179 0.0507677144 0.0013695913 0.0028982948 0.0299883012;
+frequency C60pi51 = 0.0033164209 0.0015310773 0.0030830171 0.0008266472 0.0051890730 0.0011024889 0.0005134130 0.0010432830 0.0278451262 0.0041895268 0.0111212494 0.0007149922 0.0023621780 0.3801761447 0.0008365077 0.0035876698 0.0023608948 0.0333346985 0.5107889643 0.0060766272;
+frequency C60pi52 = 0.1995014012 0.0236078675 0.0392254543 0.0094955104 0.0584590451 0.0254265363 0.0125535371 0.0939787338 0.0341857201 0.0140209879 0.0449387571 0.0118723304 0.0246990633 0.0634433944 0.0145385320 0.1663920640 0.0533159207 0.0129802666 0.0606346163 0.0367302614;
+frequency C60pi53 = 0.0319448994 0.1011667268 0.2084709220 0.0378074649 0.0066040348 0.0766372935 0.0279488190 0.0365541130 0.2088643258 0.0047542347 0.0156545731 0.0868664783 0.0043253317 0.0108915768 0.0060899575 0.0577656939 0.0302051160 0.0026001883 0.0387897304 0.0060585202;
+frequency C60pi54 = 0.0776799515 0.0142518583 0.0403216692 0.0080651725 0.0140092962 0.0179995517 0.0112622427 0.0136868237 0.0133729897 0.1239635380 0.0724670993 0.0129144967 0.0420745442 0.0173584908 0.0117084432 0.0922723571 0.2316899445 0.0028153633 0.0141726542 0.1679135132;
+frequency C60pi55 = 0.1183662657 0.0805192606 0.0259524932 0.0495595439 0.0035624835 0.1204924917 0.1537589210 0.0194993426 0.0229373171 0.0302661211 0.0571250629 0.0982304112 0.0171727472 0.0068665705 0.0175153030 0.0486588400 0.0635796210 0.0023008307 0.0083027431 0.0553336300;
+frequency C60pi56 = 0.0528559899 0.0193569043 0.0264743774 0.2092761515 0.0008625883 0.1212409715 0.4024189781 0.0155838458 0.0124148798 0.0054864832 0.0090256472 0.0497017031 0.0042357114 0.0012650715 0.0063185636 0.0197262901 0.0235463735 0.0008381610 0.0033948741 0.0159764347;
+frequency C60pi57 = 0.0344366215 0.0426221820 0.1636716191 0.1139007491 0.0020985982 0.0605413987 0.0541780220 0.3361639671 0.0461776737 0.0003463416 0.0048355678 0.0667552967 0.0019704509 0.0031557619 0.0040369775 0.0481173332 0.0089148085 0.0006510101 0.0054145649 0.0020110555;
+frequency C60pi58 = 0.1153088951 0.0151278638 0.0458476603 0.1755516676 0.0014962362 0.0366731222 0.1749410045 0.0394181311 0.0132401530 0.0056912974 0.0101409559 0.0433118387 0.0030332064 0.0015700232 0.1665802563 0.0871536033 0.0468260603 0.0007515702 0.0031432715 0.0141931831;
+frequency C60pi59 = 0.3865149348 0.0037579334 0.0030420497 0.0022366810 0.0218928357 0.0021464743 0.0031387843 0.3694353983 0.0014672902 0.0085376076 0.0127257242 0.0018840458 0.0080581695 0.0039281367 0.0158688291 0.0808877279 0.0305195935 0.0009922880 0.0019020345 0.0410634615;
+frequency C60pi60 = 0.0146570745 0.0028841333 0.0012998335 0.0005210575 0.0024317913 0.0049362750 0.0014874369 0.0020953252 0.0010181940 0.1913901476 0.4432797758 0.0022898369 0.2217427062 0.0091637503 0.0007685153 0.0027251487 0.0170997497 0.0008779380 0.0014756028 0.0778557075;
+model C60 = POISSON+G4+FMIX{C60pi1:1:0.0169698865,C60pi2:1:0.0211683374,C60pi3:1:0.0276589079,C60pi4:1:0.0065675964,C60pi5:1:0.0141221416,C60pi6:1:0.0068774834,C60pi7:1:0.0146909701,C60pi8:1:0.0067225777,C60pi9:1:0.0018396660,C60pi10:1:0.0102547197,C60pi11:1:0.0230896163,C60pi12:1:0.0057941033,C60pi13:1:0.0125394534,C60pi14:1:0.0204526478,C60pi15:1:0.0070629602,C60pi16:1:0.0117982741,C60pi17:1:0.0068334668,C60pi18:1:0.0433775839,C60pi19:1:0.0318278731,C60pi20:1:0.0222546108,C60pi21:1:0.0 [...]
+
+end;
diff --git a/model/modelset.cpp b/model/modelset.cpp
new file mode 100644
index 0000000..bbba7ce
--- /dev/null
+++ b/model/modelset.cpp
@@ -0,0 +1,127 @@
+/*
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 2012  BUI Quang Minh <email>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "modelset.h"
+
+ModelSet::ModelSet(const char *model_name, PhyloTree *tree) : ModelGTR(tree)
+{
+	name = full_name = model_name;
+	name += "+SSF";
+	full_name += "+site-specific state-frequency model (unpublished)";
+}
+
+void ModelSet::computeTransMatrix(double time, double* trans_matrix)
+{
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->computeTransMatrix(time, trans_matrix);
+		trans_matrix += (num_states * num_states);
+	}
+}
+
+void ModelSet::computeTransMatrixFreq(double time, double* trans_matrix)
+{
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->computeTransMatrixFreq(time, trans_matrix);
+		trans_matrix += (num_states * num_states);
+	}
+}
+
+void ModelSet::computeTransDerv(double time, double* trans_matrix, double* trans_derv1, double* trans_derv2)
+{
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->computeTransDerv(time, trans_matrix, trans_derv1, trans_derv2);
+		trans_matrix += (num_states * num_states);
+		trans_derv1 += (num_states * num_states);
+		trans_derv2 += (num_states * num_states);
+	}
+}
+
+void ModelSet::computeTransDervFreq(double time, double rate_val, double* trans_matrix, double* trans_derv1, double* trans_derv2)
+{
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->computeTransDervFreq(time, rate_val, trans_matrix, trans_derv1, trans_derv2);
+		trans_matrix += (num_states * num_states);
+		trans_derv1 += (num_states * num_states);
+		trans_derv2 += (num_states * num_states);
+	}
+}
+
+int ModelSet::getPtnModelID(int ptn)
+{
+	assert(ptn >= 0 && ptn < pattern_model_map.size());
+	assert(pattern_model_map[ptn] >= 0 && pattern_model_map[ptn] < size());
+    return pattern_model_map[ptn];
+}
+
+
+double ModelSet::computeTrans(double time, int model_id, int state1, int state2) {
+	return at(model_id)->computeTrans(time, state1, state2);
+}
+
+double ModelSet::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) {
+	return at(model_id)->computeTrans(time, state1, state2, derv1, derv2);
+	
+}
+
+int ModelSet::getNDim()
+{
+	assert(size());
+    return front()->getNDim();
+}
+
+void ModelSet::writeInfo(ostream& out)
+{
+	assert(size());
+	if (verbose_mode >= VB_MED) {
+		int i = 1;
+		for (iterator it = begin(); it != end(); it++, i++) {
+			out << "Partition " << i << ":" << endl;
+			(*it)->writeInfo(out);
+		}
+	} else {
+		front()->writeInfo(out);
+	}
+}
+
+void ModelSet::decomposeRateMatrix()
+{
+	for (iterator it = begin(); it != end(); it++)
+		(*it)->decomposeRateMatrix();
+}
+
+
+void ModelSet::getVariables(double* variables)
+{
+	assert(size());
+	for (iterator it = begin(); it != end(); it++)
+		(*it)->getVariables(variables);
+}
+
+void ModelSet::setVariables(double* variables)
+{
+	assert(size());
+	front()->setVariables(variables);
+}
+
+
+ModelSet::~ModelSet()
+{
+
+}
+
diff --git a/model/modelset.h b/model/modelset.h
new file mode 100644
index 0000000..2a4829e
--- /dev/null
+++ b/model/modelset.h
@@ -0,0 +1,182 @@
+/*
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 2012  BUI Quang Minh <email>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef MODELSET_H
+#define MODELSET_H
+
+#include "modelgtr.h"
+
+/**
+ * a set of substitution models, used eg for site-specific state frequency model or 
+ * partition model with joint branch lengths
+ */
+class ModelSet : public ModelGTR, public vector<ModelGTR*>
+{
+
+public:
+    ModelSet(const char *model_name, PhyloTree *tree);
+	/**
+	 * @return TRUE if this is a site-specific model, FALSE otherwise
+	 */
+	virtual bool isSiteSpecificModel() { return true; }
+
+	/**
+	 * get the size of transition matrix, default is num_states*num_states.
+	 * can be changed for e.g. site-specific model
+	 */
+	virtual int getTransMatrixSize() { return num_states * num_states * size(); }
+
+
+	/**
+		compute the transition probability matrix.
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+	*/
+	virtual void computeTransMatrix(double time, double *trans_matrix);
+
+	
+	/**
+	 * wrapper for computing transition matrix times state frequency vector
+	 * @param time time between two events
+	 * @param trans_matrix (OUT) the transition matrix between all pairs of states.
+	 * 	Assume trans_matrix has size of num_states * num_states.
+	 */
+	virtual void computeTransMatrixFreq(double time, double *trans_matrix);
+
+	
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDerv(double time, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2 times state frequency vector
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDervFreq(double time, double rate_val, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+
+	/**
+		To AVOID 'hides overloaded virtual functions
+		compute the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int state1, int state2) { return 0; }
+
+	/**
+		To AVOID 'hides overloaded virtual functions
+		compute the transition probability between two states
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2) { return 0; }
+
+
+
+	/**
+		compute the transition probability between two states at a specific site 
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param model_id model ID
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int model_id, int state1, int state2);
+
+	/**
+		compute the transition probability and its 1st and 2nd derivatives between two states at a specific site
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param model_id model ID
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2);
+
+	/**
+	 * @return pattern ID to model ID map, useful for e.g., partition model
+	 * @param ptn pattern ID of the alignment
+	 */
+	virtual int getPtnModelID(int ptn);
+	
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+	
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix();
+
+    ~ModelSet();
+
+	/** map from pattern ID to model ID */
+	IntVector pattern_model_map;
+	
+protected:
+	
+	
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters 
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters 
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+	
+};
+
+#endif // MODELSET_H
diff --git a/model/modelsubst.cpp b/model/modelsubst.cpp
new file mode 100644
index 0000000..1b79b65
--- /dev/null
+++ b/model/modelsubst.cpp
@@ -0,0 +1,183 @@
+//
+// C++ Implementation: substmodel
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#include "modelsubst.h"
+#include "tools.h"
+
+ModelSubst::ModelSubst(int nstates) : Optimization()
+{
+	num_states = nstates;
+	name = "JC";
+	full_name = "JC (Juke and Cantor, 1969)";
+	state_freq = new double[num_states];
+	for (int i = 0; i < num_states; i++)
+		state_freq[i] = 1.0 / num_states;
+	freq_type = FREQ_EQUAL;
+}
+
+// here the simplest Juke-Cantor model is implemented, valid for all kind of data (DNA, AA,...)
+void ModelSubst::computeTransMatrix(double time, double *trans_matrix) {
+	double non_diagonal = (1.0 - exp(-time*num_states/(num_states - 1))) / num_states;
+	double diagonal = 1.0 - non_diagonal * (num_states - 1);
+	int nstates_sqr = num_states * num_states;
+
+	for (int i = 0; i < nstates_sqr; i++)
+		if (i % (num_states+1) == 0) 
+			trans_matrix[i] = diagonal; 
+		else 
+			trans_matrix[i] = non_diagonal;
+}
+
+void ModelSubst::computeTransMatrixFreq(double time, double* trans_matrix)
+{
+	computeTransMatrix(time, trans_matrix);
+	for (int state1 = 0; state1 < num_states; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * num_states);
+		for (int state2 = 0; state2 < num_states; state2++)
+			trans_mat_state[state2] /= num_states;
+	}
+	
+}
+
+
+double ModelSubst::computeTrans(double time, int state1, int state2) {
+	double expt = exp(-time * num_states / (num_states-1));
+	if (state1 != state2) {
+		return (1.0 - expt) / num_states;
+	}
+	return (1.0 + (num_states-1)*expt) / num_states;
+
+/*	double non_diagonal = (1.0 - exp(-time*num_states/(num_states - 1))) / num_states;
+	if (state1 != state2)
+		return non_diagonal;
+	return 1.0 - non_diagonal * (num_states - 1);*/
+}
+
+double ModelSubst::computeTrans(double time, int model_id, int state1, int state2) {
+	return computeTrans(time, state1, state2);
+}
+
+double ModelSubst::computeTrans(double time, int state1, int state2, double &derv1, double &derv2) {
+	double coef = -double(num_states) / (num_states-1);
+	double expt = exp(time * coef);
+	if (state1 != state2) {
+		derv1 = expt / (num_states-1);
+		derv2 = derv1 * coef;
+		return (1.0 - expt) / num_states;
+	}
+
+	derv1 = -expt;
+	derv2 = derv1 * coef;
+	return (1.0 + (num_states-1)*expt) / num_states;
+}
+
+double ModelSubst::computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2) {
+	return computeTrans(time, state1, state2, derv1, derv2);
+}
+
+void ModelSubst::getRateMatrix(double *rate_mat) {
+	int nrate = getNumRateEntries();
+	for (int i = 0; i < nrate; i++)
+		rate_mat[i] = 1.0;
+}
+
+void ModelSubst::getQMatrix(double *q_mat) {
+	int i, j, k;
+	for (i = 0, k = 0; i < num_states; i++)
+		for (j = 0; j < num_states; j++, k++)
+			if (i == j) q_mat[k] = -1.0; else q_mat[k] = 1.0/3;
+}
+
+void ModelSubst::getStateFrequency(double *state_freq) {
+	double freq = 1.0 / num_states;
+	for (int i = 0; i < num_states; i++)
+		state_freq[i] = freq;
+}
+
+void ModelSubst::computeTransDerv(double time, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2)
+{
+	double expf = exp(-time*num_states/(num_states - 1));
+	double non_diag = (1.0 - expf) / num_states;
+	double diag = 1.0 - non_diag * (num_states - 1);
+	double derv1_non_diag = expf / (num_states-1);
+	double derv1_diag = -expf;
+	double derv2_non_diag = -derv1_non_diag*num_states/(num_states-1);
+	double derv2_diag = -derv1_diag*num_states/(num_states-1);
+
+	int nstates_sqr = num_states * num_states;
+	int i;
+	for (i = 0; i < nstates_sqr; i++)
+		if (i % (num_states+1) == 0) { 
+			trans_matrix[i] = diag;
+			trans_derv1[i] = derv1_diag;
+			trans_derv2[i] = derv2_diag;
+		} else { 
+			trans_matrix[i] = non_diag;
+			trans_derv1[i] = derv1_non_diag;
+			trans_derv2[i] = derv2_non_diag;
+		}
+
+	// DEBUG
+	/*int j;
+	if (verbose_mode == VB_DEBUG) {
+		cout.precision(4);
+		cout << "time = " << time << endl;
+		for (i = 0; i < num_states; i++, cout << endl) {
+			for (j = 0; j < num_states; j++) {
+				cout.width(8);
+				cout << right << trans_matrix[i*num_states+j] << " ";
+			}
+			cout << "| ";
+			for (j = 0; j < num_states; j++) {
+				cout << right << trans_derv1[i*num_states+j] << " ";
+				cout.width(8);
+			}
+			cout << "| ";
+			for (j = 0; j < num_states; j++) {
+				cout.width(8);
+				cout << right << trans_derv2[i*num_states+j] << " ";
+			}
+		}
+		cout.precision(10);
+	}*/
+
+}
+
+void ModelSubst::computeTransDervFreq(double time, double rate_val, double* trans_matrix, double* trans_derv1, double* trans_derv2)
+{
+	int nstates = num_states;
+	double rate_sqr = rate_val*rate_val;
+	computeTransDerv(time * rate_val, trans_matrix, trans_derv1, trans_derv2);
+	for (int state1 = 0; state1 < nstates; state1++) {
+		double *trans_mat_state = trans_matrix + (state1 * nstates);
+		double *trans_derv1_state = trans_derv1 + (state1 * nstates);
+		double *trans_derv2_state = trans_derv2 + (state1 * nstates);
+		for (int state2 = 0; state2 < nstates; state2++) {
+			trans_mat_state[state2] /= num_states;
+			trans_derv1_state[state2] *= (rate_val/num_states);
+			trans_derv2_state[state2] *= (rate_sqr/num_states);
+		}
+	}
+
+}
+
+double *ModelSubst::newTransMatrix() {
+	return new double[num_states * num_states];
+}
+
+ModelSubst::~ModelSubst()
+{
+	if (state_freq) delete [] state_freq;
+}
+
+
+
diff --git a/model/modelsubst.h b/model/modelsubst.h
new file mode 100644
index 0000000..8d3e8a2
--- /dev/null
+++ b/model/modelsubst.h
@@ -0,0 +1,299 @@
+//
+// C++ Interface: substmodel
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#ifndef SUBSTMODEL_H
+#define SUBSTMODEL_H
+
+#include <string>
+#include "tools.h"
+#include "optimization.h"
+
+using namespace std;
+
+/**
+Substitution model abstract class
+
+	@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+*/
+class ModelSubst: public Optimization
+{
+	friend class ModelFactory;
+
+public:
+	/**
+		constructor
+		@param nstates number of states, e.g. 4 for DNA, 20 for proteins.
+	*/
+    ModelSubst(int nstates);
+
+
+	/**
+		@return the number of dimensions
+	*/
+	virtual int getNDim() { return 0; }
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() { return name; }
+
+	/**
+		@return TRUE if model is time-reversible, FALSE otherwise
+	*/
+	virtual bool isReversible() { return true; };
+	
+	/**
+	 * @return TRUE if this is a site-specific model, FALSE otherwise
+	 */
+	virtual bool isSiteSpecificModel() { return false; }
+
+	/**
+	 * @return TRUE if this is a mixture model, FALSE otherwise
+	 */
+	virtual bool isMixture() { return false; }
+
+	/**
+	 * @return the number of mixture model components
+	 */
+	virtual int getNMixtures() { return 1; }
+
+	/**
+		@return the number of rate entries, equal to the number of elements
+			in the upper-diagonal of the rate matrix (since model is reversible)
+	*/
+	virtual int getNumRateEntries() { return num_states*(num_states-1)/2; }
+
+	/**
+	 * get the size of transition matrix, default is num_states*num_states.
+	 * can be changed for e.g. site-specific model
+	 */
+	virtual int getTransMatrixSize() { return num_states * num_states; }
+
+	/**
+		compute the transition probability matrix. One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+	*/
+	virtual void computeTransMatrix(double time, double *trans_matrix);
+
+	/**
+	 * wrapper for computing transition matrix times state frequency vector
+	 * @param time time between two events
+	 * @param trans_matrix (OUT) the transition matrix between all pairs of states.
+	 * 	Assume trans_matrix has size of num_states * num_states.
+	 */
+	virtual void computeTransMatrixFreq(double time, double *trans_matrix);
+
+	/**
+		compute the transition probability between two states. 
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int state1, int state2);
+
+	/**
+		compute the transition probability between two states at a specific model ID, useful for partition model 
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param model_id model ID
+		@param state1 first state
+		@param state2 second state
+	*/
+	virtual double computeTrans(double time, int model_id, int state1, int state2);
+
+	/**
+		compute the transition probability and its 1st and 2nd derivatives between two states. 
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int state1, int state2, double &derv1, double &derv2);
+
+	/**
+		compute the transition probability and its 1st and 2nd derivatives between two states at a specific model ID
+		One should override this function when defining new model.
+		The default is the Juke-Cantor model, valid for all kind of data (DNA, AA, Codon, etc)
+		@param time time between two events
+		@param model_id model ID
+		@param state1 first state
+		@param state2 second state
+		@param derv1 (OUT) 1st derivative
+		@param derv2 (OUT) 2nd derivative
+	*/
+	virtual double computeTrans(double time, int model_id, int state1, int state2, double &derv1, double &derv2);
+
+	/**
+	 * @return pattern ID to model ID map, useful for e.g., partition model
+	 * @param ptn pattern ID of the alignment
+	 */
+	virtual int getPtnModelID(int ptn) { return 0; }
+	
+
+	/**
+	 * Get the rate parameters like a,b,c,d,e,f for DNA model!!!
+		Get the above-diagonal entries of the rate matrix, assuming that the last element is 1.
+		ONE SHOULD OVERRIDE THIS FUNCTION WHEN DEFINING NEW MODEL!!!
+		The default is equal rate of 1 (JC Model), valid for all kind of data.
+		@param rate_mat (OUT) upper-triangle rate matrix. Assume rate_mat has size of num_states*(num_states-1)/2
+	*/
+	
+	virtual void getRateMatrix(double *rate_mat);
+
+	/**
+		Get the rate matrix Q. One should override this function when defining new model.
+		The default is equal rate of 1 (JC Model), valid for all kind of data.
+		@param rate_mat (OUT) upper-triagle rate matrix. Assume rate_mat has size of num_states*(num_states-1)/2
+	*/
+	virtual void getQMatrix(double *q_mat);
+
+	/**
+		compute the state frequency vector. One should override this function when defining new model.
+		The default is equal state sequency, valid for all kind of data.
+		@param state_freq (OUT) state frequency vector. Assume state_freq has size of num_states
+	*/
+	virtual void getStateFrequency(double *state_freq);
+
+	/**
+		get frequency type
+		@return frequency type
+	*/
+	virtual StateFreqType getFreqType() { return FREQ_EQUAL; }
+
+
+	/**
+		allocate memory for a transition matrix. One should override this function when defining new model
+		such as Gamma model. The default is to allocate a double vector of size num_states * num_states. This
+		is equivalent to the memory needed by a square matrix.
+		@return the pointer to the newly allocated transition matrix
+	*/
+	virtual double *newTransMatrix();
+
+
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDerv(double time, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+	/**
+		compute the transition probability matrix.and the derivative 1 and 2 times state frequency vector
+		@param time time between two events
+		@param trans_matrix (OUT) the transition matrix between all pairs of states. 
+			Assume trans_matrix has size of num_states * num_states.
+		@param trans_derv1 (OUT) the 1st derivative matrix between all pairs of states. 
+		@param trans_derv2 (OUT) the 2nd derivative matrix between all pairs of states. 
+	*/
+	virtual void computeTransDervFreq(double time, double rate_val, double *trans_matrix, 
+		double *trans_derv1, double *trans_derv2);
+
+
+	/**
+		decompose the rate matrix into eigenvalues and eigenvectors
+	*/
+	virtual void decomposeRateMatrix() {}
+
+	/**
+		optimize model parameters. One should override this function when defining new model.
+		The default does nothing since it is a Juke-Cantor type model, hence no parameters involved.
+		@param epsilon accuracy of the parameters during optimization
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double gradient_epsilon) { return 0.0; }
+
+	/**
+	 * @return TRUE if parameters are at the boundary that may cause numerical unstability
+	 */
+	virtual bool isUnstableParameters() { return false; }
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out) {}
+
+	virtual double *getEigenvalues() const {
+		return NULL;
+	}
+
+	virtual double *getEigenvectors() const {
+		return NULL;
+	}
+
+	virtual double *getInverseEigenvectors() const {
+		return NULL;
+	}
+
+	/**
+		number of states
+	*/
+	int num_states;
+
+	/**
+		name of the model
+	*/
+	string name;
+
+
+	/**
+		full name of the model
+	*/
+	string full_name;
+	
+	/**
+	 state frequencies
+	 */
+	double *state_freq;
+	
+
+	/**
+		state frequency type
+	*/
+	StateFreqType freq_type;
+
+	/**
+		destructor
+	*/
+    virtual ~ModelSubst();
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables) {}
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables) {}
+
+};
+
+#endif
diff --git a/model/partitionmodel.cpp b/model/partitionmodel.cpp
new file mode 100644
index 0000000..c4624f8
--- /dev/null
+++ b/model/partitionmodel.cpp
@@ -0,0 +1,113 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "partitionmodel.h"
+#include "superalignment.h"
+
+PartitionModel::PartitionModel()
+        : ModelFactory()
+{
+}
+
+PartitionModel::PartitionModel(Params &params, PhyloSuperTree *tree, ModelsBlock *models_block)
+        : ModelFactory()
+{
+	store_trans_matrix = params.store_trans_matrix;
+	is_storing = false;
+	joint_optimize = params.optimize_model_rate_joint;
+	fused_mix_rate = false;
+
+	// create dummy model
+	model = new ModelSubst(tree->aln->num_states);
+	site_rate = new RateHeterogeneity();
+	site_rate->setTree(tree);
+
+    string model_name = params.model_name;
+    PhyloSuperTree::iterator it;
+    int part;
+    for (it = tree->begin(), part = 0; it != tree->end(); it++, part++) {
+        assert(!((*it)->getModelFactory()));
+        params.model_name = tree->part_info[part].model_name;
+        if (params.model_name == "") // if empty, take model name from command option
+        	params.model_name = model_name;
+        (*it)->setModelFactory(new ModelFactory(params, (*it), models_block));
+        (*it)->setModel((*it)->getModelFactory()->model);
+        (*it)->setRate((*it)->getModelFactory()->site_rate);
+        params.model_name = model_name;
+        if ((*it)->aln->getNSeq() < tree->aln->getNSeq() && (*it)->getModel()->freq_type == FREQ_EMPIRICAL && (*it)->aln->seq_type != SEQ_CODON) {
+        	// modify state_freq to account for empty sequences
+        	(*it)->aln->computeStateFreq((*it)->getModel()->state_freq, (*it)->aln->getNSite() * (tree->aln->getNSeq() - (*it)->aln->getNSeq()));
+        	(*it)->getModel()->decomposeRateMatrix();
+        }
+        //string taxa_set = ((SuperAlignment*)tree->aln)->getPattern(part);
+        //(*it)->copyTree(tree, taxa_set);
+        //(*it)->drawTree(cout);
+    }
+}
+
+int PartitionModel::getNParameters() {
+    PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+	int df = 0;
+    for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++) {
+    	df += (*it)->getModelFactory()->getNParameters();
+    }
+    return df;
+}
+
+
+double PartitionModel::optimizeParameters(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+    PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+    double tree_lh = 0.0;
+    int ntrees = tree->size();
+
+    if (tree->part_order.empty()) tree->computePartitionOrder();
+	#ifdef _OPENMP
+	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic)
+	#endif
+    for (int i = 0; i < ntrees; i++) {
+        int part = tree->part_order[i];
+    	if (write_info)
+        #ifdef _OPENMP
+        #pragma omp critical
+        #endif
+        {
+    		cout << "Optimizing " << tree->at(part)->getModelName() <<
+        		" parameters for partition " << tree->part_info[part].name <<
+        		" (" << tree->at(part)->getModelFactory()->getNParameters() << " free parameters)" << endl;
+        }
+        tree_lh += tree->at(part)->getModelFactory()->optimizeParameters(fixed_len, write_info && verbose_mode >= VB_MED, 
+            logl_epsilon/min(ntrees,10), gradient_epsilon/min(ntrees,10));
+    }
+    //return ModelFactory::optimizeParameters(fixed_len, write_info);
+    return tree_lh;
+}
+
+PartitionModel::~PartitionModel()
+{
+}
+
+bool PartitionModel::isUnstableParameters() {
+    PhyloSuperTree *tree = (PhyloSuperTree*)site_rate->getTree();
+
+	for (PhyloSuperTree::iterator it = tree->begin(); it != tree->end(); it++)
+		if ((*it)->getModelFactory()->isUnstableParameters()) {
+			return true;
+		}
+	return false;
+}
diff --git a/model/partitionmodel.h b/model/partitionmodel.h
new file mode 100644
index 0000000..ed36693
--- /dev/null
+++ b/model/partitionmodel.h
@@ -0,0 +1,66 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PARTITIONMODEL_H
+#define PARTITIONMODEL_H
+
+#include "phylosupertree.h"
+#include "modelfactory.h"
+
+/**
+Partition model
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class PartitionModel : public ModelFactory
+{
+public:
+    PartitionModel();
+	/**
+		constructor
+		create partition model with possible rate heterogeneity. Create proper class objects
+		for two variables: model and site_rate. It takes the following field of params into account:
+			model_name, num_rate_cats, freq_type, store_trans_matrix
+		@param params program parameters
+		@param tree associated phylogenetic super-tree
+	*/
+	PartitionModel(Params &params, PhyloSuperTree *tree, ModelsBlock *models_block);
+
+    ~PartitionModel();
+
+    /**
+     * @return #parameters of the model + # branches
+     */
+    virtual int getNParameters();
+
+	/**
+		optimize model parameters and tree branch lengths
+		@param fixed_len TRUE to fix branch lengths, default is false
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(bool fixed_len = false, bool write_info = true, double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+	/**
+	 * @return TRUE if parameters are at the boundary that may cause numerical unstability
+	 */
+	virtual bool isUnstableParameters();
+
+};
+
+#endif
diff --git a/model/ratefree.cpp b/model/ratefree.cpp
new file mode 100644
index 0000000..12a62c9
--- /dev/null
+++ b/model/ratefree.cpp
@@ -0,0 +1,567 @@
+/*
+ * ratefree.cpp
+ *
+ *  Created on: Nov 3, 2014
+ *      Author: minh
+ */
+
+#include "phylotree.h"
+#include "ratefree.h"
+
+#include "model/modelfactory.h"
+#include "model/modelmixture.h"
+
+
+const double MIN_FREE_RATE = 0.001;
+const double MAX_FREE_RATE = 1000.0;
+const double TOL_FREE_RATE = 0.0001;
+
+// Modified by Thomas on 13 May 2015
+//const double MIN_FREE_RATE_PROP = 0.0001;
+//const double MAX_FREE_RATE_PROP = 0.9999;
+const double MIN_FREE_RATE_PROP = 0.001;
+const double MAX_FREE_RATE_PROP = 1000;
+
+RateFree::RateFree(int ncat, double start_alpha, string params, bool sorted_rates, string opt_alg, PhyloTree *tree) : RateGamma(ncat, start_alpha, false, tree) {
+	fix_params = false;
+	prop = NULL;
+    this->sorted_rates = sorted_rates;
+    optimizing_params = 0;
+    this->optimize_alg = opt_alg;
+	setNCategory(ncat);
+
+	if (params.empty()) return;
+	DoubleVector params_vec;
+	try {
+		convert_double_vec(params.c_str(), params_vec);
+		if (params_vec.size() != ncategory*2)
+			outError("Number of parameters for FreeRate model must be twice number of categories");
+		int i;
+		double sum, sum_prop;
+		for (i = 0, sum = 0.0, sum_prop = 0.0; i < ncategory; i++) {
+			prop[i] = params_vec[i*2];
+			rates[i] = params_vec[i*2+1];
+			sum += prop[i]*rates[i];
+			sum_prop += prop[i];
+		}
+		if (fabs(sum_prop-1.0) > 1e-5)
+			outError("Sum of category proportions not equal to 1");
+		for (i = 0; i < ncategory; i++)
+			rates[i] /= sum;
+		fix_params = true;
+	} catch (string &str) {
+		outError(str);
+	}
+}
+
+void RateFree::setNCategory(int ncat) {
+
+    // initialize with gamma rates
+    RateGamma::setNCategory(ncat);
+	if (prop) delete [] prop;
+	prop  = new double[ncategory];
+
+    int i;
+	for (i = 0; i < ncategory; i++)
+        prop[i] = 1.0/ncategory;
+    
+//	double sum_prop = (ncategory)*(ncategory+1)/2.0;
+//	double sum = 0.0;
+//	int i;
+	// initialize rates as increasing
+//	for (i = 0; i < ncategory; i++) {
+//		prop[i] = (double)(ncategory-i) / sum_prop;
+//        prop[i] = 1.0 / ncategory;
+//		rates[i] = (double)(i+1);
+//		sum += prop[i]*rates[i];
+//	}
+//	for (i = 0; i < ncategory; i++)
+//		rates[i] /= sum;
+
+	name = "+R";
+	name += convertIntToString(ncategory);
+	full_name = "FreeRate";
+	full_name += " with " + convertIntToString(ncategory) + " categories";
+}
+
+void RateFree::setRateAndProp(RateFree *input) {
+    assert(input->ncategory == ncategory-1);
+    int k = 0, i;
+    // get the category k with largest proportion
+    for (i = 1; i < ncategory-1; i++)
+        if (input->prop[i] > input->prop[k]) k = i;
+
+    memcpy(rates, input->rates, (k+1)*sizeof(double));
+    memcpy(prop, input->prop, (k+1)*sizeof(double));
+    rates[k+1] = 1.414*rates[k]; // sqrt(2)
+    prop[k+1] = prop[k]/2;
+    rates[k] = 0.586*rates[k];
+    prop[k] = prop[k]/2;
+    if (k < ncategory-2) {
+        memcpy(&rates[k+2], &input->rates[k+1], (ncategory-2-k)*sizeof(double));
+        memcpy(&prop[k+2], &input->prop[k+1], (ncategory-2-k)*sizeof(double));
+    }
+    // copy half of k to the last category
+
+
+//    rates[ncategory-1] = rates[k];
+//    prop[ncategory-1] = prop[k] / 2;
+//    prop[k] = prop[k] / 2;
+    // sort the rates in increasing order
+    if (sorted_rates)
+        quicksort(rates, 0, ncategory-1, prop);
+    phylo_tree->clearAllPartialLH();
+}
+
+
+RateFree::~RateFree() {
+	if (prop) delete [] prop;
+	prop = NULL;
+}
+
+string RateFree::getNameParams() {
+	stringstream str;
+	str << "+R" << ncategory << "{";
+	for (int i = 0; i < ncategory; i++) {
+		if (i > 0) str << ",";
+		str << prop[i]<< "," << rates[i];
+	}
+	str << "}";
+	return str.str();
+}
+
+double RateFree::meanRates() {
+	double ret = 0.0;
+	for (int i = 0; i < ncategory; i++)
+		ret += prop[i] * rates[i];
+	return ret;
+}
+
+/**
+ * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
+ * @return rescaling factor
+ */
+double RateFree::rescaleRates() {
+	double norm = meanRates();
+	for (int i = 0; i < ncategory; i++)
+		rates[i] /= norm;
+	return norm;
+}
+
+int RateFree::getNDim() { 
+    if (fix_params) return 0;
+    if (optimizing_params == 0) return (2*ncategory-2); 
+    if (optimizing_params == 1) // rates
+        return ncategory;
+    if (optimizing_params == 2) // proportions
+        return ncategory-1;
+    return 0;
+}
+
+double RateFree::targetFunk(double x[]) {
+	getVariables(x);
+    if (optimizing_params != 2)
+        // only clear partial_lh if optimizing rates
+        phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+
+
+/**
+	optimize parameters. Default is to optimize gamma shape
+	@return the best likelihood
+*/
+double RateFree::optimizeParameters(double gradient_epsilon) {
+
+	int ndim = getNDim();
+
+	// return if nothing to be optimized
+	if (ndim == 0)
+		return phylo_tree->computeLikelihood();
+
+	if (verbose_mode >= VB_MED)
+		cout << "Optimizing " << name << " model parameters by " << optimize_alg << " algorithm..." << endl;
+
+    if (optimize_alg == "EM")
+        return optimizeWithEM();
+
+	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
+
+	double *variables = new double[ndim+1];
+	double *upper_bound = new double[ndim+1];
+	double *lower_bound = new double[ndim+1];
+	bool *bound_check = new bool[ndim+1];
+	double score;
+
+//    score = optimizeWeights();
+
+    int left = 1, right = 2;
+    if (optimize_alg.substr(0, 6) == "1-BFGS") {
+        left = 0; 
+        right = 0;
+    }
+
+    for (optimizing_params = left; optimizing_params <= right; optimizing_params++) {
+    
+        ndim = getNDim();
+        // by BFGS algorithm
+        setVariables(variables);
+        setBounds(lower_bound, upper_bound, bound_check);
+
+//        if (optimizing_params == 2 && optimize_alg.find("-EM") != string::npos)
+//            score = optimizeWeights();
+//        else 
+        if (optimize_alg.substr(optimize_alg.length()-2,2) == "-B")
+            score = -L_BFGS_B(ndim, variables+1, lower_bound+1, upper_bound+1, max(gradient_epsilon, TOL_FREE_RATE));
+        else
+            score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_FREE_RATE));
+
+        getVariables(variables);
+        // sort the rates in increasing order
+        if (sorted_rates)
+            quicksort(rates, 0, ncategory-1, prop);
+        phylo_tree->clearAllPartialLH();
+    }
+    optimizing_params = 0;
+
+	delete [] bound_check;
+	delete [] lower_bound;
+	delete [] upper_bound;
+	delete [] variables;
+
+	return score;
+}
+
+void RateFree::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	if (getNDim() == 0) return;
+	int i;
+    if (optimizing_params == 2) {
+        // proportions
+        for (i = 1; i < ncategory; i++) {
+            lower_bound[i] = MIN_FREE_RATE_PROP;
+            upper_bound[i] = MAX_FREE_RATE_PROP;
+            bound_check[i] = false;
+        }
+    } else if (optimizing_params == 1){
+        // rates
+        for (i = 1; i <= ncategory; i++) {
+            lower_bound[i] = MIN_FREE_RATE;
+            upper_bound[i] = MAX_FREE_RATE;
+            bound_check[i] = false;
+        }
+    } else {
+        // both weights and rates
+        for (i = 1; i < ncategory; i++) {
+            lower_bound[i] = MIN_FREE_RATE_PROP;
+            upper_bound[i] = MAX_FREE_RATE_PROP;
+            bound_check[i] = false;
+        }
+        for (i = 1; i < ncategory; i++) {
+            lower_bound[i+ncategory-1] = MIN_FREE_RATE;
+            upper_bound[i+ncategory-1] = MAX_FREE_RATE;
+            bound_check[i+ncategory-1] = false;
+        }
+        
+    }
+//	for (i = ncategory; i <= 2*ncategory-2; i++) {
+//		lower_bound[i] = MIN_FREE_RATE;
+//		upper_bound[i] = MAX_FREE_RATE;
+//		bound_check[i] = false;
+//	}
+}
+
+
+void RateFree::setVariables(double *variables) {
+	if (getNDim() == 0) return;
+	int i;
+
+	// Modified by Thomas on 13 May 2015
+	// --start--
+	/*
+	variables[1] = prop[0];
+	for (i = 2; i < ncategory; i++)
+		variables[i] = variables[i-1] + prop[i-1];
+	*/
+    
+    if (optimizing_params == 2) {    
+        // proportions
+        for (i = 0; i < ncategory-1; i++)
+            variables[i+1] = prop[i] / prop[ncategory-1];
+    } else if (optimizing_params == 1) {
+        // rates
+        for (i = 0; i < ncategory; i++)
+            variables[i+1] = rates[i];
+    } else {
+        // both rates and weights
+        for (i = 0; i < ncategory-1; i++)
+            variables[i+1] = prop[i] / prop[ncategory-1];
+        for (i = 0; i < ncategory-1; i++)
+            variables[i+ncategory] = rates[i] / rates[ncategory-1];
+    }
+
+}
+
+void RateFree::getVariables(double *variables) {
+	if (getNDim() == 0) return;
+	int i;
+
+	// Modified by Thomas on 13 May 2015
+	// --start--
+	/*
+	double *y = new double[2*ncategory+1];
+	double *z = y+ncategory+1;
+	//  site proportions: y[0..c] <-> (0.0, variables[1..c-1], 1.0)
+	y[0] = 0; y[ncategory] = 1.0;
+	memcpy(y+1, variables+1, (ncategory-1) * sizeof(double));
+	std::sort(y+1, y+ncategory);
+
+	// category rates: z[0..c-1] <-> (variables[c..2*c-2], 1.0)
+	memcpy(z, variables+ncategory, (ncategory-1) * sizeof(double));
+	z[ncategory-1] = 1.0;
+	//std::sort(z, z+ncategory-1);
+
+	double sum = 0.0;
+	for (i = 0; i < ncategory; i++) {
+		prop[i] = (y[i+1]-y[i]);
+		sum += prop[i] * z[i];
+	}
+	for (i = 0; i < ncategory; i++) {
+		rates[i] = z[i] / sum;
+	}
+
+	delete [] y;
+	*/
+
+	double sum = 1.0;
+    if (optimizing_params == 2) {
+        // proportions
+        for (i = 0; i < ncategory-1; i++) {
+            sum += variables[i+1];
+        }
+        for (i = 0; i < ncategory-1; i++) {
+            prop[i] = variables[i+1] / sum;
+        }
+        prop[ncategory-1] = 1.0 / sum;
+    } else if (optimizing_params == 1) {
+        // rates
+        for (i = 0; i < ncategory; i++)
+            rates[i] = variables[i+1];
+    } else {
+        // both weights and rates
+        for (i = 0; i < ncategory-1; i++) {
+            sum += variables[i+1];
+        }
+        for (i = 0; i < ncategory-1; i++) {
+            prop[i] = variables[i+1] / sum;
+        }
+        prop[ncategory-1] = 1.0 / sum;
+        
+        // then rates
+    	sum = prop[ncategory-1];
+    	for (i = 0; i < ncategory-1; i++) {
+    		sum += prop[i] * variables[i+ncategory];
+    	}
+    	for (i = 0; i < ncategory-1; i++) {
+    		rates[i] = variables[i+ncategory] / sum;
+    	}
+    	rates[ncategory-1] = 1.0 / sum;
+    }
+	// --end--
+
+}
+
+/**
+	write information
+	@param out output stream
+*/
+void RateFree::writeInfo(ostream &out) {
+	out << "Site proportion and rates: ";
+	for (int i = 0; i < ncategory; i++)
+		out << " (" << prop[i] << "," << rates[i] << ")";
+	out << endl;
+}
+
+/**
+	write parameters, used with modeltest
+	@param out output stream
+*/
+void RateFree::writeParameters(ostream &out) {
+	for (int i = 0; i < ncategory; i++)
+		out << "\t" << prop[i] << "\t" << rates[i];
+
+}
+
+double RateFree::optimizeWithEM() {
+    size_t ptn, c;
+    size_t nptn = phylo_tree->aln->getNPattern();
+    size_t nmix = ncategory;
+    
+//    double *lk_ptn = aligned_alloc<double>(nptn);
+    double *new_prop = aligned_alloc<double>(nmix);
+    PhyloTree *tree = new PhyloTree;
+    tree->copyPhyloTree(phylo_tree);
+    tree->optimize_by_newton = phylo_tree->optimize_by_newton;
+    tree->setLikelihoodKernel(phylo_tree->sse);
+    // initialize model
+    ModelFactory *model_fac = new ModelFactory();
+    model_fac->joint_optimize = phylo_tree->params->optimize_model_rate_joint;
+
+    RateHeterogeneity *site_rate = new RateHeterogeneity; 
+    tree->setRate(site_rate);
+    site_rate->setTree(tree);
+            
+    model_fac->site_rate = site_rate;
+    tree->model_factory = model_fac;
+    tree->setParams(phylo_tree->params);
+        
+    // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
+    for (int step = 0; step < ncategory; step++) {
+        // first compute _pattern_lh_cat
+        double score;
+        if (!phylo_tree->getModel()->isMixture())
+            score = phylo_tree->computeLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+        else if (phylo_tree->getModelFactory()->fused_mix_rate) {
+            score = phylo_tree->computeMixrateLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+        } else {
+            outError("Mixture model does not work with FreeRate model!");
+            score = phylo_tree->computeMixtureLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+        }
+        memset(new_prop, 0, nmix*sizeof(double));
+                
+        // E-step
+        // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
+        for (ptn = 0; ptn < nptn; ptn++) {
+            double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+            double lk_ptn = 0.0;
+            for (c = 0; c < nmix; c++) {
+                lk_ptn += this_lk_cat[c];
+            }
+            lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn;
+            
+            // transform _pattern_lh_cat into posterior probabilities of each category
+            for (c = 0; c < nmix; c++) {
+                this_lk_cat[c] *= lk_ptn;
+                new_prop[c] += this_lk_cat[c];
+            }
+            
+        } 
+        
+        // M-step, update weights according to (*)        
+        
+        bool converged = true;
+        for (c = 0; c < nmix; c++) {
+            new_prop[c] = new_prop[c] / phylo_tree->getAlnNSite();
+            // check for convergence
+            converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
+            prop[c] = new_prop[c];
+        }
+        
+        // now optimize rates one by one
+        double sum = 0.0;
+        for (c = 0; c < nmix; c++) {
+            tree->copyPhyloTree(phylo_tree);
+            ModelGTR *subst_model;
+            if (phylo_tree->getModel()->isMixture())
+                subst_model = ((ModelMixture*)phylo_tree->getModel())->at(c);
+            else
+                subst_model = (ModelGTR*)phylo_tree->getModel();
+            tree->setModel(subst_model);
+            subst_model->setTree(tree);
+            model_fac->model = subst_model;
+                        
+            // initialize likelihood
+            tree->initializeAllPartialLh();
+            // copy posterior probability into ptn_freq
+            tree->computePtnFreq();
+            double *this_lk_cat = phylo_tree->_pattern_lh_cat+c;
+            for (ptn = 0; ptn < nptn; ptn++)
+                tree->ptn_freq[ptn] = this_lk_cat[ptn*nmix];
+            double scaling = rates[c];
+            tree->scaleLength(scaling);
+            tree->optimizeTreeLengthScaling(scaling, 0.001);
+            converged = converged && (fabs(rates[c] - scaling) < 1e-4);
+            rates[c] = scaling;
+            sum += prop[c] * rates[c];
+            // reset subst model
+            tree->setModel(NULL);
+            subst_model->setTree(phylo_tree);
+            
+        }
+        
+        phylo_tree->clearAllPartialLH();
+        if (converged) break;
+    }
+    
+    delete tree;
+    aligned_free(new_prop);
+    return phylo_tree->computeLikelihood();
+}
+
+//double RateFree::optimizeWeights() {
+//    // first compute _pattern_lh_cat
+//    double score;
+//    if (!phylo_tree->getModel()->isMixture())
+//        score = phylo_tree->computeLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+//    else if (phylo_tree->getModelFactory()->fused_mix_rate) {
+//        score = phylo_tree->computeMixrateLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+//    } else {
+//        outError("Mixture model does not work with FreeRate model!");
+//        score = phylo_tree->computeMixtureLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); 
+//    }
+//    size_t ptn, c;
+//    size_t nptn = phylo_tree->aln->getNPattern();
+//    size_t nmix = ncategory;
+//    
+//    double *lk_ptn = aligned_alloc<double>(nptn);
+//    double *new_prop = aligned_alloc<double>(nmix);
+//    
+//        
+//    // EM algorithm loop described in Wang, Li, Susko, and Roger (2008)
+//    for (int step = 0; step < 100; step++) {
+//        // E-step
+//        memset(lk_ptn, 0, nptn*sizeof(double));
+//        if (step == 0) {
+//            for (c = 0; c < nmix; c++) 
+//                new_prop[c] = 1.0 / prop[c];
+//            // decoupled weights (prop) from _pattern_lh_cat to obtain L_ci and compute pattern likelihood L_i
+//            for (ptn = 0; ptn < nptn; ptn++) {
+//                double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+//                for (c = 0; c < nmix; c++) {
+//                    lk_ptn[ptn] += this_lk_cat[c];
+//                    this_lk_cat[c] *= new_prop[c];
+//                }
+//            } 
+//        } else {
+//            // update L_i according to (**)
+//            for (ptn = 0; ptn < nptn; ptn++) {
+//                double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+//                for (c = 0; c < nmix; c++) {
+//                    lk_ptn[ptn] += this_lk_cat[c] * prop[c];
+//                }
+//            }        
+//        }
+//        
+//        // M-step, update weights according to (*)
+//        memset(new_prop, 0, nmix*sizeof(double));
+//        for (ptn = 0; ptn < nptn; ptn++) {
+//            double inv_lk_ptn = phylo_tree->ptn_freq[ptn] / lk_ptn[ptn];
+//            double *this_lk_cat = phylo_tree->_pattern_lh_cat + ptn*nmix;
+//            for (c = 0; c < nmix; c++)
+//                new_prop[c] += this_lk_cat[c] * inv_lk_ptn;
+//        }
+//        
+//        bool converged = true;
+//        for (c = 0; c < nmix; c++) {
+//            new_prop[c] = prop[c] * (new_prop[c] / phylo_tree->getAlnNSite());
+//            // check for convergence
+//            converged = converged && (fabs(prop[c]-new_prop[c]) < 1e-4);
+//            prop[c] = new_prop[c];
+//        }
+//        if (converged) break;
+//    }
+//    
+//    aligned_free(new_prop);
+//    aligned_free(lk_ptn);
+//    return phylo_tree->computeLikelihood();
+//}
diff --git a/model/ratefree.h b/model/ratefree.h
new file mode 100644
index 0000000..a445a2c
--- /dev/null
+++ b/model/ratefree.h
@@ -0,0 +1,139 @@
+/*
+ * ratefree.h
+ *
+ *  Created on: Nov 3, 2014
+ *      Author: minh
+ */
+
+#ifndef RATEFREE_H_
+#define RATEFREE_H_
+
+#include "rategamma.h"
+
+class RateFree: virtual public RateGamma {
+public:
+	/**
+		constructor
+		@param ncat number of rate categories
+		@param tree associated phylogenetic tree
+        @param opt_alg optimization algorithm (1-BFGS, 2-BFGS, EM)
+	*/
+    RateFree(int ncat, double start_alpha, string params, bool sorted_rates, string opt_alg, PhyloTree *tree);
+
+	virtual ~RateFree();
+
+	/**
+		@return true if this is a Gamma model (default: false)
+	*/	
+    virtual bool isGammaRate() { return false; }
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		get the proportion of sites under a specified category.
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return prop[category]; }
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		optimize parameters. Default is to optimize gamma shape
+		@return the best likelihood
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+    /** optimize weights using EM algorithm */
+    double optimizeWithEM();
+
+    double optimizeWeights();
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+    /**
+        set number of rate categories
+        @param ncat #categories
+    */
+	virtual void setNCategory(int ncat);
+
+    /**
+        initialize rates and prop from rate model with #category less by 1
+    */
+    void setRateAndProp(RateFree *input);
+
+	/**
+	 * used to normal branch lengths if mean rate is not equal to 1 (e.g. FreeRate model)
+	 * @return mean rate, default = 1
+	 */
+	virtual double meanRates();
+
+	/**
+	 * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
+	 * @return rescaling factor
+	 */
+	virtual double rescaleRates();
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+	/**
+	 * proportion of sites for each rate categories
+	 */
+	double *prop;
+
+	/** TRUE to fix parameters */
+	bool fix_params;
+    
+    /** true to sort rate in increasing order, false otherwise */
+    bool sorted_rates;
+
+    /** 0: no, 1: rates, 2: weights */
+    int optimizing_params;
+
+    string optimize_alg;
+
+};
+
+#endif /* RATEFREE_H_ */
diff --git a/model/ratefreeinvar.cpp b/model/ratefreeinvar.cpp
new file mode 100644
index 0000000..989de24
--- /dev/null
+++ b/model/ratefreeinvar.cpp
@@ -0,0 +1,82 @@
+/*
+ * ratefreeinvar.cpp
+ *
+ *  Created on: Nov 7, 2014
+ *      Author: minh
+ */
+
+#include "ratefreeinvar.h"
+
+RateFreeInvar::RateFreeInvar(int ncat, double start_alpha, string params, bool sorted_rates, double p_invar_sites, string opt_alg, PhyloTree *tree)
+: RateGamma(ncat, 1.0, false, tree), RateInvar(p_invar_sites, tree), RateFree(ncat, start_alpha, params, sorted_rates, opt_alg, tree)
+{
+	cur_optimize = 0;
+	name = "+I" + name;
+	full_name = "Invar+" + full_name;
+}
+
+void RateFreeInvar::setNCategory(int ncat) {
+	RateFree::setNCategory(ncat);
+	name = "+I" + name;
+	full_name = "Invar+" + full_name;
+}
+
+double RateFreeInvar::computeFunction(double value) {
+	p_invar = value;
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+double RateFreeInvar::targetFunk(double x[]) {
+	return RateFree::targetFunk(x);
+}
+
+void RateFreeInvar::writeInfo(ostream &out) {
+	RateInvar::writeInfo(out);
+	RateFree::writeInfo(out);
+
+}
+
+/**
+	write parameters, used with modeltest
+	@param out output stream
+*/
+void RateFreeInvar::writeParameters(ostream &out) {
+	RateInvar::writeParameters(out);
+	RateFree::writeParameters(out);
+}
+
+void RateFreeInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	RateFree::setBounds(lower_bound, upper_bound, bound_check);
+	if (RateInvar::getNDim() == 0) return;
+	int ndim = getNDim()-1;
+	RateInvar::setBounds(lower_bound+ndim, upper_bound+ndim, bound_check+ndim);
+}
+
+/**
+	optimize parameters
+	@return the best likelihood
+*/
+double RateFreeInvar::optimizeParameters(double gradient_epsilon) {
+	double tree_lh;
+	tree_lh = RateFree::optimizeParameters(gradient_epsilon);
+	return tree_lh;
+}
+
+void RateFreeInvar::setVariables(double *variables) {
+	RateFree::setVariables(variables);
+	if (RateInvar::getNDim() == 0) return;
+	variables[getNDim()] = p_invar;
+}
+
+/**
+	this function is served for the multi-dimension optimization. It should assign the model parameters
+	from a vector of variables that is index from 1 (NOTE: not from 0)
+	@param variables vector of variables, indexed from 1
+*/
+void RateFreeInvar::getVariables(double *variables) {
+	RateFree::getVariables(variables);
+	if (RateInvar::getNDim() == 0) return;
+	p_invar = variables[getNDim()];
+}
+
diff --git a/model/ratefreeinvar.h b/model/ratefreeinvar.h
new file mode 100644
index 0000000..6f42e3e
--- /dev/null
+++ b/model/ratefreeinvar.h
@@ -0,0 +1,116 @@
+/*
+ * ratefreeinvar.h
+ *
+ *  Created on: Nov 7, 2014
+ *      Author: minh
+ */
+
+#ifndef RATEFREEINVAR_H_
+#define RATEFREEINVAR_H_
+
+#include "rateinvar.h"
+#include "ratefree.h"
+
+class RateFreeInvar: public RateInvar, public RateFree {
+public:
+
+ 	/**
+		constructor
+		@param ncat number of rate categories
+		@param tree associated phylogenetic tree
+	*/
+    RateFreeInvar(int ncat, double start_alpha, string params, bool sorted_rates, double p_invar_sites, string opt_alg, PhyloTree *tree);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return RateInvar::getNDim() + RateFree::getNDim(); }
+
+	/**
+		get the proportion of sites under a specified category.
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return (1.0-p_invar)*prop[category]; }
+
+	/**
+		get the rate of a specified category. Default returns 1.0 since it is homogeneous model
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return RateFree::getRate(category); }
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() {
+		return RateInvar::getNameParams() + RateFree::getNameParams();
+	}
+
+	/**
+		override function from Optimization class, used by the minimizeOneDimen() to optimize
+		p_invar or gamma shape parameter.
+		@param value value of p_invar (if cur_optimize == 1) or gamma shape (if cur_optimize == 0).
+	*/
+	virtual double computeFunction(double value);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		optimize parameters
+		@return the best likelihood
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+	virtual void setNCategory(int ncat);
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+private:
+
+	/**
+		current parameter to optimize. 0 if gamma shape or 1 if p_invar.
+	*/
+	int cur_optimize;
+
+};
+
+#endif /* RATEFREEINVAR_H_ */
diff --git a/model/rategamma.cpp b/model/rategamma.cpp
new file mode 100644
index 0000000..2bbd2bf
--- /dev/null
+++ b/model/rategamma.cpp
@@ -0,0 +1,435 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "phylotree.h"
+#include "rategamma.h"
+#include <math.h>
+
+
+
+RateGamma::RateGamma(int ncat, double shape, bool median, PhyloTree *tree) : RateHeterogeneity()
+{
+	ncategory = ncat;
+	phylo_tree = tree;
+	cut_median = median;
+	//gamma_shape = MAX_GAMMA_SHAPE-1.0;
+	gamma_shape = max(MIN_GAMMA_SHAPE, fabs(shape));
+	fix_gamma_shape = false;
+	rates = NULL;
+	if (shape > 0.0) {
+//		gamma_shape = shape;
+		fix_gamma_shape = true;
+	} else if (shape == 0.0) {
+		gamma_shape = max(MIN_GAMMA_SHAPE, random_double() * 10.0);
+		cout << "Randomize initial gamma shape (alpha): " << gamma_shape << endl;
+	}
+	setNCategory(ncat);
+}
+
+void RateGamma::setNCategory(int ncat) {
+	ncategory = ncat;
+	if (rates) delete [] rates;
+	rates = new double[ncategory];
+	name = "+G" + convertIntToString(ncategory);
+	full_name = "Gamma with " + convertIntToString(ncategory) + " categories";
+	computeRates();
+}
+
+
+string RateGamma::getNameParams() {
+	ostringstream str;
+	str << "+G" << ncategory << '{' << gamma_shape << '}';
+	return str.str();
+}
+
+RateGamma::~RateGamma()
+{
+	if (rates) delete [] rates;
+	rates = NULL;
+}
+
+void RateGamma::computeRates() {
+	int cat; /* category id */
+	double sum_rates = 0.0;
+
+	if (ncategory == 1) {
+		rates[0] = 1.0;
+		return;
+	}
+
+	if (!cut_median) {
+		computeRatesMean();
+	} else {
+		for (cat = 0; cat < ncategory; cat ++) {
+			double prob = ( 2.0 * cat + 1 ) / (2.0 * ncategory);
+			double perPoint_ = cmpPointChi2 (prob, 2.0 * gamma_shape) / (2.0 * gamma_shape);
+			perPoint_ = perPoint_ < 0.0 ? -perPoint_ : perPoint_;
+			rates[ cat ] = perPoint_;
+		}
+
+		//rescale in order to make mean equal to 1.0
+
+
+		for (cat = 0; cat < ncategory; cat ++)
+			sum_rates += rates[ cat];
+
+		for (cat = 0; cat < ncategory; cat ++)
+			rates[ cat ] = rates[ cat ] * ncategory / sum_rates;
+	}
+
+	/* BQM 2015-02-25: Testing if RAxML forgot this rate rescaling step */
+	if (phylo_tree && phylo_tree->params && phylo_tree->params->no_rescale_gamma_invar)
+		return;
+
+	/* if invariable sites are present */
+	double p_inv = getPInvar();
+	for (cat = 0; cat < ncategory; cat++)
+		rates[cat] = rates[cat]/(1.0 - p_inv);
+
+	/* check for very small rates */
+//	for (cat = 0; cat < ncategory; cat ++)
+//		if (rates[cat] < MIN_GAMMA_RATE)
+//			rates[cat] = MIN_GAMMA_RATE;
+}
+
+/*double RateGamma::cmpPerPointGamma (const double prob, const double shape) {
+}*/
+
+void RateGamma::computeRatesMean () {
+	int i;
+	double lnga1=cmpLnGamma(gamma_shape+1);
+	double *freqK = new double[ncategory];
+	for (i=0; i<ncategory-1; i++) /* cutting points, Eq. 9 */
+		freqK[i]=cmpPointChi2((i+1.0)/ncategory, 2.0 * gamma_shape) / (2.0 * gamma_shape);
+	for (i=0; i<ncategory-1; i++) /* Eq. 10 */
+		freqK[i]=cmpIncompleteGamma(freqK[i]*gamma_shape, gamma_shape+1, lnga1);
+
+	rates[0] = freqK[0]*ncategory;
+	rates[ncategory-1] = (1-freqK[ncategory-2])*ncategory;
+	for (i=1; i<ncategory-1; i++)  rates[i] = (freqK[i]-freqK[i-1])*ncategory;
+	delete [] freqK;
+}
+
+void RateGamma::setGammaShape(double gs) {
+	gamma_shape = gs;
+}
+
+double RateGamma::computeFunction(double shape) {
+	gamma_shape = shape;
+	computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+double RateGamma::targetFunk(double x[]) {
+	getVariables(x);
+	computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+void RateGamma::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	if (getNDim() == 0) return;
+	lower_bound[1] = MIN_GAMMA_SHAPE;
+	upper_bound[1] = MAX_GAMMA_SHAPE;
+	bound_check[1] = false;
+}
+
+void RateGamma::setVariables(double *variables) {
+	if (getNDim() == 0) return;
+	variables[1] = gamma_shape;
+}
+
+void RateGamma::getVariables(double *variables) {
+	if (getNDim() == 0) return;
+	gamma_shape = variables[1];
+}
+
+double RateGamma::optimizeParameters(double gradient_epsilon, double min_gamma, double max_gamma) {
+	if (fix_gamma_shape)
+		return phylo_tree->computeLikelihood();
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing gamma shape..." << endl;
+	double negative_lh;
+	double current_shape = gamma_shape;
+	double ferror, optx;
+	optx = minimizeOneDimen(min_gamma, current_shape, max_gamma, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
+	gamma_shape = optx;
+	computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -negative_lh;
+}
+
+double RateGamma::optimizeParameters(double gradient_epsilon) {
+	if (fix_gamma_shape)
+		return phylo_tree->computeLikelihood();
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing gamma shape..." << endl;
+	double negative_lh;
+	double current_shape = gamma_shape;
+	double ferror, optx;
+	optx = minimizeOneDimen(MIN_GAMMA_SHAPE, current_shape, MAX_GAMMA_SHAPE, max(gradient_epsilon, TOL_GAMMA_SHAPE), &negative_lh, &ferror);
+	gamma_shape = optx;
+	computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -negative_lh;
+}
+
+void RateGamma::writeInfo(ostream &out) {
+	out << "Gamma shape alpha: " << gamma_shape << endl;
+	//out << " (" << (cut_median ? "median" : "mean") << " rate per category)" << endl;
+	//out << "Number of categories: " << ncategory << endl;
+}
+
+void RateGamma::writeParameters(ostream &out) {
+	out << "\t" << gamma_shape;
+}
+
+int RateGamma::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
+	//cout << "Computing Gamma site rates by empirical Bayes..." << endl;
+//	double *ptn_rates = new double[npattern];
+	if (phylo_tree->sse == LK_NORMAL || phylo_tree->sse == LK_SSE)
+		phylo_tree->computeLikelihoodBranchNaive((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	else {
+//		switch (phylo_tree->aln->num_states) {
+//		case 4: phylo_tree->computeLikelihoodBranchEigen<4>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 20: phylo_tree->computeLikelihoodBranchEigen<20>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 2: phylo_tree->computeLikelihoodBranchEigen<2>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 64: phylo_tree->computeLikelihoodBranchEigen<64>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		default: outError("Option unsupported yet for this sequence type. Contact author if you really need it."); break;
+//		}
+        phylo_tree->computeLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	}
+
+	int npattern = phylo_tree->aln->getNPattern();
+	pattern_rates.resize(npattern);
+	pattern_cat.resize(npattern);
+
+	double *lh_cat = phylo_tree->_pattern_lh_cat;
+	for (int i = 0; i < npattern; i++) {
+		double sum_rate = 0.0, sum_lh = 0.0;
+		int best = 0;
+		for (int c = 0; c < ncategory; c++) {
+			sum_rate += rates[c] * lh_cat[c];
+			sum_lh += lh_cat[c];
+			if (lh_cat[c] > lh_cat[best] || (lh_cat[c] == lh_cat[best] && random_double()<0.5))  // break tie at random
+                best = c;
+		}
+		pattern_rates[i] = sum_rate / sum_lh;
+		pattern_cat[i] = best;
+		lh_cat += ncategory;
+	}
+    return ncategory;
+
+//	pattern_rates.clear();
+//	pattern_rates.insert(pattern_rates.begin(), ptn_rates, ptn_rates + npattern);
+//	pattern_cat.resize(npattern, 0);
+//	for (int i = 0; i < npattern; i++)
+//		for (int j = 1; j < ncategory; j++)
+//			if (fabs(rates[j] - ptn_rates[i]) < fabs(rates[pattern_cat[i]] - ptn_rates[i]))
+//				pattern_cat[i] = j;
+//	delete [] ptn_rates;
+}
+
+
+/*NUMERICAL SUBROUTINES
+**************************************************************************************
+
+**************************************************************************************
+**************************************************************************************
+**************************************************************************************
+**************************************************************************************/
+
+/* THE FOLLOWING CODE COMES FROM tools.c in Yang's PAML package */
+
+//----------------------------------------------------------------------------------------
+double RateGamma::cmpLnGamma (double alpha) {
+	/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places.
+	   Stirling's formula is used for the central polynomial part of the procedure.
+	   Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function.
+	   Communications of the Association for Computing Machinery, 9:684
+	*/
+	double x=alpha, f=0, z;
+
+	if (x<7) {
+		f=1;  z=x-1;
+		while (++z<7)  f*=z;
+		x=z;   f=-log(f);
+	}
+	z = 1/(x*x);
+	return  f + (x-0.5)*log(x) - x + .918938533204673
+	        + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z
+	           +.083333333333333)/x;
+} //end of function cmpLnGamma
+
+//----------------------------------------------------------------------------------------
+double RateGamma::cmpIncompleteGamma (double x, double alpha, double ln_gamma_alpha) {
+	/* returns the incomplete gamma ratio I(x,alpha) where x is the upper
+		   limit of the integration and alpha is the shape parameter.
+	   returns (-1) if in error
+	   (1) series expansion     if (alpha>x || x<=1)
+	   (2) continued fraction   otherwise
+
+	   RATNEST FORTRAN by
+	   Bhattacharjee GP (1970) The incomplete gamma integral.  Applied Statistics,
+	   19: 285-287 (AS32)
+	*/
+
+	int i;
+	double p=alpha, g=ln_gamma_alpha;
+	double accurate=1e-8, overflow=1e30;
+	double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6];
+
+	if (x==0) return (0);
+	if (x<0 || p<=0) return (-1);
+
+	factor=exp(p*log(x)-x-g);
+	if (x>1 && x>=p) goto l30;
+	/* (1) series expansion */
+	gin=1;  term=1;  rn=p;
+l20:
+	rn++;
+	term*=x/rn;   gin+=term;
+
+	if (term > accurate) goto l20;
+
+	gin*=factor/p;
+	goto l50;
+l30:
+
+	/* (2) continued fraction */
+	a=1-p;   b=a+x+1;  term=0;
+	pn[0]=1;  pn[1]=x;  pn[2]=x+1;  pn[3]=x*b;
+	gin=pn[2]/pn[3];
+l32:
+	a++;  b+=2;  term++;   an=a*term;
+	for (i=0; i<2; i++) pn[i+4]=b*pn[i+2]-an*pn[i];
+	if (pn[5] == 0) goto l35;
+	rn=pn[4]/pn[5];   dif=fabs(gin-rn);
+	if (dif>accurate) goto l34;
+	if (dif<=accurate*rn) goto l42;
+l34:
+	gin=rn;
+l35:
+	for (i=0; i<4; i++) pn[i]=pn[i+2];
+	if (fabs(pn[4]) < overflow) goto l32;
+	for (i=0; i<4; i++) pn[i]/=overflow;
+	goto l32;
+l42:
+	gin=1-factor*gin;
+
+l50:
+	return (gin);
+} //end of function cmpIncompleteGamma
+
+
+//----------------------------------------------------------------------------------------
+/* functions concerning the CDF and percentage points of the gamma and
+   Chi2 distribution
+*/
+double RateGamma::cmpPointNormal (double prob) {
+	/* returns z so that Prob{x<z}=prob where x ~ N(0,1) and (1e-12)<prob<1-(1e-12)
+	   returns (-9999) if in error
+	   Odeh RE & Evans JO (1974) The percentage points of the normal distribution.
+	   Applied Statistics 22: 96-97 (AS70)
+
+	   Newer methods:
+	     Wichura MJ (1988) Algorithm AS 241: the percentage points of the
+	       normal distribution.  37: 477-484.
+	     Beasley JD & Springer SG  (1977).  Algorithm AS 111: the percentage
+	       points of the normal distribution.  26: 118-121.
+
+	*/
+	double a0=-.322232431088, a1=-1, a2=-.342242088547, a3=-.0204231210245;
+	double a4=-.453642210148e-4, b0=.0993484626060, b1=.588581570495;
+	double b2=.531103462366, b3=.103537752850, b4=.0038560700634;
+	double y, z=0, p=prob, p1;
+
+	p1 = (p<0.5 ? p : 1-p);
+
+	if (p1<1e-20) return (-9999);
+
+	y = sqrt (log(1/(p1*p1)));
+	z = y + ((((y*a4+a3)*y+a2)*y+a1)*y+a0) / ((((y*b4+b3)*y+b2)*y+b1)*y+b0);
+	return (p<0.5 ? -z : z);
+} //end of function cmpPointNormal
+
+
+
+//----------------------------------------------------------------------------------------
+
+double RateGamma::cmpPointChi2 (double prob, double v) {
+	/* returns z so that Prob{x<z}=prob where x is Chi2 distributed with df=v
+	   returns -1 if in error.   0.000002<prob<0.999998
+	   RATNEST FORTRAN by
+	       Best DJ & Roberts DE (1975) The percentage points of the
+	       Chi2 distribution.  Applied Statistics 24: 385-388.  (AS91)
+	   Converted into C by Ziheng Yang, Oct. 1993.
+	*/
+	double e=.5e-6, aa=.6931471805, p=prob, g;
+	double xx, c, ch, a=0,q=0,p1=0,p2=0,t=0,x=0,b=0,s1,s2,s3,s4,s5,s6;
+
+	if (p<.000002 || p>.999998 || v<=0) return (-1);
+
+	g = cmpLnGamma (v/2);
+	xx=v/2;   c=xx-1;
+	if (v >= -1.24*log(p)) goto l1;
+
+	ch=pow((p*xx*exp(g+xx*aa)), 1/xx);
+	if (ch-e<0) return (ch);
+	goto l4;
+l1:
+	if (v>.32) goto l3;
+	ch=0.4;   a=log(1-p);
+l2:
+	q=ch;  p1=1+ch*(4.67+ch);  p2=ch*(6.73+ch*(6.66+ch));
+	t=-0.5+(4.67+2*ch)/p1 - (6.73+ch*(13.32+3*ch))/p2;
+	ch-=(1-exp(a+g+.5*ch+c*aa)*p2/p1)/t;
+	if (fabs(q/ch-1)-.01 <= 0) goto l4;
+	else                       goto l2;
+
+l3:
+	x=cmpPointNormal (p);
+	p1=0.222222/v;   ch=v*pow((x*sqrt(p1)+1-p1), 3.0);
+	if (ch>2.2*v+6)  ch=-2*(log(1-p)-c*log(.5*ch)+g);
+l4:
+
+	do {
+		q=ch;   p1=.5*ch;
+		if ((t=cmpIncompleteGamma (p1, xx, g))<0) {
+			return (-1);
+		}
+		p2=p-t;
+		t=p2*exp(xx*aa+g+p1-c*log(ch));
+		b=t/ch;  a=0.5*t-b*c;
+
+		s1=(210+a*(140+a*(105+a*(84+a*(70+60*a))))) / 420;
+		s2=(420+a*(735+a*(966+a*(1141+1278*a))))/2520;
+		s3=(210+a*(462+a*(707+932*a)))/2520;
+		s4=(252+a*(672+1182*a)+c*(294+a*(889+1740*a)))/5040;
+		s5=(84+264*a+c*(175+606*a))/2520;
+		s6=(120+c*(346+127*c))/5040;
+		ch+=t*(1+0.5*t*s1-b*c*(s1-b*(s2-b*(s3-b*(s4-b*(s5-b*s6))))));
+	} while (fabs(q/ch-1) > e);
+
+	return (ch);
+} //end of function cmpPointChi2
+
+
+/* THE END OF THE CODES COMMING FROM tools.c in Yang's PAML package */
diff --git a/model/rategamma.h b/model/rategamma.h
new file mode 100644
index 0000000..0cf4bd7
--- /dev/null
+++ b/model/rategamma.h
@@ -0,0 +1,284 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEGAMMA_H
+#define RATEGAMMA_H
+
+#include "rateheterogeneity.h"
+
+const double MIN_GAMMA_RATE = 1e-6;
+// change from 0.01 to 0.02 as 0.01 causes numerical problems
+const double MIN_GAMMA_SHAPE = 0.02;
+const double MAX_GAMMA_SHAPE = 1000.0;
+const double TOL_GAMMA_SHAPE = 0.001;
+
+class PhyloTree;
+/**
+Discrete gamma distributed site-rate model from Yang 1994
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class RateGamma: virtual public RateHeterogeneity
+{
+
+	friend class RateGammaInvar;
+
+public:
+	/**
+		constructor
+		@param ncat number of rate categories
+		@param shape Gamma shape parameter
+		@param tree associated phylogenetic tree
+	*/
+    RateGamma(int ncat, double shape, bool median, PhyloTree *tree);
+
+	/**
+		destructor
+	*/
+    virtual ~RateGamma();
+
+	/**
+		@return true if this is a Gamma model (default: false)
+	*/	
+    virtual bool isGammaRate() { return true; }
+
+	virtual double getGammaShape() { return gamma_shape; }
+
+	virtual void setGammaShape(double gs);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		@return TRUE to use median rate for discrete categories, FALSE to use mean rate instead
+	*/
+	bool isCutMedian() { return cut_median; }
+
+	/**
+		@return the number of rate categories
+	*/
+	virtual int getNRate() { return ncategory; }
+
+	/**
+		get the number of rate categories for site-specific category model
+		@return the number of rate categories
+	*/
+	virtual int getNDiscreteRate() { return ncategory; }
+
+	/**
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return rates[category]; }
+
+	/**
+		get the proportion of sites under a specified category.
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return 1.0/ncategory; }
+
+	/**
+	 * 	return pointer to the rate array
+	 */
+	virtual double* getRates() { return rates; }
+
+	/** discrete Gamma according to Yang 1994 (JME 39:306-314) and using median cutting point
+		It takes 'ncategory' and 'gamma_shape' variables as input. On output, it write to 'rates' variable.
+	*/
+	void computeRates();
+
+	/** discrete Gamma according to Yang 1994 (JME 39:306-314) and using mean of the portion of gamma distribution
+		It takes 'ncategory' and 'gamma_shape' variables as input. On output, it write to 'rates' variable.
+	*/
+	void computeRatesMean ();
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+		optimize parameters. Default is to optimize gamma shape
+		@return the best likelihood
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+    /**
+     *  Same as above but add parameters to control gamma bounds
+     */
+	virtual double optimizeParameters(double gradient_epsilon, double min_gamma, double max_gamma);
+
+	/**
+		override function from Optimization class, used by the minimizeOneDimen() to optimize
+		gamma shape parameter
+	*/
+	virtual double computeFunction(double shape);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return !fix_gamma_shape; }
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+	bool isFixGammaShape() const {
+		return fix_gamma_shape;
+	}
+
+	void setFixGammaShape(bool fixGammaShape) {
+		fix_gamma_shape = fixGammaShape;
+	}
+
+    /**
+        set number of rate categories
+        @param ncat #categories
+    */
+	virtual void setNCategory(int ncat);
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+	/**
+		number of rate categories
+	*/
+	int ncategory;
+
+	/**
+		rates, containing ncategory elements
+	*/
+	double *rates;
+
+
+	/**
+		the gamma shape parameter 'alpha'
+	*/
+	double gamma_shape;
+
+	/**
+		TRUE to fix the gamma shape parameter
+	*/
+	bool fix_gamma_shape;
+
+	/**
+		TRUE to use median rate for discrete categories, FALSE to use mean rate instead
+	*/
+	bool cut_median;
+
+public:
+
+	//Normally, beta is assigned equal to alpha
+	//double cmpPerPointGamma (const double prob, const double shape);
+
+	/***********************************************************
+	NUMERICAL SUBROUTINES
+	THE FOLLOWING CODE COMES FROM tools.c in Yang's PAML package
+	***********************************************************/
+	/** returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places.
+	   Stirling's formula is used for the central polynomial part of the procedure.
+	   Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function.
+	   Communications of the Association for Computing Machinery, 9:684
+
+	*/
+	static double cmpLnGamma (double alpha);
+
+	/** returns the incomplete gamma ratio I(x,alpha) where x is the upper
+	   limit of the integration and alpha is the shape parameter.
+	   returns (-1) if in error
+	   (1) series expansion     if (alpha>x || x<=1)
+	   (2) continued fraction   otherwise
+	   RATNEST FORTRAN by
+	   Bhattacharjee GP (1970) The incomplete gamma integral.  Applied Statistics,
+	   19: 285-287 (AS32)
+	*/
+	static double cmpIncompleteGamma (double x, double alpha, double ln_gamma_alpha);
+
+	/** functions concerning the CDF and percentage points of the gamma and
+	   Chi2 distribution
+	   returns z so that Prob{x<z}=prob where x ~ N(0,1) and (1e-12)<prob<1-(1e-12)
+	   returns (-9999) if in error
+	   Odeh RE & Evans JO (1974) The percentage points of the normal distribution.
+	   Applied Statistics 22: 96-97 (AS70)
+
+	   Newer methods:
+	     Wichura MJ (1988) Algorithm AS 241: the percentage points of the
+	       normal distribution.  37: 477-484.
+	     Beasley JD & Springer SG  (1977).  Algorithm AS 111: the percentage
+	       points of the normal distribution.  26: 118-121.
+	*/
+	static double cmpPointNormal (double prob);
+
+
+	/** returns z so that Prob{x<z}=prob where x is Chi2 distributed with df=v
+	   returns -1 if in error.   0.000002<prob<0.999998
+	   RATNEST FORTRAN by
+	       Best DJ & Roberts DE (1975) The percentage points of the
+	       Chi2 distribution.  Applied Statistics 24: 385-388.  (AS91)
+	   Converted into C by Ziheng Yang, Oct. 1993.
+	*/
+	static double cmpPointChi2 (double prob, double v);
+
+	/* THE END OF THE CODES COMMING FROM tools.c in Yang's PAML package */
+
+};
+
+
+
+
+#endif
diff --git a/model/rategammainvar.cpp b/model/rategammainvar.cpp
new file mode 100644
index 0000000..6995366
--- /dev/null
+++ b/model/rategammainvar.cpp
@@ -0,0 +1,186 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "rategammainvar.h"
+
+RateGammaInvar::RateGammaInvar(int ncat, double shape, bool median,
+		double p_invar_sites, bool simultaneous, PhyloTree *tree) :
+		RateInvar(p_invar_sites, tree), RateGamma(ncat, shape, median, tree) {
+	name = "+I" + name;
+	full_name = "Invar+" + full_name;
+	joint_optimize = simultaneous;
+	computeRates();
+}
+
+void RateGammaInvar::setNCategory(int ncat) {
+	RateGamma::setNCategory(ncat);
+	name = "+I" + name;
+	full_name = "Invar+" + full_name;
+	computeRates();
+}
+
+string RateGammaInvar::getNameParams() {
+	return RateInvar::getNameParams() + RateGamma::getNameParams();
+}
+
+double RateGammaInvar::computeFunction(double value) {
+	if (cur_optimize == 0)
+		gamma_shape = value;
+	else 
+		p_invar = value;
+	// need to compute rates again if p_inv or Gamma shape changes!
+	computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+void RateGammaInvar::writeInfo(ostream &out) {
+	RateInvar::writeInfo(out);
+	RateGamma::writeInfo(out);
+}
+
+void RateGammaInvar::writeParameters(ostream &out) {
+	RateInvar::writeParameters(out);
+	RateGamma::writeParameters(out);
+}
+
+void RateGammaInvar::setVariables(double *variables) {
+	RateGamma::setVariables(variables);
+	int gid = RateGamma::getNDim();
+	RateInvar::setVariables(variables+gid);
+}
+
+void RateGammaInvar::getVariables(double *variables) {
+	int gid = RateGamma::getNDim();
+	RateGamma::getVariables(variables);
+	RateInvar::getVariables(variables+gid);
+}
+
+double RateGammaInvar::targetFunk(double x[]) {
+	assert(phylo_tree);
+	getVariables(x);
+	// need to compute rates again if p_inv or Gamma shape changes!
+	RateGamma::computeRates();
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+void RateGammaInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	int gid = RateGamma::getNDim();
+	RateGamma::setBounds(lower_bound, upper_bound, bound_check);
+	RateInvar::setBounds(lower_bound+gid, upper_bound+gid, bound_check+gid);
+}
+
+double RateGammaInvar::optimizeParameters(double gradient_epsilon) {
+
+	int ndim = getNDim();
+
+	// return if nothing to be optimized
+	if (ndim == 0)
+		return phylo_tree->computeLikelihood();
+
+	if (!joint_optimize) {
+//		double lh = phylo_tree->computeLikelihood();
+		cur_optimize = 1;
+		double invar_lh = -DBL_MAX;
+        invar_lh = RateInvar::optimizeParameters(gradient_epsilon);
+//		assert(tree_lh >= lh-0.1);
+//		lh = tree_lh;
+		cur_optimize = 0;
+		double gamma_lh;
+		if (Params::getInstance().testAlpha) {
+			gamma_lh = RateGamma::optimizeParameters(gradient_epsilon, 0.05, 10);
+		} else {
+            gamma_lh = RateGamma::optimizeParameters(gradient_epsilon);
+        }
+		assert(gamma_lh >= invar_lh - 0.1);
+		phylo_tree->clearAllPartialLH();
+		return gamma_lh;
+	}
+
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing " << name << " model parameters by BFGS..." << endl;
+
+	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
+
+	double *variables = new double[ndim+1];
+	double *upper_bound = new double[ndim+1];
+	double *lower_bound = new double[ndim+1];
+	bool *bound_check = new bool[ndim+1];
+	double score;
+
+	// by BFGS algorithm
+	setVariables(variables);
+	setBounds(lower_bound, upper_bound, bound_check);
+
+	score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, TOL_GAMMA_SHAPE));
+
+	getVariables(variables);
+
+	phylo_tree->clearAllPartialLH();
+
+	delete [] bound_check;
+	delete [] lower_bound;
+	delete [] upper_bound;
+	delete [] variables;
+
+	return score;
+}
+
+
+int RateGammaInvar::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
+	//cout << "Computing Gamma site rates by empirical Bayes..." << endl;
+//	double *ptn_rates = new double[npattern];
+	if (phylo_tree->sse == LK_NORMAL || phylo_tree->sse == LK_SSE)
+		phylo_tree->computeLikelihoodBranchNaive((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	else {
+//		switch (phylo_tree->aln->num_states) {
+//		case 4: phylo_tree->computeLikelihoodBranchEigen<4>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 20: phylo_tree->computeLikelihoodBranchEigen<20>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 2: phylo_tree->computeLikelihoodBranchEigen<2>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 64: phylo_tree->computeLikelihoodBranchEigen<64>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		default: outError("Option unsupported yet for this sequence type. Contact author if you really need it."); break;
+//		}
+        phylo_tree->computeLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	}
+
+	int npattern = phylo_tree->aln->getNPattern();
+	pattern_rates.resize(npattern);
+	pattern_cat.resize(npattern);
+
+	double *lh_cat = phylo_tree->_pattern_lh_cat;
+	for (int i = 0; i < npattern; i++) {
+		double sum_rate = 0.0, sum_lh = phylo_tree->ptn_invar[i];
+		int best = 0;
+        double best_lh = phylo_tree->ptn_invar[i];
+		for (int c = 0; c < ncategory; c++) {
+			sum_rate += rates[c] * lh_cat[c];
+			sum_lh += lh_cat[c];
+			if (lh_cat[c] > best_lh || (lh_cat[c] == best_lh && random_double()<0.5)) { // break tie at random
+                best = c+1;
+                best_lh = lh_cat[c];
+            }
+		}
+		pattern_rates[i] = sum_rate / sum_lh;
+		pattern_cat[i] = best;
+		lh_cat += ncategory;
+	}
+    return ncategory+1;
+}
+
diff --git a/model/rategammainvar.h b/model/rategammainvar.h
new file mode 100644
index 0000000..ef6e53f
--- /dev/null
+++ b/model/rategammainvar.h
@@ -0,0 +1,145 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEGAMMAINVAR_H
+#define RATEGAMMAINVAR_H
+
+#include "rateinvar.h"
+#include "rategamma.h"
+
+/**
+class for I+G rate heterogeneity
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class RateGammaInvar : public RateInvar, public RateGamma
+{
+public:
+ 	/**
+		constructor
+		@param ncat number of rate categories
+		@param tree associated phylogenetic tree
+		@param testAlpha turn on option for doing random restart optimization of alpha and p_invar
+	*/
+    RateGammaInvar(int ncat, double shape, bool median, double p_invar_sites, bool simultaneous, PhyloTree *tree);
+
+	/**
+		get the proportion of sites under a specified category.
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return (1.0-p_invar)/ncategory; }
+
+	/**
+		get the rate of a specified category. Default returns 1.0 since it is homogeneous model
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return RateGamma::getRate(category); }
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		override function from Optimization class, used by the minimizeOneDimen() to optimize
+		p_invar or gamma shape parameter.
+		@param value value of p_invar (if cur_optimize == 1) or gamma shape (if cur_optimize == 0).
+	*/
+	virtual double computeFunction(double value);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		optimize parameters
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return RateInvar::getNDim() + RateGamma::getNDim(); }
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+	/** TRUE to jointly optimize gamma shape and p_invar using BFGS, default: FALSE */
+	bool joint_optimize;
+
+	virtual void setNCategory(int ncat);
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat);
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+private:
+
+	/**
+	 *  TRUE to turn on random restart optimization for estimating alpha and p_invar
+	 */
+//	bool rr_ai;
+
+	/**
+		current parameter to optimize. 0 if gamma shape or 1 if p_invar.
+	*/
+	int cur_optimize;
+};
+
+#endif
diff --git a/model/rateheterogeneity.cpp b/model/rateheterogeneity.cpp
new file mode 100644
index 0000000..3bc02d3
--- /dev/null
+++ b/model/rateheterogeneity.cpp
@@ -0,0 +1,96 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include "phylotree.h"
+#include "rateheterogeneity.h"
+
+
+RateHeterogeneity::RateHeterogeneity()
+ : Optimization()
+{
+	name = "";
+	full_name = "Uniform";
+	phylo_tree = NULL;
+}
+
+void RateHeterogeneity::setTree(PhyloTree *tree) {
+	phylo_tree = tree;
+}
+
+RateHeterogeneity::~RateHeterogeneity()
+{
+}
+
+void RateHeterogeneity::writeSiteRates(ostream &out, DoubleVector &pattern_rates, IntVector &pattern_cat, int ncategory) {
+	int nsite = phylo_tree->aln->getNSite();
+	int i;
+	
+	out.setf(ios::fixed,ios::floatfield);
+	out.precision(5);
+	out << "Site\tRate";
+	if (!pattern_cat.empty()) out << "\tCategory\tCategorized_rate";
+	out << endl;
+	//cout << __func__ << endl;
+    IntVector count;
+    count.resize(ncategory, 0);
+	for (i = 0; i < nsite; i++) {
+		int ptn = phylo_tree->aln->getPatternID(i);
+		out << i+1 << "\t";
+		if (pattern_rates[ptn] >= MAX_SITE_RATE) out << "100.0"; else out << pattern_rates[ptn];
+		//cout << i << " "<< ptn << " " << pattern_cat[ptn] << endl;
+		if (!pattern_cat.empty()) out << "\t" << pattern_cat[ptn]+1 << "\t" << getRate(pattern_cat[ptn]);
+		out << endl;
+        count[pattern_cat[ptn]]++;
+	}
+    cout << "Empirical proportions for each category:";
+    for (i = 0; i < count.size(); i++)
+        cout << " " << ((double)count[i])/nsite;
+    cout << endl;
+}
+
+void RateHeterogeneity::writeSiteRates(ostream &out) {
+	DoubleVector pattern_rates;
+	IntVector pattern_cat;
+	int ncategory = computePatternRates(pattern_rates, pattern_cat);
+	if (pattern_rates.empty()) return;
+	writeSiteRates(out, pattern_rates, pattern_cat, ncategory);
+}
+
+void RateHeterogeneity::writeSiteRates(const char *file_name) {
+	DoubleVector pattern_rates;
+	IntVector pattern_cat;
+	int ncategory = computePatternRates(pattern_rates, pattern_cat);
+	if (pattern_rates.empty()) return;
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(file_name);
+		writeSiteRates(out, pattern_rates, pattern_cat, ncategory);
+		out.close();
+		cout << "Site rates printed to " << file_name << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, file_name);
+	}
+}
+
+double RateHeterogeneity::targetFunk(double x[]) {
+	return -phylo_tree->computeLikelihood();
+}
diff --git a/model/rateheterogeneity.h b/model/rateheterogeneity.h
new file mode 100644
index 0000000..235c956
--- /dev/null
+++ b/model/rateheterogeneity.h
@@ -0,0 +1,247 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEHETEROGENEITY_H
+#define RATEHETEROGENEITY_H
+
+
+#include "optimization.h"
+#include <string>
+#include "tools.h"
+
+using namespace std;
+
+class PhyloTree;
+
+const double MIN_SITE_RATE = 1e-6;
+const double MAX_SITE_RATE = 100.0;
+const double TOL_SITE_RATE = 1e-6;
+
+
+/**
+class for among-site rate heterogeneity, the default is homogeneous (equal) rate model
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+
+class RateHeterogeneity : public Optimization
+{
+	friend class ModelFactory;
+
+public:
+	/**
+		constructor
+	*/
+    RateHeterogeneity();
+
+	/**
+		destructor
+	*/
+    virtual ~RateHeterogeneity();
+
+	/**
+		set phylogenetic tree
+		@param tree associated phyogenetic tree
+	*/
+	void setTree(PhyloTree *tree);
+
+	/**
+		set phylogenetic tree
+		@param tree associated phyogenetic tree
+	*/
+	PhyloTree *getTree() { return phylo_tree; }
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams() { return name; }
+
+	/**
+		@return false by default. True if rates are site-specific (Meyer and von Haeseler (2003) model)
+	*/
+	virtual bool isSiteSpecificRate() { return false; }
+
+	/**
+		get the number of rate categories. The default returns 1 category since it is homogeneous model
+		@return the number of rate categories
+	*/
+	virtual int getNRate() { return 1; }
+
+	/**
+		set the number of rate categories. The default raises assertion since it is homogeneous model
+	*/
+	virtual void setNCategory(int ncat) { assert(0); }
+
+	/**
+		get the number of rate categories for site-specific category model
+		The default returns 1 category since it is homogeneous model
+		@return the number of rate categories
+	*/
+	virtual int getNDiscreteRate() { return 1; }
+
+	/**
+		get the rate of a specified category. Default returns 1.0 since it is homogeneous model
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return 1.0; }
+
+	/**
+		get the proportion of a specified category. Default returns 1.0 since it is homogeneous model
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return 1.0; }
+
+	/**
+		get the rate of a specified site-pattern. Default returns 1.0 since it is homogeneous model
+		@param ptn pattern ID 
+		@return the rate of the specified site-pattern
+	*/
+	virtual double getPtnRate(int ptn) { return 1.0; }
+
+	/**
+		get rate category of a specified site-pattern. Default returns -1 since it is homogeneous model
+		@param ptn pattern ID 
+		@return the rate category of the specified site-pattern
+	*/
+	virtual int getPtnCat(int ptn) { return -1; }
+
+	/**
+		get the proportion of invariable sites. Default returns 0.0 since it is homogeneous model
+		@return the proportion of invariable sites
+	*/
+	virtual double getPInvar() { return 0.0; }
+
+	/**
+		get the Gamma shape. Default returns 0.0 since it is homogeneous model
+		@return Gamma shape
+	*/	
+	virtual double getGammaShape() { return 0.0; }
+
+	/**
+		@return true if this is a Gamma model (default: false)
+	*/	
+    virtual bool isGammaRate() { return false; }
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {}
+
+	/**
+		optimize parameters. Default does nothing
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double gradient_epsilon) { return 0.0; }
+
+	/**
+		classify rates into categories, this is meant for the discrete MH model. 
+		The default just return tree_lh
+		@param tree_lh the current tree log-likelihood
+	*/
+	virtual double classifyRates(double tree_lh) { return tree_lh; }
+
+	/**
+	 * used to normal branch lengths if mean rate is not equal to 1 (e.g. FreeRate model)
+	 * @return mean rate, default = 1
+	 */
+	virtual double meanRates() { return 1.0; }
+
+	/**
+	 * rescale rates s.t. mean rate is equal to 1, useful for FreeRate model
+	 * @return rescaling factor
+	 */
+	virtual double rescaleRates() { return 1.0; }
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out) {}
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out) {}
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) { return 1; }
+
+	/**
+		write site-rates to a file in the following format:
+		1  rate_1
+		2  rate_2
+		....
+		This function will call computePatternRates()
+		@param file_name target file to write rates
+	*/
+	void writeSiteRates(const char *file_name);
+
+	void writeSiteRates(ostream &out);
+
+	void writeSiteRates(ostream &out, DoubleVector &pattern_rates, IntVector &pattern_cat, int ncategory);
+
+	/**
+		name of the rate heterogeneity type
+	*/
+	string name;
+
+
+	/**
+		full name of the rate heterogeneity type
+	*/
+	string full_name;
+
+	/**
+		phylogenetic tree associated
+	*/
+	PhyloTree *phylo_tree;
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables) {}
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables) {}
+
+	
+};
+#endif
diff --git a/model/rateinvar.cpp b/model/rateinvar.cpp
new file mode 100644
index 0000000..67ab1b8
--- /dev/null
+++ b/model/rateinvar.cpp
@@ -0,0 +1,98 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "rateinvar.h"
+
+RateInvar::RateInvar(double p_invar_sites, PhyloTree *tree)
+ : RateHeterogeneity()
+{
+	if (tree)
+		p_invar = max(tree->aln->frac_const_sites/2.0, MIN_PINVAR);
+//		p_invar = MIN_PINVAR;
+	else
+		p_invar = MIN_PINVAR;
+	fix_p_invar = false;
+	phylo_tree = tree;
+	name = "+I";
+	full_name = "Invar";
+	if (p_invar_sites >= 0) {
+		p_invar = p_invar_sites;
+		fix_p_invar = true;
+	}
+}
+
+string RateInvar::getNameParams() {
+	ostringstream str;
+	str << "+I{" << p_invar << '}';
+	return str.str();
+}
+
+double RateInvar::computeFunction(double p_invar_value) {
+	p_invar = p_invar_value;
+	// fix bug: computeTip... will update ptn_invar vector
+//	phylo_tree->computePtnInvar();
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+double RateInvar::targetFunk(double x[]) {
+	getVariables(x);
+	// fix bug: computeTip... will update ptn_invar vector
+	phylo_tree->computePtnInvar();
+	return -phylo_tree->computeLikelihood();
+}
+
+void RateInvar::setBounds(double *lower_bound, double *upper_bound, bool *bound_check) {
+	if (getNDim() == 0) return;
+	lower_bound[1] = MIN_PINVAR;
+	upper_bound[1] = phylo_tree->aln->frac_const_sites;
+	bound_check[1] = false;
+}
+
+double RateInvar::optimizeParameters(double gradient_epsilon) {
+	if (fix_p_invar)
+		return -computeFunction(p_invar);
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing proportion of invariable sites..." << endl;
+	double negative_lh;
+	double ferror;
+	p_invar = minimizeOneDimen(MIN_PINVAR, p_invar, min(phylo_tree->aln->frac_const_sites, 1.0-MIN_PINVAR), max(gradient_epsilon, TOL_PINVAR), &negative_lh, &ferror);
+	//p_invar = minimizeOneDimen(MIN_PINVAR, p_invar, 1.0 - MIN_PINVAR, TOL_PINVAR, &negative_lh, &ferror);
+    phylo_tree->clearAllPartialLH();
+	phylo_tree->computePtnInvar();
+	return -negative_lh;
+}
+
+void RateInvar::writeInfo(ostream &out) {
+	out << "Proportion of invariable sites: " << p_invar << endl;
+}
+
+void RateInvar::writeParameters(ostream &out) {
+	out << "\t" << p_invar;
+}
+
+void RateInvar::setVariables(double *variables) {
+	if (RateInvar::getNDim() == 0) return;
+	variables[1] = p_invar;
+}
+
+void RateInvar::getVariables(double *variables) {
+	if (RateInvar::getNDim() == 0) return;
+	p_invar = variables[1];
+}
diff --git a/model/rateinvar.h b/model/rateinvar.h
new file mode 100644
index 0000000..54ff1bc
--- /dev/null
+++ b/model/rateinvar.h
@@ -0,0 +1,154 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEINVAR_H
+#define RATEINVAR_H
+
+#include "phylotree.h"
+#include "rateheterogeneity.h"
+
+const double MIN_PINVAR = 1e-6;
+const double TOL_PINVAR = 1e-6;
+
+/**
+class for rate heterogeneity with a fraction of invariable sites
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class RateInvar : virtual public RateHeterogeneity
+{
+	friend class RateGammaInvar;
+
+public:
+	/**
+		constructor
+		@param p_invar_sites proportion of invariable sites
+		@param tree associated phylogenetic tree
+	*/
+	RateInvar(double p_invar_sites, PhyloTree *tree);
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}
+	 */
+	virtual string getNameParams();
+
+	/**
+		get the proportion of sites under a specified category.
+		@param category category ID from 0 to #category-1
+		@return the proportion of the specified category
+	*/
+	virtual double getProp(int category) { return 1.0 - p_invar; }
+
+	/**
+		get the rate of a specified category. Default returns 1.0 since it is homogeneous model
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return 1.0 / (1.0 - p_invar); }
+
+	/**
+		get the proportion of invariable sites
+		@return the proportion of invariable sites
+	*/
+	virtual double getPInvar() { return p_invar; }
+
+	/**
+	 * setup the bounds for joint optimization with BFGS
+	 */
+	virtual void setBounds(double *lower_bound, double *upper_bound, bool *bound_check);
+
+	/**
+		optimize parameters
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double gradient_epsilon);
+
+	/**
+		override function from Optimization class, used by the minimizeOneDimen() to optimize
+		p_invar parameter
+	*/
+	virtual double computeFunction(double p_invar_value);
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return !fix_p_invar; }
+	
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+	bool isFixPInvar() const {
+		return fix_p_invar;
+	}
+
+	void setFixPInvar(bool fixPInvar) {
+		fix_p_invar = fixPInvar;
+	}
+
+	void setPInvar(double pInvar) {
+		p_invar = pInvar;
+	}
+
+	/**
+		proportion of invariable sites
+	*/
+	double p_invar;
+	
+	/**
+		TRUE to fix the proportion of invariable sites
+	*/
+	bool fix_p_invar;
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+};
+
+#endif
diff --git a/model/ratekategory.cpp b/model/ratekategory.cpp
new file mode 100644
index 0000000..b84c7ea
--- /dev/null
+++ b/model/ratekategory.cpp
@@ -0,0 +1,181 @@
+/*
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 2012  BUI Quang Minh <email>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "phylotree.h"
+#include "ratekategory.h"
+
+RateKategory::RateKategory(int ncat, PhyloTree *tree)
+{
+	ncategory = ncat;
+	phylo_tree = tree;
+	rates = new double[ncategory];
+	name = "+K";
+	name += convertIntToString(ncategory);
+	full_name = "KAT";
+	full_name += " with " + convertIntToString(ncategory) + " categories";
+	if (ncategory == 1) { rates[0] = 1.0; return; } 
+	int i;
+	for (i = 0; i < ncategory; i++) do { rates[i] = random_double(); } while (rates[i]<0.1 || rates[i] > 0.9);
+	//for (i = 0; i < ncategory; i++) rates[i] = 1.0 + i;
+	double sum = 0.0;
+	for (i = 0; i < ncategory; i++) sum += rates[i];
+	for (i = 0; i < ncategory; i++) rates[i] = rates[i]*ncategory/sum;
+}
+
+RateKategory::~RateKategory()
+{
+	if (rates) delete [] rates;
+	rates = NULL;
+}
+
+double RateKategory::targetFunk(double x[])
+{
+	getVariables(x);
+	if (rates[ncategory-1] < 1e-4) return 1.0e+12;
+	assert(phylo_tree);
+	phylo_tree->clearAllPartialLH();
+	return -phylo_tree->computeLikelihood();
+}
+
+double RateKategory::optimizeParameters(double gradient_epsilon)
+{
+	int ndim = getNDim();
+	
+	// return if nothing to be optimized
+	if (ndim == 0) return 0.0;
+
+	if (verbose_mode >= VB_MAX)
+		cout << "Optimizing " << name << " model parameters..." << endl;
+
+	//if (freq_type == FREQ_ESTIMATE) scaleStateFreq(false);
+
+	double *variables = new double[ndim+1];
+	double *upper_bound = new double[ndim+1];
+	double *lower_bound = new double[ndim+1];
+	bool *bound_check = new bool[ndim+1];
+	int i;
+	double score;
+	
+	// by BFGS algorithm
+	setVariables(variables);
+	for (i = 1; i <= ndim; i++) {
+		//cout << variables[i] << endl;
+		lower_bound[i] = 1e-4;
+		upper_bound[i] = ncategory;
+		bound_check[i] = false;
+	}
+
+	score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(gradient_epsilon, 1e-6));
+
+	getVariables(variables);
+	//sort(rates, rates+ncategory);
+	phylo_tree->clearAllPartialLH();
+	
+	delete [] bound_check;
+	delete [] lower_bound;
+	delete [] upper_bound;
+	delete [] variables;
+
+	return score;
+}
+
+int RateKategory::computePatternRates(DoubleVector& pattern_rates, IntVector& pattern_cat)
+{
+	cout << "Computing site rates by empirical Bayes..." << endl;
+	if (phylo_tree->sse == LK_NORMAL || phylo_tree->sse == LK_SSE)
+		phylo_tree->computeLikelihoodBranchNaive((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	else {
+//		switch (phylo_tree->aln->num_states) {
+//		case 4: phylo_tree->computeLikelihoodBranchEigen<4>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 20: phylo_tree->computeLikelihoodBranchEigen<20>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		case 2: phylo_tree->computeLikelihoodBranchEigen<2>((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root); break;
+//		default: outError("Option unsupported yet for this sequence type. Contact author if you really need it."); break;
+//		}
+        phylo_tree->computeLikelihoodBranchEigen((PhyloNeighbor*)phylo_tree->root->neighbors[0], (PhyloNode*)phylo_tree->root);
+	}
+
+	int npattern = phylo_tree->aln->getNPattern();
+	pattern_rates.resize(npattern);
+	pattern_cat.resize(npattern);
+
+	double *lh_cat = phylo_tree->_pattern_lh_cat;
+	for (int i = 0; i < npattern; i++) {
+		double sum_rate = 0.0, sum_lh = 0.0;
+		int best = 0;
+		for (int c = 0; c < ncategory; c++) {
+			sum_rate += rates[c] * lh_cat[c];
+			sum_lh += lh_cat[c];
+			if (lh_cat[c] > lh_cat[best]) best = c;
+		}
+		pattern_rates[i] = sum_rate / sum_lh;
+		pattern_cat[i] = best;
+		lh_cat += ncategory;
+	}
+
+    return ncategory;
+
+//	int npattern = phylo_tree->aln->getNPattern();
+//	double *ptn_rates = new double[npattern];
+//	phylo_tree->computeLikelihoodBranchNaive((PhyloNeighbor*)phylo_tree->root->neighbors[0],
+//		(PhyloNode*)phylo_tree->root, NULL, ptn_rates);
+//
+//	pattern_rates.clear();
+//	pattern_rates.insert(pattern_rates.begin(), ptn_rates, ptn_rates + npattern);
+//	pattern_cat.resize(npattern, 0);
+//	for (int i = 0; i < npattern; i++)
+//		for (int j = 1; j < ncategory; j++)
+//			if (fabs(rates[j] - ptn_rates[i]) < fabs(rates[pattern_cat[i]] - ptn_rates[i]))
+//				pattern_cat[i] = j;
+//	delete [] ptn_rates;
+}
+
+void RateKategory::getVariables(double* variables)
+{
+	if (ncategory == 1) return;
+	rates[0] = 1.0;
+	memcpy(rates, variables+1, (ncategory-1) * sizeof(double));
+	double sum = 0.0;
+	int i;
+	for (i = 0; i < ncategory-1; i++) 
+		sum += rates[i];
+	/*
+	for (i = 0; i < ncategory; i++) 
+		rates[i] = rates[i]*ncategory/sum;*/
+	rates[ncategory-1] = ncategory - sum;
+}
+
+void RateKategory::setVariables(double* variables)
+{
+	if (ncategory == 1) return;
+	memcpy(variables+1, rates, (ncategory-1) * sizeof(double));
+}
+
+
+void RateKategory::writeInfo(ostream& out)
+{
+	out << "Rates: ";
+	for (int i = 0; i < ncategory; i++)
+		out << " " << rates[i];
+	out << endl;
+	out << "BIC: " << -2 * phylo_tree->computeLikelihood() + getNDim() * log(phylo_tree->getAlnNSite()) << endl;
+}
+
+void RateKategory::writeParameters(ostream& out)
+{
+}
diff --git a/model/ratekategory.h b/model/ratekategory.h
new file mode 100644
index 0000000..f841e38
--- /dev/null
+++ b/model/ratekategory.h
@@ -0,0 +1,132 @@
+/*
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 2012  BUI Quang Minh <email>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef RATEKATEGORY_H
+#define RATEKATEGORY_H
+
+#include "rateheterogeneity.h"
+
+class PhyloTree;
+
+/**
+among-site-rate model that sites are categorized into K categories of equal proportion
+where the K rates are optimized by ML instead of the Gamma distribution
+
+*/
+class RateKategory : virtual public RateHeterogeneity
+{
+public:
+	/**
+		constructor
+		@param ncat number of rate categories
+		@param tree associated phylogenetic tree
+	*/
+    RateKategory(int ncat, PhyloTree *tree);
+
+	/**
+		destructor
+	*/
+    virtual ~RateKategory();
+
+	
+	/**
+		@return the number of rate categories
+	*/
+	virtual int getNRate() { return ncategory; }
+
+	/**
+		get the number of rate categories for site-specific category model
+		@return the number of rate categories
+	*/
+	virtual int getNDiscreteRate() { return ncategory; }
+
+	/**
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category) { return rates[category]; }
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat);
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+
+	/**
+		optimize model parameters
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double epsilon);
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return (ncategory-1); }
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		write parameters, used with modeltest
+		@param out output stream
+	*/
+	virtual void writeParameters(ostream &out);
+
+protected:
+
+	/**
+		number of rate categories
+	*/
+	int ncategory;
+
+	/**
+		rates, containing ncategory elements
+	*/
+	double *rates;
+
+	
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters 
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters 
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+
+};
+
+#endif // RATEKATEGORY_H
diff --git a/model/ratemeyerdiscrete.cpp b/model/ratemeyerdiscrete.cpp
new file mode 100644
index 0000000..a7253ae
--- /dev/null
+++ b/model/ratemeyerdiscrete.cpp
@@ -0,0 +1,536 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include "phylotree.h"
+#include "ratemeyerdiscrete.h"
+//#include "kmeans/KMeans.h"
+//#include "modeltest_wrapper.h"
+
+/************************************************
+	Huy's k-means dynamic programming algorithm
+************************************************/
+
+void quicksort(double arr[], int weight[], int index[], int left, int right) {
+      int i = left, j = right, tmp2;
+      double tmp;
+      double pivot = arr[(left + right) / 2];
+ 
+      /* partition */
+      while (i <= j) {
+            while (arr[i] < pivot)
+                  i++;
+            while (arr[j] > pivot)
+                  j--;
+            if (i <= j) {
+                  tmp = arr[i];
+                  arr[i] = arr[j];
+                  arr[j] = tmp;
+                  tmp2 = index[i];
+                  index[i] = index[j];
+                  index[j] = tmp2;
+                  tmp2 = weight[i];
+                  weight[i] = weight[j];
+                  weight[j] = tmp2;
+                  i++;
+                  j--;
+            }
+      };
+ 
+      /* recursion */
+      if (left < j)
+            quicksort(arr, weight, index, left, j);
+      if (i < right)
+            quicksort(arr, weight, index, i, right);
+}
+
+double mean_sum(int l, int r, double *sumA, double *sumAsquare, int *sumW) {
+/*	double mean = (sumA[r]-sumA[l-1])/(r-l+1);
+	return sumAsquare[r]-sumAsquare[l-1]- 2*(sumA[r]- sumA[l-1])*mean + mean*mean*(r-l+1);*/
+
+	double sum = (sumA[r]- sumA[l-1]);
+	return sumAsquare[r]-sumAsquare[l-1] - sum*sum/(sumW[r] - sumW[l-1]);
+
+/*
+	double mean = (sumA[r]-sumA[l-1]);
+	return sumAsquare[r]-sumAsquare[l-1]- 2*(sumA[r]- sumA[l-1])*mean + mean*mean*(r-l+1);*/
+}
+
+
+
+// Runs k-means on the given set of points.
+//   - n: The number of points in the data set
+//   - k: The number of clusters to look for
+//   - d: The number of dimensions that the data set lives in
+//   - points: An array of size n*d where points[d*i + j] gives coordinate j of poi
+//   - attempts: The number of times to independently run k-means with different starting centers.
+//               The best result is always returned (as measured by the cost function).
+//   - centers: This can either be null or an array of size k*d. In the latter case, it will be
+//              filled with the locations of all final cluster centers. Specifically
+//              centers[d*i + j] will give coordinate j of center i. If the cluster is unused, it
+//              will contain NaN instead.
+//   - assignments: This can either be null or an array of size n. In the latter case, it will be
+//                  filled with the cluster that each pois assigned to (an integer between 0
+//                  and k-1 inclusive).
+// The final cost of the clustering is also returned.
+
+double RunKMeans1D(int n, int k, double *points, int *weights, double *centers, int *assignments) {
+	double *sumA;
+	double *sumAsquare;
+	int *sumW;
+	double **Cost;
+	int **trace;
+	
+	sumA = new double[n+1];
+	sumAsquare = new double[n+1];
+	sumW = new int[n+1];
+	Cost = new double*[n+1];
+	for (int i=0; i<=n; i++) Cost[i] = new double[k+1];
+	trace = new int*[n+1];
+	for (int i=0; i<=n; i++) trace[i] = new int[k+1];
+
+	int *index = new int[n+1];
+	for (int i=0; i<n; i++) index[i] = i;
+
+	//for (int i=1; i<=n; i++) cout <<index[i] <<"\t" <<points[i] <<endl;
+
+	quicksort(points, weights, index, 0, n-1);
+	
+	//for (int i=n; i>0; i--) {points[i] = points[i-1]; index[i] = index[i-1];}
+	//for (int i=1; i<=n; i++) cout <<index[i] <<"\t" <<points[i] <<endl;
+	
+	//exit(1);
+	
+	sumA[0] = 0; sumAsquare[0] =0; sumW[0] = 0;
+	for (int i=1; i<=n; i++) {
+		/*sumA[i] = sumA[i-1] + points[i-1];
+		sumAsquare[i] = sumAsquare[i-1] + points[i-1]*points[i-1];*/
+		sumA[i] = sumA[i-1] + points[i-1] * weights[i-1];
+		sumAsquare[i] = sumAsquare[i-1] + points[i-1]*points[i-1] * weights[i-1];
+		sumW[i] = sumW[i-1] + weights[i-1];
+	}
+	
+	Cost[0][0] = 0;
+	for (int i=1; i<=n; i++) {
+		Cost[i][1] = mean_sum(1, i, sumA, sumAsquare, sumW);
+		trace[i][1] = 0;
+		for (int j=2; j<=(i<=k?i:k); j++) {
+			Cost[i][j] = Cost[j-1][j-1]+ mean_sum(j, i, sumA, sumAsquare, sumW);
+			trace[i][j] = j-1;
+			for (int _k=j; _k<=i-1; _k++) {
+				double temp = mean_sum(_k+1, i, sumA, sumAsquare, sumW);
+				if (Cost[i][j] >= Cost[_k][j-1]+ temp) {
+					Cost[i][j] = Cost[_k][j-1]+ temp;			
+					trace[i][j] = _k;
+				}
+			}
+		}
+	}
+	
+	double min_cost = Cost[n][k];
+	
+    int i = n; int j = k;
+    while (i>0) {
+		int t= trace[i][j];
+		centers[j-1] = (sumA[i]-sumA[t])/(sumW[i]-sumW[t]);
+		//cout << "category " <<k-j<<endl;
+		for (int _i=t+1; _i<=i; _i++) {
+			//cout <<index[_i] << "\t" <<points[_i] <<endl;
+			assignments[index[_i-1]] = j-1; //points[_i] \in category k-j
+		}
+		i=t; j=j-1;
+	}
+	
+	for (int i=n; i>=0; i--) delete [] trace[i];
+	delete [] trace;
+	for (int i=n; i>=0; i--) delete [] Cost[i];
+	delete [] Cost;
+
+	delete [] sumW;	
+	delete [] sumAsquare;
+	delete [] sumA;
+
+	return min_cost;
+}
+     
+
+/************************************************
+	RateMeyerDiscrete
+************************************************/
+RateMeyerDiscrete::RateMeyerDiscrete(int ncat, int cat_type, char *file_name, PhyloTree *tree, bool rate_type)
+ : RateMeyerHaeseler(file_name, tree, rate_type)
+{
+	ncategory = ncat;
+	rates = NULL;
+	ptn_cat = NULL;
+	is_categorized = false;
+	mcat_type = cat_type;
+	if (ncat > 0) {
+		rates = new double[ncategory];
+		memset(rates, 0, sizeof(double) * ncategory);
+	}
+	name += convertIntToString(ncategory);
+	if (ncategory > 0)
+		full_name += " with " + convertIntToString(ncategory) + " categories";
+	else
+		full_name += "auto-detect #categories";
+}
+
+RateMeyerDiscrete::RateMeyerDiscrete() {
+	ncategory = 0;
+	rates = NULL;
+	ptn_cat = NULL;
+	is_categorized = false;
+	mcat_type = 0;
+	rates = NULL;
+	name = full_name = "";
+	rate_mh = true;
+}
+
+
+RateMeyerDiscrete::~RateMeyerDiscrete()
+{
+	if (rates) delete [] rates;
+}
+
+bool RateMeyerDiscrete::isSiteSpecificRate() { 
+	return !is_categorized; 
+}
+
+int RateMeyerDiscrete::getNDiscreteRate() { 
+	if (!is_categorized) return RateMeyerHaeseler::getNDiscreteRate();
+	assert(ncategory > 0);
+	return ncategory; 
+}
+
+double RateMeyerDiscrete::getRate(int category) {
+	if (!is_categorized) return RateMeyerHaeseler::getRate(category);
+	assert(category < ncategory); 
+	return rates[category]; 
+}
+
+int RateMeyerDiscrete::getPtnCat(int ptn) {
+	if (!is_categorized) return RateMeyerHaeseler::getPtnCat(ptn);
+	assert(ptn_cat);
+	return ptn_cat[ptn];
+}
+
+double RateMeyerDiscrete::getPtnRate(int ptn) {
+	if (!is_categorized) return RateMeyerHaeseler::getPtnRate(ptn);
+	assert(ptn_cat && rates);
+	return rates[ptn_cat[ptn]];
+}
+
+int RateMeyerDiscrete::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
+	pattern_rates.insert(pattern_rates.begin(), begin(), end());
+	pattern_cat.insert(pattern_cat.begin(), ptn_cat, ptn_cat + size());
+    return ncategory;
+}
+
+/*double RateMeyerDiscrete::optimizeParameters() {
+	if (is_categorized) {
+		is_categorized = false;
+		phylo_tree->clearAllPartialLh();
+		return phylo_tree->computeLikelihood();
+	}
+	double tree_lh = RateMeyerHaeseler::optimizeParameters();
+	return tree_lh;
+}*/
+
+double RateMeyerDiscrete::optimizeParameters(double epsilon) {
+	if (!is_categorized) return RateMeyerHaeseler::optimizeParameters(epsilon);
+	phylo_tree->calcDist(dist_mat);
+	for (int i = 0; i < ncategory; i++)
+		optimizeCatRate(i);
+	normalizeRates();
+	phylo_tree->clearAllPartialLH();
+	return phylo_tree->computeLikelihood();
+	//return phylo_tree->optimizeAllBranches(2);
+}
+
+
+double RateMeyerDiscrete::computeFunction(double value) {
+	if (!is_categorized) return RateMeyerHaeseler::computeFunction(value);
+	if (!rate_mh) {
+		if (value != cur_scale) {
+			ptn_tree->scaleLength(value/cur_scale);
+			cur_scale = value;
+			ptn_tree->clearAllPartialLH();
+		}
+		return -ptn_tree->computeLikelihood();
+	}
+
+	double lh = 0.0;
+	int nseq = phylo_tree->leafNum;
+	int nstate = phylo_tree->getModel()->num_states;
+	int i, j, k, state1, state2;
+	ModelSubst *model = phylo_tree->getModel();
+    int trans_size = nstate * nstate;
+	double *trans_mat = new double[trans_size];
+	int *pair_freq = new int[trans_size];
+
+	for (i = 0; i < nseq-1; i++) 
+		for (j = i+1; j < nseq; j++) {
+			memset(pair_freq, 0, trans_size * sizeof(int));
+			for (k = 0; k < size(); k++) {
+				if (ptn_cat[k] != optimizing_cat) continue;
+				Pattern *pat = & phylo_tree->aln->at(k);
+				if ((state1 = pat->at(i)) < nstate && (state2 = pat->at(j)) < nstate)
+					pair_freq[state1*nstate + state2] += pat->frequency;
+			}
+			model->computeTransMatrix(value * dist_mat[i*nseq + j], trans_mat);
+			for (k = 0; k < trans_size; k++) if (pair_freq[k])
+				lh -= pair_freq[k] * log(trans_mat[k]);
+		}
+	delete [] pair_freq;
+	delete [] trans_mat;
+	return lh;
+}
+
+void RateMeyerDiscrete::computeFuncDerv(double value, double &df, double &ddf) {
+	if (!is_categorized) {
+		RateMeyerHaeseler::computeFuncDerv(value, df, ddf);
+		return;
+	}
+
+//	double lh = 0.0;
+	int nseq = phylo_tree->leafNum;
+	int nstate = phylo_tree->getModel()->num_states;
+	int i, j, k, state1, state2;
+	ModelSubst *model = phylo_tree->getModel();
+    int trans_size = nstate * nstate;
+	double *trans_mat = new double[trans_size];
+	double *trans_derv1 = new double[trans_size];
+	double *trans_derv2 = new double[trans_size];
+	df = ddf = 0.0;
+
+	int *pair_freq = new int[trans_size];
+
+	for (i = 0; i < nseq-1; i++) 
+		for (j = i+1; j < nseq; j++) {
+			memset(pair_freq, 0, trans_size * sizeof(int));
+			for (k = 0; k < size(); k++) {
+				if (ptn_cat[k] != optimizing_cat) continue;
+				Pattern *pat = & phylo_tree->aln->at(k);
+				if ((state1 = pat->at(i)) < nstate && (state2 = pat->at(j)) < nstate)
+					pair_freq[state1*nstate + state2] += pat->frequency;
+			}
+			double dist = dist_mat[i*nseq + j];
+			double derv1 = 0.0, derv2 = 0.0;
+			model->computeTransDerv(value * dist, trans_mat, trans_derv1, trans_derv2);
+			for (k = 0; k < trans_size; k++) if (pair_freq[k]) {
+				double t1 = trans_derv1[k] / trans_mat[k];
+				double t2 = trans_derv2[k] / trans_mat[k];
+				trans_derv1[k] = t1;
+				trans_derv2[k] = (t2 - t1*t1);
+//				lh -= log(trans_mat[k]) * pair_freq[k];
+				derv1 += trans_derv1[k] * pair_freq[k];
+				derv2 += trans_derv2[k] * pair_freq[k];
+			}
+			df -= derv1 * dist;
+			ddf -= derv2 * dist * dist;
+		}
+	delete [] pair_freq;
+	delete [] trans_derv2;
+	delete [] trans_derv1;
+	delete [] trans_mat;
+//	return lh;
+
+/*	double lh = 0.0, derv1, derv2;
+	df = 0.0; ddf = 0.0;	
+	for (int i = 0; i < size(); i++)
+		if (ptn_cat[i] == optimizing_cat) {
+			optimizing_pattern = i;
+			int freq =  phylo_tree->aln->at(i).frequency;
+			lh += RateMeyerHaeseler::computeFuncDerv(value, derv1, derv2) * freq;
+			df += derv1 * freq;
+			ddf += derv2 * freq;
+		}
+	return lh;*/
+}
+
+
+double RateMeyerDiscrete::optimizeCatRate(int cat) {
+	optimizing_cat = cat;
+	double negative_lh;
+	double current_rate = rates[cat];
+	double ferror, optx;
+
+	if (!rate_mh) {
+		IntVector ptn_id;
+		for (int i = 0; i < size(); i++)
+			if (ptn_cat[i] == optimizing_cat)
+				ptn_id.push_back(i);
+		prepareRateML(ptn_id);
+	}
+
+    if (phylo_tree->optimize_by_newton && rate_mh) // Newton-Raphson method 
+	{
+    	optx = minimizeNewtonSafeMode(MIN_SITE_RATE, current_rate, MAX_SITE_RATE, TOL_SITE_RATE, negative_lh);
+    }
+    else {
+		optx = minimizeOneDimen(MIN_SITE_RATE, current_rate, MAX_SITE_RATE, TOL_SITE_RATE, &negative_lh, &ferror);
+		double fnew;
+		if ((optx < MAX_SITE_RATE) && (fnew = computeFunction(MAX_SITE_RATE)) <= negative_lh+TOL_SITE_RATE) {
+			optx = MAX_SITE_RATE;
+			negative_lh = fnew;
+		}
+		if ((optx > MIN_SITE_RATE) && (fnew = computeFunction(MIN_SITE_RATE)) <= negative_lh+TOL_SITE_RATE) {
+			optx = MIN_SITE_RATE;
+			negative_lh = fnew;
+		}
+	}
+	//negative_lh = brent(MIN_SITE_RATE, current_rate, max_rate, 1e-3, &optx);
+	if (optx > MAX_SITE_RATE*0.99) optx = MAX_SITE_RATE;
+	if (optx < MIN_SITE_RATE*2) optx = MIN_SITE_RATE;
+	rates[cat] = optx;
+//#ifndef NDEBUG		
+//#endif
+
+	if (!rate_mh) completeRateML();
+	return optx;	
+}
+
+void RateMeyerDiscrete::normalizeRates() {
+	double sum = 0.0, ok = 0.0;
+	int nptn = size();
+	int i;
+
+	for (i = 0; i < nptn; i++) {
+		//at(i) = rates[ptn_cat[i]];
+		if (getPtnRate(i) < MAX_SITE_RATE) { 
+			sum += getPtnRate(i) * phylo_tree->aln->at(i).frequency; 
+			ok += phylo_tree->aln->at(i).frequency; 
+		}
+	}
+
+	if (fabs(sum - ok) > 1e-3) {
+		//cout << "Normalizing rates " << sum << " / " << ok << endl;
+		double scale_f = ok / sum;
+		for (i = 0; i < ncategory; i++)
+			if (rates[i] > 2*MIN_SITE_RATE && rates[i] < MAX_SITE_RATE)
+				rates[i] *= scale_f;
+	}
+}
+
+double RateMeyerDiscrete::classifyRatesKMeans() {
+
+	assert(ncategory > 0);
+	int nptn = size();
+
+	// clustering the rates with k-means
+	//AddKMeansLogging(&cout, false);
+	double *points = new double[nptn];
+	int *weights = new int[nptn];
+	int i;
+	if (!ptn_cat) ptn_cat = new int[nptn];
+	for (i = 0; i < nptn; i++) {
+		points[i] = at(i);
+		if (mcat_type & MCAT_LOG) points[i] = log(points[i]);
+		weights[i] = 1;
+		if (!(mcat_type & MCAT_PATTERN)) 
+			weights[i] = phylo_tree->aln->at(i).frequency;
+	}
+	memset(rates, 0, sizeof(double) * ncategory);
+
+	//double cost = RunKMeansPlusPlus(nptn, ncategory, 1, points, sqrt(nptn), rates, ptn_cat);
+	double cost = RunKMeans1D(nptn, ncategory, points, weights, rates, ptn_cat);
+	// assign the categorized rates
+	if  (mcat_type & MCAT_LOG) 
+		for (i = 0; i < ncategory; i++) rates[i] = exp(rates[i]);
+	if (rates[0] < MIN_SITE_RATE) rates[0] = MIN_SITE_RATE;
+	if (rates[ncategory-1] > MAX_SITE_RATE - 1e-6) rates[ncategory-1] = MAX_SITE_RATE;
+	if (verbose_mode >= VB_MED) {
+		cout << "K-means cost: " << cost << endl;
+		for (i = 0; i < ncategory; i++) cout << rates[i] << " ";
+		cout << endl;
+	}
+
+	normalizeRates();
+	phylo_tree->clearAllPartialLH();
+	double cur_lh = phylo_tree->computeLikelihood();
+
+	delete [] weights;
+	delete [] points;
+	
+	if (mcat_type & MCAT_MEAN)
+		return cur_lh;
+
+	return phylo_tree->getModelFactory()->optimizeParameters(false,false, TOL_LIKELIHOOD);
+
+	// optimize category rates again by ML
+/*	for (int k = 0; k < 100; k++) {
+		phylo_tree->calcDist(dist_mat);
+		for (i = 0; i < ncategory; i++)
+			optimizeCatRate(i);
+		normalizeRates();
+		phylo_tree->clearAllPartialLh();
+		double new_lh = phylo_tree->optimizeAllBranches(k+2);
+		if (new_lh > cur_lh + 1e-2) {
+			cur_lh = new_lh; 
+			cout << "Current log-likelihood: " << cur_lh << endl;
+		} else {
+			cur_lh = new_lh;
+			break;
+		}
+	}
+	*/
+	return cur_lh;
+}
+
+
+double RateMeyerDiscrete::classifyRates(double tree_lh) {
+	if (is_categorized) return tree_lh;
+
+	double new_tree_lh;
+	is_categorized = true;
+	if (ncategory > 0) {
+		cout << endl << "Classifying rates into " << ncategory << " categories..." << endl;
+		return classifyRatesKMeans();
+	}
+
+	// identifying proper number of categories
+	int nptn = phylo_tree->aln->getNPattern();
+	rates = new double[nptn];
+
+	for (ncategory = 2; ; ncategory++) {
+		cout << endl << "Classifying rates into " << ncategory << " categories..." << endl;
+		new_tree_lh = classifyRatesKMeans();
+		new_tree_lh = phylo_tree->optimizeAllBranches();
+		cout << "For " << ncategory << " categories, LogL = " << new_tree_lh;
+		double lh_diff = 2*(tree_lh - new_tree_lh);
+		int df = (nptn - ncategory);
+		double pval = computePValueChiSquare(lh_diff, df);
+		cout << ", p-value = " << pval;
+		cout << endl;
+		//if (new_tree_lh > tree_lh - 3.0) break;
+		if (pval > 0.05) break;
+	}
+
+	cout << endl << "Number of categories is set to " << ncategory << endl;
+	return new_tree_lh;
+}
+
+
+
+
+void RateMeyerDiscrete::writeInfo(ostream &out) {
+	//out << "Number of categories: " << ncategory << endl;
+}
+
diff --git a/model/ratemeyerdiscrete.h b/model/ratemeyerdiscrete.h
new file mode 100644
index 0000000..e5b8a4f
--- /dev/null
+++ b/model/ratemeyerdiscrete.h
@@ -0,0 +1,166 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEMEYERDISCRETE_H
+#define RATEMEYERDISCRETE_H
+
+#include "ratemeyerhaeseler.h"
+
+/**
+The discrete version of Meyer & von Haeseler rate class
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class RateMeyerDiscrete : public RateMeyerHaeseler
+{
+public:
+ 	/**
+		constructor
+		@param ncat number of rate categories
+		@param cat_type category type, bitwise, incl. CAT_LOG, CAT_MEAN, CAT_PATTERN
+		@param file_name rate file name, NULL if not inputed
+		@param tree associated phylo tree
+   */
+   RateMeyerDiscrete(int ncat, int cat_type, char *file_name, PhyloTree *tree, bool rate_type);
+
+   RateMeyerDiscrete();
+
+	/**
+		destructor
+	*/
+    virtual ~RateMeyerDiscrete();
+
+	/**
+		get the number of rate categories for site-specific category model
+		@return the number of rate categories
+	*/
+	virtual int getNDiscreteRate();
+
+	/**
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	virtual double getRate(int category);
+
+	/**
+		get the rate of a specified site-pattern. Default returns 1.0 since it is homogeneous model
+		@param ptn pattern ID 
+		@return the rate of the specified site-pattern
+	*/
+	virtual double getPtnRate(int ptn);
+
+	/**
+		get rate category of a specified site-pattern. 
+		@param ptn pattern ID 
+		@return the rate category of the specified site-pattern
+	*/
+	virtual int getPtnCat(int ptn);
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories        
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat);
+
+	virtual bool isSiteSpecificRate();
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return ncategory; }
+
+	/**
+		optimize rates of all site-patterns
+		compute categorized rates from the "continuous" rate of the original Meyer & von Haeseler model.
+		The current implementation uses the k-means algorithm with k-means++ package.
+	*/
+	virtual double optimizeParameters(double epsilon);
+
+	/**
+		classify rates into categories.
+		@param tree_lh the current tree log-likelihood
+	*/
+	virtual double classifyRates(double tree_lh);
+
+	/**
+		classify rates into categories using k-means++ method.
+		@return tree likelihood
+	*/
+	double classifyRatesKMeans();
+
+
+	/**
+		This function is inherited from Optimization class for optimizting site rates 
+		@param value x-value of the function
+		@return f(value) of function f you want to minimize
+	*/
+	virtual double computeFunction(double value);
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+	double optimizeCatRate(int cat);
+
+	void normalizeRates();
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+protected:
+
+	/**
+		number of rate categories
+	*/
+	int ncategory;
+
+	/**
+		category index for every pattern
+	*/
+	int *ptn_cat;
+
+	/**
+		rates, containing ncategory elements
+	*/
+	double *rates;
+
+	/**
+		false at beginning, true after continuous rates were optimized
+	*/
+	bool is_categorized;
+
+	int mcat_type;
+	
+	/**
+		current category under optimization. Note that this is not thread-safe
+	*/
+	int optimizing_cat;
+
+};
+
+#endif
diff --git a/model/ratemeyerhaeseler.cpp b/model/ratemeyerhaeseler.cpp
new file mode 100644
index 0000000..9b61b92
--- /dev/null
+++ b/model/ratemeyerhaeseler.cpp
@@ -0,0 +1,496 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "phylotree.h"
+#include "ratemeyerhaeseler.h"
+
+
+
+RateMeyerHaeseler::RateMeyerHaeseler(char *file_name, PhyloTree *tree, bool rate_type)
+ : RateHeterogeneity()
+{
+	name = "+M";
+	full_name = "Meyer & von Haeseler (2003)";
+	dist_mat = NULL;
+	setTree(tree);
+	rate_file = file_name;
+	rate_mh = rate_type;
+	if (!rate_mh) {
+		name="+CAT";
+		full_name = "Stamatakis (2007) experimental";
+	}
+}
+
+RateMeyerHaeseler::RateMeyerHaeseler()
+ : RateHeterogeneity()
+{
+	name = "+M";
+	full_name = "Meyer & von Haeseler (2003)";
+	dist_mat = NULL;
+	phylo_tree = NULL;
+	rate_file = NULL;
+	rate_mh = true;
+}
+
+void RateMeyerHaeseler::readRateFile(char *rate_file) {
+	cout << "Reading site-specific rate file " << rate_file << " ..." << endl;
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(rate_file);
+		char line[256];
+		int site, i;
+		double rate;
+		int nsites = phylo_tree->aln->getNSite();
+		resize(phylo_tree->aln->getNPattern(), -1.0);
+		int saturated_sites = 0, saturated_ptn = 0;
+
+		in.getline(line, sizeof(line));
+		//if (strncmp(line, "Site", 4) != 0) throw "Wrong header line";
+
+		for (i = 0; i < nsites; i++) {
+			in.getline(line, sizeof(line));
+			stringstream ss(line);
+			string tmp;
+			ss >> tmp;
+			site = convert_int(tmp.c_str());
+			if (site <= 0 || site > nsites) throw "Wrong site number (must be between 1 and #sites)";
+			site--;
+			ss >> tmp;
+			rate = convert_double(tmp.c_str());
+			if (rate < 0.0) throw "Negative rate not allowed";
+			if (rate <= 0.0) rate = MIN_SITE_RATE;
+			int ptn = phylo_tree->aln->getPatternID(site);
+			if (rate >= MAX_SITE_RATE) {
+				rate = MAX_SITE_RATE; 
+				saturated_sites += phylo_tree->aln->at(ptn).frequency; 
+				saturated_ptn ++;
+			}
+			at(ptn) = rate;
+		}
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+
+		for (i = 0; i < size(); i++)
+			if (at(i) < 0.0) throw "Some site has no rate information";
+
+		if (saturated_sites) {
+			stringstream str;
+			str << saturated_sites << " sites (" << saturated_ptn << " patterns) show too high rates (>=" << MAX_SITE_RATE << ")";
+			outWarning(str.str());
+		}
+	} catch (const char *str) {
+		outError(str);
+	} catch (string str) {
+		outError(str);
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	}
+}
+
+RateMeyerHaeseler::~RateMeyerHaeseler()
+{
+	if (dist_mat) delete [] dist_mat;
+}
+
+int RateMeyerHaeseler::getNDim() {
+	if (phylo_tree) 
+		return phylo_tree->aln->getNPattern()-1;
+	if (empty()) return 0;
+	return size()-1; 
+}
+
+/*
+double RateMeyerHaeseler::getRate(int category) {
+	if (category < size())
+		return at(category);
+
+	return 1.0;
+}*/
+
+double RateMeyerHaeseler::getPtnRate(int ptn) {
+	if (ptn < size())
+		return at(ptn);
+
+	return 1.0;
+}
+
+int RateMeyerHaeseler::computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat) {
+	pattern_rates.insert(pattern_rates.begin(), begin(), end());
+    return size();
+}
+
+void RateMeyerHaeseler::getRates(DoubleVector &rates) {
+	rates.clear();
+	if (empty()) {
+		rates.resize(phylo_tree->aln->size(), 1.0);
+	} else {
+		rates.insert(rates.begin(), begin(), end());
+	} 
+}
+
+void RateMeyerHaeseler::setRates(DoubleVector &rates) {
+	clear();
+	insert(begin(), rates.begin(), rates.end());
+}
+
+void RateMeyerHaeseler::initializeRates() {
+
+	int i, j, rate_id = 0, state1, state2;
+	int nseq = phylo_tree->leafNum;
+	int nstate = phylo_tree->getModel()->num_states;
+
+	if (nseq < 25) 
+		outWarning("Meyer & von Haeseler model is not recommended for < 25 sequences\n");
+
+	resize(phylo_tree->aln->getNPattern(), 1.0);
+
+	for (Alignment::iterator pat = phylo_tree->aln->begin(); pat != phylo_tree->aln->end(); pat++, rate_id++) {
+		int diff = 0, total = 0;
+		for (i = 0; i < nseq-1; i++) if ((state1 = pat->at(i)) < nstate)
+			for (j = i+1; j < nseq; j++) if ((state2 = pat->at(j)) < nstate) {
+				//total += dist_mat[state1 * nstate + state2];
+				//if (state1 != state2) diff += dist_mat[state1 * nstate + state2];
+				total++;
+				if (state1 != state2) diff++;
+		}
+		if (diff == 0) diff = 1;
+		if (total == 0) total = 1;
+		double obs_diff = double(diff) / total;
+		double tolog = 1.0 - obs_diff*nstate/(nstate-1);
+		if (tolog > 0.0) {
+			at(rate_id) = -log(tolog) * (nstate-1) / nstate;
+		} else at(rate_id) = obs_diff;
+		
+	}
+}
+
+void RateMeyerHaeseler::prepareRateML(IntVector &ptn_id) {
+	Alignment *aln = new Alignment();
+	aln->extractPatterns(phylo_tree->aln, ptn_id);
+	ptn_tree = new PhyloTree(aln);
+	stringstream ss;
+	phylo_tree->printTree(ss);
+	ptn_tree->readTree(ss, phylo_tree->rooted);
+	ptn_tree->setAlignment(aln);
+	ptn_tree->setModelFactory(phylo_tree->getModelFactory());
+	ptn_tree->setModel(phylo_tree->getModelFactory()->model);
+	ptn_tree->setRate(new RateHeterogeneity());
+	ptn_tree->computeLikelihood();
+	//cout << optimizing_pattern << " " << lh << endl;
+	cur_scale = 1.0;
+}
+
+void RateMeyerHaeseler::completeRateML() {
+	ptn_tree->setModelFactory(NULL);
+	ptn_tree->setModel(NULL);
+	//ptn_tree->setRate(NULL);
+	delete ptn_tree->aln;
+	delete ptn_tree;
+	ptn_tree = NULL;
+}
+
+double RateMeyerHaeseler::optimizeRate(int pattern) {
+	optimizing_pattern = pattern;
+
+	double max_rate = MAX_SITE_RATE;
+
+	double minf = INFINITY, minx = 0;
+	double negative_lh;
+	double current_rate = at(pattern);
+	double ferror, optx;
+	/* constant site alway have ZERO rates */
+	if (phylo_tree->aln->at(pattern).is_const) {
+		return (at(pattern) = MIN_SITE_RATE);
+	}
+
+	if (!rate_mh) {	
+		IntVector ptn_id;
+		ptn_id.push_back(optimizing_pattern);
+		prepareRateML(ptn_id);
+	}
+
+    if (phylo_tree->optimize_by_newton && rate_mh) // Newton-Raphson method 
+	{
+    	optx = minimizeNewtonSafeMode(MIN_SITE_RATE, current_rate, max_rate, TOL_SITE_RATE, negative_lh);
+		if (optx > MAX_SITE_RATE*0.99 || (optx < MIN_SITE_RATE*2 && !phylo_tree->aln->at(pattern).is_const)) 
+		{
+			double optx2, negative_lh2;
+			optx2 = minimizeOneDimen(MIN_SITE_RATE, current_rate, max_rate, TOL_SITE_RATE, &negative_lh2, &ferror);
+			if (negative_lh2 < negative_lh - 1e-4) {
+				cout << "+++NEWTON IS WRONG for pattern " << pattern << ": " << optx2 << " " << 
+				negative_lh2 << " (Newton: " << optx << " " << negative_lh <<")" << endl;
+			}
+			if (negative_lh < negative_lh2 - 1e-4 && verbose_mode >= VB_MED) {
+				cout << "Brent is wrong for pattern " << pattern << ": " << optx2 << " " << 
+				negative_lh2 << " (Newton: " << optx << " " << negative_lh <<")" << endl;
+			}
+		}
+    }
+    else {
+		optx = minimizeOneDimen(MIN_SITE_RATE, current_rate, max_rate, TOL_SITE_RATE, &negative_lh, &ferror);
+		double fnew;
+		if ((optx < max_rate) && (fnew = computeFunction(max_rate)) <= negative_lh+TOL_SITE_RATE) {
+			optx = max_rate;
+			negative_lh = fnew;
+		}
+		if ((optx > MIN_SITE_RATE) && (fnew = computeFunction(MIN_SITE_RATE)) <= negative_lh+TOL_SITE_RATE) {
+			optx = MIN_SITE_RATE;
+			negative_lh = fnew;
+		}
+	}
+	//negative_lh = brent(MIN_SITE_RATE, current_rate, max_rate, 1e-3, &optx);
+	if (optx > max_rate*0.99) optx = MAX_SITE_RATE;
+	if (optx < MIN_SITE_RATE*2) optx = MIN_SITE_RATE;
+	at(pattern) = optx;
+
+	if (!rate_mh) { 
+		completeRateML(); 
+		return optx; 
+	}
+
+//#ifndef NDEBUG		
+	if (optx == MAX_SITE_RATE || (optx == MIN_SITE_RATE && !phylo_tree->aln->at(pattern).is_const)) {
+		ofstream out;
+	
+		if (verbose_mode >= VB_MED)  {
+			cout << "Checking pattern " << pattern << " (" << current_rate << ", " << optx << ")" << endl;
+			out.open("x", ios::app);
+			out << pattern;
+		}
+		for (double val=0.1; val <= 100; val += 0.1) {
+			double f = computeFunction(val);
+			
+			if (verbose_mode >= VB_MED) out << " " << f;
+			if (f < minf) { minf = f; minx = val; }
+			if (verbose_mode < VB_MED && minf < negative_lh) break;
+		}
+		if (verbose_mode >= VB_MED) { 
+			out << endl;
+			out.close();
+		}
+		//cout << "minx: " << minx << " " << minf << endl;
+		if (negative_lh > minf+1e-3) {
+			optx = minimizeOneDimen(MIN_SITE_RATE, minx, max_rate, 1e-3, &negative_lh, &ferror);
+			at(pattern) = optx;
+			if (verbose_mode >= VB_MED)
+				cout << "FIX rate: " << minx << " , " << optx << endl;
+		}
+	}
+//#endif
+
+	return optx;
+}
+
+
+void RateMeyerHaeseler::optimizeRates() {
+	if (!dist_mat) {
+		dist_mat = new double[phylo_tree->leafNum * phylo_tree->leafNum];
+	}
+	// compute the distance based on the path lengths between taxa of the tree
+	phylo_tree->calcDist(dist_mat);
+	IntVector ok_ptn;
+	ok_ptn.resize(size(), 0);
+	double sum = 0.0;
+	int i;
+	int ok_sites = 0;
+	int saturated_sites = 0, saturated_ptn = 0;
+	int invar_sites = 0;
+	int ambiguous_sites = 0;
+	int nseq = phylo_tree->leafNum;
+	int nstates = phylo_tree->aln->num_states;
+	for (i = 0; i < size(); i++) {
+		int freq = phylo_tree->aln->at(i).frequency;
+		if (phylo_tree->aln->at(i).computeAmbiguousChar(nstates) <= nseq-2) {
+			optimizeRate(i);
+			if (at(i) == MIN_SITE_RATE) invar_sites += freq; 
+			if (at(i) == MAX_SITE_RATE) {
+				saturated_sites += freq; 
+				saturated_ptn ++;
+			}
+		} else { at(i) = MIN_SITE_RATE; ambiguous_sites += freq; }
+		if (at(i) < MAX_SITE_RATE) 
+		{
+			if (at(i) > MIN_SITE_RATE) sum += at(i) * freq;
+			ok_ptn[i] = 1;
+			ok_sites += freq;
+		}
+	} 
+
+	// now scale such that the mean of rates is 1
+	double scale_f = ok_sites / sum;
+	for (i = 0; i < size(); i++) {
+		if (ok_ptn[i] && at(i) > MIN_SITE_RATE) at(i) = at(i) * scale_f;
+	}
+
+	if (ambiguous_sites) {
+		stringstream str;
+		str << ambiguous_sites << " sites contain too many gaps or ambiguous characters";
+		outWarning(str.str());
+	}
+	if (saturated_sites) {
+		stringstream str;
+		str << saturated_sites << " sites (" << saturated_ptn << " patterns) show too high rates (>=" << MAX_SITE_RATE << ")";
+		outWarning(str.str());
+	}
+	//cout << invar_sites << " sites have zero rate" << endl;
+
+}
+
+double RateMeyerHaeseler::optimizeParameters(double epsilon) {
+	assert(phylo_tree);
+	double tree_lh = phylo_tree->computeLikelihood();
+
+	DoubleVector prev_rates;
+	getRates(prev_rates);
+
+	if (empty()) {
+		if (rate_file) {
+			readRateFile(rate_file);
+			phylo_tree->clearAllPartialLH();
+			return phylo_tree->optimizeAllBranches();
+		}
+		initializeRates();
+	}
+
+	optimizeRates();
+
+	
+	phylo_tree->clearAllPartialLH();
+
+	stringstream best_tree_string;
+	phylo_tree->printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
+	double new_tree_lh = phylo_tree->optimizeAllBranches(1);
+	//double new_tree_lh = phylo_tree->computeLikelihood();
+
+	if (new_tree_lh < tree_lh - 1e-5) {
+		cout << "Worse likelihood (" << new_tree_lh << "), roll back site rates..." << endl;
+		setRates(prev_rates);
+		phylo_tree->rollBack(best_tree_string);
+		//phylo_tree->clearAllPartialLh();
+		new_tree_lh = phylo_tree->computeLikelihood();
+		//cout << "Backup log-likelihood: " << new_tree_lh << endl;
+		new_tree_lh = tree_lh;
+	}
+	
+	return new_tree_lh;
+}
+
+
+double RateMeyerHaeseler::computeFunction(double value) {
+	if (!rate_mh) {
+		if (value != cur_scale) {
+			ptn_tree->scaleLength(value/cur_scale);
+			cur_scale = value;
+			ptn_tree->clearAllPartialLH();
+		}
+		return -ptn_tree->computeLikelihood();
+	}
+	int nseq = phylo_tree->leafNum;
+	int nstate = phylo_tree->getModel()->num_states;
+	int i, j, state1, state2;
+	double lh = 0.0;
+	ModelSubst *model = phylo_tree->getModel();
+	Pattern *pat = & phylo_tree->aln->at(optimizing_pattern);
+	
+	for (i = 0; i < nseq-1; i++) if ((state1 = pat->at(i)) < nstate) 
+		for (j = i+1; j < nseq; j++) if ((state2 = pat->at(j)) < nstate) 
+			lh -= log(model->computeTrans(value * dist_mat[i*nseq + j], state1, state2));
+	return lh;
+}
+
+void RateMeyerHaeseler::computeFuncDerv(double value, double &df, double &ddf) {
+	int nseq = phylo_tree->leafNum;
+	int nstate = phylo_tree->getModel()->num_states;
+	int i, j, state1, state2;
+//	double lh = 0.0;
+	double trans, derv1, derv2;
+	ModelSubst *model = phylo_tree->getModel();
+	Pattern *pat = & phylo_tree->aln->at(optimizing_pattern);
+	df = ddf = 0.0;
+	for (i = 0; i < nseq-1; i++) if ((state1 = pat->at(i)) < nstate) 
+		for (j = i+1; j < nseq; j++) if ((state2 = pat->at(j)) < nstate) {
+			double dist = dist_mat[i*nseq + j];
+			trans = model->computeTrans(value * dist, state1, state2, derv1, derv2);
+//			lh -= log(trans);
+			double t1 = derv1 / trans;
+			double t2 = derv2 / trans;
+			df -= t1 * dist;
+			ddf -= dist * dist * (t2 - t1*t1);
+		}
+//	return lh;
+}
+
+
+void RateMeyerHaeseler::runIterativeProc(Params &params, IQTree &tree) {
+	int i;
+	if (verbose_mode >= VB_MED) {
+		ofstream out("x");
+		out.close();
+	}
+	setTree(&tree);
+	RateHeterogeneity *backup_rate = tree.getRate();
+	if (backup_rate->getGammaShape() > 0 ) {
+		IntVector pattern_cat;
+		backup_rate->computePatternRates(*this, pattern_cat);
+		double sum = 0.0;
+		for (i = 0; i < size(); i++)
+			sum += at(i) * phylo_tree->aln->at(i).frequency;
+		sum /=  phylo_tree->aln->getNSite();
+		if (fabs(sum - 1.0) > 0.0001) {
+			if (verbose_mode >= VB_MED)
+				cout << "Normalizing Gamma rates (" << sum << ")" << endl;
+			for (i = 0; i < size(); i++)
+				at(i) /= sum;
+		}
+	}
+	tree.getModelFactory()->site_rate = this;
+	tree.setRate(this);
+
+	
+	//if  (empty()) initializeRates();
+
+	//setRates(prev_rates);
+	//string rate_file = params.out_prefix;
+	//rate_file += ".mhrate";
+	double prev_lh = tree.getCurScore();
+	string dist_file = params.out_prefix;
+	dist_file += ".tdist";
+	tree.getModelFactory()->stopStoringTransMatrix();
+
+	for (i = 2; i < 100; i++) {
+		//DoubleVector prev_rates;
+		//getRates(prev_rates);
+		//writeSiteRates(prev_rates, rate_file.c_str());
+		tree.setCurScore(optimizeParameters(0.0));
+		//phylo_tree->aln->printDist(dist_file.c_str(), dist_mat);
+		tree.setCurScore(tree.optimizeAllBranches(i));
+		cout << "Current Log-likelihood: " << tree.getCurScore() << endl;
+		if (tree.getCurScore() <= prev_lh + 1e-4) {
+			break;
+		}
+		prev_lh = tree.getCurScore();
+	}
+	cout << "Optimization took " << i-1 << " rounds to finish" << endl;
+	tree.getModelFactory()->startStoringTransMatrix();
+	//tree.getModelFactory()->site_rate = backup_rate;
+	//tree.setRate(backup_rate);
+}
diff --git a/model/ratemeyerhaeseler.h b/model/ratemeyerhaeseler.h
new file mode 100644
index 0000000..018bd49
--- /dev/null
+++ b/model/ratemeyerhaeseler.h
@@ -0,0 +1,162 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef RATEMEYERHAESELER_H
+#define RATEMEYERHAESELER_H
+
+#include "rateheterogeneity.h"
+#include "tools.h"
+#include "iqtree.h"
+
+
+/**
+Implementation for site-specific rates of Meyer & von Haeseler (2003)
+Inherited from Optimization and the double vector for storing site-specific rates
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class RateMeyerHaeseler : public RateHeterogeneity, public DoubleVector
+{
+public:
+	/**
+		constructor
+	*/
+    RateMeyerHaeseler(char *file_name, PhyloTree *tree, bool rate_type);
+
+    RateMeyerHaeseler();
+
+	/**
+		destructor
+	*/
+    ~RateMeyerHaeseler();
+
+	void readRateFile(char *rate_file);
+
+	/**
+		@return true 
+	*/
+	virtual bool isSiteSpecificRate() { return true; }
+
+	/**
+		get the number of rate categories. 
+		@return the number of rate categories
+	*/
+	//virtual int getNRate() { return size(); }
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+
+	/**
+		get the rate of a specified category
+		@param category category ID from 0 to #category-1
+		@return the rate of the specified category
+	*/
+	//virtual double getRate(int category);
+
+	/**
+		get the rate of a specified site-pattern. Default returns 1.0 since it is homogeneous model
+		@param ptn pattern ID 
+		@return the rate of the specified site-pattern
+	*/
+	virtual double getPtnRate(int ptn);
+
+	/**
+		Compute site-specific rates. Override this for Gamma model
+		@param pattern_rates (OUT) pattern rates. Resizing if necesary
+        @return total number of categories
+	*/
+	virtual int computePatternRates(DoubleVector &pattern_rates, IntVector &pattern_cat);
+
+	void getRates(DoubleVector &rates);
+
+
+	void setRates(DoubleVector &rates);
+
+	void initializeRates();
+
+	/**
+		optimize parameters, the rates in this case
+		@return the best likelihood 
+	*/
+	virtual double optimizeParameters(double epsilon);
+
+	/**
+		optimize rate of site
+		@param pattern target pattern
+		@return the optimized rate value, also update the corresponding element of the vector
+	*/
+	double optimizeRate(int pattern);
+
+	/**
+		optimize rates of all site-patterns
+	*/
+	virtual void optimizeRates();
+
+
+	/**
+		This function is inherited from Optimization class for optimizting site rates 
+		@param value x-value of the function
+		@return f(value) of function f you want to minimize
+	*/
+	virtual double computeFunction(double value);
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+
+	void runIterativeProc(Params &params, IQTree &tree);
+
+	/**
+		distance matrix inferred from the path lengths of the tree (not from the sequences)
+	*/
+	double *dist_mat;
+
+
+protected:
+
+	char *rate_file;
+
+	/**
+		current pattern under optimization. Note that this is not thread-safe
+	*/
+	int optimizing_pattern;
+
+	/**
+		FALSE to use MH Model, FALSE for using tree-likelihood
+	*/
+	bool rate_mh;
+
+	double cur_scale;
+
+	PhyloTree *ptn_tree;
+
+	void prepareRateML(IntVector &ptn_id);
+	void completeRateML();
+};
+
+#endif
diff --git a/modelsblock.cpp b/modelsblock.cpp
new file mode 100644
index 0000000..29265b3
--- /dev/null
+++ b/modelsblock.cpp
@@ -0,0 +1,90 @@
+/*
+ * modelsblock.cpp
+ *
+ *  Created on: Jan 9, 2015
+ *      Author: minh
+ */
+
+#include "modelsblock.h"
+
+ModelsBlock::ModelsBlock()  : NxsBlock(), vector<NxsModel>()
+{
+	id = "MODELS";
+}
+
+ModelsBlock::~ModelsBlock() {
+}
+
+void ModelsBlock::Read(NxsToken &token)
+{
+	// This should be the semicolon after the block name
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		throw NxsException("Expecting ';' after MODELS block name", token);
+	for (;;) {
+		token.GetNextToken();
+		if (token.Equals("MODEL") || token.Equals("FREQUENCY")) {
+			NxsModel model;
+			model.flag = (NM_FREQ * (int)token.Equals("FREQUENCY"));
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+			model.name = token.GetToken();
+
+			if (findModel(model.name)) {
+				errormsg = "Duplicated model name ";
+				errormsg += model.name.c_str();
+				throw NxsException(errormsg, token);
+			}
+
+			token.GetNextToken();
+			if (!token.Equals("="))
+				throw NxsException("Expecting '=' after model name", token);
+
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextContiguousToken(';');
+			model.description = token.GetToken();
+
+			token.GetNextToken();
+			if (!token.Equals(";"))
+				throw NxsException("Expecting ';' to terminate MODEL command", token);
+
+			model.flag |= (NM_ATOMIC*(model.description.find_first_of("+*") == string::npos && model.description.find("MIX") == string::npos));
+
+			push_back(model);
+
+		} else if (token.Equals("END") || token.Equals("ENDBLOCK")) {
+			// Get the semicolon following END
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				throw NxsException("Expecting ';' to terminate the ENDBLOCK command, but found ", token);
+			break;
+		}	// if (token.Equals("END") || token.Equals("ENDBLOCK"))
+		else {
+			SkippingCommand(token.GetToken());
+			do {
+				token.GetNextToken();
+			} while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				throw NxsException("Unexpected end of file encountered", token);
+		}	// token not END, ENDBLOCK, COST
+	}
+}
+
+NxsModel *ModelsBlock::findModel(string &name) {
+	for (iterator it = begin(); it != end(); it++)
+		if (it->name == name) return &(*it);
+	return NULL;
+}
+
+NxsModel *ModelsBlock::findMixModel(string &name) {
+	for (iterator it = begin(); it != end(); it++)
+		if (it->name == name) {
+            if ((it->flag & NM_ATOMIC) == 0) 
+                return &(*it);
+            else return NULL;
+        }
+	return NULL;
+}
diff --git a/modelsblock.h b/modelsblock.h
new file mode 100644
index 0000000..4cca2af
--- /dev/null
+++ b/modelsblock.h
@@ -0,0 +1,63 @@
+/*
+ * modelsblock.h
+ *
+ *  Created on: Jan 9, 2015
+ *      Author: minh
+ */
+
+#ifndef MODELSBLOCK_H_
+#define MODELSBLOCK_H_
+
+#include "ncl/ncl.h"
+
+const int NM_ATOMIC = 1; // NxsModel is not mixture or +G etc. model
+const int NM_FREQ = 2;   // NxsModel contains state frequency
+
+class NxsModel {
+public:
+	/* model name */
+	string name;
+
+	/* model description */
+	string description;
+
+	/* true if model the basic model (no mixture etc.) */
+	int flag;
+
+	virtual ~NxsModel() {}
+};
+
+/**
+ * Class to parse MODELS block in NEXUS file
+ */
+class ModelsBlock: public NxsBlock, public vector<NxsModel> {
+public:
+	/** constructor */
+	ModelsBlock();
+	/** destructor */
+	virtual ~ModelsBlock();
+
+    /**
+        @param name model name
+        @return pointer to model with the name or NULL if not found
+    */
+	NxsModel *findModel(string &name);
+
+    /**
+        @param name model name
+        @return pointer to a mixed model with the name or NULL if not found
+    */
+	NxsModel *findMixModel(string &name);
+
+
+protected:
+
+	/**
+		main method to read block from file
+		@param token a token reader
+	*/
+	virtual void Read(NxsToken &token);
+
+};
+
+#endif /* MODELSBLOCK_H_ */
diff --git a/mpdablock.cpp b/mpdablock.cpp
new file mode 100644
index 0000000..7e9bca3
--- /dev/null
+++ b/mpdablock.cpp
@@ -0,0 +1,393 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "mpdablock.h"
+#include "split.h"
+#include "splitgraph.h"
+
+MPdaBlock::MPdaBlock(SplitGraph *asgraph)
+ : NxsBlock()
+{
+	budget = -1;
+	min_budget = -1;
+	sub_size = 0;
+	cost_constrained = false;
+	id = "PDA";
+	sgraph = asgraph;
+}
+
+
+void MPdaBlock::Report(ostream &out)
+{
+	out << "Budget = " << budget << endl;
+	out << "Taxa Costs = ";
+	for (DoubleVector::iterator it = costs.begin(); it != costs.end(); it++)
+		out << *it << " ";
+	out << endl;
+}
+
+void MPdaBlock::Reset()
+{
+	errormsg.clear();
+	isEmpty			= true;
+	isEnabled		= true;
+	isUserSupplied	= false;
+
+}
+
+void MPdaBlock::Read(NxsToken &token)
+{
+
+	int ntax = sgraph->getNTaxa();
+	if (ntax <= 0) {
+		errormsg = "PDA Block should be preceeded by Splits Block";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		
+	}
+
+	//int nominal_ntax	= 0;
+	//int nominal_nsplits = 0;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+	{
+		errormsg = "Expecting ';' after PDA block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+	}
+
+	for (;;)
+	{
+		token.GetNextToken();
+
+		if (token.Equals("PARAMETERS"))
+		{
+			// This should be the NTAX keyword
+			//
+			token.GetNextToken();
+
+			do {
+
+				if (token.Equals("BUDGET")) {
+					// This should be the equals sign
+					//
+					token.GetNextToken();
+		
+					if (!token.Equals("="))
+					{
+						errormsg = "Expecting '=', but found ";
+						errormsg += token.GetToken();
+						errormsg += " instead";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+		
+					// This should be the integer budget
+					//
+					token.GetNextToken();
+		
+					budget = convert_double(token.GetToken().c_str());
+					if (budget <= 0)
+					{
+						errormsg = "BUDGET should be greater than zero (";
+						errormsg += token.GetToken();
+						errormsg += " was specified)";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				} else if (token.Equals("MIN BUDGET")) {
+					// This should be the equals sign
+					//
+					token.GetNextToken();
+		
+					if (!token.Equals("="))
+					{
+						errormsg = "Expecting '=', but found ";
+						errormsg += token.GetToken();
+						errormsg += " instead";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+		
+					// This should be the integer budget
+					//
+					token.GetNextToken();
+		
+					min_budget = convert_double(token.GetToken().c_str());
+					if (budget < 0)
+					{
+						errormsg = "MIN_BUDGET should be greater than or equal to zero (";
+						errormsg += token.GetToken();
+						errormsg += " was specified)";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+		
+				} else if (token.Equals("BUDGET CONSTRAINED")) {
+					cost_constrained = true;
+				} else if (token.Equals("K")) {
+					// This should be the equals sign
+					//
+					token.GetNextToken();
+		
+					if (!token.Equals("="))
+					{
+						errormsg = "Expecting '=', but found ";
+						errormsg += token.GetToken();
+						errormsg += " instead";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+		
+					// This should be the integer budget
+					//
+					token.GetNextToken();
+		
+					sub_size = atoi(token.GetToken().c_str());
+					if (sub_size <= 1)
+					{
+						errormsg = "K should be greater than 1 (";
+						errormsg += token.GetToken();
+						errormsg += " was specified)";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				} else
+				{
+					errormsg = "Invalid PARAMETERS command: ";
+					errormsg += token.GetToken();
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+				token.GetNextToken();
+
+			} while (!token.AtEOF() && !token.Equals(";"));
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate PARAMETERS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		} // if (token.Equals("PARAMETERS"))
+
+		else if (token.Equals("TAXCOSTS")) {
+
+			costs.resize(ntax, -1);
+			// This should be taxon name
+			//
+			token.GetNextToken();
+
+			do {				
+				int tax_id = -1;
+
+				try {
+					tax_id = sgraph->getTaxa()->FindTaxon(token.GetToken());
+				} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+					tax_id = -1;
+				}
+
+				if (tax_id < 0)
+				{
+					errormsg = "Taxon is not found (";
+					errormsg += token.GetToken();
+					errormsg += " was specified)";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+		
+				// This should be the cost of taxon
+				//
+				token.GetNextToken();
+
+				int taxcost = convert_double(token.GetToken().c_str());
+				if (taxcost < 0)
+				{
+					errormsg = "Taxon cost should be greater than or equal to zero (";
+					errormsg += token.GetToken();
+					errormsg += " was specified)";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+				costs[tax_id] = taxcost;
+
+				token.GetNextToken();
+			} while (!token.AtEOF() && !token.Equals(";"));
+
+			// This should be the terminating semicolon
+			//
+			//token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate TAXCOSTS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+			for (int i = 0; i < ntax; i++)
+				if (costs[i] < 0) {
+					costs[i] = 0;
+					cout << "WARNING: taxon " << sgraph->getTaxa()->GetTaxonLabel(i)
+						<< "has no cost! set to 0." << endl;
+				}
+		}	// if (token.Equals("TAXCOSTS"))
+
+
+		else if (token.Equals("END") || token.Equals("ENDBLOCK"))
+		{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+			break;
+		}	// if (token.Equals("END") || token.Equals("ENDBLOCK"))
+
+		else
+		{
+			SkippingCommand(token.GetToken());
+			do
+			{
+				token.GetNextToken();
+			}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+			{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+		}	// token not END, ENDBLOCK, COST
+	}	// GetNextToken loop
+
+}
+
+void MPdaBlock::readBudgetFile(Params &params) {
+	ifstream in;
+	in.exceptions(ios::failbit | ios::badbit);
+	cout << "Reading budget information file " << params.budget_file << "..." << endl;
+	NxsString taxname;
+	int ntaxa = sgraph->getNTaxa() - params.is_rooted;
+	int i;
+
+	try {
+		costs.resize(ntaxa, -1);
+		in.open(params.budget_file);
+		in >> budget;
+		if (budget < 0) 
+			throw "Negative total budget.";
+		for (i = 0; i < ntaxa && !in.eof(); i++) {
+			double taxcost;
+			int tax_id = -1;
+			taxname = "";
+			in.exceptions(ios::badbit);
+			in >> taxname;
+			in.exceptions(ios::failbit | ios::badbit);
+			if (taxname == "") break;
+			in >> taxcost;
+			if (taxcost < 0) 
+				throw "Negative taxa preservation cost.";
+			tax_id = sgraph->getTaxa()->FindTaxon(taxname);
+			costs[tax_id] = taxcost;
+		}
+		in.close();
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+		outError(ERR_NO_TAXON, taxname);
+	} catch (const char *str) {
+		outError(str);
+	} catch (...) {
+		// anything else
+		outError(ERR_READ_ANY);
+	}
+
+	for (i = 0; i < ntaxa; i++)
+		if (costs[i] < 0) {
+			costs[i] = 0;
+			cout << "WARNING: taxon " << sgraph->getTaxa()->GetTaxonLabel(i)
+				<< "has no cost! set to 0." << endl;
+		}
+	cost_constrained = true;
+}
+
+void MPdaBlock::readBudgetAreaFile(Params &params) {
+	ifstream in;
+	in.exceptions(ios::failbit | ios::badbit);
+	cout << "Reading budget for areas information file " << params.budget_file << "..." << endl;
+	string areaname;
+	int nareas = sgraph->getNAreas();
+	int i;
+
+	try {
+		costs.resize(nareas, -1);
+		in.open(params.budget_file);
+		in >> budget;
+		if (budget < 0) 
+			throw "Negative total budget.";
+		for (i = 0; i < nareas && !in.eof(); i++) {
+			double areacost;
+			int area_id = -1;
+			areaname = "";
+			in.exceptions(ios::badbit);
+			in >> areaname;
+			in.exceptions(ios::failbit | ios::badbit);
+			if (areaname == "") break;
+			in >> areacost;
+			if (areacost < 0) 
+				throw "Negative taxa preservation cost.";
+			area_id = sgraph->getSetsBlock()->findArea(areaname);
+			if (area_id < 0)
+				outError(ERR_NO_AREA, areaname);
+			costs[area_id] = areacost;
+		}
+		in.close();
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (const char *str) {
+		outError(str);
+	} catch (...) {
+		// anything else
+		outError(ERR_READ_ANY);
+	}
+
+	for (i = 0; i < nareas; i++)
+		if (costs[i] < 0) {
+			costs[i] = 0;
+			cout << "WARNING: area " << sgraph->getSetsBlock()->getSet(i)->name
+				<< "has no cost! set to 0." << endl;
+		}
+	cost_constrained = true;
+}
+
+
+void MPdaBlock::SkippingCommand(NxsString commandName) {
+	cout << "   Skipping unknown command (" << commandName << ")..." << endl;
+}
+
+
+MPdaBlock::~MPdaBlock()
+{
+}
diff --git a/mpdablock.h b/mpdablock.h
new file mode 100644
index 0000000..f94d86e
--- /dev/null
+++ b/mpdablock.h
@@ -0,0 +1,153 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MPDABLOCK_H
+#define MPDABLOCK_H
+
+#include "ncl/ncl.h"
+#include "tools.h"
+
+class SplitGraph;
+
+/**
+PdaBlock to read from nexus file
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class MPdaBlock : public NxsBlock
+{
+public:
+	friend class SplitGraph;
+	friend class PDNetwork;
+	friend class CircularNetwork;
+
+	/**
+		constructor, assigning an associated splits graph
+		@param asgraph a splits graph
+	*/
+    MPdaBlock(SplitGraph *asgraph);
+
+	/**
+		destructor
+	*/
+    virtual ~MPdaBlock();
+
+	/**
+		print info to an output stream
+		@param out output stream, cout for output to screen
+	*/
+	virtual void Report(ostream &out);
+
+	/**
+		reset the block
+	*/
+	virtual void Reset();
+
+	/**
+		called when some commands are skipped
+		@param commandName command name
+	*/
+	virtual void SkippingCommand(NxsString commandName);
+
+
+	/**
+		read the file containing total budget and taxa costs informations
+		@param params program parameters
+	*/
+	void readBudgetFile(Params &params);
+
+	/**
+		read the file containing total budget and area costs informations
+		@param params program parameters
+	*/
+	void readBudgetAreaFile(Params &params);
+
+
+	/**
+		@return total budget
+	*/
+	double getBudget() {
+		return budget;
+	}
+
+	/**
+		@return min budget
+	*/
+	double getMinBudget() {
+		return min_budget;
+	}
+
+	/**
+		@return size of PD set
+	*/
+	int getSubSize() {
+		return sub_size;
+	}
+
+	/**
+		@return cost of a taxon
+	*/
+	double getCost(int tax_id) {
+		assert(tax_id < (int) costs.size());
+		return costs[tax_id];
+	}
+
+
+protected:
+
+	/**
+		the associated splits graph
+	*/
+	SplitGraph *sgraph;
+
+	/**
+		total budget
+	*/
+	double budget;
+
+	/**
+		min budget, to compute PD sets with preservation
+		costs from min_budget to budget
+	*/
+	double min_budget;
+
+	/**
+		size of PD set
+	*/
+	int sub_size;
+
+	/**
+		true if cost constrained PD problem
+	*/
+	bool cost_constrained;
+
+	/**
+		cost of each taxon
+	*/
+	vector<double> costs;
+
+	/**
+		main method to read block from file
+		@param token a token reader
+	*/
+	virtual void Read(NxsToken &token);
+
+};
+
+#endif
diff --git a/msetsblock.cpp b/msetsblock.cpp
new file mode 100644
index 0000000..e8693e0
--- /dev/null
+++ b/msetsblock.cpp
@@ -0,0 +1,283 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "msetsblock.h"
+
+MSetsBlock::MSetsBlock()
+ : NxsBlock()
+{
+	id = "SETS";
+}
+
+
+MSetsBlock::~MSetsBlock()
+{
+	for (TaxaSetNameVector::reverse_iterator it = sets.rbegin(); it != sets.rend(); it++) {
+		//cout << (*it)->name << endl;
+		delete *it;
+	}
+	sets.clear();
+}
+
+
+void MSetsBlock::Report(ostream &out)
+{
+	int nsets = getNSets();
+	out << "Number of sets: " << nsets << endl;
+	for (TaxaSetNameVector::iterator i = sets.begin(); i != sets.end(); i++) {
+		out << "Set " << (*i)->name << " contains: ";
+		for (vector<string>::iterator it = (*i)->taxlist.begin(); it != (*i)->taxlist.end(); it++)
+			out << (*it) << "  ";
+		out << endl;
+	}
+}
+
+void MSetsBlock::Reset()
+{
+	for (TaxaSetNameVector::reverse_iterator it = sets.rbegin(); it != sets.rend(); it++)
+		delete *it;
+	sets.clear();
+}
+
+void MSetsBlock::Read(NxsToken &token)
+{
+	// This should be the semicolon after the block name
+	//
+
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+	{
+		errormsg = "Expecting ';' after SETS block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+	}
+
+	for (;;)
+	{
+		token.GetNextToken();
+
+		if (token.Equals("TAXSET"))
+		{
+			// This should be the NTAX keyword
+			//
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+
+			//sets.resize(sets.size()+1);
+			TaxaSetName *myset = new TaxaSetName;
+			sets.push_back(myset);
+
+			myset->name = token.GetToken();
+
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+			{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+			do {
+				myset->taxlist.push_back(token.GetToken());
+				token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+				token.GetNextToken();
+			} while (!token.AtEOF() && !token.Equals(";"));
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate PARAMETERS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		} // if (token.Equals("TAXSET"))
+
+		else if (token.Equals("CHARSET"))
+		{
+			// This should be the NTAX keyword
+			//
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+
+
+			//sets.resize(sets.size()+1);
+			CharSet *myset = new CharSet;
+			charsets.push_back(myset);
+			myset->aln_file = "";
+			myset->model_name = "";
+			myset->position_spec = "";
+			myset->sequence_type = "";
+			myset->char_partition = "";
+
+			myset->name = token.GetToken();
+
+			token.GetNextToken();
+			if (!token.Equals("="))
+			{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextContiguousToken(';');
+			myset->position_spec = token.GetToken();
+
+			// separate position_spec into alignment name if ':' exists
+			size_t pos = myset->position_spec.find(':');
+			if (pos != string::npos) {
+				myset->aln_file = myset->position_spec.substr(0, pos);
+				myset->position_spec = myset->position_spec.substr(pos+1);
+			}
+            if ((pos=myset->position_spec.find(',')) != string::npos && isalpha(myset->position_spec[0])) {
+                myset->sequence_type = myset->position_spec.substr(0, pos);
+                myset->position_spec = myset->position_spec.substr(pos+1);
+            }
+
+			token.GetNextToken();
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate PARAMETERS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		} // if (token.Equals("CHARSET"))
+		else if (token.Equals("CHARPARTITION"))
+		{
+			// This should be the NTAX keyword
+			//
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+			string partition_name = token.GetToken();
+			token.GetNextToken();
+			if (!token.Equals("="))
+			{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+			token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+			token.GetNextToken();
+			do {
+				string model_name = "";
+				while (!token.AtEOF() && !token.Equals(":")) {
+					model_name += token.GetToken();
+					token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+					token.GetNextToken();
+				}
+
+				if (!token.Equals(":"))
+				{
+					errormsg = "Expecting ':' or ',' but found ";
+					errormsg += token.GetToken();
+					errormsg += " instead";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+				string charset_name;
+				token.SetLabileFlagBit(NxsToken::preserveUnderscores);
+				token.GetNextToken();
+				charset_name = token.GetToken();
+				CharSet *myset = findCharSet(charset_name);
+				if (!myset)
+				{
+					errormsg = "CharSet ";
+					errormsg += token.GetToken();
+					errormsg += " not found";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+				myset->model_name = model_name;
+				myset->char_partition = partition_name;
+				token.GetNextToken();
+				if (!token.Equals(",") && !token.Equals(";"))
+				{
+					errormsg = "Expecting ',' or ';', but found ";
+					errormsg += token.GetToken();
+					errormsg += " instead";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+				if (token.Equals(";"))
+					break;
+				else
+					token.GetNextToken();
+			} while (!token.AtEOF() && !token.Equals(";"));
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate CHARPARTITION command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+		}
+		else if (token.Equals("END") || token.Equals("ENDBLOCK"))
+		{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+			break;
+		}	// if (token.Equals("END") || token.Equals("ENDBLOCK"))
+
+		else
+		{
+			SkippingCommand(token.GetToken());
+			do
+			{
+				token.GetNextToken();
+			}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+			{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+		}	// token not END, ENDBLOCK, COST
+	}	// GetNextToken loop
+
+}
+
+CharSet *MSetsBlock::findCharSet(string name) {
+	for (vector<CharSet*>::iterator it = charsets.begin(); it != charsets.end(); it++)
+		if ((*it)->name == name) return (*it);
+	return NULL;
+}
+
+int MSetsBlock::findArea(string &name) {
+	for (int i = 0; i < sets.size(); i++)
+		if (sets[i]->name == name) return i;
+	return -1;
+}
diff --git a/msetsblock.h b/msetsblock.h
new file mode 100644
index 0000000..03d6c65
--- /dev/null
+++ b/msetsblock.h
@@ -0,0 +1,145 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MSETSBLOCK_H
+#define MSETSBLOCK_H
+
+#include "ncl/ncl.h"
+
+/**
+	a taxa set with name
+*/
+class TaxaSetName {
+public:
+	/**
+		set name
+	*/
+	string name;
+	
+	/**
+		string vector of taxa names
+	*/
+	vector<string> taxlist;
+};
+
+typedef vector<TaxaSetName*> TaxaSetNameVector;
+
+/**
+ * a charset
+ */
+class CharSet {
+public:
+	/** charset name */
+	string name;
+
+	/** positions specification, e.g., 1-500\3 501-502 */
+	string position_spec;
+
+	/** name of model associated with charset, e.g., GTR+G */
+	string model_name;
+
+	/** alignment name */
+	string aln_file;
+
+	/** sequence type */
+	string sequence_type;
+
+	/** name of CharPartition where this charset is included*/
+	string char_partition;
+};
+
+
+/**
+Sets Block of Nexus file parser
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class MSetsBlock : public NxsBlock
+{
+public:
+
+	/**
+		constructor, assigning an associated splits graph
+	*/
+    MSetsBlock();
+
+	/**
+		destructor
+	*/
+    virtual ~MSetsBlock();
+
+	/**
+		print info to an output stream
+		@param out output stream, cout for output to screen
+	*/
+	virtual void Report(ostream &out);
+
+	/**
+		reset the block
+	*/
+	virtual void Reset();
+
+	/**
+		@return the number of sets
+	*/
+	int getNSets() const { return sets.size(); }
+
+	/**
+		@param id set id
+		@return reference to the corresponding set
+	*/
+	inline TaxaSetName *getSet(int id) { return sets[id]; }
+
+	/**
+		@return vector of all taxa set
+	*/
+	inline TaxaSetNameVector *getSets() { return &sets; }
+
+	/**
+		@param name an area name
+		@return ID of the area with that name, -1 if not found
+	*/
+	int findArea(string &name);
+
+
+	/** list of charsets with (possible) models */
+	vector<CharSet* > charsets;
+
+	/**
+	 * return CharSet with a name from charsets, NULL if not found
+	 */
+	CharSet *findCharSet(string name);
+
+protected:
+
+	/**
+		main method to read block from file
+		@param token a token reader
+	*/
+	virtual void Read(NxsToken &token);
+
+
+	/**
+		list of taxa set names
+	*/
+	TaxaSetNameVector sets;
+
+};
+
+#endif
diff --git a/msplitsblock.cpp b/msplitsblock.cpp
new file mode 100644
index 0000000..4f1a6da
--- /dev/null
+++ b/msplitsblock.cpp
@@ -0,0 +1,318 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "msplitsblock.h"
+#include "split.h"
+#include "splitgraph.h"
+
+MSplitsBlock::MSplitsBlock(SplitGraph *asgraph)
+		: NxsBlock()
+{
+	nsplits = 0;
+	ntaxa = 0;
+	id = "SPLITS";
+	sgraph = asgraph;
+}
+
+
+void MSplitsBlock::Report(ostream &out)
+{
+	sgraph->report(out);
+}
+
+void MSplitsBlock::Reset()
+{
+	errormsg.clear();
+	isEmpty			= true;
+	isEnabled		= true;
+	isUserSupplied	= false;
+
+	ntaxa			= 0;
+	nsplits = 0;
+	sgraph->clear();
+}
+
+void MSplitsBlock::AddSplit(NxsToken &token)
+{
+	token.SetLabileFlagBit(token.hyphenNotPunctuation);
+	// this should be the weight of split
+	token.GetNextToken();
+
+	NxsString str = token.GetToken();
+
+	// get the next token
+	token.GetNextToken();
+
+	// if continuing number....
+	if (token.GetToken() == "+")
+	{
+		str += token.GetToken();
+		token.GetNextToken();
+		str += token.GetToken();
+		token.GetNextToken();
+		//cout << str << " ";
+	}
+
+	double weight = atof(str.c_str());
+
+	//cout << token.GetToken() << " split weight = " << weight << endl;
+
+	vector<int> taxa_list;
+
+	//@pol should check to make sure this is not punctuation
+	while (!token.AtEOF() && !token.Equals(","))
+	{
+		int index = atoi(token.GetToken().c_str());
+		if (index < 1 || index > ntaxa)
+		{
+			errormsg = "Taxon index should be greater than zero and less than ";
+			errormsg += (ntaxa+1);
+			errormsg += "(";
+			errormsg += token.GetToken();
+			errormsg += " was specified)";
+			throw NxsException(errormsg, token);
+		}
+		
+		taxa_list.push_back(index-1);
+
+		token.GetNextToken();
+	}
+
+	if (token.AtEOF())
+	{
+		errormsg = "Unexpected end of file encountered";
+		throw NxsException(errormsg, token);
+	}
+
+	Split *split = new Split(ntaxa, weight, taxa_list);	
+	sgraph->push_back(split);
+}
+
+void MSplitsBlock::Read(NxsToken &token)
+{
+
+	//int nominal_ntax	= 0;
+	//int nominal_nsplits = 0;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+	{
+		errormsg = "Expecting ';' after TAXA block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token);
+	}
+
+	for (;;)
+	{
+		token.GetNextToken();
+
+		if (token.Equals("DIMENSIONS"))
+		{
+			// This should be the NTAX keyword
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("NTAX"))
+			{
+				errormsg = "Expecting NTAX keyword, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+			{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+
+			// This should be the number of taxa
+			//
+			token.GetNextToken();
+
+			ntaxa = atoi(token.GetToken().c_str());
+			if (ntaxa <= 0)
+			{
+				errormsg = "NTAX should be greater than zero (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token);
+			}
+
+			// This should be the NSPLITS keyword
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("NSPLITS"))
+			{
+				errormsg = "Expecting NSPLITS keyword, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+			{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+
+			// This should be the number of taxa
+			//
+			token.GetNextToken();
+
+			nsplits = atoi(token.GetToken().c_str());
+			if (nsplits <= 0)
+			{
+				errormsg = "NSPLITS should be greater than zero (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token);
+			}
+
+
+			// This should be the terminating semicolon
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate DIMENSIONS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+		}	// if (token.Equals("DIMENSIONS"))
+
+		else if (token.Equals("CYCLE"))
+		{
+			if (ntaxa <= 0)
+			{
+				errormsg = "DIMENSIONS must be specified before CYCLE command";
+				throw NxsException(errormsg, token);
+			}
+			token.GetNextToken();
+			while (!token.AtEOF() && !token.Equals(";")) {
+				int tax = atoi(token.GetToken().c_str());
+				if (tax <= 0 || tax > ntaxa)
+				{
+					errormsg = "taxon index in CYCLE should be between 1 and";
+					errormsg += ntaxa;
+					errormsg += " (";
+					errormsg += token.GetToken();
+					errormsg += " was specified)";
+					throw NxsException(errormsg, token);
+				}
+				cycle.push_back(tax-1);
+				token.GetNextToken();
+			}
+			if (cycle.size() != ntaxa) {
+				errormsg = "Not all taxa in CYCLE are included";
+				throw NxsException(errormsg, token);
+			}
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate CYCLE command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+		}
+		else if (token.Equals("MATRIX"))
+		{
+			if (nsplits <= 0)
+			{
+				errormsg = "NSPLITS must be specified before MATRIX command";
+				throw NxsException(errormsg, token);
+			}
+
+			for (int i = 0; i < nsplits; i++)
+			{
+				AddSplit(token);
+			}
+
+			// This should be terminating semicolon
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate MATRIX command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+		}	// if (token.Equals("TAXLABELS"))
+
+		else if (token.Equals("END") || token.Equals("ENDBLOCK"))
+		{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+			{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token);
+			}
+			break;
+		}	// if (token.Equals("END") || token.Equals("ENDBLOCK"))
+
+		else
+		{
+			SkippingCommand(token.GetToken());
+			do
+			{
+				token.GetNextToken();
+			}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+			{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token);
+			}
+		}	// token not END, ENDBLOCK, MATRIX, or DIMENSIONS
+	}	// GetNextToken loop
+
+}
+
+
+MSplitsBlock::~MSplitsBlock()
+{}
+
+
diff --git a/msplitsblock.h b/msplitsblock.h
new file mode 100644
index 0000000..4570900
--- /dev/null
+++ b/msplitsblock.h
@@ -0,0 +1,106 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MSPLITSBLOCK_H
+#define MSPLITSBLOCK_H
+
+#include "ncl/ncl.h"
+//#include "splitgraph.h"
+
+class SplitGraph;
+
+/**
+SplitsBlock to read from nexus file
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class MSplitsBlock : public NxsBlock
+{
+public:
+
+	friend class SplitGraph;
+	friend class MTree;
+
+	/**
+		constructor, assigning an associated splits graph
+		@param asgraph a splits graph
+	*/
+    MSplitsBlock(SplitGraph *asgraph);
+
+	/**
+		destructor
+	*/
+	virtual ~MSplitsBlock();
+
+
+	/**
+		print info to an output stream
+		@param out output stream, cout for output to screen
+	*/
+	virtual void Report(ostream &out);
+
+	/**
+		reset the block
+	*/
+	virtual void Reset();
+
+	/**
+		parse a line containing split
+		@param token a token reader
+	*/
+	void AddSplit(NxsToken &token);
+
+	/**
+		@return cycle
+	*/
+	inline vector<int> &getCycle() {
+		return cycle;
+	}
+
+protected:
+
+	/**
+		number of taxa
+	*/
+	int ntaxa;
+
+	/**
+		number of splits
+	*/
+	int nsplits;
+
+	/**
+		the associated splits graph
+	*/
+	SplitGraph *sgraph;
+
+	/**
+		taxa index around circle, if it is a circular split graph
+	*/
+	vector<int> cycle;
+
+	/**
+		main method to read block from file
+		@param token a token reader
+	*/
+	virtual void Read(NxsToken &token);
+
+};
+
+#endif
diff --git a/mtree.cpp b/mtree.cpp
new file mode 100644
index 0000000..78813b1
--- /dev/null
+++ b/mtree.cpp
@@ -0,0 +1,2037 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "mtree.h"
+#include <iostream>
+//#include <fstream>
+#include <iterator>
+#include "splitgraph.h"
+using namespace std;
+
+/*********************************************
+	class MTree
+*********************************************/
+
+MTree::MTree() {
+    root = NULL;
+    leafNum = 0;
+    nodeNum = 0;
+    rooted = false;
+    num_precision = 6;
+    len_scale = 1.0;
+	fig_char = "|-+++";
+}
+
+MTree::MTree(const char *userTreeFile, bool &is_rooted)
+{
+    init(userTreeFile, is_rooted);
+}
+
+void MTree::init(const char *userTreeFile, bool &is_rooted) {
+    num_precision = 10;
+    len_scale = 1.0;
+    readTree(userTreeFile, is_rooted);
+    //printInfo();
+	fig_char = "|-+++";
+}
+
+
+/**
+	constructor, get from another tree
+*/
+MTree::MTree(MTree &tree) {
+    init(tree);
+}
+
+void MTree::init(MTree &tree) {
+    root = tree.root;
+    leafNum = tree.leafNum;
+    nodeNum = tree.nodeNum;
+    rooted = tree.rooted;
+    //userFile = tree.userFile;
+    // have to delete the root when exchange to another object
+    tree.root = NULL;
+    num_precision = tree.num_precision;
+    len_scale = tree.len_scale;
+    fig_char = tree.fig_char;
+}
+
+void MTree::copyTree(MTree *tree) {
+    if (root) freeNode();
+    stringstream ss;
+    tree->printTree(ss);
+    readTree(ss, tree->rooted);
+}
+
+void MTree::copyTree(MTree *tree, string &taxa_set) {
+    if (tree->leafNum != taxa_set.length())
+    	outError("#leaves and taxa_set do not match!");
+    leafNum = nodeNum = branchNum = 0;
+    for (string::iterator it = taxa_set.begin(); it != taxa_set.end(); it++)
+        nodeNum += (*it);
+    double new_len;
+    if (root) freeNode();
+    root = NULL;
+    root = copyTree(tree, taxa_set, new_len);
+}
+
+Node* MTree::copyTree(MTree *tree, string &taxa_set, double &len, Node *node, Node *dad) {
+    if (!node) {
+        if (taxa_set[tree->root->id]) {
+            node = tree->root;
+        } else {
+            for (int i = 0; i < tree->leafNum; i++)
+                if (taxa_set[i]) {
+                    node = tree->findNodeID(i);
+                    break;
+                }
+        }
+    }
+    Node *new_node = NULL;
+    NodeVector new_nodes;
+    DoubleVector new_lens;
+    if (node->isLeaf()) {
+        len = 0.0;
+        if (taxa_set[node->id]) {
+            new_node = newNode(leafNum++, node->name.c_str());
+        }
+        if (dad) return new_node;
+    }
+    if (new_node) {
+        new_nodes.push_back(new_node);
+        new_lens.push_back(len);
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        double new_len;
+        new_node = copyTree(tree, taxa_set, new_len, (*it)->node, node);
+        if (new_node) {
+            new_nodes.push_back(new_node);
+            new_lens.push_back((*it)->length + new_len);
+        }
+    }
+    if (new_nodes.empty()) return NULL;
+    if (new_nodes.size() == 1) {
+        len = new_lens[0];
+        return new_nodes[0];
+    }
+    if (!dad && new_nodes.size() == 2) {
+        double sum_len = new_lens[0] + new_lens[1];
+        new_nodes[0]->addNeighbor(new_nodes[1], sum_len, branchNum);
+        new_nodes[1]->addNeighbor(new_nodes[0], sum_len, branchNum);
+        branchNum++;
+        return new_nodes[0];
+    }
+    Node* int_node = newNode(nodeNum++, node->name.c_str());
+    len = 0.0;
+    for (int i = 0; i < new_nodes.size(); i++) {
+        int_node->addNeighbor(new_nodes[i], new_lens[i], branchNum);
+        new_nodes[i]->addNeighbor(int_node, new_lens[i], branchNum);
+        branchNum++;
+    }
+    return int_node;
+}
+
+Node* MTree::newNode(int node_id, const char* node_name) {
+    return new Node(node_id, node_name);
+}
+
+Node* MTree::newNode(int node_id, int node_name) {
+    return new Node(node_id, node_name);
+}
+
+bool MTree::isBifurcating(Node *node, Node *dad) {
+	if (!node) node = root;
+	if (!node->isLeaf() && node->degree() != 3) return false;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!(*it)->node->isLeaf() && (*it)->node->degree() != 3) return false;
+		if (!isBifurcating((*it)->node, node)) return false;
+	}
+	return true;
+}
+
+void MTree::printBranchLengths(ostream &out, Node *node, Node *dad)
+{
+    if (node == NULL) {
+    	node = root;
+    	sortTaxa();
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (node->name != "") out << node->name; else out << node->id;
+        out << "\t";
+        if ((*it)->node->name != "") out << (*it)->node->name; else out << (*it)->node->id;
+        out << "\t" << (*it)->length << endl;
+        printBranchLengths(out, (*it)->node, node);
+    }
+}
+
+int MTree::countZeroBranches(Node *node, Node *dad, double epsilon) {
+    int count = 0;
+    if (node == NULL) node = root;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if ((*it)->length <= epsilon) count++;
+        count += countZeroBranches((*it)->node, node, epsilon);
+    }
+    return count;
+}
+
+int MTree::countZeroInternalBranches(Node *node, Node *dad, double epsilon) {
+    int count = 0;
+    if (node == NULL) node = root;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if ((*it)->length <= epsilon && !(*it)->node->isLeaf() && !node->isLeaf()) count++;
+        count += countZeroInternalBranches((*it)->node, node, epsilon);
+    }
+    return count;
+
+}
+
+int MTree::countLongBranches(Node *node, Node *dad, double upper_limit) {
+    int count = 0;
+    if (node == NULL) node = root;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if ((*it)->length >= upper_limit) count++;
+        count += countLongBranches((*it)->node, node, upper_limit);
+    }
+    return count;
+}
+
+
+void MTree::printTree(const char *ofile, int brtype)
+{
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        if (brtype & WT_APPEND)
+            out.open(ofile, ios_base::out | ios_base::app);
+        else
+            out.open(ofile);
+        printTree(out, brtype);
+        out.close();
+        if (verbose_mode >= VB_DEBUG)
+            cout << "Tree was printed to " << ofile << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, ofile);
+    }
+}
+
+string MTree::getTreeString() {
+	stringstream tree_stream;
+	printTree(tree_stream);
+	return tree_stream.str();
+}
+
+void MTree::printTree(ostream &out, int brtype) {
+    if (root->isLeaf()) {
+        if (root->neighbors[0]->node->isLeaf()) {
+            // tree has only 2 taxa!
+            out << "(";
+            printTree(out, brtype, root);
+            out << ",";
+            if (brtype & WT_TAXON_ID)
+                out << root->neighbors[0]->node->id;
+            else
+                out << root->neighbors[0]->node->name;
+
+            if (brtype & WT_BR_LEN)
+                out << ":0";
+            out << ")";
+        } else
+            // tree has more than 2 taxa
+            printTree(out, brtype, root->neighbors[0]->node);
+    } else
+        printTree(out, brtype, root);
+
+    out << ";";
+    if (brtype & WT_NEWLINE) out << endl;
+}
+
+struct IntString {
+    int id;
+    string str;
+};
+
+/**
+	nodecmp, for pruning algorithm
+*/
+struct IntStringCmp
+{
+    /**
+    	nodecmp, for pruning algorithm
+    */
+    bool operator()(const IntString* s1, const IntString* s2) const
+    {
+        return (s1->id) < (s2->id);
+    }
+};
+
+typedef set<IntString*, IntStringCmp> IntStringSet;
+
+int MTree::printTree(ostream &out, int brtype, Node *node, Node *dad)
+{
+    int smallest_taxid = leafNum;
+    out.precision(num_precision);
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        smallest_taxid = node->id;
+        if (brtype & WT_TAXON_ID)
+            out << node->id;
+        else
+            out << node->name;
+
+        if (brtype & WT_BR_LEN) {
+        	out.setf( std::ios::fixed, std:: ios::floatfield ); // some sofware does handle number format like '1.234e-6'
+            out.precision(10); // increase precision to avoid zero branch (like in RAxML)
+        	double len = node->neighbors[0]->length;
+            if (brtype & WT_BR_SCALE) len *= len_scale;
+            if (brtype & WT_BR_LEN_ROUNDING) len = round(len);
+            if (brtype & WT_BR_LEN_FIXED_WIDTH)
+                out << ":" << fixed << len;
+            else
+                out << ":" << len;
+        }
+    } else {
+        // internal node
+        out << "(";
+        bool first = true;
+        double length = 0.0;
+        //for (int i = 0; i < node->neighbors.size(); i++)
+        //if (node->neighbors[i]->node != dad)
+        if (! (brtype & WT_SORT_TAXA)) {
+            FOR_NEIGHBOR_IT(node, dad, it) {
+                if ((*it)->node->name != ROOT_NAME) {
+                    if (!first)
+                        out << ",";
+                    int taxid = printTree(out, brtype, (*it)->node, node);
+                    if (taxid < smallest_taxid) smallest_taxid = taxid;
+                    first = false;
+                } else
+                    length = (*it)->length;
+            } else {
+                length = (*it)->length;
+            }
+        } else {
+            IntStringSet strout;
+            FOR_NEIGHBOR_IT(node, dad, it) {
+                if ((*it)->node->name != ROOT_NAME) {
+                    ostringstream ss;
+                    IntString *str = new IntString;
+                    str->id = printTree(ss, brtype, (*it)->node, node);
+                    //ss.flush();
+                    str->str = ss.str();
+                    strout.insert(str);
+                } else
+                    length = (*it)->length;
+            } else {
+                length = (*it)->length;
+            }
+            smallest_taxid = (*strout.begin())->id;
+            IntStringSet::iterator iss;
+            for (iss = strout.begin(); iss != strout.end(); iss++) {
+                if (!first) out << ",";
+                out << (*iss)->str;
+                first = false;
+            }
+            for (iss = strout.begin(); iss != strout.end(); iss++)
+                delete (*iss);
+        }
+        out << ")";
+        if (!node->name.empty())
+            out << node->name;
+        else if (brtype & WT_INT_NODE)
+            out << node->id;
+        if (dad != NULL || length > 0.0) {
+            if (brtype & WT_BR_SCALE) length *= len_scale;
+            if (brtype & WT_BR_LEN_ROUNDING) length = round(length);
+            if (brtype & WT_BR_LEN) {
+                if (brtype & WT_BR_LEN_FIXED_WIDTH)
+                    out << ":" << fixed << length;
+                else
+                    out << ":" << length;
+            } else if (brtype & WT_BR_CLADE) {
+                if (! node->name.empty()) out << "/";
+                out << length;
+            }
+        }
+    }
+    return smallest_taxid;
+}
+
+
+void MTree::printSubTree(ostream &out, NodeVector &subtree) {
+    if (root->isLeaf())
+        printSubTree(out, subtree, root->neighbors[0]->node);
+    else
+        printSubTree(out, subtree, root);
+    out << ";";
+}
+
+void MTree::printSubTree(ostream &out, NodeVector &subtree, Node *node, Node *dad) {
+    if (!node) node = root;
+
+    NeighborVec::iterator it;
+    double length = 0.0, dad_length = 0.0;
+    // go down if only 1 child available
+    Node *child = NULL;
+    int degree;
+    do {
+        degree = 0;
+        FOR_NEIGHBOR(node, dad, it) {
+            if (subtree[(*it)->node->id] != NULL) {
+                degree++;
+                child = (*it)->node;
+            }
+        } else dad_length = (*it)->length;
+
+        if (degree == 1) {
+            dad = node;
+            node = child;
+            length += dad_length;
+        }
+    } while (degree == 1 && !node->isLeaf());
+
+    if (node->isLeaf())
+        out << node->name << ":" << node->neighbors[0]->length + length;
+    else
+    {
+        // internal node
+        out << "(";
+        bool first = true;
+
+        FOR_NEIGHBOR(node, dad, it)	{
+            if (subtree[(*it)->node->id] != NULL) {
+                if ((*it)->node->name != ROOT_NAME) {
+                    if (!first)
+                        out << ",";
+                    printSubTree(out, subtree, (*it)->node, node);
+                    first = false;
+                } else
+                    length += (*it)->length;
+            }
+        } else {
+            length += (*it)->length;
+        }
+        out << ")";
+        if (!node->name.empty())
+            out << node->name;
+        if (dad != NULL || length > 1e-20)
+            out << ":" << length;
+    }
+}
+
+
+void MTree::printTaxa(const char *ofile)
+{
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(ofile);
+        if (root->isLeaf())
+            printTaxa(out, root->neighbors[0]->node);
+        else
+            printTaxa(out);
+        out.close();
+        cout << "Taxa list was printed to " << ofile << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, ofile);
+    }
+}
+
+void MTree::printTaxa(ostream &out, Node *node, Node *dad)
+{
+    if (!node) node = root;
+    if (node->isLeaf())
+        out << node->name << endl;
+    else
+    {
+        // internal node
+        //for (int i = 0; i < node->neighbors.size(); i++)
+        //if (node->neighbors[i]->node != dad)
+        FOR_NEIGHBOR_IT(node, dad, it)	{
+            printTaxa(out, (*it)->node, node);
+        }
+    }
+}
+
+void MTree::printTaxa(ostream &out, NodeVector &subtree) {
+    for (int i = 0; i < leafNum; i++)
+        if (subtree[i] != NULL) {
+            out << subtree[i]->name << endl;
+        }
+}
+
+void MTree::readTree(const char *infile, bool &is_rooted) {
+    ifstream in;
+    try {
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(infile);
+        readTree(in, is_rooted);
+        in.close();
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, infile);
+    }
+
+    rooted = is_rooted;
+
+    if (verbose_mode >= VB_MED)
+        cout << "Tree contains " << leafNum - is_rooted <<
+             " taxa and " << nodeNum-1-is_rooted << " branches" << endl;
+}
+
+
+void MTree::readTree(istream &in, bool &is_rooted)
+{
+    in_line = 1;
+    in_column = 1;
+    try {
+        char ch;
+        ch = readNextChar(in);
+        if (ch != '(') {
+        	cout << in.rdbuf() << endl;
+            throw "Tree file does not start with an opening-bracket '('";
+        }
+
+        leafNum = 0;
+
+        double branch_len;
+        Node *node;
+        parseFile(in, ch, node, branch_len);
+        if (is_rooted || branch_len > 0.0) {
+            if (branch_len == -1.0) branch_len = 0.0;
+            if (branch_len < 0.0)
+                throw ERR_NEG_BRANCH;
+            is_rooted = true;
+            root = newNode(leafNum, ROOT_NAME);
+            root->addNeighbor(node, branch_len);
+            node->addNeighbor(root, branch_len);
+            leafNum++;
+        } else { // assign root to one of the neighbor of node, if any
+            FOR_NEIGHBOR_IT(node, NULL, it)
+            if ((*it)->node->isLeaf()) {
+                root = (*it)->node;
+                break;
+            }
+        }
+        // make sure that root is a leaf
+        assert(root->isLeaf());
+
+        if (in.eof() || ch != ';')
+            throw "Tree file must be ended with a semi-colon ';'";
+    } catch (bad_alloc) {
+        outError(ERR_NO_MEMORY);
+    } catch (const char *str) {
+        outError(str, reportInputInfo());
+    } catch (string str) {
+        outError(str.c_str(), reportInputInfo());
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, reportInputInfo());
+    } catch (...) {
+        // anything else
+        outError(ERR_READ_ANY, reportInputInfo());
+    }
+
+    nodeNum = leafNum;
+    initializeTree();
+
+    //bool stop = false;
+    //checkValidTree(stop);
+}
+
+void MTree::initializeTree(Node *node, Node* dad)
+{
+    if (!node) {
+        node = root;
+        nodeNum = leafNum;
+        branchNum = 0;
+    }
+    if (!node->isLeaf())
+    {
+        node->id = nodeNum;
+        nodeNum++;
+        //node->name = node->id;
+
+    }
+    //for (int i = 0; i < node->neighbors.size(); i++)
+    //if (node->neighbors[i]->node != dad)
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        (*it)->id = branchNum;
+        (*it)->node->findNeighbor(node)->id = branchNum;
+        branchNum++;
+        initializeTree((*it)->node, node);
+    }
+}
+
+
+void MTree::parseFile(istream &infile, char &ch, Node* &root, double &branch_len)
+{
+    Node *node;
+    int maxlen = 10000;
+    char seqname[10000];
+    int seqlen;
+    double brlen;
+    branch_len = -1.0;
+
+    root = newNode();
+
+    if (ch == '(') {
+        // internal node
+        ch = readNextChar(infile);
+        while (ch != ')' && !infile.eof())
+        {
+            node = NULL;
+            parseFile(infile, ch, node, brlen);
+            //if (brlen == -1.0)
+            //throw "Found branch with no length.";
+            //if (brlen < 0.0)
+            //throw ERR_NEG_BRANCH;
+            root->addNeighbor(node, brlen);
+            node->addNeighbor(root, brlen);
+            if (infile.eof())
+                throw "Expecting ')', but end of file instead";
+            if (ch == ',')
+                ch = readNextChar(infile);
+            else if (ch != ')') {
+                string err = "Expecting ')', but found '";
+                err += ch;
+                err += "' instead";
+                throw err;
+            }
+        }
+        if (!infile.eof()) ch = readNextChar(infile);
+    }
+    // now read the node name
+    seqlen = 0;
+    char end_ch = 0;
+    if (ch == '\'' || ch == '"') end_ch = ch;
+
+    while (!infile.eof() && seqlen < maxlen)
+    {
+        if (end_ch == 0) {
+            if (is_newick_token(ch) || controlchar(ch)) break;
+        }
+        seqname[seqlen++] = ch;
+        ch = infile.get();
+        in_column++;
+        if (end_ch != 0 && ch == end_ch) {
+            seqname[seqlen++] = ch;
+            break;
+        }
+    }
+    if ((controlchar(ch) || ch == '[' || ch == end_ch) && !infile.eof())
+        ch = readNextChar(infile, ch);
+    if (seqlen == maxlen)
+        throw "Too long name ( > 100)";
+    seqname[seqlen] = 0;
+    if (seqlen == 0 && root->isLeaf())
+        throw "A taxon has no name.";
+    if (seqlen > 0)
+        root->name.append(seqname);
+    if (root->isLeaf()) {
+        // is a leaf, assign its ID
+        root->id = leafNum;
+        if (leafNum == 0)
+            MTree::root = root;
+        leafNum++;
+    }
+
+    if (ch == ';' || infile.eof())
+        return;
+    if (ch == ':')
+    {
+        ch = readNextChar(infile);
+        seqlen = 0;
+        while (!is_newick_token(ch) && !controlchar(ch) && !infile.eof() && seqlen < maxlen)
+        {
+            seqname[seqlen] = ch;
+            seqlen++;
+            ch = infile.get();
+            in_column++;
+        }
+        if ((controlchar(ch) || ch == '[') && !infile.eof())
+            ch = readNextChar(infile, ch);
+        if (seqlen == maxlen || infile.eof())
+            throw "branch length format error.";
+        seqname[seqlen] = 0;
+        branch_len = convert_double(seqname);
+    }
+}
+
+/**
+	check tree is bifurcating tree (every leaf with level 1 or 3)
+*/
+void MTree::checkValidTree(bool &stop, Node *node, Node *dad)
+{
+    if (!node) node = root;
+    if (node->degree() != 1 && node->degree() != 3) {
+        cout << "Tree is not bifurcating." << endl;
+        stop = true;
+        return;
+    }
+    //for (int i = 0; i < node->neighbors.size(); i++)
+    //if (node->neighbors[i]->node != dad) {
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        checkValidTree(stop, (*it)->node, node);
+        if (stop)
+            return;
+    }
+}
+
+double MTree::treeLength(Node *node, Node *dad)
+{
+    if (!node) node = root;
+    double sum = 0;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        sum += (*it)->length + treeLength((*it)->node, node);
+    }
+    return sum;
+}
+
+double MTree::treeLengthInternal( double epsilon, Node *node, Node *dad)
+{
+    if (!node) node = root;
+    double sum = 0;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+    	if (!(*it)->node->isLeaf() && !node->isLeaf())
+    	{
+    		if (treeLength((*it)->node, node) > epsilon) {
+    			sum += (*it)->length + treeLengthInternal(epsilon, (*it)->node, node);
+    		}
+    	}
+    	else {
+    		if (treeLength((*it)->node, node) > epsilon) {
+    			sum += treeLengthInternal(epsilon, (*it)->node, node);
+    		}
+    	}
+    }
+    return sum;
+}
+
+double MTree::treeDepth(Node *node, Node *dad)
+{
+    if (!node) node = root;
+    double maxsum = 0.0;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        double len = (*it)->length;
+        if (len < 0.0) len = 0.0;
+        double sum = len + treeDepth((*it)->node, node);
+        if (sum > maxsum) maxsum = sum;
+    }
+    return maxsum;
+}
+
+void MTree::getNonCherryLeaves(NodeVector &noncherry, NodeVector &cherry, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+    	if (node->isInCherry()) {
+    		cherry.push_back(node);
+    	} else {
+            noncherry.push_back(node);
+    	}
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+    	getNonCherryLeaves(noncherry, cherry, (*it)->node, node);
+    }
+}
+
+void MTree::getTaxa(NodeVector &taxa, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        taxa.push_back(node);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getTaxa(taxa, (*it)->node, node);
+    }
+}
+
+void MTree::getAllNodesInSubtree(Node *node, Node *dad, NodeVector &nodeList) {
+    assert(node && dad);
+    nodeList.push_back(node);
+    if (node->isLeaf()) {
+        return;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getAllNodesInSubtree((*it)->node, node, nodeList);
+    }
+}
+
+int MTree::getNumTaxa(Node *node, Node *dad) {
+    int numLeaf = 0;
+    if (!node) {
+    	node = root;
+    	numLeaf = 1;
+    } else {
+        if (node->isLeaf()) {
+            return 1;
+        }
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        numLeaf += getNumTaxa((*it)->node, node);
+    }
+    return numLeaf;
+}
+
+void MTree::getInternalNodes(NodeVector &nodes, Node *node, Node *dad) {
+    if (!node) node = root;
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it)
+    if (!(*it)->node->isLeaf()) {
+        getInternalNodes(nodes, (*it)->node, node);
+        nodes.push_back((*it)->node);
+    }
+}
+
+void MTree::getAllInnerBranches(NodeVector &nodes1, NodeVector &nodes2, SplitGraph* excludeSplits, Node *node, Node *dad) {
+    if (!node) node = root;
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it)
+    if (!(*it)->node->isLeaf()) {
+        getAllInnerBranches(nodes1, nodes2, excludeSplits, (*it)->node, node);
+        if (!node->isLeaf()) {
+        	if (excludeSplits != NULL && excludeSplits->size() != 0) {
+        		Split* sp = getSplit(node, (*it)->node);
+        		if (excludeSplits->containSplit(*sp)) {
+        			delete sp;
+        			continue;
+        		}
+        		delete sp;
+        	}
+			if (node->id < (*it)->node->id) {
+				nodes1.push_back(node);
+				nodes2.push_back((*it)->node);
+			} else {
+				nodes1.push_back((*it)->node);
+				nodes2.push_back(node);
+			}
+        }
+    }
+}
+
+bool MTree::branchExist(Node* node1, Node* node2, NodeVector& nodes1, NodeVector& nodes2) {
+	assert(nodes1.size() == nodes2.size());
+	bool existed = false;
+	for (int i = 0; i < nodes1.size(); i++) {
+		if (nodes1[i] == node1) {
+			if (nodes2[i] == node2) {
+				existed = true;
+				break;
+			}
+		}
+		if (nodes1[i] == node2) {
+			if (nodes2[i] == node1) {
+				existed = true;
+				break;
+			}
+		}
+	}
+	return existed;
+}
+
+void MTree::getInnerBranches(NodeVector &nodes1, NodeVector &nodes2, int depth, Node *node, Node *dad) {
+    if (depth == 0)
+      return;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (!(*it)->node->isLeaf() && !branchExist(node, (*it)->node, nodes1, nodes2)) {
+        	nodes1.push_back(node);
+        	nodes2.push_back((*it)->node);
+            getInnerBranches(nodes1, nodes2, depth-1, (*it)->node, node);
+        }
+    }
+}
+
+bool MTree::isInnerBranch(Node* node1, Node* node2) {
+    assert(node1->degree() == 3 && node2->degree() == 3);
+    return (isABranch(node1, node2) && !node1->isLeaf() && !node2->isLeaf());
+}
+
+bool MTree::isABranch(Node* node1, Node* node2) {
+	bool isBranch1 = false;
+	for (NeighborVec::iterator it = node1->neighbors.begin(); it != node1->neighbors.end(); it++) {
+		if ((*it)->node == node2) {
+			isBranch1 = true;
+			break;
+		}
+	}
+	// Sanity check: both nodes must have each other as neighbors or not at all
+	bool isBranch2 = false;
+	for (NeighborVec::iterator it = node2->neighbors.begin(); it != node2->neighbors.end(); it++) {
+		if ((*it)->node == node1) {
+			isBranch2 = true;
+			break;
+		}
+	}
+	if (isBranch2 != isBranch1) {
+		int node1ID = node1->id;
+		int node2ID = node2->id;
+		stringstream msg;
+		msg << "Tree data structure corrupted! Node " << node1ID << " and node " << node2ID << " are not constructed properly";
+		outError(msg.str());
+	}
+	return isBranch1;
+}
+
+void MTree::getBranches(NodeVector &nodes, NodeVector &nodes2, Node *node, Node *dad) {
+    if (!node) node = root;
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (node->id < (*it)->node->id) {
+            nodes.push_back(node);
+            nodes2.push_back((*it)->node);
+        } else {
+            nodes.push_back((*it)->node);
+            nodes2.push_back(node);
+        }
+        getBranches(nodes, nodes2, (*it)->node, node);
+    }
+}
+
+void MTree::getBranchLengths(DoubleVector &len, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+        assert(len.size() == branchNum);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        len[(*it)->id] = (*it)->length;
+        getBranchLengths(len, (*it)->node, node);
+    }
+}
+
+void MTree::setBranchLengths(DoubleVector &len, Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+        assert(len.size() == branchNum);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        (*it)->length = (*it)->node->findNeighbor(node)->length = len[(*it)->id];
+        setBranchLengths(len, (*it)->node, node);
+    }
+}
+
+void MTree::getOrderedTaxa(NodeVector &taxa, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        if (taxa.empty()) taxa.resize(leafNum);
+        taxa[node->id] = node;
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getOrderedTaxa(taxa, (*it)->node, node);
+    }
+}
+
+void MTree::getTaxaName(vector<string> &taxname, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        if (taxname.empty()) taxname.resize(leafNum);
+        taxname[node->id] = node->name;
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getTaxaName(taxname, (*it)->node, node);
+    }
+}
+
+
+void MTree::getTaxaID(vector<int> &taxa, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        taxa.push_back(node->id);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        getTaxaID(taxa, (*it)->node, node);
+    }
+}
+
+bool MTree::containsSplits(SplitGraph& splits) {
+	SplitGraph treeSplits;
+	convertSplits(treeSplits);
+	//check if treeSplits contains all splits in splits
+	for (SplitGraph::iterator it = splits.begin(); it != splits.end(); it++) {
+		if (!treeSplits.containSplit(**it))
+			return false;
+	}
+	//treeSplits.report(cout);
+	//splits.report(cout);
+	return true;
+}
+
+Split* MTree::getSplit(Node* node1, Node* node2) {
+	Split* sp = new Split(leafNum);
+	getTaxa(*sp, node1, node2);
+	if (sp->shouldInvert())
+		sp->invert();
+	return sp;
+}
+
+void MTree::convertSplits(SplitGraph &sg, Split *resp, NodeVector *nodes, Node *node, Node *dad) {
+    if (!node) node = root;
+    assert(resp->getNTaxa() == leafNum);
+    bool has_child = false;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        //vector<int> taxa;
+        //getTaxaID((*it)->node, node, taxa);
+
+        Split *sp = new Split(leafNum, (*it)->length);
+        convertSplits(sg, sp, nodes, (*it)->node, node);
+        *resp += *sp;
+        if (sp->shouldInvert())
+            sp->invert();
+		 /* ignore nodes with degree of 2 because such split will be added before */
+        if (node->degree() != 2) {
+		  sg.push_back(sp);
+          if (nodes) nodes->push_back((*it)->node);
+        }
+        has_child = true;
+    }
+    if (!has_child)
+        resp->addTaxon(node->id);
+}
+
+void MTree::convertSplits(vector<string> &taxname, SplitGraph &sg, NodeVector *nodes, Node *node, Node *dad) {
+    if (!sg.taxa) {
+        sg.taxa = new NxsTaxaBlock();
+        for (vector<string>::iterator it = taxname.begin(); it != taxname.end(); it++)
+            sg.taxa->AddTaxonLabel(NxsString(it->c_str()));
+    }
+    if (!sg.splits)
+        sg.splits = new MSplitsBlock(&sg);
+    if (!sg.pda)
+        sg.pda = new MPdaBlock(&sg);
+
+    // make the cycle
+    getTaxaID(sg.splits->cycle);
+    // make the splits
+    Split sp(leafNum);
+    convertSplits(sg, &sp, nodes, node, dad);
+}
+
+void MTree::convertSplits(SplitGraph &sg, NodeVector *nodes, Node *node, Node *dad) {
+
+    // make the taxa name
+    vector<string> taxname;
+    taxname.resize(leafNum);
+    getTaxaName(taxname);
+
+    convertSplits(taxname, sg, nodes, node, dad);
+}
+
+inline int splitnumtaxacmp(const Split* a, const Split* b)
+{
+    return (a->countTaxa() < b->countTaxa());
+}
+
+void MTree::convertToTree(SplitGraph &sg) {
+    SplitGraph::iterator it;
+    int taxid;
+    int count;
+	BoolVector has_tax;
+	has_tax.resize(sg.getNTaxa(), false);
+	// first add trivial splits if not existed
+	for (it = sg.begin(); it != sg.end(); it++) {
+		taxid = (*it)->trivial();
+		if (taxid >= 0) has_tax[taxid] = true;
+	}
+	for (count = 0; count < has_tax.size(); count++)
+		if (!has_tax[count]) {
+			Split *sp = new Split(sg.getNTaxa());
+			sp->addTaxon(count);
+			sg.push_back(sp);
+		}
+    // sort splits by the number of taxa they contain
+    sort(sg.begin(), sg.end(), splitnumtaxacmp);
+
+    // initialize the tree
+    rooted = false;
+    leafNum = sg.getNTaxa();
+    nodeNum = leafNum;
+
+    // create the ground nodes, first as the leaves
+    NodeVector leaves;
+    vector<Split*> cladetaxa;
+    leaves.resize(leafNum, NULL);
+    cladetaxa.resize(leafNum, NULL);
+    // first add all trivial splits into tree
+    for (it = sg.begin(), count = 0; it != sg.end(); it++, count++) {
+        //(*it)->report(cout);
+        taxid = (*it)->trivial();
+        if (taxid < 0) break;
+        assert(leaves[taxid] == NULL);
+        leaves[taxid] = newNode(taxid, sg.getTaxa()->GetTaxonLabel(taxid).c_str());
+        leaves[taxid]->addNeighbor(NULL, (*it)->getWeight());
+        cladetaxa[taxid] = (*it);
+    }
+    // now fill in all missing taxa with zero terminal branch
+    for (taxid = 0; taxid < leafNum; taxid++)
+        assert(leaves[taxid]);
+
+    // now add non-trivial splits, cotinue with the interrupted iterator
+    for (/*it = sg.begin()*/; it != sg.end(); it++) {
+        //(*it)->report(cout);
+        Split *mysp = *it;
+        Node *newnode = newNode(nodeNum);
+        int count = 0;
+
+        for (taxid = 0; taxid < leaves.size(); )
+            if (cladetaxa[taxid]->subsetOf(*mysp)) // clade is a subset of current split
+            {
+                count += cladetaxa[taxid]->countTaxa();
+                double len = leaves[taxid]->updateNeighbor(NULL, newnode);
+                newnode->addNeighbor(leaves[taxid], len);
+                leaves[taxid] = leaves.back();
+                leaves.pop_back();
+                cladetaxa[taxid] = cladetaxa.back();
+                cladetaxa.pop_back();
+            } else taxid++;
+        assert(count == mysp->countTaxa());
+        cladetaxa.push_back(mysp);
+        leaves.push_back(newnode);
+
+        newnode->addNeighbor(NULL, mysp->getWeight());
+        nodeNum++;
+    }
+    assert(leaves.size() >= 3);
+    Node *newnode = newNode(nodeNum);
+    for (taxid = 0; taxid < leaves.size(); taxid++) {
+        double len = leaves[taxid]->updateNeighbor(NULL, newnode);
+        newnode->addNeighbor(leaves[taxid], len);
+    }
+    root = newnode;
+    nodeNum++;
+    cladetaxa.clear();
+}
+
+Node *MTree::findNodeName(string &name, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->name == name) return node;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        Node *res = findNodeName(name, (*it)->node, node);
+        if (res) return res;
+    }
+    return NULL;
+}
+
+Node *MTree::findLeafName(string &name, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf() && node->name == name) return node;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        Node *res = findLeafName(name, (*it)->node, node);
+        if (res) return res;
+    }
+    return NULL;
+}
+
+Node *MTree::findNodeID(int id, Node *node, Node* dad) {
+    if (!node) node = root;
+    if (node->id == id) return node;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        Node *res = findNodeID(id, (*it)->node, node);
+        if (res) return res;
+    }
+    return NULL;
+}
+
+
+void MTree::scaleLength(double norm, bool make_int, Node *node, Node *dad) {
+    if (!node) node = root;
+    FOR_NEIGHBOR_DECLARE(node, NULL, it) {
+        (*it)->length *= norm;
+        if (make_int)
+            (*it)->length = round((*it)->length);
+    }
+
+    FOR_NEIGHBOR(node, dad, it) {
+        scaleLength(norm, make_int, (*it)->node, node);
+    }
+}
+
+void MTree::transformBranchLenRAX(double factor, Node *node, Node *dad) {
+    if (!node) node = root;
+    FOR_NEIGHBOR_DECLARE(node, NULL, it) {
+        (*it)->length /= factor;
+        (*it)->length = exp(-(*it)->length);
+    }
+
+    FOR_NEIGHBOR(node, dad, it) {
+    	transformBranchLenRAX(factor, (*it)->node, node);
+    }
+}
+
+void MTree::scaleCladeSupport(double norm, bool make_int, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (!node->isLeaf() && !node->name.empty()) {
+        double supp = 0.0;
+        try {
+            supp = convert_double(node->name.c_str());
+        } catch (string str) {
+            outError(str);
+        }
+        supp *= norm;
+        if (make_int)
+            supp = round(supp);
+        node->name = "";
+        node->name += supp;
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        scaleCladeSupport(norm, make_int, (*it)->node, node);
+    }
+}
+
+
+MTree::~MTree()
+{
+    if (root != NULL)
+        freeNode();
+    root = NULL;
+}
+
+int MTree::freeNode(Node *node, Node *dad)
+{
+	if ( root == NULL )
+		return 0;
+    if (!node) node = root;
+    NeighborVec::reverse_iterator it;
+    int num_nodes = 1;
+    for (it = node->neighbors.rbegin(); it != node->neighbors.rend(); it++)
+        if ((*it)->node != dad) {
+            num_nodes += freeNode((*it)->node, node);
+        }
+    delete node;
+    return num_nodes;
+}
+
+char MTree::readNextChar(istream &in, char current_ch) {
+    char ch;
+    if (current_ch == '[')
+        ch = current_ch;
+    else {
+        in.get(ch);
+        in_column++;
+        if (ch == 10) {
+            in_line++;
+            in_column = 1;
+        }
+    }
+    while (controlchar(ch) && !in.eof()) {
+        in.get(ch);
+        in_column++;
+        if (ch == 10) {
+            in_line++;
+            in_column = 1;
+        }
+    }
+    // ignore comment
+    while (ch=='[' && !in.eof()) {
+        while (ch!=']' && !in.eof()) {
+            in.get(ch);
+            in_column++;
+            if (ch == 10) {
+                in_line++;
+                in_column = 1;
+            }
+        }
+        if (ch != ']') throw "Comments not ended with ]";
+        in_column++;
+        in.get(ch);
+        if (ch == 10) {
+            in_line++;
+            in_column = 1;
+        }
+        while (controlchar(ch) && !in.eof()) {
+            in_column++;
+            in.get(ch);
+            if (ch == 10) {
+                in_line++;
+                in_column = 1;
+            }
+        }
+    }
+    return ch;
+}
+
+string MTree::reportInputInfo() {
+    string str = " (line ";
+    str += convertIntToString(in_line) + " column " + convertIntToString(in_column-1) + ")";
+    return str;
+}
+
+
+typedef map<int, Neighbor*> IntNeighborMap;
+
+int MTree::sortTaxa(Node *node, Node *dad) {
+    if (!node) {
+        node = root;
+        if (node->isLeaf()) node = node->neighbors[0]->node;
+    }
+    if (node->isLeaf())
+        return node->id;
+    IntNeighborMap taxid_nei_map;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        int taxid = sortTaxa((*it)->node, node);
+        taxid_nei_map.insert(IntNeighborMap::value_type(taxid, (*it)));
+    }
+    ;
+    int i = 0;
+    for (IntNeighborMap::iterator it = taxid_nei_map.begin(); it != taxid_nei_map.end(); it++, i++) {
+        if (node->neighbors[i]->node == dad) i++;
+        node->neighbors[i] = it->second;
+    }
+
+    return taxid_nei_map.begin()->first;
+}
+
+void MTree::setExtendedFigChar() {
+	//fig_char[0] = 179;
+	//fig_char[1] = 196;
+	fig_char[2] = '/';
+	//fig_char[3] = 195;
+	fig_char[4] = '\\';
+}
+
+void MTree::drawTree(ostream &out, int brtype, double zero_epsilon) {
+    IntVector sub_tree_br;
+    if (verbose_mode >= VB_DEBUG) {
+        printTree(cout);
+        cout << endl;
+    }
+    Node *node = root;
+    if (node->isLeaf()) node = node->neighbors[0]->node;
+    double scale = 60.0/treeDepth(node);
+    //if (verbose_mode >= VB_DEBUG)
+    //cout << "Tree depth: " << scale<< endl;
+    drawTree2(out, brtype, scale, sub_tree_br, zero_epsilon);
+    /*
+    if (brtype & WT_INT_NODE)
+        drawTree2(out, brtype, scale, sub_tree_br, zero_epsilon);
+    else
+        drawTree(out, brtype, scale, sub_tree_br, zero_epsilon);
+    */
+    out << endl;
+}
+
+/*
+void MTree::drawTree(ostream &out, int brtype, double brscale, IntVector &subtree_br, double zero_epsilon, Node *node, Node *dad) {
+    int i, br_len = 3;
+    if (!node) {
+        node = root;
+        if (node->isLeaf()) node = node->neighbors[0]->node;
+    } else {
+
+        if (brtype & WT_BR_SCALE) {
+            br_len = floor(node->findNeighbor(dad)->length * brscale)-1;
+            if (br_len < 3) br_len = 3;
+            //if (!node->isLeaf() && br_len < 4) br_len = 4;
+        }
+        out << '+';
+        if ((brtype & WT_INT_NODE) && !node->isLeaf()) {
+            string str = convertIntToString(node->id);
+            for (i = 0; i < br_len-str.length(); i++) out << '-';
+            out << node->id;
+        } else
+            for (i = 0; i < br_len; i++) out << '-';
+    }
+    if (node->isLeaf()) {
+        out << node->name;
+        if (brtype & WT_TAXON_ID)
+            out << " (" << node->id << ")";
+        out << endl;
+        return;
+    }
+    int descendant_cnt = node->degree();
+    if (dad) descendant_cnt--;
+    int cnt = 0;
+    subtree_br.push_back(br_len);
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (cnt == descendant_cnt-1)
+            subtree_br.back() = -subtree_br.back();
+
+        drawTree(out, brtype, brscale, subtree_br, zero_epsilon, (*it)->node, node);
+        cnt++;
+        if (cnt == descendant_cnt) break;
+        for (IntVector::iterator it = subtree_br.begin()+1; it != subtree_br.end(); it++)
+        {
+            if ((*(it-1)) > 0) out << '|';
+            else out << ' ';
+            for (i = 0; i < abs(*it); i++) out << ' ';
+        }
+    }
+    subtree_br.pop_back();
+}
+*/
+
+void MTree::drawTree2(ostream &out, int brtype, double brscale, IntVector &subtree_br, double zero_epsilon, Node *node, Node *dad) {
+    int i, br_len = 3;
+    IntVector::iterator ii;
+    bool zero_length = false;
+
+    //cout << "DrawTree2!" << endl;
+    if (!node) {
+        node = root;
+        if (node->isLeaf()) node = node->neighbors[0]->node;
+    } else {
+        if (brtype & WT_BR_SCALE) {
+            br_len = floor(node->findNeighbor(dad)->length * brscale)-1;
+            if (br_len < 2) br_len = 2;
+        }
+        if (node->findNeighbor(dad)->length <= zero_epsilon) zero_length = true;
+    }
+    if (node->isLeaf()) {
+        for (ii = subtree_br.begin()+1; ii != subtree_br.end(); ii++) {
+            if (abs(*(ii-1)) > 1000) out << ' ';
+            else out << fig_char[0];
+            int num = abs(*ii);
+            if (num > 1000) num -= 1000;
+            for (i = 0; i < num; i++) out << ' ';
+        }
+        out << ((node==dad->neighbors.front()->node) ? fig_char[2] : ((node==dad->neighbors.back()->node) ? fig_char[4] : fig_char[3]));
+        for (i = 0; i < br_len; i++)
+            out << ((zero_length) ? '*' : fig_char[1]);
+        out << node->name;
+        if (brtype & WT_TAXON_ID)
+            out << " (" << node->id << ")";
+        if (brtype & WT_BR_ID)
+            out << " [" << node->neighbors[0]->id << "]";
+        if (brtype & WT_BR_LEN)
+            out << " " << node->neighbors[0]->length;
+        //out << " ";
+        //copy (subtree_br.begin(), subtree_br.end(), ostream_iterator<int> (out, " "));
+        out << endl;
+        return;
+    }
+    int descendant_cnt = node->degree();
+    if (dad) descendant_cnt--;
+    int cnt = 0;
+    bool first = true;
+
+    br_len = br_len+1000;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if (cnt == descendant_cnt-1)
+            br_len = -br_len;
+        subtree_br.push_back(br_len);
+
+        drawTree2(out, brtype, brscale, subtree_br, zero_epsilon, (*it)->node, node);
+        subtree_br.pop_back();
+        if (br_len > 1000) br_len -= 1000;
+        cnt++;
+        if (cnt == descendant_cnt) break;
+        if (subtree_br.size() > 1)
+            for (ii = subtree_br.begin()+1; ii != subtree_br.end(); ii++) {
+                if (abs(*(ii-1)) > 1000) out << ' ';
+                else out << fig_char[0];
+                if (ii == subtree_br.begin()) continue;
+                int num = abs(*ii);
+                if (num > 1000) num -= 1000;
+                for (i = 0; i < num; i++) out << ' ';
+            }
+        if (first) {
+            if (dad) {
+				out << ((node==dad->neighbors.front()->node) ? fig_char[2] : ((node==dad->neighbors.back()->node) ? fig_char[4] : fig_char[3]));
+                for (i = 0; i < abs(br_len); i++)
+                    out << ((zero_length) ? '*' : fig_char[1]);
+            }
+            if (brtype & WT_INT_NODE)
+            	out << node->id;
+            else
+            	out << fig_char[0];
+            if (!node->name.empty())
+                out << " (" << node->name << ")";
+            if (brtype & WT_BR_LEN && dad)
+                out << " " << node->findNeighbor(dad)->length;
+            if (brtype & WT_BR_ID && dad)
+                out << " [" << node->findNeighbor(dad)->id << "]";
+            if (!subtree_br.empty()) {
+                if (subtree_br.back() >1000)
+                    subtree_br.back() -= 1000;
+                else if (subtree_br.back() < 0)
+                    subtree_br.back() -= 1000;
+            }
+        } else {
+            if (dad) {
+                if (abs(subtree_br.back()) > 1000) out << ' ';
+                else out << fig_char[0];
+                for (i = 0; i < abs(br_len); i++)
+                    out << ' ';
+            }
+            out << fig_char[0];
+        }
+        //out << " ";
+        //copy (subtree_br.begin(), subtree_br.end(), ostream_iterator<int> (out, " "));
+        out << endl;
+        first = false;
+    }
+}
+
+bool MTree::equalTopology(MTree *tree) {
+	assert(root->isLeaf());
+	Node *root2 = tree->findLeafName(root->name);
+	if (!root2) return false;
+	ostringstream ostr, ostr2;
+	printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+	tree->printTree(ostr2, WT_TAXON_ID | WT_SORT_TAXA, root2);
+	return ostr.str() == ostr2.str();
+}
+
+void MTree::calcDist(char *filename) {
+    vector<string> taxname;
+    int i, j;
+
+    // allocate memory
+    taxname.resize(leafNum);
+    double *dist = new double [leafNum * leafNum];
+    // calculate the distances
+    calcDist(dist);
+    // get the taxa name
+    getTaxaName(taxname);
+
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(filename);
+
+        // now write the distances in phylip .dist format
+        out << leafNum << endl;
+
+        for (i = 0; i < leafNum; i++) {
+            out << taxname[i] << "   ";
+            for (j = 0; j < leafNum; j++) {
+                out << dist[i*leafNum + j] << "  ";
+            }
+            out << endl;
+        }
+        out.close();
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, filename);
+    }
+    delete [] dist;
+}
+
+void MTree::calcDist(double* &dist, Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        calcDist(node, 0.0, dist, node, NULL);
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        calcDist(dist, (*it)->node, node);
+    }
+}
+
+void MTree::calcDist(Node *aroot, double cur_len, double* &dist, Node *node, Node *dad) {
+    double branch_length;
+	if (!node) node = root;
+    if (node->isLeaf()) {
+        dist[aroot->id * leafNum + node->id] = cur_len;
+        dist[node->id * leafNum + aroot->id] = cur_len;
+    }
+    //for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+    //if ((*it)->node != dad)	{
+    FOR_NEIGHBOR_IT(node, dad, it) {
+    	branch_length = (*it)->length;
+        calcDist(aroot, cur_len + branch_length, dist, (*it)->node, node);
+    }
+
+}
+
+
+/*********************************************
+	class PDTaxaSet
+*********************************************/
+
+void PDTaxaSet::setSubTree(MTree &tree, NodeVector &subtree) {
+    stringstream ostr;
+    tree.printSubTree(ostr, subtree);
+    tree_str = ostr.str();
+}
+
+void PDTaxaSet::setTree(MTree &tree) {
+    // assign the taxa set
+    tree.getTaxa(*this);
+    // assign the score
+    score = tree.treeLength();
+
+    // assign tree_str
+    stringstream ostr;
+    tree.printTree(ostr);
+    tree_str = ostr.str();
+}
+
+
+void PDTaxaSet::printTaxa(ostream &out) {
+    for (iterator it = begin(); it != end(); it++)
+        if ((*it)->name != ROOT_NAME)
+            out << (*it)->name << endl;
+}
+
+void PDTaxaSet::printTaxa(char *filename) {
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(filename);
+        printTaxa(out);
+        out.close();
+        cout << "Taxa list was printed to " << filename << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, filename);
+    }
+}
+
+void PDTaxaSet::printTree(ostream &out) {
+    if (!tree_str.empty())
+        out << tree_str << endl;
+}
+
+void PDTaxaSet::printTree(char *filename) {
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(filename);
+        printTree(out);
+        out.close();
+        cout << "Tree was printed to " << filename << endl;
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, filename);
+    }
+}
+
+
+void PDTaxaSet::makeIDSet(int ntaxa, Split &id_set) {
+    id_set.setNTaxa(ntaxa);
+    id_set.setWeight(score);
+    for (iterator it = begin(); it != end(); it++)
+        id_set.addTaxon((*it)->id);
+}
+
+void MTree::writeInternalNodeNames(string &out_file) {
+    try {
+        ofstream out(out_file.c_str());
+        NodeVector nodes;
+        getInternalNodes(nodes);
+        for (NodeVector::iterator nit = nodes.begin(); nit != nodes.end(); nit++) {
+            out  << " " << (*nit)->name;
+        }
+        out << endl;
+        out.close();
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, out_file);
+    }
+}
+
+void MTree::assignLeafID(Node *node, Node *dad) {
+    if (!node) node = root;
+    if (node->isLeaf()) {
+        node->id = atoi(node->name.c_str());
+        assert(node->id >= 0 && node->id < leafNum);
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)
+    assignLeafID((*it)->node, node);
+}
+
+void MTree::getTaxa(Split &taxa, Node *node, Node *dad) {
+	if (!node) node = root;
+	if (node->isLeaf()) {
+		taxa.addTaxon(node->id);
+	}
+	FOR_NEIGHBOR_IT(node, dad, it)
+		getTaxa(taxa, (*it)->node, node);
+}
+
+
+void MTree::extractQuadSubtrees(vector<Split*> &subtrees, Node *node, Node *dad) {
+	if (!node) node = root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		extractQuadSubtrees(subtrees, (*it)->node, node);
+		if ((*it)->node->isLeaf()) continue;
+		// internal branch
+		assert(node->degree() == 3 && (*it)->node->degree() == 3);
+		int cnt = 0;
+		Node *child = (*it)->node;
+		FOR_NEIGHBOR_DECLARE(child, node, it2) {
+			Split *sp = new Split(leafNum);
+			getTaxa(*sp, (*it2)->node, child);
+			subtrees.push_back(sp);
+			cnt += sp->countTaxa();
+		}
+		FOR_NEIGHBOR(node, child, it2) {
+			Split *sp = new Split(leafNum);
+			getTaxa(*sp, (*it2)->node, node);
+			subtrees.push_back(sp);
+			cnt += sp->countTaxa();
+		}
+		assert(cnt == leafNum);
+	}
+}
+
+
+void MTree::assignBranchSupport(const char *trees_file) {
+	cout << "Reading input trees file " << trees_file << endl;
+	try {
+		ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(trees_file);
+        assignBranchSupport(in);
+		in.close();
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, trees_file);
+	}
+}
+
+void MTree::assignBranchSupport(istream &in) {
+	SplitGraph mysg;
+	NodeVector mynodes;
+	convertSplits(mysg, &mynodes, root->neighbors[0]->node);
+	vector<Split*> subtrees;
+	extractQuadSubtrees(subtrees, root->neighbors[0]->node);
+	IntVector decisive_counts;
+	decisive_counts.resize(mynodes.size(), 0);
+	StrVector occurence_trees; // list of tree IDs where each split occurs
+	if (verbose_mode >= VB_MED)
+		occurence_trees.resize(mynodes.size());
+	SplitGraph::iterator sit;
+	for (sit = mysg.begin(); sit != mysg.end(); sit++)
+		(*sit)->setWeight(0.0);
+	int ntrees, taxid;
+	for (ntrees = 1; !in.eof(); ntrees++) {
+		MTree tree;
+		bool is_rooted = false;
+
+		// read in the tree and convert into split system for indexing
+		tree.readTree(in, is_rooted);
+		if (verbose_mode >= VB_DEBUG)
+			cout << ntrees << " " << endl;
+		StrVector taxname;
+		tree.getTaxaName(taxname);
+		// create the map from taxa between 2 trees
+		Split taxa_mask(leafNum);
+		for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++) {
+			taxid = mysg.findLeafName(*it);
+			if (taxid < 0)
+				outError("Taxon not found in full tree: ", *it);
+			taxa_mask.addTaxon(taxid);
+		}
+		// make the taxa ordering right before converting to split system
+		taxname.clear();
+		int smallid;
+		for (taxid = 0, smallid = 0; taxid < leafNum; taxid++)
+			if (taxa_mask.containTaxon(taxid)) {
+				taxname.push_back(mysg.getTaxa()->GetTaxonLabel(taxid));
+				string name = (string)mysg.getTaxa()->GetTaxonLabel(taxid);
+				tree.findLeafName(name)->id = smallid++;
+			}
+		assert(taxname.size() == tree.leafNum);
+
+		SplitGraph sg;
+		//NodeVector nodes;
+		tree.convertSplits(sg);
+		SplitIntMap hash_ss;
+		for (sit = sg.begin(); sit != sg.end(); sit++)
+			hash_ss.insertSplit((*sit), 1);
+
+		// now scan through all splits in current tree
+		int id, qid;
+		for (sit = mysg.begin(), id = 0, qid = 0; sit != mysg.end(); sit++, id++)
+		if ((*sit)->trivial() < 0) // it is an internal split
+		{
+
+			bool decisive = true;
+			for (int i = 0; i < 4; i++) {
+				if (!taxa_mask.overlap(*subtrees[qid+i])) {
+					decisive = false;
+					break;
+				}
+			}
+			qid += 4;
+			if (!decisive) continue;
+
+			decisive_counts[id]++;
+			Split *subsp = (*sit)->extractSubSplit(taxa_mask);
+			if (subsp->shouldInvert())
+				subsp->invert();
+			Split *sp = hash_ss.findSplit(subsp);
+			if (sp && sp->trivial() < 0) {
+				(*sit)->setWeight((*sit)->getWeight()+1.0);
+				if (verbose_mode >= VB_MED)
+					occurence_trees[id] += convertIntToString(ntrees) + " ";
+				if (verbose_mode >= VB_MAX) {
+					for (taxid = 0; taxid < (*sit)->getNTaxa(); taxid++)
+						if ((*sit)->containTaxon(taxid))
+							cout << " " << mysg.getTaxa()->GetTaxonLabel(taxid);
+					cout << " --> ";
+					for (taxid = 0; taxid < sp->getNTaxa(); taxid++)
+						if (sp->containTaxon(taxid))
+							cout << " " << taxname[taxid];
+					cout << endl;
+				}
+			}
+			delete subsp;
+		}
+
+		char ch;
+		in.exceptions(ios::goodbit);
+		(in) >> ch;
+		if (in.eof()) break;
+		in.unget();
+		in.exceptions(ios::failbit | ios::badbit);
+
+	}
+
+	cout << ntrees << " trees read" << endl;
+
+	for (int i = 0; i < mysg.size(); i++)
+	if (!mynodes[i]->isLeaf())
+	{
+		stringstream tmp;
+		if (!mynodes[i]->name.empty())
+			tmp << "/";
+		if (mysg[i]->getWeight() == 0.0)
+			tmp << "0";
+		else
+			tmp << round((mysg[i]->getWeight()/decisive_counts[i])*1000)/10;
+		if (verbose_mode >= VB_MED)
+			tmp << "%" << decisive_counts[i];
+		if (!mynodes[i]->isLeaf()) mynodes[i]->name.append(tmp.str());
+		if (verbose_mode >= VB_MED) {
+			cout << mynodes[i]->name << " " << occurence_trees[i] << endl;
+		}
+	}
+	for (vector<Split*>::reverse_iterator it = subtrees.rbegin(); it != subtrees.rend(); it++)
+		delete (*it);
+}
+
+void MTree::computeRFDist(const char *trees_file, IntVector &dist) {
+	cout << "Reading input trees file " << trees_file << endl;
+	try {
+		ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(trees_file);
+        computeRFDist(in, dist);
+		in.close();
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, trees_file);
+	}
+}
+
+void MTree::computeRFDist(istream &in, IntVector &dist) {
+	SplitGraph mysg;
+	convertSplits(mysg, NULL, root->neighbors[0]->node);
+	SplitGraph::iterator sit;
+	for (sit = mysg.begin(); sit != mysg.end(); sit++)
+		(*sit)->setWeight(0.0);
+	int ntrees, taxid;
+	for (ntrees = 1; !in.eof(); ntrees++) {
+		MTree tree;
+		bool is_rooted = false;
+
+		// read in the tree and convert into split system for indexing
+		tree.readTree(in, is_rooted);
+		if (verbose_mode >= VB_DEBUG)
+			cout << ntrees << " " << endl;
+		StrVector taxname;
+		tree.getTaxaName(taxname);
+		// create the map from taxa between 2 trees
+		Split taxa_mask(leafNum);
+		for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++) {
+			taxid = mysg.findLeafName(*it);
+			if (taxid < 0)
+				outError("Taxon not found in full tree: ", *it);
+			taxa_mask.addTaxon(taxid);
+		}
+		// make the taxa ordering right before converting to split system
+		taxname.clear();
+		int smallid;
+		for (taxid = 0, smallid = 0; taxid < leafNum; taxid++)
+			if (taxa_mask.containTaxon(taxid)) {
+				taxname.push_back(mysg.getTaxa()->GetTaxonLabel(taxid));
+				string name = (string)mysg.getTaxa()->GetTaxonLabel(taxid);
+				tree.findLeafName(name)->id = smallid++;
+			}
+		assert(taxname.size() == tree.leafNum);
+
+		SplitGraph sg;
+		//NodeVector nodes;
+		tree.convertSplits(sg);
+		SplitIntMap hash_ss;
+		for (sit = sg.begin(); sit != sg.end(); sit++)
+			hash_ss.insertSplit((*sit), 1);
+
+		// now scan through all splits in current tree
+		int common_splits = 0;
+		for (sit = mysg.begin(); sit != mysg.end(); sit++)
+		if ((*sit)->trivial() < 0) // it is an internal split
+		{
+
+			Split *subsp = (*sit)->extractSubSplit(taxa_mask);
+			if (subsp->shouldInvert())
+				subsp->invert();
+			Split *sp = hash_ss.findSplit(subsp);
+			if (sp) {
+				common_splits++;
+				//(*sit)->setWeight((*sit)->getWeight()+1.0);
+				if (verbose_mode >= VB_MAX) {
+					for (taxid = 0; taxid < (*sit)->getNTaxa(); taxid++)
+						if ((*sit)->containTaxon(taxid))
+							cout << " " << mysg.getTaxa()->GetTaxonLabel(taxid);
+					cout << " --> ";
+					for (taxid = 0; taxid < sp->getNTaxa(); taxid++)
+						if (sp->containTaxon(taxid))
+							cout << " " << taxname[taxid];
+					cout << endl;
+				}
+			}
+			delete subsp;
+		}
+
+		//cout << "common_splits = " << common_splits << endl;
+        int rf_val = branchNum-leafNum + tree.branchNum-tree.leafNum - 2*common_splits;
+		dist.push_back(rf_val);
+		char ch;
+		in.exceptions(ios::goodbit);
+		(in) >> ch;
+		if (in.eof()) break;
+		in.unget();
+		in.exceptions(ios::failbit | ios::badbit);
+
+	}
+
+//	cout << ntrees << " trees read" << endl;
+
+
+}
+
+void MTree::insertTaxa(StrVector &new_taxa, StrVector &existing_taxa) {
+	if (new_taxa.empty()) return;
+	IntVector id;
+	int i;
+	id.resize(new_taxa.size());
+	for (i = 0; i < id.size(); i++)
+		id[i] = i;
+	// randomize order before reinsert back into tree
+	my_random_shuffle(id.begin(), id.end());
+
+	for (int i = 0; i < new_taxa.size(); i++) {
+		Node *old_taxon = findLeafName(existing_taxa[id[i]]);
+		assert(old_taxon);
+		double len = old_taxon->neighbors[0]->length;
+		Node *old_node = old_taxon->neighbors[0]->node;
+		Node *new_taxon = newNode(leafNum+i, new_taxa[id[i]].c_str());
+		Node *new_node = newNode();
+		// link new_taxon - new_node
+		new_taxon->addNeighbor(new_node, 0.0);
+		new_node->addNeighbor(new_taxon, 0.0);
+		// link old_taxon - new_node
+		new_node->addNeighbor(old_taxon, 0.0);
+		old_taxon->updateNeighbor(old_node, new_node, 0.0);
+		// link old_node - new_node
+		new_node->addNeighbor(old_node, len);
+		old_node->updateNeighbor(old_taxon, new_node, len);
+	}
+
+    leafNum = leafNum + new_taxa.size();
+    initializeTree();
+}
+
+Node *MTree::findFirstTaxon(Node *node, Node *dad) {
+	if (!node) node = root;
+//	Node *next;
+	for (int i = 0; i < nodeNum; i++)
+		FOR_NEIGHBOR_IT(node, dad, it) {
+			if ((*it)->node->isLeaf()) return (*it)->node;
+			dad = node;
+			node = (*it)->node;
+            break;
+		}
+	return NULL;
+}
+
+void MTree::removeTaxa(StrVector &taxa_names) {
+	if (taxa_names.empty()) return;
+	int count = 0;
+	for (StrVector::iterator sit = taxa_names.begin(); sit != taxa_names.end(); sit++) {
+		Node *node = findLeafName(*sit);
+		if (!node) continue;
+		count++;
+//		if (!node)
+//			outError((string)"Taxon " + (*sit) + " does not appear in the tree");
+		if (node == root)
+		{	// find another root
+			root = findFirstTaxon(root);
+		}
+
+		Node *innode = node->neighbors[0]->node;
+		Node *othernodes[2] = { NULL, NULL };
+		int i;
+		double length = 0;
+
+		bool should_merge = true;
+
+		FOR_NEIGHBOR_DECLARE(innode, node, it)	{
+			length += (*it)->length;
+			if (othernodes[0] == NULL)
+				othernodes[0] = (*it)->node;
+			else if (othernodes[1] == NULL)
+				othernodes[1] = (*it)->node;
+			else
+				should_merge = false;
+		}
+
+		if (should_merge)
+		{
+			// merge two branches
+			for (i = 0; i < 2; i++)
+				for (it = othernodes[i]->neighbors.begin(); it != othernodes[i]->neighbors.end(); it++)
+					if ((*it)->node == innode)
+					{
+						(*it)->node = othernodes[1-i];
+						(*it)->length = length;
+					}
+		} else {
+			// simple delete the neighbor of innode
+			for (it = innode->neighbors.begin(); it != innode->neighbors.end(); it++)
+				if ((*it)->node == node) {
+					innode->neighbors.erase(it);
+					break;
+				}
+		}
+		delete node;
+	}
+
+	if (!count) return;
+
+	NodeVector taxa;
+	getTaxa(taxa);
+	assert(taxa.size() > 0);
+	// reassign taxon IDs
+	int id = 0;
+	for (NodeVector::iterator nit = taxa.begin(); nit != taxa.end(); nit++, id++)
+		(*nit)->id = id;
+	leafNum = taxa.size();
+	initializeTree();
+}
diff --git a/mtree.h b/mtree.h
new file mode 100644
index 0000000..2a094c0
--- /dev/null
+++ b/mtree.h
@@ -0,0 +1,788 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MTREE_H
+#define MTREE_H
+
+#include "node.h"
+//#include "splitgraph.h"
+#include "split.h"
+#include <iostream>
+#include <sstream>
+#include "hashsplitset.h"
+#include "splitset.h"
+
+const char ROOT_NAME[] = "_root";
+
+class SplitGraph;
+
+/**
+General-purposed tree
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+ */
+class MTree {
+public:
+
+    /********************************************************
+            CONSTRUCTORs, INITIALIZATION AND DESTRUCTORs
+     ********************************************************/
+
+    /**
+            constructor, read tree from user file
+            @param userTreeFile the name of the user tree
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    MTree(const char *userTreeFile, bool &is_rooted);
+
+    /**
+            constructor, get from another tree
+            @param tree another MTree
+     */
+    MTree(MTree &tree);
+
+    /**
+            constructor
+     */
+    MTree();
+
+    /**
+            copy the tree structure into this tree
+            @param tree the tree to copy
+     */
+    virtual void copyTree(MTree *tree);
+
+    /**
+            copy the sub-tree structure into this tree
+            @param tree the tree to copy
+            @param taxa_set 0-1 string of length leafNum (1 to keep the leaf)
+     */
+    virtual void copyTree(MTree *tree, string &taxa_set);
+
+    Node* copyTree(MTree *tree, string &taxa_set, double &len, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            initialize the tree from a NEWICK tree file
+            @param userTreeFile the name of the user tree
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    void init(const char *userTreeFile, bool &is_rooted);
+
+    /**
+            initialize tree, get from another tree
+            @param tree another MTree
+     */
+    void init(MTree &tree);
+
+
+    /**
+            destructor
+     */
+    virtual ~MTree();
+
+    /**
+            allocate a new node. Override this if you have an inherited Node class.
+            @param node_id node ID
+            @param node_name node name
+            @return a new node
+     */
+    virtual Node* newNode(int node_id = -1, const char* node_name = NULL);
+
+    virtual Node* newNode(int node_id, int node_name);
+
+
+    /**
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return the number of branches with zero length ( <= epsilon)
+     */
+    int countZeroBranches(Node *node = NULL, Node *dad = NULL, double epsilon = 0.000001);
+
+    /**
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return the number of internal branches with zero length ( <= epsilon)
+     */
+    int countZeroInternalBranches(Node *node = NULL, Node *dad = NULL, double epsilon = 0.000001);
+
+	/**
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@return the number of long branches
+	*/
+	int countLongBranches(Node *node = NULL, Node *dad = NULL, double epsilon = 8.8);
+    /********************************************************
+            PRINT INFORMATION
+     ********************************************************/
+
+	/** @return true if tree is bifurcating, false otherwise */
+	bool isBifurcating(Node *node = NULL, Node *dad = NULL);
+    /**
+            print information
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void printBranchLengths(ostream &out, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            print the tree to the output file in newick format
+            @param outfile the output file.
+            @param brtype type of branch to print
+     */
+    void printTree(const char *outfile, int brtype = WT_BR_LEN);
+
+    /**
+            print the tree to the output file in newick format
+            @param out the output stream.
+            @param brtype type of branch to print
+     */
+    void printTree(ostream & out, int brtype = WT_BR_LEN);
+
+
+    string getTreeString();
+
+    /**
+            print the tree to the output file in newick format
+            @param out the output file.
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param brtype type of branch to print
+            @return ID of the taxon with smallest ID
+     */
+    int printTree(ostream &out, int brtype, Node *node, Node *dad = NULL);
+
+
+    /**
+            print the sub-tree to the output file in newick format
+            @param out the output file.
+            @param subtree list of nodes (internal & external) contained in the new tree
+     */
+    void printSubTree(ostream &out, NodeVector &subtree);
+
+    /**
+            print the sub-tree to the output file in newick format
+            @param out the output file.
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param subtree list of nodes (internal & external) contained in the new tree
+     */
+    void printSubTree(ostream &out, NodeVector &subtree, Node *node, Node *dad = NULL);
+
+
+    /**
+            print the taxa set to the output file
+            @param outfile the output file.
+     */
+    void printTaxa(const char *outfile);
+
+    /**
+            print the taxa set to the output file
+            @param out the output file.
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void printTaxa(ostream &out, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            print the taxa set of a given subtree
+            @param out the output file.
+            @param subtree the subtree vector
+     */
+    void printTaxa(ostream &out, NodeVector &subtree);
+
+    void writeInternalNodeNames(string &out_file);
+
+    /********************************************************
+            DRAW TREE
+     ********************************************************/
+
+    /**
+            Sort the taxa by their IDs
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return smallest taxon ID of the subtree
+     */
+    int sortTaxa(Node *node = NULL, Node *dad = NULL);
+
+	void drawTree(ostream &out, int brtype = WT_BR_SCALE + WT_INT_NODE, double zero_epsilon = 2e-6);
+
+	/** OBSOLETE:
+	void drawTree(ostream &out, int brtype, double brscale, IntVector &sub_tree_br, double zero_epsilon,
+            Node *node = NULL, Node *dad = NULL);
+    */
+
+	void drawTree2(ostream &out, int brtype, double brscale, IntVector &sub_tree_br, double zero_epsilon,
+            Node *node = NULL, Node *dad = NULL);
+
+    /**
+     * @param tree the other tree to compare with
+     * @return TRUE if this tree is topologically equal to tree
+     */
+    bool equalTopology(MTree *tree);
+
+    /********************************************************
+            READ TREE FROM FILE
+     ********************************************************/
+
+    /**
+            read the tree from the input file in newick format
+            @param infile the input file file.
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    virtual void readTree(const char *infile, bool &is_rooted);
+
+    /**
+            read the tree from the ifstream in newick format
+            @param in the input stream.
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    virtual void readTree(istream &in, bool &is_rooted);
+
+    /**
+            parse the tree from the input file in newick format
+            @param infile the input file
+            @param ch (IN/OUT) current char
+            @param root (IN/OUT) the root of the (sub)tree
+            @param branch_len (OUT) branch length associated to the current root
+		
+     */
+    void parseFile(istream &infile, char &ch, Node* &root, double &branch_len);
+
+
+    /**
+            initialize tree, set node structure
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void initializeTree(Node *node = NULL, Node* dad = NULL);
+
+
+    /********************************************************
+            GET INFORMATION
+     ********************************************************/
+
+    /**
+            @return sum of all branch lengths
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    double treeLength(Node *node = NULL, Node *dad = NULL);
+
+    /**
+            @return sum length of all internal branches
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    double treeLengthInternal(double epsilon, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            @return maximum path length from root node to taxa
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    double treeDepth(Node *node = NULL, Node *dad = NULL);
+    /**
+        get the descending taxa ID list below the node
+        @param node the starting node, NULL to start from the root
+        @param dad dad of the node, used to direct the search
+        @param taxa (OUT) taxa ID
+     */
+    void getTaxaID(vector<int> &taxa, Node *node = NULL, Node *dad = NULL);
+
+    /**
+     * get all node within a subtree
+     * TODO: This is probably identical with getTaxa
+     * @param node root of the subtree
+     * @param dad node to define the subtree
+     * @param nodeList (OUT) vector containing all nodes of the subtree
+     */
+    void getAllNodesInSubtree(Node *node, Node *dad, NodeVector &nodeList);
+
+    /**
+     * get number of taxa below the node
+     * @param node the starting node, NULL to start from the root
+     * @param dad dad of the node, used to direct the search
+     * @return number of taxa
+     */
+    int getNumTaxa(Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get the descending taxa below the node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param taxa (OUT) vector of taxa
+     */
+    void getTaxa(NodeVector &taxa, Node *node = NULL, Node *dad = NULL);
+
+    /**
+     	get all descending taxa which are in non-cherry position
+  		@param node the starting node, NULL to start from the root
+        @param dad dad of the node, used to direct the search
+        @param noncherry (OUT) vector of non-cherry taxa
+        @param cherry (OUT) vector of cherry taxa
+     */
+    void getNonCherryLeaves(NodeVector &noncherry, NodeVector &cherry, Node *node = NULL, Node *dad = NULL);
+
+	/**
+		get the descending taxa below the node
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param taxa (OUT) vector of taxa
+	*/
+	void getTaxa(Split &taxa, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get the descending taxa below the node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param taxa (OUT) vector of taxa
+     */
+    void getOrderedTaxa(NodeVector &taxa, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get the descending taxa names below the node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param taxname (OUT) taxa name
+     */
+    void getTaxaName(vector<string> &taxname, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get the descending internal nodes below \a node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param nodes (OUT) vector of internal nodes
+     */
+    void getInternalNodes(NodeVector &nodes, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get the descending internal branches below \a node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param nodes (OUT) vector of one end node of branch
+            @param nodes2 (OUT) vector of the other end node of branch
+            @param excludeSplits do not collect branches in here
+     */
+    void getAllInnerBranches(vector<Node*> &nodes, vector<Node*> &nodes2, SplitGraph* excludeSplits = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            get all descending branches below the node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param nodes (OUT) vector of one end node of branch
+            @param nodes2 (OUT) vector of the other end node of branch
+     */
+    void getBranches(NodeVector &nodes, NodeVector &nodes2, Node *node = NULL, Node *dad = NULL);
+
+    /**
+     *      get all descending internal branches below \a node and \a dad up to depth \a depth
+     *      @param[in] depth collect all internal branches up to distance \a depth from the current branch
+     *      @param[in] node one of the 2 nodes of the current branches
+     *      @param[in] dad one of the 2 nodes of the current branches
+     *      @param[out] nodes1 contains one ends of the collected branches
+     *      @param[out] nodes2 contains the other ends of the collected branches
+     */
+    void getInnerBranches(NodeVector& nodes1, NodeVector& nodes2, int depth, Node *node, Node *dad);
+
+    /**
+     *  @brief check whether branch (node1, node2) exist in the branch vector (nodes1, node2)
+     */
+    bool branchExist(Node* node1, Node* node2, NodeVector& nodes1, NodeVector& nodes2);
+
+    /**
+     * @brief: check if the branch is internal
+     * @param[in] node1 one end of the branch
+     * @param[in] node2 the other end of the branch
+     */
+    bool isInnerBranch(Node* node1, Node* node2);
+
+    /**
+     *  Check if the 2 nodes from a branch in the tree
+     *  @param node1 one of the 2 nodes
+     *  @param node2 one of the 2 nodes
+     *  return true if they are adjacent to each other
+     */
+    bool isABranch(Node* node1, Node* node2);
+
+    void getBranchLengths(DoubleVector &len, Node *node = NULL, Node *dad = NULL);
+
+    void setBranchLengths(DoubleVector &len, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            find a node with corresponding name
+            @param name node name
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return node if found, otherwise NULL
+     */
+    Node *findNodeName(string &name, Node *node = NULL, Node* dad = NULL);
+
+    /**
+            find a leaf with corresponding name
+            @param name leaf name
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return node if found, otherwise NULL
+     */
+    Node *findLeafName(string &name, Node *node = NULL, Node* dad = NULL);
+
+    /**
+            find a node with corresponding ID
+            @param id node ID
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @return node if found, otherwise NULL
+     */
+    Node *findNodeID(int id, Node *node = NULL, Node* dad = NULL);
+
+
+    /**
+            scale the length of all branches to a norm factor
+            @param norm normalized factor
+            @param make_int TRUE to round lengths to int, FALSE otherwise
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void scaleLength(double norm, bool make_int = false, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            scale the length of all branches for RAxML internal presentation
+            @param norm normalized factor
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void transformBranchLenRAX(double factor, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            scale the clade supports of all internal nodes to a norm factor
+            @param norm normalized factor
+            @param make_int TRUE to round lengths to int, FALSE otherwise
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void scaleCladeSupport(double norm, bool make_int = false, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            assign the leaf IDs with their names
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+
+     */
+    void assignLeafID(Node *node = NULL, Node *dad = NULL);
+
+
+    /********************************************************
+            CONVERT TREE INTO SPLIT SYSTEM
+     ********************************************************/
+
+    /**
+            convert the tree into the split system
+            @param sg (OUT) resulting split graph
+     */
+	void convertSplits(SplitGraph &sg, NodeVector *nodes = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            convert the tree into the split system
+            @param taxname certain taxa name
+            @param sg (OUT) resulting split graph
+     */
+	void convertSplits(vector<string> &taxname, SplitGraph &sg, NodeVector *nodes = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            convert the tree into the split system, iterative procedure
+            @param sg (OUT) resulting split graph
+            @param resp (internal) set of taxa below node
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void convertSplits(SplitGraph &sg, Split *resp, NodeVector *nodes = NULL, Node *node = NULL, Node *dad = NULL);
+
+    /**
+     * 		Generate a split defined by branch node1-node2
+     * 		@param node1 one end of the branch
+     * 		@param node2 one end of the branch
+     * 		@return a pointer to the split (the new split is allocated dynamically)
+     */
+    Split* getSplit(Node* node1, Node* node2);
+
+    /**
+     *  Check whehter the tree contains all splits in \a splits
+     *  @param splits list of splits to check
+     *  @return true or false
+     */
+    bool containsSplits(SplitGraph& splits);
+
+    /********************************************************
+            CONVERT SPLIT SYSTEM INTO TREE
+     ********************************************************/
+    /**
+            convert compatible split set into tree
+            @param sg source split graph
+     */
+    void convertToTree(SplitGraph &sg);
+
+
+    /********************************************************
+            calculate distance matrix
+     ********************************************************/
+
+
+    /**
+            calculate the pairwise distances on the tree, print the matrix to file (in phylip format)
+            @param filename file name
+     */
+    void calcDist(char *filename);
+
+    /**
+            calculate the pairwise distances on the tree
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param dist (OUT) distance matrix
+     */
+    void calcDist(double* &dist, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            calculate the pairwise distances on the tree
+            @param aroot the starting root
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param cur_len current length from aroot to node
+            @param dist (OUT) distance matrix
+     */
+    void calcDist(Node *aroot, double cur_len, double* &dist, Node *node, Node *dad);
+/********************************************************
+	STATISTICS
+********************************************************/
+
+	void extractQuadSubtrees(vector<Split*> &subtrees, Node *node = NULL, Node *dad = NULL);
+
+	/**
+	 * for each branch, assign how many times this branch appears in the input set of trees.
+	 * Work fine also when the trees do not have the same taxon set.
+	 * @param trees_file set of trees in NEWICK
+	 */
+	void assignBranchSupport(const char *trees_file);
+
+	void assignBranchSupport(istream &in);
+
+	/**
+	 * compute robinson foulds distance between this tree and a set of trees.
+	 * Work fine also when the trees do not have the same taxon set.
+	 * @param trees_file set of trees in NEWICK
+	 * @param dist (OUT) distance vector
+	 */
+	void computeRFDist(const char *trees_file, IntVector &dist);
+
+	void computeRFDist(istream &in, IntVector &dist);
+
+	/**
+	 * insert new taxa next to the existing taxa in the tree
+	 * @param new_taxa name of new taxa to be inserted
+	 * @param existing_taxa names of existing taxa in the tree
+	 */
+	void insertTaxa(StrVector &new_taxa, StrVector &existing_taxa);
+
+	/** remove some taxa from the tree
+	 * @param taxa_names names of taxa that will be removed
+	 */
+	void removeTaxa(StrVector &taxa_names);
+
+	/** find a first taxon below a subtree */
+	Node *findFirstTaxon(Node *node = NULL, Node *dad = NULL);
+
+	/********************************************************
+            PROPERTIES OF TREE
+     ********************************************************/
+    /**
+            root node.
+     */
+    Node *root;
+
+    /**
+            number of leaves
+     */
+    int leafNum;
+
+    /**
+            total number of nodes in the tree
+     */
+    int nodeNum;
+
+    /**
+            total number of branches in the tree
+     */
+    int branchNum;
+
+    /**
+            user tree file name
+     */
+    //char *userFile;
+
+    /**
+            TRUE if the tree is rooted
+     */
+    bool rooted;
+
+    /**
+            precision to print branch lengths, default: 6
+     */
+    int num_precision;
+
+    /** if WT_BR_SCALE turned on, printTree will scale branch length with this factor */
+    double len_scale;
+
+    /**
+            release the nemory.
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    int freeNode(Node *node = NULL, Node *dad = NULL);
+
+    void setExtendedFigChar();
+
+protected:
+
+    /**
+            line number of the input file, used to output errors in input file
+     */
+    int in_line;
+    /**
+            column number of the input file, used to output errors in input file
+     */
+    int in_column;
+
+
+    /**
+     * special character for drawing tree figure
+     * 0: vertical line
+     *  1: horizontal line
+     *  2: top corner
+     *  3: middle corner
+     *  4: bottom corner
+     */
+    string fig_char;
+
+    /**
+            check tree is bifurcating tree (every leaf with level 1 or 3)
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+            @param stop (IN/OUT) set = true to stop the search
+     */
+    void checkValidTree(bool& stop, Node *node = NULL, Node *dad = NULL);
+
+    /**
+            read the next character from a NEWICK file. Ignore comments [...]
+            @param in input stream
+            @param current_ch current character in the stream
+            @return next character read from input stream
+     */
+    char readNextChar(istream &in, char current_ch = 0);
+
+    string reportInputInfo();
+
+    /**
+     * Convert node IDs of a pair of nodes to a string in form "id1-id2"
+     * where id1 is smaller than id2. This is done to create a key for the map data structure
+     * @param node1
+     * @param node2
+     * @return the string key for the node pair
+     */
+    inline string getBranchID(Node* node1, Node* node2) {
+        string key("");
+        if (node1->id < node2->id) {
+            key += convertIntToString(node1->id) + "-"
+                    + convertIntToString(node2->id);
+        } else {
+            key += convertIntToString(node2->id) + "-"
+                    + convertIntToString(node1->id);
+        }
+        return key;
+    }
+};
+
+/**
+PD set
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+ */
+class PDTaxaSet : public vector<Node*> {
+public:
+    /**
+            PD score
+     */
+    double score;
+
+    /**
+            string representing subtree connecting taxa in the PD set
+     */
+    string tree_str;
+
+    /**
+            name of this taxa set
+     */
+    string name;
+
+    /**
+            assign subtree string
+            @param tree a MTree class
+            @param subtree list of nodes (internal & external) contained in the tree
+            @return score and tree_str variables of this class
+     */
+    void setSubTree(MTree &tree, NodeVector &subtree);
+
+
+    /**
+            assign the taxa, score and subtree string
+            @param tree a MTree class
+     */
+    void setTree(MTree &tree);
+
+    /**
+            print taxa to stream
+            @param out output stream
+     */
+    void printTaxa(ostream &out);
+
+    /**
+            print taxa to file
+            @param filename output file name
+     */
+    void printTaxa(char *filename);
+
+    /**
+            print tree to stream
+            @param out output stream
+     */
+    void printTree(ostream &out);
+
+    /**
+            print tree to file
+            @param filename output file name
+     */
+    void printTree(char *filename);
+
+    /**
+            convert from the taxa node vector to set of their IDs
+            @param ntaxa total number of taxa
+            @param id_set (OUT) set of their IDs
+     */
+    void makeIDSet(int ntaxa, Split &id_set);
+
+};
+
+
+#endif
diff --git a/mtreeset.cpp b/mtreeset.cpp
new file mode 100644
index 0000000..307ebff
--- /dev/null
+++ b/mtreeset.cpp
@@ -0,0 +1,760 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "mtreeset.h"
+#include "alignment.h"
+#include "gzstream.h"
+
+MTreeSet::MTreeSet()
+{
+}
+
+MTreeSet::MTreeSet(const char *userTreeFile, bool &is_rooted, 
+	int burnin, int max_count, const char *tree_weight_file) {
+	init(userTreeFile, is_rooted, burnin, max_count, tree_weight_file);
+}
+
+void readIntVector(const char *file_name, int burnin, int max_count, IntVector &vec) {
+	cout << "Reading integer vector file " << file_name << " ..." << endl;
+	vec.clear();
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(file_name);
+		// remove the failbit
+		in.exceptions(ios::badbit);
+
+		for (; !in.eof();) {
+			int i;
+			if(!(in >> i)) break;
+			if (burnin > 0) 
+				burnin--;
+			else if (max_count > 0) {
+				vec.push_back(i);
+				max_count--;
+			}
+		}
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	}
+}
+
+void MTreeSet::init(const char *userTreeFile, bool &is_rooted, int burnin, int max_count,
+	const char *tree_weight_file, IntVector *weights, bool compressed) 
+{
+	readTrees(userTreeFile, is_rooted, burnin, max_count, weights, compressed);
+	checkConsistency();
+
+	if (tree_weight_file) 
+		readIntVector(tree_weight_file, burnin, max_count, tree_weights);
+/*	else if (!weights)
+		tree_weights.resize(size(), 1);*/
+
+	if (size() != tree_weights.size())
+		outError("Tree file and tree weight file have different number of entries");
+
+}
+
+void MTreeSet::init(StringIntMap &treels, bool &is_rooted, IntVector &weights) {
+	//resize(treels.size(), NULL);
+	int count = 0;
+	//IntVector ok_trees;
+	//ok_trees.resize(treels.size(), 0);
+	//for (i = 0; i < trees_id.size(); i++) ok_trees[trees_id[i]] = 1;
+
+	for (StringIntMap::iterator it = treels.begin(); it != treels.end(); it++) 
+	if (weights[it->second]) {
+		count++;
+		MTree *tree = newTree();
+		stringstream ss(it->first);
+		bool myrooted = is_rooted;
+		tree->readTree(ss, myrooted);
+		NodeVector taxa;
+		tree->getTaxa(taxa);
+		for (NodeVector::iterator taxit = taxa.begin(); taxit != taxa.end(); taxit++)
+			(*taxit)->id = atoi((*taxit)->name.c_str());
+		//at(it->second) = tree;
+		push_back(tree);
+		tree_weights.push_back(weights[it->second]);
+		//cout << "Tree " << it->second << ": ";
+		//tree->printTree(cout, WT_NEWLINE);
+	}
+	if (verbose_mode >= VB_MED)
+		cout << count << " tree(s) converted" << endl;
+	//tree_weights.resize(size(), 1);
+}
+
+void MTreeSet::init(vector<string> &trees, vector<string> &taxonNames, bool &is_rooted) {
+	int count = 0;
+	for (vector<string>::iterator it = trees.begin(); it != trees.end(); it++) {
+		MTree *tree = newTree();
+		stringstream ss(*it);
+		tree->readTree(ss, is_rooted);
+	    int nseq = taxonNames.size();
+	    assert(tree->getNumTaxa() == nseq);
+	    for (int seq = 0; seq < nseq; seq++) {
+	        string seq_name = taxonNames[seq];
+	        Node *node = tree->findLeafName(seq_name);
+	        assert(node);
+	        assert(node->isLeaf());
+	        node->id = seq;
+	    }
+		push_back(tree);
+		tree_weights.push_back(1);
+		count++;
+	}
+	cout << count << " tree(s) converted" << endl;
+}
+
+void MTreeSet::readTrees(const char *infile, bool &is_rooted, int burnin, int max_count,
+	IntVector *weights, bool compressed) 
+{
+	cout << "Reading tree(s) file " << infile << " ..." << endl;
+	int count, omitted;
+/*	IntVector ok_trees;
+	if (trees_id) {
+		int max_id = *max_element(trees_id->begin(), trees_id->end());
+		ok_trees.resize(max_id+1, 0);
+		for (IntVector::iterator it = trees_id->begin(); it != trees_id->end(); it++)
+			ok_trees[*it] = 1;
+		cout << "Restricting to " << trees_id->size() << " trees" << endl;
+	}*/
+	try {
+		istream *in;
+		if (compressed) in = new igzstream; else in = new ifstream;
+		in->exceptions(ios::failbit | ios::badbit);
+		
+		if (compressed) ((igzstream*)in)->open(infile); else ((ifstream*)in)->open(infile);
+		if (burnin > 0) {
+			int cnt = 0;
+			while (cnt < burnin && !in->eof()) {
+				char ch;
+				(*in) >> ch;
+				if (ch == ';') cnt++;
+			}
+			cout << cnt << " beginning tree(s) discarded" << endl;
+			if (in->eof())
+				throw "Burnin value is too large.";
+		}
+		for (count = 1, omitted = 0; !in->eof() && count <= max_count; count++) {
+			if (!weights || weights->at(count-1)) {
+				//cout << "Reading tree " << count << " ..." << endl;
+				MTree *tree = newTree();
+				bool myrooted = is_rooted;
+				//tree->userFile = (char*) infile;
+				tree->readTree(*in, myrooted);
+				push_back(tree);
+				if (weights) 
+					tree_weights.push_back(weights->at(count-1)); 
+				else tree_weights.push_back(1);
+				//cout << "Tree contains " << tree->leafNum - tree->rooted << 
+				//" taxa and " << tree->nodeNum-1-tree->rooted << " branches" << endl;
+			} else {
+				// omit the tree
+				//push_back(NULL);
+				//in->exceptions(ios::badbit);
+				while (!in->eof()) {
+					char ch;
+					if (!((*in) >> ch)) break;
+					if (ch == ';') break;
+				}
+				omitted++;
+			} 
+			char ch;
+			in->exceptions(ios::goodbit);
+			(*in) >> ch;
+			if (in->eof()) break;
+			in->unget();
+			in->exceptions(ios::failbit | ios::badbit);
+
+		}
+		cout << size() << ((front()->rooted) ? " rooted" : " un-rooted") << " tree(s) loaded" << endl;
+		if (omitted) cout << omitted << " tree(s) omitted" << endl;
+		//in->exceptions(ios::failbit | ios::badbit);
+		if (compressed) ((igzstream*)in)->close(); else ((ifstream*)in)->close();
+		// following line was missing which caused small memory leak
+		delete in;
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, infile);		
+	} catch (const char* str) {
+		outError(str);
+	}
+}
+
+void MTreeSet::checkConsistency() {
+	if (empty()) 
+		return;
+	iterator it;
+	bool rooted = false;
+	int i;
+	bool first = true;
+	for (it = begin(), i = 0; it != end(); it++, i++)
+	if ((*it)) {
+		if (!first && (*it)->rooted != rooted) {
+			cout << i+1 << " " << (*it)->rooted << " " << rooted << endl;
+			outError("Rooted and unrooted trees are mixed up");
+			rooted = (*it)->rooted;
+		}
+		first = false;
+	}
+
+	NodeVector taxa1;
+	NodeVector::iterator it2;
+
+	first = true;
+	for (it = begin(); it != end(); it++) if (*it) {
+		MTree *tree = *it;
+		NodeVector taxa;
+		tree->getTaxa(taxa);
+		sort(taxa.begin(), taxa.end(), nodenamecmp);
+		for (it2 = taxa.begin(), i = 0; it2 != taxa.end(); it2++, i++)
+			(*it2)->id = i;
+
+		if (first ) {
+			taxa1 = taxa;
+			first = false;
+		} else {
+			// now check this tree with the first tree	
+			if (tree->leafNum != taxa1.size())
+				outError("Tree has different number of taxa!");
+	
+			for (it2 = taxa.begin(), i = 0; it2 != taxa.end(); it2++, i++) {
+				if ((*it2)->name != taxa1[i]->name) 
+					outError("Tree has different taxa names!");
+			}
+		}
+	}
+}
+
+bool MTreeSet::isRooted() {
+	if (empty()) return false;
+	return (front()->rooted);
+}
+
+void MTreeSet::assignLeafID() {
+	for (iterator it = begin(); it != end(); it++)
+		(*it)->assignLeafID();
+}
+
+void MTreeSet::printTrees(const char *ofile, int  brtype)
+{
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(ofile);
+		printTrees(out, brtype);
+		out.close();
+		cout << "Tree(s) were printed to " << ofile << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, ofile);
+	}
+}
+
+void MTreeSet::printTrees(ostream & out, int brtype) {
+	for (iterator  it = begin(); it != end(); it++) {
+		(*it)->printTree(out, brtype);
+		out << endl;
+	}
+}
+
+void MTreeSet::convertSplits(SplitGraph &sg, double split_threshold, int weighting_type, 
+	double weight_threshold) 
+{
+	SplitIntMap hash_ss;
+/*
+	if (split_threshold == 0.0) {
+		convertSplits(sg, hash_ss, weighting_type, weight_threshold);
+		return;
+	}*/
+	//SplitGraph temp;
+	convertSplits(sg, hash_ss, weighting_type, weight_threshold);
+	int nsplits = sg.getNSplits();
+
+	double threshold = split_threshold * size();
+//	cout << "threshold = " << threshold << endl;
+	int count=0;
+	for (SplitGraph::iterator it = sg.begin(); it != sg.end(); ) {
+		count++;
+		//SplitIntMap::iterator ass_it = hash_ss.find(*it);
+		int freq_value;
+		Split *sp = hash_ss.findSplit(*it, freq_value);
+		assert(sp != NULL);
+		assert(*sp == *(*it));
+		//Split *sp = ass_it->first;
+		if (freq_value <= threshold) {
+			if (verbose_mode == VB_DEBUG) {
+				sp->report(cout);
+			}
+			int num = hash_ss.getValue(sg.back());
+			hash_ss.eraseSplit(sp);
+			if (it != sg.end()-1) {
+				hash_ss.eraseSplit(sg.back());
+				*(*it) = (*sg.back());
+			}
+			delete sg.back();
+			sg.pop_back();
+			if (it == sg.end()) break;
+			hash_ss.insertSplit(*it, num);
+		} else {
+			//sg.push_back(new Split(*sp));
+			it++;
+		}
+	}
+	/*
+	sg.taxa = temp.taxa;
+	sg.splits = temp.splits;
+	sg.pda = temp.pda;
+	sg.sets = temp.sets;
+	sg.trees = temp.trees;
+	temp.taxa = NULL;
+	temp.splits = NULL;
+	temp.pda = NULL;
+	temp.sets = NULL;
+	temp.trees = NULL;
+	*/
+	cout << nsplits - sg.getNSplits() << " split(s) discarded because frequency <= " << split_threshold << endl;
+}
+
+
+void MTreeSet::convertSplits(SplitGraph &sg, SplitIntMap &hash_ss, int weighting_type, double weight_threshold) {
+	vector<string> taxname(front()->leafNum);
+	// make sure that the split system contains at least 1 split
+	if (size() == 0)
+		return;
+	
+	front()->getTaxaName(taxname);
+	convertSplits(taxname, sg, hash_ss, weighting_type, weight_threshold, NULL);
+}
+
+void MTreeSet::convertSplits(vector<string> &taxname, SplitGraph &sg, SplitIntMap &hash_ss, 
+	int weighting_type, double weight_threshold, char *tag_str, bool sort_taxa) {
+
+	if (verbose_mode >= VB_MED) {
+	#ifdef USE_HASH_MAP
+		cout << "Using hash_map" << endl;
+	#else
+		cout << "Using map" << endl;
+	#endif
+
+		cout << "Converting collection of tree(s) into split system..." << endl;
+	}
+	SplitGraph::iterator itg;
+	vector<string>::iterator its;
+/*
+	for (its = taxname.begin(); its != taxname.end(); its++)
+		if (*its == ROOT_NAME) {	
+			taxname.erase(its);
+			break;
+		}*/
+	if (sort_taxa) sort(taxname.begin(), taxname.end());
+	sg.createBlocks();
+	for (its = taxname.begin(); its != taxname.end(); its++)
+		sg.getTaxa()->AddTaxonLabel(NxsString(its->c_str()));
+/*
+	if (size() == 1 && weighting_type != SW_COUNT) {
+		front()->convertSplits(taxname, sg);
+		return;
+	}*/
+
+
+	SplitGraph *isg;
+	int tree_id = 0;
+//	cout << "Number of trees: " << size() << endl;
+//	cout << "Number of weight: " << tree_weights.size() << endl;
+	for (iterator it = begin(); it != end(); it++, tree_id++) {
+		if (tree_weights[tree_id] == 0) continue;
+		MTree *tree = *it;
+
+		if (tree->leafNum != taxname.size())
+			outError("Tree has different number of taxa!");
+		if (sort_taxa) {
+			NodeVector taxa;
+			tree->getTaxa(taxa);
+			sort(taxa.begin(), taxa.end(), nodenamecmp);
+			int i = 0;
+			for (NodeVector::iterator it2 = taxa.begin(); it2 != taxa.end(); it2++) {
+				if ((*it2)->name != taxname[i]) {
+					cout << "Name 1: " <<  (*it2)->name << endl;
+					cout << "Name 2: " <<  taxname[i] << endl;
+					outError("Tree has different taxa names!");
+				}
+				(*it2)->id = i++;
+			}
+		}
+		isg = new SplitGraph();
+		tree->convertSplits(taxname, *isg);
+		//isg->getTaxa()->Report(cout);
+		//isg->report(cout);
+		for (itg = isg->begin(); itg != isg->end(); itg++) {
+			//SplitIntMap::iterator ass_it = hash_ss.find(*itg);
+			int value;
+			//if ((*itg)->getWeight()==0.0) cout << "zero weight!" << endl;
+			Split *sp = hash_ss.findSplit(*itg, value);
+			if (sp != NULL) {
+				//Split *sp = ass_it->first;
+				if (weighting_type != SW_COUNT)
+					sp->setWeight(sp->getWeight() + (*itg)->getWeight() * tree_weights[tree_id]);
+				else
+					sp->setWeight(sp->getWeight() + tree_weights[tree_id]);
+				hash_ss.setValue(sp, value + tree_weights[tree_id]);
+			}
+			else {
+				sp = new Split(*(*itg));
+				if (weighting_type != SW_COUNT)
+					sp->setWeight((*itg)->getWeight() * tree_weights[tree_id]);
+				else				
+					sp->setWeight(tree_weights[tree_id]);
+				sg.push_back(sp);
+				//SplitIntMap::value_type spair(sp, 1);
+				//hash_ss.insert(spair);
+				
+				hash_ss.insertSplit(sp, tree_weights[tree_id]);
+ 			}
+            if (tag_str)
+                sp->name += "@" + convertIntToString(tree_id+1);
+		}
+		delete isg;
+	}
+
+	if (weighting_type == SW_AVG_PRESENT) {
+		for (itg = sg.begin(); itg != sg.end(); itg++) {
+			int value = 0;
+			if (!hash_ss.findSplit(*itg, value))
+				outError("Internal error ", __func__);
+			(*itg)->setWeight((*itg)->getWeight() / value);
+		}
+	} else if (weighting_type == SW_AVG_ALL) {
+		for (itg = sg.begin(); itg != sg.end(); itg++) {
+			(*itg)->setWeight((*itg)->getWeight() / tree_weights.size());
+		}
+	}
+
+	int discarded = 0;	
+	for (itg = sg.begin(); itg != sg.end(); )  {
+		if ((*itg)->getWeight() <= weight_threshold) {
+			discarded++;
+			delete (*itg);
+			(*itg) = sg.back();
+			sg.pop_back(); 
+		} else itg++;
+	}
+	if (discarded)
+		cout << discarded << " split(s) discarded because weight <= " << weight_threshold << endl;
+	//sg.report(cout);
+}
+
+
+MTreeSet::~MTreeSet()
+{
+	for (reverse_iterator it = rbegin(); it != rend(); it++) {
+		MTree *tree = *it;
+		delete tree;
+	}
+	clear();
+}
+
+
+void MTreeSet::computeRFDist(int *rfdist, int mode, double weight_threshold) {
+	// exit if less than 2 trees
+	if (size() < 2)
+		return;
+#ifdef USE_HASH_MAP
+	cout << "Using hash_map" << endl;
+#else
+	cout << "Using map" << endl;
+#endif
+	cout << "Computing Robinson-Foulds distance..." << endl;
+
+	vector<string> taxname(front()->leafNum);
+	vector<SplitIntMap*> hs_vec;
+	vector<SplitGraph*> sg_vec;
+
+	front()->getTaxaName(taxname);
+
+
+	// converting trees into split system then stored in SplitIntMap for efficiency
+	for (iterator it = begin(); it != end(); it++) {
+		SplitGraph *sg = new SplitGraph();
+		SplitIntMap *hs = new SplitIntMap();
+
+		(*it)->convertSplits(taxname, *sg);
+		// make sure that taxon 0 is included
+		for (SplitGraph::iterator sit = sg->begin(); sit != sg->end(); sit++) {
+			if (!(*sit)->containTaxon(0)) (*sit)->invert();
+			hs->insertSplit((*sit), 1);
+		}
+		hs_vec.push_back(hs);
+		sg_vec.push_back(sg);
+	}
+
+	// now start the RF computation
+	int id = 0;
+	for (vector<SplitIntMap*>::iterator hsit = hs_vec.begin(); hsit+1 != hs_vec.end(); hsit++, id++) {
+		vector<SplitIntMap*>::iterator end_it = hs_vec.end();
+		if (mode == RF_ADJACENT_PAIR) end_it = hsit+2;
+		int id2 = id+1;
+		for (vector<SplitIntMap*>::iterator hsit2 = hsit+1; hsit2 != end_it; hsit2++, id2++) {
+			int diff_splits = 0;
+			SplitIntMap::iterator spit;
+			for (spit = (*hsit2)->begin(); spit != (*hsit2)->end(); spit++) {
+				if (spit->first->getWeight() >= weight_threshold && !(*hsit)->findSplit(spit->first)) diff_splits++;
+			}
+			for (spit = (*hsit)->begin(); spit != (*hsit)->end(); spit++) {
+				if (spit->first->getWeight() >= weight_threshold && !(*hsit2)->findSplit(spit->first)) diff_splits++;
+			}
+			//int rf_val = (*hsit)->size() + (*hsit2)->size() - 2*common_splits;
+			int rf_val = diff_splits;
+			if (mode == RF_ADJACENT_PAIR) 
+				rfdist[id] = rf_val;
+			else {
+				rfdist[id*size() + id2] = rfdist[id2*size() + id] = rf_val;
+			}
+		}
+	}
+	// delete memory 
+	for (id = size()-1; id >= 0; id--) {
+		delete hs_vec[id];
+		delete sg_vec[id];
+	}
+}
+
+
+void MTreeSet::computeRFDist(int *rfdist, MTreeSet *treeset2, 
+	const char *info_file, const char *tree_file, int *incomp_splits) 
+{
+	// exit if less than 2 trees
+#ifdef USE_HASH_MAP
+	cout << "Using hash_map" << endl;
+#else
+	cout << "Using map" << endl;
+#endif
+
+	ofstream oinfo;
+	ofstream otree;
+	if (info_file) oinfo.open(info_file);
+	if (tree_file) otree.open(tree_file);
+	if (incomp_splits) memset(incomp_splits, 0, size()*treeset2->size()*sizeof(int));
+
+	vector<string> taxname(front()->leafNum);
+	vector<SplitIntMap*> hs_vec;
+	vector<SplitGraph*> sg_vec;
+	vector<NodeVector> nodes_vec;
+
+	front()->getTaxaName(taxname);
+
+
+	iterator it;
+	// converting trees into split system then stored in SplitIntMap for efficiency
+	for (iterator it = begin(); it != end(); it++) {
+		SplitGraph *sg = new SplitGraph();
+		SplitIntMap *hs = new SplitIntMap();
+		NodeVector nodes;
+
+		(*it)->convertSplits(taxname, *sg, &nodes);
+		// make sure that taxon 0 is included
+		int i = 0;
+		for (SplitGraph::iterator sit = sg->begin(); sit != sg->end(); sit++, i++) {
+			if (!(*sit)->containTaxon(0)) (*sit)->invert();
+			hs->insertSplit((*sit), i);
+		}
+		hs_vec.push_back(hs);
+		sg_vec.push_back(sg);
+		nodes_vec.push_back(nodes);
+	}
+
+	// converting trees into split system then stored in SplitIntMap for efficiency
+	for (it = treeset2->begin(); it != treeset2->end(); it++) {
+		SplitGraph *sg = new SplitGraph();
+		SplitIntMap *hs = new SplitIntMap();
+		NodeVector nodes;
+
+		(*it)->convertSplits(taxname, *sg, &nodes);
+		// make sure that taxon 0 is included
+		int i = 0;
+		for (SplitGraph::iterator sit = sg->begin(); sit != sg->end(); sit++, i++) {
+			if (!(*sit)->containTaxon(0)) (*sit)->invert();
+			hs->insertSplit((*sit), i);
+		}
+		hs_vec.push_back(hs);
+		sg_vec.push_back(sg);
+		nodes_vec.push_back(nodes);
+	}
+
+	// now start the RF computation
+	int id = 0;
+	int col_size = hs_vec.size() - size();
+	for (vector<SplitGraph*>::iterator hsit = sg_vec.begin(); id < size(); hsit++, id++) {
+		int id2 = 0;
+		for (vector<SplitIntMap*>::iterator hsit2 = (hs_vec.begin() + size()); hsit2 != hs_vec.end(); hsit2++, id2++) {
+			int common_splits = 0;
+			int i = 0;
+			for (SplitGraph::iterator spit = (*hsit)->begin(); spit != (*hsit)->end(); spit++, i++) {
+				if ((*hsit2)->findSplit(*spit)) {
+					common_splits++;
+					if (info_file && (*spit)->trivial()<0) oinfo << " " << nodes_vec[id][i]->name;
+				} else {
+					if (info_file && (*spit)->trivial()<0) oinfo << " -" << nodes_vec[id][i]->name;
+					nodes_vec[id][i]->name = "-" + nodes_vec[id][i]->name;
+					/*if (incomp_splits && !sg_vec[id2+size()]->compatible(*spit))
+						nodes_vec[id][i]->name = "-" + nodes_vec[id][i]->name;*/
+				} 
+			}
+			int rf_val = (*hsit)->size() + (*hsit2)->size() - 2*common_splits;
+			rfdist[id*col_size + id2] = rf_val;
+			if (info_file) oinfo << endl;
+			if (tree_file) { at(id)->printTree(otree); otree << endl; }
+			for (i = 0; i < nodes_vec[id].size(); i++)
+				if (nodes_vec[id][i]->name[0] == '-') nodes_vec[id][i]->name.erase(0,1);
+		}
+		if (!incomp_splits) continue;
+		id2 = 0;
+		// count incompatible splits
+		for (vector<SplitGraph*>::iterator hsit3 = sg_vec.begin()+size(); hsit3 != sg_vec.end(); hsit3++, id2++) {
+			int num_incomp = 0;
+			SplitGraph::iterator spit;
+			for (spit = (*hsit)->begin(); spit != (*hsit)->end(); spit++) 
+				if (!(*hsit3)->compatible(*spit)) num_incomp++;
+			for (spit = (*hsit3)->begin(); spit != (*hsit3)->end(); spit++) 
+				if (!(*hsit)->compatible(*spit)) num_incomp++;
+					
+			incomp_splits[id*col_size + id2] = num_incomp;
+		}
+	}
+	// delete memory 
+	for (id = hs_vec.size()-1; id >= 0; id--) {
+		delete hs_vec[id];
+		delete sg_vec[id];
+	}
+
+	if (info_file) {
+		oinfo.close();
+		cout << "Detailed split occurences printed to " << info_file << endl;
+	}
+	if (tree_file) {
+		otree.close();
+		cout << "Detailed split occurences on tree printed to " << tree_file << endl;
+	}
+}
+
+int MTreeSet::sumTreeWeights() {
+	int sum = 0;
+	for (IntVector::iterator it = tree_weights.begin(); it != tree_weights.end(); it++)
+		sum += (*it);
+	return sum;
+}
+
+int MTreeSet::categorizeDistinctTrees(IntVector &category) {
+	if (empty()) return 0;
+	if (size() == 1) {
+		category.resize(1,0);
+		return 1;
+	}
+	StringIntMap tree_cat_map;
+	string root_name = front()->root->name;
+	int ncat = 0;
+	category.resize(size(),-1);
+
+	int id = 0;
+	for (iterator it = begin(); it != end(); it++, id++) {
+		(*it)->root = (*it)->findNodeName(root_name);
+		if (!(*it)->root || !(*it)->root->isLeaf()) 
+			outError("Internal error ", __func__);
+		stringstream ostr;
+		(*it)->printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+		string str = ostr.str();
+		//cout << str << endl;
+		StringIntMap::iterator map_it = tree_cat_map.find(str);
+		if (map_it == tree_cat_map.end()) { // not found
+			category[id] = ncat;
+			tree_cat_map[str] = ncat;
+			ncat++;
+		} else {
+			category[id] = map_it->second;
+		}
+	}
+	return ncat;
+}
+
+
+/*int MTreeSet::categorizeDistinctTrees(IntVector &category) {
+	// exit if less than 2 trees
+	if (empty()) return 0;
+	if (size() == 1) {
+		category.resize(1,0);
+		return 1;
+	}
+#ifdef USE_HASH_MAP
+	cout << "Using hash_map" << endl;
+#else
+	cout << "Using map" << endl;
+#endif
+	cout << "Checking duplicated trees..." << endl;
+
+	vector<string> taxname(front()->leafNum);
+	vector<SplitIntMap*> hs_vec;
+	vector<SplitGraph*> sg_vec;
+
+	front()->getTaxaName(taxname);
+
+
+	// converting trees into split system then stored in SplitIntMap for efficiency
+	for (iterator it = begin(); it != end(); it++) {
+		SplitGraph *sg = new SplitGraph();
+		SplitIntMap *hs = new SplitIntMap();
+
+		(*it)->convertSplits(taxname, *sg);
+		// make sure that taxon 0 is included
+		for (SplitGraph::iterator sit = sg->begin(); sit != sg->end(); sit++) {
+			if (!(*sit)->containTaxon(0)) (*sit)->invert();
+			hs->insertSplit((*sit), 1);
+		}
+		hs_vec.push_back(hs);
+		sg_vec.push_back(sg);
+	}
+
+	// now start the RF computation
+	int id = 0, ncat = 0;
+	category.resize(size(),-1);
+	for (vector<SplitIntMap*>::iterator hsit = hs_vec.begin(); hsit != hs_vec.end(); hsit++, id++) 
+	if (category[id] < 0) {
+		category[id] = ncat;
+		int id2 = id+1;
+		for (vector<SplitIntMap*>::iterator hsit2 = hsit+1; hsit2 != hs_vec.end(); hsit2++, id2++) 
+		if (category[id2] < 0) {
+			bool equal = true;
+			for (SplitIntMap::iterator spit = (*hsit2)->begin(); spit != (*hsit2)->end(); spit++) {
+				if (!(*hsit)->findSplit(spit->first)) { equal = false; break; }
+			}
+			if (equal) category[id2] = ncat;
+		}
+		ncat++;
+	}
+	// delete memory 
+	for (id = size()-1; id >= 0; id--) {
+		delete hs_vec[id];
+		delete sg_vec[id];
+	}
+	return ncat;
+}
+
+*/
diff --git a/mtreeset.h b/mtreeset.h
new file mode 100644
index 0000000..0969720
--- /dev/null
+++ b/mtreeset.h
@@ -0,0 +1,181 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef MTREESET_H
+#define MTREESET_H
+
+#include "mtree.h"
+#include "splitgraph.h"
+#include "alignment.h"
+
+void readIntVector(const char *file_name, int burnin, int max_count, IntVector &vec);
+
+/**
+Set of trees
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class MTreeSet : public vector<MTree*> {
+public:
+
+    MTreeSet();
+
+	/**
+		constructor, read trees from user file
+		@param userTreeFile the name of the user trees
+		@param is_rooted (IN/OUT) true if tree is rooted
+		@param burnin the number of beginning trees to be discarded
+		@param max_count max number of trees to load
+	*/
+	MTreeSet(const char *userTreeFile, bool &is_rooted, int burnin, int max_count, 
+		const char *tree_weight_file = NULL);
+
+	/**
+		initialize the tree from a NEWICK tree file
+		@param userTreeFile the name of the user tree
+		@param is_rooted (IN/OUT) true if tree is rooted
+		@param burnin the number of beginning trees to be discarded
+		@param max_count max number of trees to load
+	*/
+	void init(const char *userTreeFile, bool &is_rooted, int burnin, int max_count, 
+		const char *tree_weight_file = NULL, IntVector *weights = NULL, bool compressed = false);
+
+	void init(StringIntMap &treels, bool &is_rooted, IntVector &weights);
+
+	/**
+	 *  Add trees from \a trees to the tree set
+	 *
+	 *  @param trees reference to a vector of NEWICK tree string
+	 *  @param taxonNames vector containing taxon names. The order of each taxon is used to assign its ID
+	 *  @param is_rooted specify whether the tree is rooted or not
+	 */
+	void init(vector<string> &trees, vector<string> &taxonNames, bool &is_rooted);
+
+
+	/**
+		read the tree from the input file in newick format
+		@param userTreeFile the name of the user trees
+		@param is_rooted (IN/OUT) true if tree is rooted
+		@param burnin the number of beginning trees to be discarded
+		@param max_count max number of trees to load
+	*/
+	void readTrees(const char *userTreeFile, bool &is_rooted, int burnin, int max_count,
+		IntVector *weights = NULL, bool compressed = false);
+
+	/**
+		assign the leaf IDs with their names for all trees
+
+	*/
+	void assignLeafID();
+
+	
+	/**
+		check the consistency of trees: taxa names between trees are matched, same rooted or unrooted
+	*/
+	void checkConsistency();
+
+	/**
+		@return true if trees are rooted
+	*/
+	bool isRooted();
+
+	/**
+		print the tree to the output file in newick format
+		@param outfile the output file.
+		@param brtype type of branch to print
+	*/
+	void printTrees(const char *outfile, int brtype = WT_BR_LEN);
+
+	/**
+		print the tree to the output file in newick format
+		@param out the output stream.
+		@param brtype type of branch to print
+	*/
+	void printTrees(ostream & out, int brtype = WT_BR_LEN);
+
+	/**
+		convert all trees into the split system
+		@param taxname certain taxa name
+		@param sg (OUT) resulting split graph
+		@param hash_ss (OUT) hash split set
+		@param lensum TRUE if summing split length, FALSE to increment only
+		@param weight_threshold minimum weight cutoff
+        @param tag_str TRUE to tag for each split, which trees it appears.
+		@param sort_taxa TRUE to sort taxa alphabetically
+	*/
+	void convertSplits(vector<string> &taxname, SplitGraph &sg, SplitIntMap &hash_ss, 
+		int weighting_type, double weight_threshold, char *tag_str, bool sort_taxa = true);
+
+	/**
+		convert all trees into the split system
+		@param sg (OUT) resulting split graph
+		@param hash_ss (OUT) hash split set
+		@param lensum TRUE to assign split weight as sum of corresponding branch lengths. 
+			Otherwise just count the number of branches.
+		@param weight_threshold minimum weight cutoff
+	*/
+	void convertSplits(SplitGraph &sg, SplitIntMap &hash_ss, 
+		int weighting_type, double weight_threshold);
+
+	/**
+		convert all trees into the split system
+		@param sg (OUT) resulting split graph
+		@param split_threshold only keep those splits which appear more than this threshold 
+		@param lensum TRUE to assign split weight as sum of corresponding branch lengths. 
+			Otherwise just count the number of branches.
+		@param weight_threshold minimum weight cutoff
+	*/
+	void convertSplits(SplitGraph &sg, double split_threshold, 
+		int weighting_type, double weight_threshold);
+
+	/**
+		compute the Robinson-Foulds distance between trees
+		@param rfdist (OUT) RF distance
+		@param mode RF_ALL_PAIR or RF_ADJACENT_PAIR
+		@param weight_threshold minimum weight cutoff
+	*/
+	void computeRFDist(int *rfdist, int mode = RF_ALL_PAIR, double weight_threshold = -1000);
+
+	/**
+		compute the Robinson-Foulds distance between trees
+		@param rfdist (OUT) RF distance
+	*/
+	void computeRFDist(int *rfdist, MTreeSet *treeset2, 
+		const char* info_file = NULL, const char *tree_file = NULL, int *incomp_splits = NULL);
+
+	int categorizeDistinctTrees(IntVector &category);
+
+	int sumTreeWeights();
+
+	/**
+		destructor
+	*/
+    virtual ~MTreeSet();
+
+	/**
+		new tree allocator
+		@return a new tree
+	*/
+	virtual MTree *newTree() { return new MTree(); }
+
+	IntVector tree_weights;
+
+};
+
+#endif
diff --git a/myreader.h b/myreader.h
new file mode 100644
index 0000000..fe77a83
--- /dev/null
+++ b/myreader.h
@@ -0,0 +1,144 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <iostream>
+#include <fstream>
+//#include "node.h"
+#include "ncl/ncl.h"
+
+/**
+	MyReader class to make more informative message
+*/
+class MyReader : public NxsReader
+{
+public:
+	
+	/**
+		input stream
+	*/
+	ifstream inf;
+
+	/**
+		constructor
+		@param infname input file name
+	*/
+	MyReader(char *infname) : NxsReader()
+	{
+		inf.open(infname, ios::binary);
+		if (!inf.is_open())
+			outError(ERR_READ_INPUT);
+	}
+
+	/**
+		destructor
+	*/
+	virtual ~MyReader()
+	{
+		inf.close();
+	}
+
+	/**
+		start
+	*/
+	virtual void ExecuteStarting() {}
+	/**
+		stop
+	*/
+	virtual void ExecuteStopping() {}
+
+	/**
+		enter a block
+		@param blockName block name
+		@return true always
+	*/
+	virtual bool EnteringBlock(NxsString blockName)
+	{
+		if (verbose_mode >= VB_MED)
+			cout << "Reading \"" << blockName << "\" block..." << endl;
+
+		// Returning true means it is ok to delete any data associated with
+		// blocks of this type read in previously
+		//
+		return true;
+	}
+
+	/**
+		skip a block
+		@param blockName block name
+	*/
+	virtual void SkippingBlock(NxsString blockName)
+	{
+		cout << "Skipping unknown block (" << blockName << ")..." << endl;
+	}
+
+	//virtual void SkippingDisabledBlock(NxsString blockName) {}
+
+	/**
+		print comments
+		@param comment comment string
+	*/
+	virtual void	OutputComment(const NxsString &comment)
+	{
+		//cout << comment;
+	}
+
+	/**
+		called when error occurs
+		@param msg additional message
+		@param pos file position
+		@param line line number
+		@param col column number
+	*/
+	virtual void	NexusError(NxsString msg, file_pos pos, long line, long col)
+	{
+		cerr << endl;
+		cerr << "Error found at line " << line;
+		cerr << ", column " << col;
+		cerr << " (file position " << pos << "):" << endl;
+		cerr << msg << endl;
+
+		exit(1);
+	}
+};
+
+/**
+	MyToken class to make more informative message
+*/
+class MyToken : public NxsToken
+{
+public:
+
+	/**
+		constructor
+		@param is input stream
+	*/
+	MyToken(istream &is) : NxsToken(is) {}
+
+	/**
+		print comments
+		@param msg comment string
+	*/
+	virtual void OutputComment(const NxsString &msg)
+	{
+		//cout << msg << endl;
+	}
+
+
+};
diff --git a/ncbitree.cpp b/ncbitree.cpp
new file mode 100644
index 0000000..91dff4f
--- /dev/null
+++ b/ncbitree.cpp
@@ -0,0 +1,255 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "ncbitree.h"
+
+NCBITree::NCBITree()
+        : MTree()
+{
+}
+
+
+NCBITree::~NCBITree()
+{
+}
+
+void NCBITree::readNCBINames(const char* infile, const char *name_type) {
+    ifstream in;
+    cout << "Reading NCBI names file " << infile << endl;
+    try {
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(infile);
+        in.exceptions(ios::badbit);
+        readNCBINames(in, name_type);
+        in.close();
+    } catch (const char* str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, infile);
+    }
+}
+
+void NCBITree::readNCBINames(ifstream &in, const char *name_type) {
+    assert(!nodes.empty());
+    char ch;
+    int node_id;
+    string node_name, unique_name;
+
+    in_line = in_column = 0;
+
+    while (!in.eof()) {
+        node_id = 0;
+        if (!(in >> node_id)) break;
+        in_line++;
+        if (node_id <= 0) throw "Wrong node ID";
+        if (node_id > nodes.size()) throw "Too large node ID";
+        if (nodes[node_id]) {
+            in >> ch;
+            if (ch != '|') throw "No | between node ID and name";
+            in.get(ch);
+            getline(in, node_name, '\t');
+            if (node_name == "") throw "Empty node name";
+            in >> ch;
+            if (ch != '|') throw "No | between name and unique name";
+            in.get(ch);
+            getline(in, unique_name, '\t');
+            if (unique_name != "") node_name = unique_name;
+
+            for (string::iterator i = node_name.begin(); i != node_name.end(); i++) {
+                if (!isalnum(*i) && (*i) != '_' && (*i) != '-' && (*i) != '.') {
+                    (*i) = '_';
+                }
+            }
+            nodes[node_id]->name = node_name;
+        }
+        // get the rest of the line
+        string str;
+        getline(in, str);
+    }
+
+}
+
+Node *NCBITree::readNCBITree(const char *infile, int root_id, const char* taxon_level, const char *ignore_level) {
+    ifstream in;
+    cout << "Reading NCBI nodes file " << infile << endl;
+    Node *parent = NULL;
+    try {
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(infile);
+        in.exceptions(ios::badbit);
+        parent = readNCBITree(in, root_id, taxon_level, ignore_level);
+        in.close();
+    } catch (const char* str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, infile);
+    }
+
+    return parent;
+}
+
+Node* NCBITree::readNCBITree(istream &in, int root_id, const char* taxon_level, const char *ignore_level) {
+    //IntVector parents_id;
+    nodes.resize(MAX_TAXONOMY_ID, NULL);
+    node_levels.resize(MAX_TAXONOMY_ID);
+    string node_level;
+    int node_id, parent_id, max_node_id = 0, num_nodes = 0;
+    char ch;
+    in_line = in_column = 0;
+
+    while (!in.eof()) {
+        node_id = parent_id = 0;
+        if (!(in >> node_id)) break;
+        in_line++;
+        num_nodes ++;
+        if (node_id <= 0) throw "Wrong node ID";
+        if (node_id >= nodes.size()) throw "Too large node ID";
+        in >> ch;
+        if (ch != '|') throw "No | between node ID and parent ID";
+        in >> parent_id;
+        if (parent_id <= 0) throw "Wrong parent ID";
+        if (parent_id >= nodes.size()) throw "Too large parent ID";
+        in >> ch;
+        if (ch != '|') throw "No | between parent ID and node rank";
+        in.get(ch);
+        getline(in,node_level,'\t');
+
+        string str;
+        getline(in, str);
+        if (node_id > max_node_id) max_node_id = node_id;
+        if (nodes[node_id]) throw "Duplicated node ID";
+        nodes[node_id] = newNode(node_id, node_id);
+        nodes[node_id]->height = parent_id; // use height temporarily for parent_id
+        node_levels[node_id] = node_level;
+    }
+
+    nodes.resize(max_node_id+1);
+    node_levels.resize(max_node_id+1);
+    int ignored = 0;
+
+    for (node_id = 0; node_id <= max_node_id; node_id++)
+        if (nodes[node_id]) {
+            parent_id = nodes[node_id]->height;
+            if (!nodes[parent_id]) throw "Parent ID not found";
+            if (parent_id == node_id) {
+                cout << "Ignore " << node_id << " | " << parent_id << endl;
+                continue;
+            }
+            double len = 1.0;
+            if (ignore_level && node_levels[node_id] == ignore_level) {
+                len = 0.0;
+                ignored++;
+            }
+            nodes[node_id]->addNeighbor(nodes[parent_id], len);
+            nodes[parent_id]->addNeighbor(nodes[node_id], len);
+        }
+
+    if (ignore_level)
+        cout << ignored << " branches are set to zero because of " << ignore_level << endl;
+
+    rooted = true;
+    if (!nodes[root_id]) throw "Root node not available";
+    root = nodes[root_id];
+
+    if (taxon_level) {
+        int pruned = pruneTaxa(node_levels, taxon_level, root, nodes[root->height]);
+        cout << pruned << " nodes below " << taxon_level << " are pruned" << endl;
+    }
+
+//	int pruned = pruneBridgeNodes(root, nodes[root->height]);
+//	cout << pruned << " nodes of degree 2 are pruned" << endl;
+
+    leafNum = nodeNum = branchNum = 0;
+    countNodeNum(root, nodes[root->height]);
+
+    /*	for (node_id = 0; node_id <= max_node_id; node_id++)
+    	if (nodes[node_id] && nodes[node_id]->isLeaf()) {
+    		Node *taxon = nodes[node_id];
+    		taxon->id = leafNum;
+    		leafNum++;
+    	}
+    	initializeTree();*/
+
+
+    cout << num_nodes << " NCBI nodes, " << nodeNum << " tree nodes, " << leafNum << " leaves, " << branchNum << " branches" << endl;
+    return nodes[nodes[root_id]->height];
+}
+
+int NCBITree::pruneTaxa(StrVector &node_levels, const char* taxon_level, Node *node, Node *dad) {
+    int num_nodes = 0;
+    //if (node_levels[node->id].find(taxon_level) != string::npos) {
+    if (node_levels[node->id] == taxon_level) {
+        // prune subtree below node
+        Neighbor *node_nei = node->findNeighbor(dad);
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            num_nodes += freeNode((*it)->node, node);
+            delete (*it);
+        }
+        node->neighbors.resize(1);
+        node->neighbors[0] = node_nei;
+        return num_nodes;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)
+    num_nodes += pruneTaxa(node_levels, taxon_level, (*it)->node, node);
+    return num_nodes;
+}
+
+
+void NCBITree::countNodeNum(Node *node, Node *dad) {
+    nodeNum++;
+    if (node->isLeaf()) leafNum++;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        branchNum++;
+        countNodeNum((*it)->node, node);
+    }
+}
+
+int NCBITree::pruneBridgeNodes(Node *node, Node *dad) {
+    int num_nodes = 0;
+    FOR_NEIGHBOR_IT(node, dad, it)
+    num_nodes += pruneBridgeNodes((*it)->node, node);
+    if (node->neighbors.size() == 2) {
+        Node *child;
+        if (node->neighbors[0]->node == dad)
+            child = node->neighbors[1]->node;
+        else
+            child = node->neighbors[0]->node;
+        double len = node->neighbors[0]->length + node->neighbors[1]->length;
+        dad->updateNeighbor(node, child, len);
+        child->updateNeighbor(node, dad, len);
+        nodes[node->id] = NULL;
+        delete node;
+        num_nodes++;
+    }
+    return num_nodes;
+}
+
+int NCBITree::freeNode(Node *node, Node *dad)
+{
+    if (!node) node = root;
+    NeighborVec::reverse_iterator it;
+    int num_nodes = 1;
+    for (it = node->neighbors.rbegin(); it != node->neighbors.rend(); it++)
+        if ((*it)->node != dad) {
+            num_nodes += freeNode((*it)->node, node);
+        }
+    nodes[node->id] = NULL;
+    delete node;
+    return num_nodes;
+}
diff --git a/ncbitree.h b/ncbitree.h
new file mode 100644
index 0000000..ffa725f
--- /dev/null
+++ b/ncbitree.h
@@ -0,0 +1,97 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef NCBITREE_H
+#define NCBITREE_H
+
+#include "mtree.h"
+
+const int MAX_TAXONOMY_ID = 2000000;
+
+/**
+Class for processing NCBI Taxonomy tree
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class NCBITree : public MTree
+{
+public:
+    NCBITree();
+	
+    ~NCBITree();
+
+
+/********************************************************
+	READ TREE FROM FILE
+********************************************************/
+
+	/**
+		read the tree in nodes.dmp file from NCBI taxonomy
+		@param infile the input file file.
+		@param root_id taxon ID of the root
+		@param taxon_level e.g. "species", "genus"; NULL to take all taxa (incl. subspecies)
+	*/
+	Node* readNCBITree(const char *infile, int root_id, const char* taxon_level, const char *ignore_level); 
+	Node* readNCBITree(istream &in, int root_id, const char* taxon_level, const char *ignore_level);
+
+	/**
+		read names.dmp file. You must call readNCBITree() before calling this function
+		@param infile input file name (typically names.dmp from NCBI)
+		@param name_type type of the name
+	*/
+	void readNCBINames(const char* infile, const char *name_type = "scientific name");
+	void readNCBINames(ifstream &in, const char *name_type = "scientific name");
+
+
+protected:
+
+	/**
+		taxonomy rank of the nodes
+	*/
+	StrVector node_levels;
+
+	/**
+		vector of all taxonomical nodes
+	*/
+	NodeVector nodes;
+
+	/**
+		prune subtree below the taxon_level
+		@return number of nodes pruned
+	*/
+	int pruneTaxa(StrVector &node_levels, const char* taxon_level, Node *node, Node *dad);
+
+	/**
+		prune all nodes that have degree of 2
+		@return number of nodes pruned
+	*/
+	int pruneBridgeNodes(Node *node, Node *dad);
+
+	void countNodeNum(Node *node, Node *dad);
+
+	/**
+		release the nemory.
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+	*/
+	int freeNode(Node *node = NULL, Node *dad = NULL);
+
+};
+
+#endif
diff --git a/ncl/CMakeLists.txt b/ncl/CMakeLists.txt
new file mode 100644
index 0000000..ee601ab
--- /dev/null
+++ b/ncl/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_library(ncl 
+nxsassumptionsblock.cpp
+nxsblock.cpp
+nxscharactersblock.cpp
+nxsdatablock.cpp
+nxsdiscretedatum.cpp
+nxsdiscretematrix.cpp
+nxsdistancedatum.cpp
+nxsdistancesblock.cpp
+nxsemptyblock.cpp
+nxsexception.cpp
+nxsreader.cpp
+nxssetreader.cpp
+nxsstring.cpp
+nxstaxablock.cpp
+nxstoken.cpp
+nxstreesblock.cpp)
diff --git a/ncl/ncl.h b/ncl/ncl.h
new file mode 100644
index 0000000..9c28a4f
--- /dev/null
+++ b/ncl/ncl.h
@@ -0,0 +1,108 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NCL_H
+#define NCL_NCL_H
+
+#if defined(_MSC_VER)
+#	pragma warning(disable:4786)
+#	pragma warning(disable:4291)
+#	define vsnprintf _vsnprintf
+#endif
+
+#if !defined(__DECCXX)
+#	include <cassert>
+#	include <cctype>
+#	include <cmath>
+#	include <cstdarg>
+#	include <cstdio>
+#	include <cstdarg>
+#	include <cstdlib>
+#	include <ctime>
+#	include <cfloat>
+#else
+#	include <assert.h>
+#	include <ctype.h>
+#	include <stdarg.h>
+#	include <math.h>
+#	include <stdarg.h>
+#	include <stdio.h>
+#	include <stdlib.h>
+#	include <time.h>
+#	include <float.h>
+#endif
+
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <set>
+#include <stdexcept>
+#include <string>
+#if defined(__GNUC__)
+#	if __GNUC__ < 3
+#		include <strstream>
+#	else
+#		include <sstream>
+#	endif
+#endif
+#include <vector>
+using namespace std;
+
+#if defined(__MWERKS__)
+#	if __ide_target("Simple-Win Release") || __ide_target("Phorest-Mac-Release")
+#		define NDEBUG
+#	else
+#		undef NDEBUG
+#	endif
+#endif
+
+#if defined( __BORLANDC__ )
+#	include <dos.h>
+#endif
+
+#if defined(__MWERKS__)
+#	define HAVE_PRAGMA_UNUSED
+		// mwerks (and may be other compilers) want return values even if the function throws an exception
+		//
+#	define DEMANDS_UNREACHABLE_RETURN
+
+#endif
+
+#include "nxsdefs.h"
+#include "nxsstring.h"
+#include "nxsexception.h"
+#include "nxstoken.h"
+#include "nxsblock.h"
+#include "nxsreader.h"
+#include "nxssetreader.h"
+#include "nxstaxablock.h"
+#include "nxstreesblock.h"
+#include "nxsdistancedatum.h"
+#include "nxsdistancesblock.h"
+#include "nxsdiscretedatum.h"
+#include "nxsdiscretematrix.h"
+#include "nxscharactersblock.h"
+#include "nxsassumptionsblock.h"
+#include "nxsdatablock.h"
+#include "nxsemptyblock.h"
+
+#endif
diff --git a/ncl/nxsassumptionsblock.cpp b/ncl/nxsassumptionsblock.cpp
new file mode 100644
index 0000000..a0dee2a
--- /dev/null
+++ b/ncl/nxsassumptionsblock.cpp
@@ -0,0 +1,535 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets id = "ASSUMPTIONS", charBlockPtr = NULL, and taxa = t. Assumes taxa is non-NULL.
+*/
+NxsAssumptionsBlock::NxsAssumptionsBlock(
+  NxsTaxaBlock *t)	/* pointer to the taxa block */
+	{
+	assert(t);
+	taxa			= t;
+	charBlockPtr	= NULL;
+	id				= "ASSUMPTIONS";
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Nothing needs to be done in the destructor.
+*/
+NxsAssumptionsBlock::~NxsAssumptionsBlock()
+{
+}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Makes data member taxa point to 'tb'. Assumes tb is non-NULL.
+*/
+void NxsAssumptionsBlock::ReplaceTaxaBlockPtr(
+  NxsTaxaBlock *tb)	/* pointer to new NxsTaxaBlock object */
+	{
+	assert(tb);
+	taxa = tb;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of character sets stored.
+*/
+int NxsAssumptionsBlock::GetNumCharSets()
+	{
+	return (int)charsets.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Erases 'names' vector, then fills 'names' with the names of all stored character sets.
+*/
+void NxsAssumptionsBlock::GetCharSetNames(
+  NxsStringVector &names)	/* the vector in which to store the names */
+	{
+	names.erase(names.begin(), names.end());
+	NxsUnsignedSetMap::const_iterator i;
+	for (i = charsets.begin(); i != charsets.end(); i++)
+	names.push_back((*i).first);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns reference to character set having name 'nm'.
+*/
+NxsUnsignedSet &NxsAssumptionsBlock::GetCharSet(
+  NxsString nm)	/* the name of the character set to return */
+	{
+	return charsets[nm];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns name of default character set. If returned string has zero length, then no default character set was defined
+|	in the data set.
+*/
+NxsString NxsAssumptionsBlock::GetDefCharSetName()
+	{
+	return def_charset;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of taxon sets stored.
+*/
+int NxsAssumptionsBlock::GetNumTaxSets()
+	{
+	return (int)taxsets.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Erases 'names' vector, then fills 'names' with the names of all stored taxon sets.
+*/
+void NxsAssumptionsBlock::GetTaxSetNames(
+  NxsStringVector &names)	/* the vector in which to store the names */
+	{
+	names.erase(names.begin(), names.end());
+	NxsUnsignedSetMap::const_iterator i;
+	for (i = taxsets.begin(); i != taxsets.end(); i++)
+		names.push_back((*i).first);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns reference to taxon set having name 'nm'.
+*/
+NxsUnsignedSet &NxsAssumptionsBlock::GetTaxSet(
+  NxsString nm)	/* the name of the taxon set to return */
+	{
+	return taxsets[nm];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns name of default taxon set. If returned string has zero length, then no default taxon set was defined in the
+|	data set.
+*/
+NxsString NxsAssumptionsBlock::GetDefTaxSetName()
+	{
+	return def_taxset;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of exclusion sets stored.
+*/
+int NxsAssumptionsBlock::GetNumExSets()
+	{
+	return (int)exsets.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Erases names, then fills names with the names of all stored exclusion sets.
+*/
+void NxsAssumptionsBlock::GetExSetNames(
+  NxsStringVector &names)	/* the vector in which to store the names */
+	{
+	names.erase(names.begin(), names.end());
+	NxsUnsignedSetMap::const_iterator i;
+	for (i = exsets.begin(); i != exsets.end(); i++)
+		names.push_back((*i).first);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns reference to exclusion set having name 'nm'.
+*/
+NxsUnsignedSet &NxsAssumptionsBlock::GetExSet(
+  NxsString nm)	/* the name of the exclusion set to return */
+	{
+	return exsets[nm];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns name of default exclusion set. If returned string has zero length, then no default exclusion set was defined
+|	in the data set.
+*/
+NxsString NxsAssumptionsBlock::GetDefExSetName()
+	{
+	return def_exset;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Applies exclusion set having name 'nm' by calling the ApplyExset method of the NxsCharactersBlock or 
+|	NxsCharactersBlock-derived object stored in the charBlockPtr pointer (which will be whichever block last called the 
+|	NxsAssumptionsBlock::SetCallback method).
+*/
+void NxsAssumptionsBlock::ApplyExSet(
+  NxsString nm)	/* the name of the exclusion set to apply */
+	{
+	assert(charBlockPtr);
+	charBlockPtr->ApplyExset(exsets[nm]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads and stores information contained in the command CHARSET within an ASSUMPTIONS block.
+*/
+void NxsAssumptionsBlock::HandleCharset(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	bool asterisked = false;
+
+	// Next token should be either an asterisk or the name of a charset
+	//
+	token.GetNextToken();
+
+	if (token.Equals("*"))
+		{
+		asterisked = true;
+		token.GetNextToken();
+		}
+
+	// Token now stored should be the name of a charset
+	//
+	NxsString charset_name = token.GetToken();
+
+	// Now grab the equals sign
+	//
+	token.GetNextToken();
+	if (!token.Equals("="))
+		{
+		errormsg = "Expecting '=' in CHARSET definition but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	assert(charBlockPtr);
+	NxsCharactersBlock &charBlock = *charBlockPtr;
+	NxsUnsignedSet s;
+	int totalChars = charBlock.GetNCharTotal();
+	NxsSetReader(token, totalChars, s, charBlock, NxsSetReader::charset).Run();
+
+	charsets[charset_name] = s;
+
+	if (asterisked)
+		def_charset = charset_name;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when the END or ENDBLOCK command needs to be parsed from within the ASSUMPTIONS block. Basically just checks
+|	to make sure the next token in the data file is a semicolon.
+*/
+void NxsAssumptionsBlock::HandleEndblock(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	// Get the semicolon following END or ENDBLOCK token
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate the END or ENDBLOCK command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads and stores information contained in the command EXSET within an ASSUMPTIONS block. If EXSET keyword is 
+|	followed by an asterisk, last read NxsCharactersBlock or NxsCharactersBlock-derived object is notified of the 
+|	characters to be excluded (its ApplyExset function is called).
+*/
+void NxsAssumptionsBlock::HandleExset(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	bool asterisked = false;
+
+	// Next token should be either an asterisk or the name of an exset
+	//
+	token.GetNextToken();
+
+	if (token.Equals("*"))
+		{
+		asterisked = true;
+		token.GetNextToken();
+		}
+
+	// Token now stored should be the name of an exset
+	//
+	NxsString exset_name = token.GetToken();
+
+	// Now grab the equals sign
+	//
+	token.GetNextToken();
+	if (!token.Equals("="))
+		{
+		errormsg = "Expecting '=' in EXSET definition but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	assert(charBlockPtr);
+	NxsCharactersBlock &charBlock = *charBlockPtr;
+	NxsUnsignedSet s;
+	int totalChars = charBlock.GetNCharTotal();
+	NxsSetReader(token, totalChars, s, charBlock, NxsSetReader::charset).Run();
+
+	exsets[exset_name] = s;
+
+	if (asterisked)
+		{
+		def_exset = exset_name;
+		charBlock.ApplyExset(s);
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads and stores information contained in the command TAXSET within an ASSUMPTIONS block.
+*/
+void NxsAssumptionsBlock::HandleTaxset(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	bool asterisked = false;
+
+	// Next token should be either an asterisk or the name of a taxset
+	//
+	token.GetNextToken();
+
+	if (token.Equals("*"))
+		{
+		asterisked = true;
+		token.GetNextToken();
+		}
+
+	// Token now stored should be the name of a taxset
+	//
+	NxsString taxset_name = token.GetToken();
+
+	// Now grab the equals sign
+	//
+	token.GetNextToken();
+	if (!token.Equals("="))
+		{
+		errormsg = "Expecting '=' in TAXSET definition but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	NxsUnsignedSet s;
+	int totalTaxa = taxa->GetNumTaxonLabels();
+	NxsSetReader(token, totalTaxa, s, *this, NxsSetReader::taxset).Run();
+
+	taxsets[taxset_name] = s;
+
+	if (asterisked)
+		def_taxset = taxset_name;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NxsReader 
+|	object) to the end or ENDBLOCK statement. Characters are read from the input stream in. Overrides the pure virtual
+|	function in the base class.
+*/
+void NxsAssumptionsBlock::Read(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	isEmpty = false;
+	isUserSupplied = true;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' after ";
+		errormsg += id;
+		errormsg += " block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for(;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("EXSET"))
+			{
+			HandleExset(token);
+			}
+		else if (token.Equals("TAXSET"))
+			{
+			HandleTaxset(token);
+			}
+		else if (token.Equals("CHARSET"))
+			{
+			HandleCharset(token);
+			}
+		else if (token.Equals("END"))
+			{
+			HandleEndblock(token);
+			break;
+			}
+		else if (token.Equals("ENDBLOCK"))
+			{
+			HandleEndblock(token);
+			break;
+			}
+		else
+			{
+			SkippingCommand(token.GetToken());
+			do
+				{
+				token.GetNextToken();
+				} while(!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+		}	// for(;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Prepares for reading a new ASSUMPTIONS block. Overrides the pure virtual function in the base class.
+*/
+void NxsAssumptionsBlock::Reset()
+	{
+	exsets.erase(exsets.begin(), exsets.end());
+	taxsets.erase(taxsets.begin(), taxsets.end());
+	charsets.erase(charsets.begin(), charsets.end());
+	def_taxset.clear();
+	def_charset.clear();
+	def_exset.clear();
+	errormsg.clear();
+	isEnabled		= true;
+	isEmpty			= true;
+	isUserSupplied	= false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this ASSUMPTIONS block. Overrides the pure virtual function
+|	in the base class.
+*/
+void NxsAssumptionsBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	out << endl;
+	out << id << " block contains the following:" << endl;
+
+	if (charsets.empty())
+		out << "  No character sets were defined" << endl;
+	else
+		{
+		NxsUnsignedSetMap::const_iterator charsets_iter = charsets.begin();
+		if (charsets.size() == 1)
+			{
+			out << "  1 character set defined:" << endl;
+			out << "    " << (*charsets_iter).first << endl;
+			}
+		else
+			{
+			out << "  " << charsets.size() << " character sets defined:" << endl;
+			for (; charsets_iter != charsets.end(); charsets_iter++)
+				{
+				NxsString nm = (*charsets_iter).first;
+				out << "    " << nm;
+				if (nm == def_charset)
+					out << " (default)";
+				out << endl;
+				}
+			}
+		}	// if (charsets.empty()) ... else
+
+	if (taxsets.empty())
+		out << "  No taxon sets were defined" << endl;
+	else
+		{
+		NxsUnsignedSetMap::const_iterator taxsets_iter = taxsets.begin();
+		if (taxsets.size() == 1)
+			{
+			out << "  1 taxon set defined:" << endl;
+			out << "    " << (*taxsets_iter).first << endl;
+			}
+		else
+			{
+			out << "  " << taxsets.size() << " taxon sets defined:" << endl;
+			for (; taxsets_iter != taxsets.end(); taxsets_iter++)
+				{
+				NxsString nm = (*taxsets_iter).first;
+				out << "    " << nm;
+				if (nm == def_taxset)
+					out << " (default)";
+				out << endl;
+				}
+			}
+		}	// if (taxsets.empty()) ... else
+
+	if (exsets.empty())
+		out << "  No exclusion sets were defined" << endl;
+	else
+		{
+		NxsUnsignedSetMap::const_iterator exsets_iter = exsets.begin();
+		if (exsets.size() == 1)
+			{
+			out << "  1 exclusion set defined:" << endl;
+			out << "    " << (*exsets_iter).first << endl;
+			}
+		else
+			{
+			out << "  " << exsets.size() << " exclusion sets defined:" << endl;
+			for (; exsets_iter != exsets.end(); exsets_iter++)
+				{
+				NxsString nm = (*exsets_iter).first;
+				out << "    " << nm;
+				if (nm == def_exset)
+				out << " (default)";
+				out << endl;
+				}
+			}
+		}
+
+	out << endl;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	A CHARACTERS, DATA, or ALLELES block can call this function to specify that it is to receive notification when the 
+|	current taxon or character set changes (e.g., an "EXSET *" command is read or a program requests that one of the 
+|	predefined taxon sets, character sets, or exsets be applied). Normally, a NxsCharactersBlock-derived object calls 
+|	this function upon entering its MATRIX command, since when that happens it becomes the primary data-containing block.
+*/
+void NxsAssumptionsBlock::SetCallback(
+  NxsCharactersBlock* p)	/* the object to be called in the event of a change in character status */
+	{
+	charBlockPtr = p;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts a taxon label to a number corresponding to the taxon's position within the list maintained by the 
+|	NxsTaxaBlock object. This method overrides the virtual function of the same name in the NxsBlock base class. If s 
+|	is not a valid taxon label, returns the value 0.
+*/
+unsigned NxsAssumptionsBlock::TaxonLabelToNumber(
+  NxsString s)	/* the taxon label to convert */
+	{
+	int i;
+	try
+		{
+		i = 1 + taxa->FindTaxon(s);
+		}
+	catch(NxsTaxaBlock::NxsX_NoSuchTaxon)
+		{
+		i = 0;
+		}
+
+	return i;
+	}
diff --git a/ncl/nxsassumptionsblock.h b/ncl/nxsassumptionsblock.h
new file mode 100644
index 0000000..3351559
--- /dev/null
+++ b/ncl/nxsassumptionsblock.h
@@ -0,0 +1,89 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_ASSUMPTIONSBLOCK_H
+#define NCL_ASSUMPTIONSBLOCK_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NxsReader block ASSUMPTIONS. It overrides the member functions Read 
+|	and Reset, which are abstract virtual functions in the base class NxsBlock. Adding a new data member? Don't forget
+|	to:
+|~
+|	o Describe it in the class declaration using a C-style comment.
+|	o Initialize it (unless it is self-initializing) in the constructor and re-initialize it in the Reset function.
+|	o Describe the initial state in the constructor documentation.
+|	o Delete memory allocated to it in both the destructor and Reset function.
+|	o Report it in some way in the Report function.
+|~
+*/
+class NxsAssumptionsBlock
+  : public NxsBlock
+	{
+	public:
+							NxsAssumptionsBlock(NxsTaxaBlock *t);
+		virtual				~NxsAssumptionsBlock();
+
+		void				ReplaceTaxaBlockPtr(NxsTaxaBlock *tb);
+		void				SetCallback(NxsCharactersBlock *p);
+
+		int					GetNumCharSets();
+		void				GetCharSetNames(NxsStringVector &names);
+		NxsUnsignedSet		&GetCharSet(NxsString nm);
+		NxsString			GetDefCharSetName();
+
+		int					GetNumTaxSets();
+		void				GetTaxSetNames(NxsStringVector &names);
+		NxsUnsignedSet		&GetTaxSet(NxsString nm);
+		NxsString			GetDefTaxSetName();
+
+		int					GetNumExSets();
+		void				GetExSetNames(NxsStringVector &names);
+		NxsUnsignedSet		&GetExSet(NxsString nm);
+		NxsString			GetDefExSetName();
+		void				ApplyExSet(NxsString nm);
+
+		virtual void		Report(std::ostream& out);
+		virtual void		Reset();
+
+	private:
+		NxsTaxaBlock		*taxa;				/* pointer to the NxsTaxaBlock object */
+		NxsCharactersBlock	*charBlockPtr;		/* pointer to the NxsCharactersBlock-derived object to be notified in the event of exset changes */
+
+	protected:
+		NxsUnsignedSetMap	charsets;			/* the variable storing charsets */
+		NxsUnsignedSetMap	taxsets;			/* the variable storing taxsets */
+		NxsUnsignedSetMap	exsets;				/* the variable storing exsets */
+
+		NxsString			def_charset;		/* the default charset */
+		NxsString			def_taxset;			/* the default taxset */
+		NxsString			def_exset;			/* the default exset */
+
+	protected:
+		void				HandleCharset(NxsToken& token);
+		void				HandleEndblock(NxsToken& token);
+		void				HandleExset(NxsToken& token);
+		void				HandleTaxset(NxsToken& token);
+		virtual void		Read(NxsToken& token);
+		virtual unsigned	TaxonLabelToNumber(NxsString s);
+	};
+
+typedef NxsAssumptionsBlock AssumptionsBlock;	// for backward compatibility
+
+#endif
+
diff --git a/ncl/nxsblock.cpp b/ncl/nxsblock.cpp
new file mode 100644
index 0000000..304d32b
--- /dev/null
+++ b/ncl/nxsblock.cpp
@@ -0,0 +1,193 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes all pointer data members to NULL, and all bool data members to true except isUserSupplied, which is
+|	initialized to false.
+*/
+NxsBlock::NxsBlock()
+	{
+	next			= NULL;
+	nexus			= NULL;
+	isEmpty			= true;
+	isEnabled		= true;
+	isUserSupplied	= false;
+
+	id.clear();
+	errormsg.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Nothing to be done.
+*/
+NxsBlock::~NxsBlock()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This base class version simply returns 0 but a derived class should override this function if it needs to construct
+|	and run a NxsSetReader object to read a set involving characters. The NxsSetReader object may need to use this 
+|	function to look up a character label encountered in the set. A class that overrides this method should return the
+|	character index in the range [1..nchar].
+*/
+unsigned NxsBlock::CharLabelToNumber(
+  NxsString s)	/* the character label to be translated to the character's number */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(s)
+#	endif
+	return 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the value of isEnabled to false. A NxsBlock can be disabled (by calling this method) if blocks of that type
+|	are to be skipped during execution of the NEXUS file. If a disabled block is encountered, the virtual
+|	NxsReader::SkippingDisabledBlock function is called, giving your application the opportunity to inform the user
+|	that a block was skipped.
+*/
+void NxsBlock::Disable()
+	{
+	isEnabled = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the value of isEnabled to true. A NxsBlock can be disabled (by calling Disable) if blocks of that type are to
+|	be skipped during execution of the NEXUS file. If a disabled block is encountered, the virtual 
+|	NxsReader::SkippingDisabledBlock function is called, giving your application the opportunity to inform the user
+|	that a block was skipped.
+*/
+void NxsBlock::Enable()
+	{
+	isEnabled = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value of isEnabled, which can be controlled through use of the Enable and Disable member functions. A 
+|	NxsBlock should be disabled if blocks of that type are to be skipped during execution of the NEXUS file. If a 
+|	disabled block is encountered, the virtual NxsReader::SkippingDisabledBlock function is called, giving your 
+|	application the opportunity to inform the user that a block was skipped.
+*/
+bool NxsBlock::IsEnabled()
+	{
+	return isEnabled;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value of isUserSupplied, which is true if and only if this block's Read function is called to process a 
+|	block of this type appearing in a data file. This is useful because in some cases, a block object may be created 
+|	internally (e.g. a NxsTaxaBlock may be populated using taxon names provided in a DATA block), and such blocks do 
+|	not require permission from the user to delete data stored therein.
+*/
+bool NxsBlock::IsUserSupplied()
+	{
+	return isUserSupplied;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if Read function has not been called since the last Reset. This base class version simply returns the 
+|	value of the data member isEmpty. If you derive a new block class from NxsBlock, be sure to set isEmpty to true in 
+|	your Reset function and isEmpty to false in your Read function.
+*/
+bool NxsBlock::IsEmpty()
+	{
+	return isEmpty;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the id NxsString.
+*/
+NxsString NxsBlock::GetID()
+	{
+	return id;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This virtual function must be overridden for each derived class to provide the ability to read everything following
+|	the block name (which is read by the NxsReader object) to the end or endblock statement. Characters are read from 
+|	the input stream 'in'. Note that to get output comments displayed, you must derive a class from NxsToken, override 
+|	the member function OutputComment to display a supplied comment, and then pass a reference to an object of the 
+|	derived class to this function.
+*/
+void NxsBlock::Read(
+  NxsToken &token)	/* the NxsToken to use for reading block */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(token)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This virtual function should be overridden for each derived class to completely reset the block object in 
+|	preparation for reading in another block of this type. This function is called by the NxsReader object just prior to
+|	calling the block object's Read function.
+*/
+void NxsBlock::Reset()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This virtual function provides a brief report of the contents of the block.
+*/
+void NxsBlock::Report(
+  ostream &out)	/* the output stream to which the report is sent */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(out)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the nexus data member of the NxsBlock object to 'nxsptr'.
+*/
+void NxsBlock::SetNexus(
+  NxsReader *nxsptr)	/* pointer to a NxsReader object */
+	{
+	nexus = nxsptr;
+	}
+ 
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function is called when an unknown command named commandName is about to be skipped. This version of the 
+|	function does nothing (i.e., no warning is issued that a command was unrecognized). Override this virtual function 
+|	in a derived class to provide such warnings to the user.
+*/
+void NxsBlock::SkippingCommand(
+  NxsString commandName)	/* the name of the command being skipped */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(commandName)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This base class version simply returns 0, but a derived class should override this function if it needs to construct
+|	and run a NxsSetReader object to read a set involving taxa. The NxsSetReader object may need to use this function to
+|	look up a taxon label encountered in the set. A class that overrides this method should return the taxon index in
+|	the range [1..ntax].
+*/
+unsigned NxsBlock::TaxonLabelToNumber(
+  NxsString s)	/* the taxon label to be translated to a taxon number */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(s)
+#	endif
+	return 0;
+	}
+
diff --git a/ncl/nxsblock.h b/ncl/nxsblock.h
new file mode 100644
index 0000000..ff05a85
--- /dev/null
+++ b/ncl/nxsblock.h
@@ -0,0 +1,74 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#ifndef NCL_NXSBLOCK_H
+#define NCL_NXSBLOCK_H
+
+class NxsReader;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This is the base class from which all block classes are derived. A NxsBlock-derived class encapsulates a Nexus block
+|	(e.g. DATA block, TREES block, etc.). The abstract virtual function Read must be overridden for each derived class 
+|	to provide the ability to read everything following the block name (which is read by the NxsReader object) to the 
+|	end or endblock statement. Derived classes must provide their own data storage and access functions. The abstract
+|	virtual function Report must be overridden to provide some feedback to user on contents of block. The abstract
+|	virtual function Reset must be overridden to empty the block of all its contents, restoring it to its 
+|	just-constructed state.
+*/
+class NxsBlock
+	{
+	friend class NxsReader;
+
+	public:
+							NxsBlock();
+		virtual				~NxsBlock();
+
+		void				SetNexus(NxsReader *nxsptr);
+
+		NxsString			GetID();
+		bool				IsEmpty();
+
+		void				Enable();
+		void				Disable();
+		bool				IsEnabled();
+		bool				IsUserSupplied();
+
+		virtual unsigned	CharLabelToNumber(NxsString s);
+		virtual unsigned	TaxonLabelToNumber(NxsString s);
+
+		virtual void		SkippingCommand(NxsString commandName);
+
+		virtual void		Report(std::ostream &out);
+		virtual void		Reset();
+
+		NxsString			errormsg;			/* workspace for creating error messages */
+
+	protected:
+		bool				isEmpty;			/* true if this object is currently storing data */
+		bool				isEnabled;			/* true if this block is currently ebabled */
+		bool				isUserSupplied;		/* true if this object has been read from a file; false otherwise */
+		NxsReader			*nexus;				/* pointer to the Nexus file reader object */
+		NxsBlock			*next;				/* pointer to next block in list */
+		NxsString			id;					/* holds name of block (e.g., "DATA", "TREES", etc.) */
+				
+		virtual void		Read(NxsToken &token);
+	};
+
+#endif
+
+
diff --git a/ncl/nxscharactersblock.cpp b/ncl/nxscharactersblock.cpp
new file mode 100644
index 0000000..180d5c3
--- /dev/null
+++ b/ncl/nxscharactersblock.cpp
@@ -0,0 +1,2951 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+#include <functional>
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes `id' to "CHARACTERS", `taxa' to `tb', `assumptionsBlock' to `ab', `ntax', `ntaxTotal', `nchar' and 
+|	`ncharTotal' to 0, `newchar' to true, `newtaxa', `interleaving', `transposing', `respectingCase', `tokens' and 
+|	`formerly_datablock' to false, `datatype' to `NxsCharactersBlock::standard', `missing' to '?', `gap' and `matchchar'
+|	to '\0', and `matrix', `charPos', `taxonPos', `activeTaxon', and `activeChar' to NULL. The ResetSymbols member 
+|	function is called to reset the `symbols' data member. Assumes that `tb' and `ab' point to valid NxsTaxaBlock and 
+|	NxsAssumptionsBlock objects, respectively.
+*/
+NxsCharactersBlock::NxsCharactersBlock(
+  NxsTaxaBlock *tb,			/* the taxa block object to consult for taxon labels */
+  NxsAssumptionsBlock *ab)	/* the assumptions block object to consult for exclusion sets */
+  : NxsBlock()
+	{
+	assert(tb != NULL);
+	assert(ab != NULL);
+
+	taxa				= tb;
+	assumptionsBlock	= ab;
+	id					= "CHARACTERS";
+
+	// These need to be initialized to NULL so Reset member function will not try to delete them
+	//
+	matrix				= NULL;
+	charPos				= NULL;
+	taxonPos			= NULL;
+	activeTaxon			= NULL;
+	activeChar			= NULL;
+	symbols				= NULL;
+
+	Reset();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes any memory allocated to the arrays `symbols', `charPos', `taxonPos', `activeChar', and `activeTaxon'. 
+|	Flushes the containers `charLabels', `eliminated', and `deleted'. Also deletes memory allocated to `matrix'.
+*/
+NxsCharactersBlock::~NxsCharactersBlock()
+	{
+	Reset();
+
+	if (symbols != NULL)
+		delete [] symbols;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes (i.e., excludes from further analyses) taxa whose indices are contained in the set `delset'. The taxon 
+|	indices refer to original taxon indices, not current indices (originals will equal current ones if number of taxa 
+|	in TAXA block equals number of taxa in MATRIX command). Returns the number of taxa actually deleted (some may have 
+|	already been deleted)
+*/
+unsigned NxsCharactersBlock::ApplyDelset(
+  NxsUnsignedSet &delset)	/* set of taxon indices to delete in range [0..`ntaxTotal') */
+	{
+	assert(activeTaxon != NULL);
+	assert(taxonPos != NULL);
+
+	unsigned num_deleted = 0;
+	unsigned k;
+
+	NxsUnsignedSet::const_iterator i;
+	for (i = delset.begin(); i != delset.end(); i++)
+		{
+		k = taxonPos[*i];
+		if (k == UINT_MAX)
+			continue;
+
+		// k equal to UINT_MAX means data was supplied for
+		// this taxon and therefore it can be deleted
+		//
+		if (activeTaxon[k] == true)
+			num_deleted++;
+		activeTaxon[k] = false;
+		}
+
+	return num_deleted;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Excludes characters whose indices are contained in the set `exset'. The indices supplied should refer to the 
+|	original character indices, not current character indices. Returns number of characters actually excluded (some 
+|	may have already been excluded).
+*/
+unsigned NxsCharactersBlock::ApplyExset(
+  NxsUnsignedSet &exset)	/* set of character indices to exclude in range [0..`ncharTotal') */
+	{
+	assert(activeChar != NULL);
+	assert(charPos != NULL);
+
+	int num_excluded = 0;
+	unsigned k;
+
+	NxsUnsignedSet::const_iterator i;
+	for (i = exset.begin(); i != exset.end(); i++)
+		{
+		k = charPos[*i];
+		if (k == UINT_MAX)
+			continue;
+
+		// k equal to UINT_MAX means character was not eliminated
+		// and therefore can be excluded
+		//
+		if (activeChar[k] == true)
+			num_excluded++;
+		activeChar[k] = false;
+		}
+
+	return num_excluded;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Includes characters whose indices are contained in the set `inset'. The indices supplied should refer to the 
+|	original character indices, not current character indices.
+*/
+unsigned NxsCharactersBlock::ApplyIncludeset(
+  NxsUnsignedSet &inset)	/* set of character indices to include in range [0..`ncharTotal') */
+	{
+	assert(activeChar != NULL);
+	assert(charPos != NULL);
+
+	unsigned num_included = 0;
+	unsigned k;
+
+	NxsUnsignedSet::const_iterator i;
+	for (i = inset.begin(); i != inset.end(); i++)
+		{
+		k = charPos[*i];
+		if (k == UINT_MAX)
+			continue;
+
+		// k equal to UINT_MAX means character was not eliminated
+		// and therefore can be excluded
+		//
+		if (activeChar[k] == false)
+			num_included++;
+		activeChar[k] = true;
+		}
+
+	return num_included;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Restores (i.e., includes in further analyses) taxa whose indices are contained in the set `restoreset'. The taxon 
+|	indices refer to original taxon indices, not current indices (originals will equal current ones if number of taxa 
+|	in TAXA block equals number of taxa in MATRIX command).
+*/
+unsigned NxsCharactersBlock::ApplyRestoreset(
+  NxsUnsignedSet &restoreset)	/* set of taxon indices to restore in range [0..`ntaxTotal') */
+	{
+	assert(activeTaxon != NULL);
+	assert(taxonPos != NULL);
+
+	unsigned num_restored = 0;
+	unsigned k;
+
+	NxsUnsignedSet::const_iterator i;
+	for (i = restoreset.begin(); i != restoreset.end(); i++)
+		{
+		k = taxonPos[*i];
+		if (k == UINT_MAX)
+			continue;
+
+		// k equal to UINT_MAX means data was supplied for
+		// this taxon and therefore it can be restored
+		//
+		if (activeTaxon[k] == false)
+			num_restored++;
+		activeTaxon[k] = true;
+		}
+
+	return num_restored;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Use to allocate memory for (and initialize) `charPos' array, which keeps track of the original character index in 
+|	cases where characters have been eliminated. This function is called by HandleEliminate in response to encountering 
+|	an ELIMINATE command in the data file, and this is probably the only place where BuildCharPosArray should be called 
+|	with `check_eliminated' true. BuildCharPosArray is also called in HandleMatrix, HandleCharstatelabels, 
+|	HandleStatelabels, and HandleCharlabels.
+*/
+void NxsCharactersBlock::BuildCharPosArray(
+  bool check_eliminated)	/* if true, eliminated set has something in it and should be consulted (default is false) */
+	{
+	assert(charPos == NULL);
+
+	charPos = new unsigned[ncharTotal];
+
+	unsigned k = 0;
+	for (unsigned j = 0; j < ncharTotal; j++)
+		{
+		if (check_eliminated && IsEliminated(j))
+			charPos[j] = UINT_MAX;
+		else
+			charPos[j] = k++;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts a character label to a 1-offset number corresponding to the character's position within `charLabels'. This
+|	method overrides the virtual function of the same name in the NxsBlock base class. If `s' is not a valid character 
+|	label, returns the value 0.
+*/
+unsigned NxsCharactersBlock::CharLabelToNumber(
+  NxsString s)	/* the character label to convert */
+	{
+	NxsStringVector::const_iterator iter = find(charLabels.begin(), charLabels.end(), s);
+
+	unsigned k = 1;
+	if (iter != charLabels.end())
+		k += (iter - charLabels.begin());
+	else
+		k = 0;
+
+	return k;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Transfers all data from `other' to this object, leaving `other' completely empty. Used to convert a NxsDataBlock 
+|	object to a NxsCharactersBlock object in programs where it is desirable to just have a NxsCharactersBlock for 
+|	storage but also allow users to enter the information in the form of the deprecated NxsDataBlock. This function 
+|	does not make a copy of such things as the data matrix, instead just transferring the pointer to that object from 
+|	other to this. This is whay it was named Consume rather than CopyFrom.
+*/
+void NxsCharactersBlock::Consume(
+  NxsCharactersBlock &other)	/* NxsCharactersBlock object from which to copy */
+	{
+	ntax				= other.ntax;
+	ntaxTotal			= other.ntaxTotal;
+	nchar				= other.nchar;
+	ncharTotal			= other.ncharTotal;
+
+	newtaxa				= other.newtaxa;
+	newchar				= other.newchar;
+
+	formerly_datablock	= true;
+	respectingCase		= other.respectingCase;
+	transposing			= other.transposing;
+	interleaving		= other.interleaving;
+	tokens				= other.tokens;
+	labels				= other.labels;
+
+	missing				= other.missing;
+	gap					= other.gap;
+	matchchar			= other.matchchar;
+
+	datatype			= other.datatype;
+
+	if (symbols != NULL)
+		delete [] symbols;
+	symbols				= other.symbols;
+	other.symbols		= NULL;
+
+	if (charPos != NULL)
+		delete [] charPos;
+	charPos				= other.charPos;
+	other.charPos		= NULL;
+
+	if (taxonPos != NULL)
+	delete [] taxonPos;
+	taxonPos			= other.taxonPos;
+	other.taxonPos		= NULL;
+
+	if (activeChar != NULL)
+	delete [] activeChar;
+	activeChar			= other.activeChar;
+	other.activeChar	= NULL;
+
+	if (activeTaxon != NULL)
+	delete [] activeTaxon;
+	activeTaxon			= other.activeTaxon;
+	other.activeTaxon	= NULL;
+
+	if (matrix != NULL)
+	delete matrix;
+	matrix				= other.matrix;
+	other.matrix		= NULL;
+
+	equates.clear();
+	int size = other.equates.size();
+	if (size > 0)
+		{
+		NxsStringMap::const_iterator i;
+		for (i = other.equates.begin(); i != other.equates.end(); ++i)
+			equates[(*i).first] = (*i).second;
+		other.equates.clear();
+		}
+
+	eliminated.clear();
+	size = eliminated.size();
+	if (size > 0)
+		{
+		NxsUnsignedSet::const_iterator i;
+		for (i = other.eliminated.begin(); i != other.eliminated.end(); i++)
+			eliminated.insert(*i);
+		other.eliminated.clear();
+		}
+
+	charLabels.clear();
+	size = charLabels.size();
+	if (size > 0)
+		{
+		NxsStringVector::const_iterator i;
+		for (i = other.charLabels.begin(); i != other.charLabels.end(); i++)
+			charLabels.push_back((*i));
+		other.charLabels.clear();
+		}
+
+	charStates.clear();
+	size = charStates.size();
+	if (size > 0)
+		{
+		NxsStringVectorMap::const_iterator i;
+		for (i = other.charStates.begin(); i != other.charStates.end(); i++)
+			charStates[ (*i).first ] = (*i).second;
+		other.charStates.clear();
+		}
+
+	isEmpty = false;
+	isUserSupplied = other.isUserSupplied;
+	other.Reset();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Provides a dump of the contents of the `matrix' variable. Useful for testing whether data is being read as 
+|	expected. If marginText is NULL, matrix output is placed flush left. If each line of output should be prefaced with 
+|	a tab character, specify "\t" for `marginText'.
+*/
+void NxsCharactersBlock::DebugShowMatrix(
+  ostream &out,			/* output stream on which to print matrix */
+  bool use_matchchar,	/* if true, matchchar symbol used; otherwise, states shown for all taxa */
+  const char *marginText)		/* for printing first on each line */
+	{
+	assert(charPos != NULL);
+	assert(taxonPos != NULL);
+
+	unsigned i, k;
+	unsigned width = taxa->GetMaxTaxonLabelLength();
+	unsigned first_taxon = UINT_MAX;
+
+	for (i = 0; i < ntaxTotal; i++)
+		{
+		// Grab taxon name from taxa block. Taxa may not have been presented in the matrix in the same order
+		// as they were stored in the taxa block, so use taxonPos array to spit them out in the order they 
+		// appeared in the TAXA command. If the taxonPos cell is UINT_MAX, then that means there is no row of
+		// the data matrix corresponding to that taxon.
+		//
+		if (taxonPos[i] == UINT_MAX)
+			continue;
+		else
+			{
+			if (first_taxon == UINT_MAX)
+				first_taxon = i;
+
+			if (marginText != NULL)
+				out << marginText;
+
+			NxsString currTaxonLabel = taxa->GetTaxonLabel(taxonPos[i]);
+				out << currTaxonLabel;
+
+			// Print out enough spaces to even up the left edge of the matrix output
+			//
+			unsigned currTaxonLabelLen = currTaxonLabel.size();
+			unsigned diff = width - currTaxonLabelLen;
+			for (k = 0; k < diff+5; k++)
+				out << ' ';
+			}
+
+		for (unsigned currChar = 0; currChar < ncharTotal; currChar++)
+			{
+			unsigned j = charPos[currChar];
+			if (j == UINT_MAX)
+				continue;
+			ShowStateLabels(out, i, j, (use_matchchar ? first_taxon : UINT_MAX));
+			}
+
+		out << endl;
+		}	// for (i = 0; i < ntaxTotal; i++)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the maximum observed number of states for any character. Note: this function is rather slow, as it must 
+|	walk through each row of each column, adding the states encountered to a set,  then finally returning the size of 
+|	the set. Thus, if this function is called often, it would be advisable to initialize an array using this function, 
+|	then refer to the array subsequently. 
+*/
+unsigned NxsCharactersBlock::GetMaxObsNumStates()
+	{
+	unsigned max = 2;
+	for (unsigned j = 0; j < nchar; j++)
+		{
+		unsigned ns = GetObsNumStates(j);
+		if (ns <= max)
+			continue;
+		max = ns;
+		}
+
+	return max;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Performs a count of the number of characters for which `activeChar' array reports true.
+*/
+unsigned NxsCharactersBlock::GetNumActiveChar()
+	{
+	unsigned num_active_char = 0;
+	for (unsigned i = 0; i < nchar; i++)
+		{
+		if (activeChar[i] == false)
+			continue;
+		num_active_char++;
+		}
+
+	return num_active_char;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Performs a count of the number of taxa for which `activeTaxon' array reports true.
+*/
+unsigned NxsCharactersBlock::GetNumActiveTaxa()
+	{
+	unsigned num_active_taxa = 0;
+	for (unsigned i = 0; i < ntax; i++)
+		{
+		if (activeTaxon[i] == false)
+			continue;
+		num_active_taxa++;
+		}
+
+	return num_active_taxa;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the original character index in the range [0..`ncharTotal'). Will be equal to `j' unless some characters 
+|	were eliminated.
+*/
+unsigned NxsCharactersBlock::GetOrigCharIndex(
+  unsigned j)	/* the character in range [0..`nchar') */
+	{
+	unsigned k = j;
+	while (k < ncharTotal && charPos[k] < j)
+		k++;
+
+	assert(k < ncharTotal);
+	return k;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the original taxon index in the range [0..`ntaxTotal'). Will be equal to `i' unless data was not provided 
+|	for some taxa listed in a preceding TAXA block.
+*/
+unsigned NxsCharactersBlock::GetOrigTaxonIndex(
+  unsigned i)	/* the taxon in range [0..`ntax') */
+	{
+	assert(taxonPos != NULL);
+
+	unsigned k = i;
+	while (k < ntaxTotal && taxonPos[k] < i)
+		k++;
+
+	assert(k < ntaxTotal);
+	return k;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns label for character state `j' at character `i', if a label has been specified. If no label was specified, 
+|	returns string containing a single blank (i.e., " ").
+*/
+NxsString NxsCharactersBlock::GetStateLabel(
+  unsigned i,	/* the locus in range [0..`nchar') */
+  unsigned j)	/* the 0-offset index of the state of interest */
+	{
+	NxsString s = " ";
+	NxsStringVectorMap::const_iterator cib = charStates.find(i);
+	if (cib != charStates.end() && static_cast<unsigned>(j) < (*cib).second.size())
+		{
+		s = (*cib).second[j];
+		}
+
+	return s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if character number `origCharIndex' was eliminated, false otherwise. Returns false immediately if 
+|	`eliminated' set is empty.
+*/
+bool NxsCharactersBlock::IsEliminated(
+  unsigned origCharIndex)	/* the character in question */
+	{
+	// Note: it is tempting to try to streamline this method by just looking up character j in charPos to see if it
+	// has been eliminated, but this temptation should be resisted because this function is used in setting up
+	// charPos in the first place!
+
+	if (eliminated.empty())
+		return false;
+
+	NxsUnsignedSet::const_iterator found = eliminated.find(origCharIndex);
+	if (found == eliminated.end())
+		return false;
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if `ch' can be found in the `symbols' array. The value of `respectingCase' is used to determine 
+|	whether or not the search should be case sensitive. Assumes `symbols' is non-NULL.
+*/
+bool NxsCharactersBlock::IsInSymbols(
+  char ch)	/* the symbol character to search for */
+	{
+	assert(symbols != NULL);
+	unsigned symbolsLength = strlen(symbols);
+	bool found = false;
+	for (unsigned i = 0; i < symbolsLength; i++)
+		{
+		char char_in_symbols = (respectingCase ? symbols[i] : (char)toupper(symbols[i]));
+		char char_in_question = (respectingCase ? ch : (char)toupper(ch));
+		if (char_in_symbols != char_in_question)
+			continue;
+		found = true;
+		break;
+		}
+
+	return found;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when CHARLABELS command needs to be parsed from within the DIMENSIONS block. Deals with everything after 
+|	the token CHARLABELS up to and including the semicolon that terminates the CHARLABELS command. If an ELIMINATE 
+|	command has been processed, labels for eliminated characters will not be stored.
+*/
+void NxsCharactersBlock::HandleCharlabels(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	unsigned num_labels_read = 0;
+	charLabels.clear();
+
+	if (charPos == NULL)
+		BuildCharPosArray();
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		// Token should either be ';' or the name of a character (an isolated '_' character is 
+		// converted automatically by token.GetNextToken() into a space, which is then stored
+		// as the character label)
+		//
+		if (token.Equals(";"))
+			{
+			break;
+			}
+		else
+			{
+			num_labels_read++;
+
+			// Check to make sure user is not trying to read in more character labels than 
+			// there are characters
+			//
+			if (num_labels_read > ncharTotal)
+				{
+				errormsg = "Number of character labels exceeds NCHAR specified in DIMENSIONS command";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			if (!IsEliminated(num_labels_read - 1))
+				charLabels.push_back(token.GetToken());
+			}
+		}
+
+	newchar = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when CHARSTATELABELS command needs to be parsed from within the CHARACTERS block. Deals with everything 
+|	after the token CHARSTATELABELS up to and including the semicolon that terminates the CHARSTATELABELS command. 
+|	Resulting `charLabels' vector will store labels only for characters that have not been eliminated, and likewise for 
+|	`charStates'. Specifically, `charStates[0]' refers to the vector of character state labels for the first 
+|	non-eliminated character.
+*/
+void NxsCharactersBlock::HandleCharstatelabels(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	unsigned currChar = 0;
+	bool semicolonFoundInInnerLoop = false;
+	bool tokenAlreadyRead = false;
+	bool save = true;
+
+	charStates.clear();
+	charLabels.clear();
+
+	if (charPos == NULL)
+		BuildCharPosArray();
+
+	for (;;)
+		{
+		save = true;
+
+		if (semicolonFoundInInnerLoop)
+			break;
+
+		if (tokenAlreadyRead)
+			tokenAlreadyRead = false;
+		else
+			token.GetNextToken();
+
+		if (token.Equals(";"))
+			break;
+
+		// Token should be the character number; create a new association
+		//
+		int n = atoi(token.GetToken().c_str());
+
+		if (n < 1 || n > (int)ncharTotal || n <= (int)currChar)
+			{
+			errormsg = "Invalid character number (";
+			errormsg += token.GetToken();
+			errormsg += ") found in CHARSTATELABELS command (either out of range or not interpretable as an integer)";
+			throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		// If n is not the next character after currChar, need to add some dummy
+		// labels to charLabels list
+		//
+		while (n - currChar > 1) 
+			{
+			currChar++;
+			if (!IsEliminated(currChar - 1))
+				charLabels.push_back(" ");
+			}
+
+		// If n refers to a character that has been eliminated, go through the motions of
+		// reading in the information but don't actually save any of it
+		//
+		currChar++;
+		assert(n == (int)currChar);
+		if (IsEliminated(currChar-1))
+			save = false;
+
+		token.GetNextToken();
+
+		// Token should be the character label
+		//
+		if (save) 
+			charLabels.push_back(token.GetToken());
+
+		token.GetNextToken();
+
+		// Token should be a slash character if state labels were provided for this character; otherwise, 
+		// token should be one of the following:
+		// 1) the comma separating information for different characters, in which case we read in the 
+		//    next token (which should be the next character number)
+		// 2) the semicolon indicating the end of the command
+		//
+		if (!token.Equals("/"))
+			{
+			if (!token.Equals(",") && !token.Equals(";"))
+				{
+				errormsg = "Expecting a comma or semicolon here, but found (";
+				errormsg += token.GetToken();
+				errormsg += ") instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			if (token.Equals(","))
+				token.GetNextToken();
+			tokenAlreadyRead = true;
+			continue;
+			}
+
+		// Now create a new association for the character states list
+
+		for (;;)
+			{
+			token.GetNextToken();
+
+			if (token.Equals(";"))
+				{
+				semicolonFoundInInnerLoop = true;
+				break;
+				}
+
+			if (token.Equals(","))
+				{
+				break;
+				}
+
+			if (save)
+				{
+				// Token should be a character state label; add it to the list
+				//
+				NxsString cslabel = token.GetToken();
+				unsigned k = GetCharPos(n - 1);
+				charStates[k].push_back(cslabel);
+				}
+
+			} // inner for (;;) loop (grabbing state labels for character n)
+		} // outer for (;;) loop
+
+	newchar = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when DIMENSIONS command needs to be parsed from within the CHARACTERS block. Deals with everything after 
+|	the token DIMENSIONS up to and including the semicolon that terminates the DIMENSIONs command. `newtaxaLabel', 
+|	`ntaxLabel' and `ncharLabel' are simply "NEWTAXA", "NTAX" and "NCHAR" for this class, but may be different for 
+|	derived classes that use `newtaxa', `ntax' and `nchar' for other things (e.g., ntax is number of populations in 
+|	an ALLELES block)
+*/
+void NxsCharactersBlock::HandleDimensions(
+  NxsToken &token,			/* the token used to read from `in' */
+  NxsString newtaxaLabel,	/* the label used in data file for `newtaxa' */
+  NxsString ntaxLabel,		/* the label used in data file for `ntax' */
+  NxsString ncharLabel)		/* the label used in data file for `nchar' */
+	{
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals(newtaxaLabel))
+			{
+			newtaxa = true;
+			}
+		else if (token.Equals(ntaxLabel)) 
+			{
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after ";
+				errormsg += ntaxLabel;
+				errormsg += " in DIMENSIONS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the number of taxa
+			//
+			token.GetNextToken();
+
+			ntax = atoi(token.GetToken().c_str());
+			if (ntax <= 0)
+				{
+				errormsg = ntaxLabel;
+				errormsg += " must be a number greater than 0";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			if (newtaxa)
+				ntaxTotal = ntax;
+			else
+				{
+				ntaxTotal = taxa->GetNumTaxonLabels();
+				if (ntaxTotal < ntax)
+					{
+					errormsg = ntaxLabel;
+					errormsg += " in ";
+					errormsg += id;
+					errormsg += " block must be less than or equal to NTAX in TAXA block";
+					errormsg += "\nNote: one circumstance that can cause this error is ";
+					errormsg += "\nforgetting to specify ";
+					errormsg += ntaxLabel;
+					errormsg += " in DIMENSIONS command when ";
+					errormsg += "\na TAXA block has not been provided";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				}
+			}
+		else if (token.Equals(ncharLabel)) 
+			{
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after ";
+				errormsg += ncharLabel;
+				errormsg += " in DIMENSIONS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the number of characters
+			//
+			token.GetNextToken();
+
+			nchar = atoi(token.GetToken().c_str());
+			if (nchar <= 0)
+				{
+				errormsg = ncharLabel;
+				errormsg += " must be a number greater than 0";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			ncharTotal = nchar;
+			}
+		else if (token.Equals(";"))
+			{
+			break;
+			}
+		}
+
+	if (newtaxa)
+		taxa->Reset();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when ELIMINATE command needs to be parsed from within the CHARACTERS block. Deals with everything after the 
+|	token ELIMINATE up to and including the semicolon that terminates the ELIMINATE command. Any character numbers 
+|	or ranges of character numbers specified are stored in the NxsUnsignedSet `eliminated', which remains empty until 
+|	an ELIMINATE command is processed. Note that like all sets the character ranges are adjusted so that their offset 
+|	is 0. For example, given "eliminate 4-7;" in the data file, the eliminate array would contain the values 3, 4, 5 
+|	and 6 (not 4, 5, 6 and 7). It is assumed that the ELIMINATE command comes before character labels and/or character 
+|	state labels have been specified; an error message is generated if the user attempts to use ELIMINATE after a 
+|	CHARLABELS, CHARSTATELABELS, or STATELABELS command.
+*/
+void NxsCharactersBlock::HandleEliminate(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	// Construct an object of type NxsSetReader, then call its run function
+	// to store the set in the eliminated set
+	//
+	NxsSetReader(token, ncharTotal, eliminated, *this, NxsSetReader::charset).Run();
+
+	nchar = ncharTotal - eliminated.size();
+
+	if (nchar != ncharTotal && (charLabels.size() > 0 || charStates.size() > 0)) 
+		{
+		errormsg = "The ELIMINATE command must appear before character\n(or character state) labels are specified";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (charPos != NULL) 
+		{
+		errormsg = "Only one ELIMINATE command is allowed, and it must appear before the MATRIX command";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	BuildCharPosArray(true);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when the END or ENDBLOCK command needs to be parsed from within the CHARACTERS block. Does two things: 
+|~
+|	o checks to make sure the next token in the data file is a semicolon
+|	o eliminates character labels and character state labels for characters that have been eliminated
+|~
+*/
+void NxsCharactersBlock::HandleEndblock(
+  NxsToken &token,		/* the token used to read from `in' */
+  NxsString charToken)	/* */
+	{
+	// Get the semicolon following END or ENDBLOCK token
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate the END or ENDBLOCK command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (charLabels.empty() && !charStates.empty())
+		{
+		// Make up labels for characters since user has provided labels
+		// for character states; that way, we know that charLabels
+		// and charStates are either both empty or both full
+		//
+		for (unsigned k = 0; k < ncharTotal; k++)
+			{
+			NxsString nm = charToken;
+			nm += " ";
+			nm += (k+1);
+			charLabels.push_back(nm.c_str());
+			}
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when FORMAT command needs to be parsed from within the DIMENSIONS block. Deals with everything after the 
+|	token FORMAT up to and including the semicolon that terminates the FORMAT command.
+*/
+void NxsCharactersBlock::HandleFormat(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	bool standardDataTypeAssumed = false;
+	bool ignoreCaseAssumed = false;
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("DATATYPE"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword DATATYPE but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be one of the following: STANDARD, DNA, RNA, NUCLEOTIDE, PROTEIN, or CONTINUOUS
+			//
+			token.GetNextToken();
+
+			if (token.Equals("STANDARD"))
+				datatype = standard;
+			else if (token.Equals("DNA"))
+				datatype = dna;
+			else if (token.Equals("RNA"))
+				datatype = rna;
+			else if (token.Equals("NUCLEOTIDE"))
+				datatype = nucleotide;
+			else if (token.Equals("PROTEIN"))
+				datatype = protein;
+			else if (token.Equals("CONTINUOUS"))
+				datatype = continuous;
+			else
+				{
+				errormsg = token.GetToken();
+				errormsg += " is not a valid DATATYPE within a ";
+				errormsg += id;
+				errormsg += " block";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// BQM commented out
+//			if (standardDataTypeAssumed && datatype != standard)
+//				{
+//				errormsg = "DATATYPE must be specified first in FORMAT command";
+//				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+//				}
+
+			ResetSymbols();
+
+			if (datatype == continuous)
+				tokens = true;
+			}
+
+		else if (token.Equals("RESPECTCASE"))
+			{
+			if (ignoreCaseAssumed)
+				{
+				errormsg = "RESPECTCASE must be specified before MISSING, GAP, SYMBOLS, and MATCHCHAR in FORMAT command";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			standardDataTypeAssumed = true;
+			respectingCase = true;
+			}
+
+		else if (token.Equals("MISSING"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword MISSING but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the missing data symbol (single character)
+			//
+			token.GetNextToken();
+
+			if (token.GetTokenLength() != 1)
+				{
+				errormsg = "MISSING symbol should be a single character, but ";
+				errormsg += token.GetToken();
+				errormsg += " was specified";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsPunctuationToken() && !token.IsPlusMinusToken())
+				{
+				errormsg = "MISSING symbol specified cannot be a punctuation token (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsWhitespaceToken())
+				{
+				errormsg = "MISSING symbol specified cannot be a whitespace character (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			missing = token.GetToken()[0];
+
+			ignoreCaseAssumed = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("GAP"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword GAP but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the gap symbol (single character)
+			//
+			token.GetNextToken();
+
+			if (token.GetTokenLength() != 1)
+				{
+				errormsg = "GAP symbol should be a single character, but ";
+				errormsg += token.GetToken();
+				errormsg += " was specified";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsPunctuationToken() && !token.IsPlusMinusToken())
+				{
+				errormsg = "GAP symbol specified cannot be a punctuation token (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsWhitespaceToken())
+				{
+				errormsg = "GAP symbol specified cannot be a whitespace character (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			gap = token.GetToken()[0];
+
+			ignoreCaseAssumed = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("SYMBOLS"))
+			{
+			if (datatype == NxsCharactersBlock::continuous)
+				{
+				errormsg = "SYMBOLS subcommand not allowed for DATATYPE=CONTINUOUS";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			int numDefStates;
+			int maxNewStates;
+			switch(datatype)
+				{
+				case NxsCharactersBlock::dna:
+				case NxsCharactersBlock::rna:
+				case NxsCharactersBlock::nucleotide:
+					numDefStates = 4;
+					maxNewStates = NCL_MAX_STATES-4;
+					break;
+
+				case NxsCharactersBlock::protein:
+					numDefStates = 21;
+					maxNewStates = NCL_MAX_STATES-21;
+					break;
+
+				default:
+					numDefStates = 0; // replace symbols list for standard datatype
+					symbols[0] = '\0';
+					maxNewStates = NCL_MAX_STATES;
+				}
+
+			// this should be an equals sign
+			//
+			token.GetNextToken();
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword SYMBOLS but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the symbols list
+			//
+			token.SetLabileFlagBit(NxsToken::doubleQuotedToken);
+			token.GetNextToken();
+
+			token.StripWhitespace();
+			unsigned numNewSymbols = token.GetTokenLength();
+
+			if ((int)numNewSymbols > maxNewStates)
+				{
+				errormsg = "SYMBOLS defines ";
+				errormsg += numNewSymbols;
+				errormsg += " new states but only ";
+				errormsg += maxNewStates;
+				errormsg += " new states allowed for this DATATYPE";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			NxsString t = token.GetToken();
+			unsigned tlen = t.size();
+
+			// Check to make sure user has not used any symbols already in the
+			// default symbols list for this data type
+			//
+			/* BQM: erase used symbols */
+			NxsString told = t;
+			t="";
+			for (unsigned i = 0; i < tlen; i++)
+				{
+				if (!IsInSymbols(told[i]) && told[i] > 32)
+					{
+						t += told[i];
+					}
+				}
+/*			for (int i = 0; i < tlen; i++)
+				{
+				if (IsInSymbols(t[i]))
+					{
+					errormsg = "The character ";
+					errormsg += t[i];
+					errormsg += " defined in SYMBOLS has already been predefined for this DATATYPE";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				}*/
+
+			// If we've made it this far, go ahead and add the user-defined
+			// symbols to the end of the list of predefined symbols
+			//
+			strcpy(symbols+numDefStates, t.c_str());
+
+			ignoreCaseAssumed = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("EQUATE"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword EQUATE but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be a double-quote character
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("\""))
+				{
+				errormsg = "Expecting '\"' after keyword EQUATE but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// Loop until second double-quote character is encountered
+			//
+			for (;;)
+				{
+				token.GetNextToken();
+				if (token.Equals("\""))
+					break;
+
+				// If token is not a double-quote character, then it must be the equate symbol (i.e., the 
+				// character to be replaced in the data matrix)
+				//
+				if (token.GetTokenLength() != 1)
+					{
+					errormsg = "Expecting single-character EQUATE symbol but found ";
+					errormsg += token.GetToken();
+					errormsg += " instead";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+
+				// Check for bad choice of equate symbol
+				//
+				NxsString t = token.GetToken();
+				char ch = t[0];
+				bool badEquateSymbol = false;
+
+				// The character '^' cannot be an equate symbol
+				//
+				if (ch == '^')
+					badEquateSymbol = true;
+
+				// Equate symbols cannot be punctuation (except for + and -)
+				//
+				if (token.IsPunctuationToken() && !token.IsPlusMinusToken())
+					badEquateSymbol = true;
+
+				// Equate symbols cannot be same as matchchar, missing, or gap
+				//
+				if (ch == missing || ch == matchchar || ch == gap)
+					badEquateSymbol = true;
+
+				// Equate symbols cannot be one of the state symbols currently defined
+				//
+				if (IsInSymbols(ch))
+					badEquateSymbol = true;
+
+				if (badEquateSymbol)
+					{
+					errormsg = "EQUATE symbol specified (";
+					errormsg += token.GetToken();
+					errormsg += ") is not valid; must not be same as missing, \nmatchchar, gap, state symbols, or any of the following: ()[]{}/\\,;:=*'\"`<>^";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+
+				NxsString k = token.GetToken();
+
+				// This should be an equals sign
+				//
+				token.GetNextToken();
+
+				if (!token.Equals("="))
+					{
+					errormsg = "Expecting '=' in EQUATE definition but found ";
+					errormsg += token.GetToken();
+					errormsg += " instead";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+
+				// This should be the token to be substituted in for the equate symbol
+				//
+				token.SetLabileFlagBit(NxsToken::parentheticalToken);
+				token.SetLabileFlagBit(NxsToken::curlyBracketedToken);
+				token.GetNextToken();
+				NxsString v = token.GetToken();
+
+				// Add the new equate association to the equates list
+				//
+				equates[k] = v;
+				}
+
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("MATCHCHAR"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword MATCHCHAR but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the matchchar symbol (single character)
+			//
+			token.GetNextToken();
+
+			if (token.GetTokenLength() != 1)
+				{
+				errormsg = "MATCHCHAR symbol should be a single character, but ";
+				errormsg += token.GetToken();
+				errormsg += " was specified";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsPunctuationToken() && !token.IsPlusMinusToken())
+				{
+				errormsg = "MATCHCHAR symbol specified cannot be a punctuation token (";
+				errormsg += token.GetToken();
+				errormsg += " was specified) ";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			else if (token.IsWhitespaceToken())
+				{
+				errormsg = "MATCHCHAR symbol specified cannot be a whitespace character (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			matchchar = token.GetToken()[0];
+
+			ignoreCaseAssumed = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("LABELS"))
+			{
+			labels = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("NOLABELS"))
+			{
+			labels = false;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("TRANSPOSE"))
+			{
+			transposing = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("INTERLEAVE"))
+			{
+			interleaving = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("ITEMS"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg += "Expecting '=' after keyword ITEMS but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be STATES (no other item is supported at this time)
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("STATES"))
+				{
+				errormsg = "Sorry, only ITEMS=STATES supported at this time";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("STATESFORMAT"))
+			{
+			// This should be an equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' after keyword STATESFORMAT but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be STATESPRESENT (no other statesformat is supported at this time)
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("STATESPRESENT"))
+				{
+				errormsg = "Sorry, only STATESFORMAT=STATESPRESENT supported at this time";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("TOKENS"))
+			{
+			tokens = true;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals("NOTOKENS"))
+			{
+			tokens = false;
+			standardDataTypeAssumed = true;
+			}
+
+		else if (token.Equals(";"))
+			{
+			break;
+			}
+		}
+
+	// Perform some last checks before leaving the FORMAT command
+	//
+	if (!tokens && datatype == continuous)
+		{
+		errormsg = "TOKENS must be defined for DATATYPE=CONTINUOUS";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (tokens && (datatype == dna || datatype == rna || datatype == nucleotide))
+		{
+		errormsg = "TOKENS not allowed for the DATATYPEs DNA, RNA, or NUCLEOTIDE";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called from HandleStdMatrix or HandleTransposedMatrix function to read in the next state. Always returns true 
+|	except in the special case of an interleaved matrix, in which case it returns false if a newline character is 
+|	encountered before the next token.
+*/
+bool NxsCharactersBlock::HandleNextState(
+  NxsToken &token,	/* the token used to read from `in' */
+  unsigned i,		/* the taxon index, in range [0..`ntax') */
+  unsigned j)		/* the character index, in range [0..`nchar') */
+	{
+	// This should be the state for taxon i and character j
+	//
+	if (!tokens)
+		{
+		token.SetLabileFlagBit(NxsToken::parentheticalToken);
+		token.SetLabileFlagBit(NxsToken::curlyBracketedToken);
+		token.SetLabileFlagBit(NxsToken::singleCharacterToken);
+		}
+
+	if (interleaving)
+		token.SetLabileFlagBit(NxsToken::newlineIsToken);
+
+	token.GetNextToken();
+
+	if (interleaving && token.AtEOL())
+		return false;
+
+	// Make sure we didn't run out of file
+	//
+	if (token.AtEOF())
+		{
+		errormsg = "Unexpected end of file encountered";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	// If we didn't run out of file, there is no reason why we should have a zero-length token on our hands
+	//
+	assert(token.GetTokenLength() > 0);
+
+	// We've read in the state now, so if this character has been eliminated, we don't want to go any further with it
+	//
+	if (j < 0)
+		return true;
+
+	// See if any equate macros apply
+	//
+	NxsString skey = NxsString(token.GetToken(true)); // equates should always respect case
+
+	NxsStringMap::iterator p = equates.find(skey);
+	if (p != equates.end())
+		{
+		NxsString sval = (*p).second;
+		token.ReplaceToken(sval.c_str());
+		}
+
+	// Handle case of single-character state symbol
+	//
+	if (!tokens && token.GetTokenLength() == 1)
+		{
+		char ch = token.GetToken()[0];
+
+		// Check for missing data symbol
+		//
+		if (ch == missing)
+			{
+			matrix->SetMissing(i, j);
+			}
+
+		// Check for matchchar symbol
+		//
+		else if (matchchar != '\0' && ch == matchchar)
+			{
+			matrix->CopyStatesFromFirstTaxon(i, j);
+			}
+
+		// Check for gap symbol
+		//
+		else if (gap != '\0' && ch == gap)
+			{
+			matrix->SetGap(i, j);
+			}
+
+		// Look up the position of this state in the symbols array
+		//
+		else
+			{
+			int p = PositionInSymbols(ch);
+			if (p < 0)
+				{
+				errormsg = "State specified (";
+				errormsg += token.GetToken();
+				errormsg += ") for taxon ";
+				errormsg += (i+1);
+				errormsg += ", character ";
+				errormsg += (j+1);
+				errormsg += ", not found in list of valid symbols";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			matrix->AddState(i, j, p);
+			matrix->SetPolymorphic(i, j, 0);
+			}
+		}	// if (!tokens && token.GetTokenLength() == 1)
+
+	// Handle case of state sets when tokens is not in effect
+	//
+	else if (!tokens && token.GetTokenLength() > 1)
+		{
+		// Token should be in one of the following forms: LEFT_SQUIGGLYacgRIGHT_SQUIGGLY LEFT_SQUIGGLYa~gRIGHT_SQUIGGLY LEFT_SQUIGGLYa c gRIGHT_SQUIGGLY (acg) (a~g) (a c g) 
+		//
+		NxsString t = token.GetToken();
+		unsigned tlen = t.size();
+		unsigned poly = (t[0] == '(');
+		assert(poly || t[0] == '{');
+		assert((poly && t[tlen-1] == ')') || (!poly && t[tlen-1] == '}'));
+
+		unsigned first_nonblank = 1;
+		while (t[first_nonblank] == ' ' || t[first_nonblank] == '\t')
+			first_nonblank++;
+
+		unsigned last_nonblank = tlen - 2;
+		while (t[last_nonblank] == ' ' || t[last_nonblank] == '\t')
+			last_nonblank--;
+
+		if (t[first_nonblank] == '~' || t[last_nonblank] == '~')
+			{
+			errormsg = token.GetToken();
+			errormsg += " does not represent a valid range of states";
+			throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		unsigned k = 1;
+		char *pFirst = symbols;
+		bool tildeFound = false;
+		for (;;)
+			{
+			if (t[k] == ')' || t[k] == '}')
+				break;
+
+			if (t[k] == ' ' || t[k] == '\t')
+				{
+				k++;
+				continue;
+				}
+
+			// t[k] should be either '~' or one of the state symbols
+			//
+			if (t[k] == '~')
+				{
+				tildeFound = true;
+				}
+			else
+				{
+				// Add state symbol and record if it is the first or last one in case we encounter a tilde
+				//
+				if (tildeFound)
+					{
+					// Add all states from firstState to t[k] then set tildeFound to false again
+					//
+					pFirst++;
+					while (*pFirst != '\0' && *pFirst != t[k])
+						{
+						int p = PositionInSymbols(*pFirst);
+						if (p < 0)
+							{
+							errormsg = "State specified (";
+							errormsg += *pFirst;
+							errormsg += ") for taxon ";
+							errormsg += (i+1);
+							errormsg += ", character ";
+							errormsg += (j+1);
+							errormsg += ", not found in list of valid symbols";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+						matrix->AddState(i, j, p);
+						pFirst++;
+						}
+
+					tildeFound = false;
+					}
+				else
+					{
+					int p = PositionInSymbols(t[k]);
+					if (p < 0)
+						{
+						errormsg = "State specified (";
+						errormsg += t[k];
+						errormsg += ") for taxon ";
+						errormsg += (i+1);
+						errormsg += ", character ";
+						errormsg += (j+1);
+						errormsg += ", not found in list of valid symbols";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					pFirst = (symbols + p);
+					matrix->AddState(i, j, p);
+					}
+
+				} // if (t[k] == '~') ... else ... loop
+
+			k++;
+			} // for (;;) loop
+
+		matrix->SetPolymorphic(i, j, poly);
+		}	// if (!tokens && token.GetTokenLength() == 1) ... else if (!tokens && token.GetTokenLength() > 1)
+
+	// Handle case in which TOKENS was specified in the FORMAT command
+	//
+	else
+		{
+		// Token should be in one of the following forms: "LEFT_SQUIGGLY"  "a"  "bb"
+		//
+		int polymorphism = token.Equals("(");
+		int uncertainty  = token.Equals("LEFT_SQUIGGLY");
+
+		if (!uncertainty && !polymorphism)
+			{
+			int k = HandleTokenState(token, j);
+			matrix->AddState(i, j, k);
+			}
+
+		else	// either uncertainty or polymorphism
+			{
+			bool tildeFound = false;
+			unsigned first = UINT_MAX;
+			unsigned last;
+			for (;;)
+				{
+				// OPEN ISSUE: What about newlines if interleaving? I'm assuming
+				// that the newline must come between characters to count.
+
+				token.SetLabileFlagBit(NxsToken::tildeIsPunctuation);
+				token.GetNextToken();
+
+				if (polymorphism && token.Equals(")"))
+					{
+					if (tildeFound)
+						{
+						errormsg = "Range of states still being specified when ')' encountered";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					break;
+					}
+
+				else if (uncertainty && token.Equals("RIGHT_SQUIGGLY"))
+					{
+					if (tildeFound)
+						{
+						errormsg = "Range of states still being specified when '}' encountered";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					break;
+					}
+
+				else if (token.Equals("~"))
+					{
+					if (first == UINT_MAX)
+						{
+						errormsg = "Tilde character ('~') cannot precede token indicating beginning of range";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					tildeFound = true;
+					}
+
+				else if (tildeFound)
+					{
+					// Add all states from first+1 to last, then reset tildeFound to false
+					//
+					last = HandleTokenState(token, j);
+
+					if (last <= first)
+						{
+						errormsg = "Last state in specified range (";
+						errormsg += token.GetToken();
+						errormsg += ") must be greater than the first";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					for (unsigned k = first+1; k <= last; k++)
+						matrix->AddState(i, j, k);
+
+					tildeFound = false;
+					first = UINT_MAX;
+					}
+
+				else
+					{
+					// Add current state, then set first to that state's value
+					// State's value is its position within the list of states
+					// for that character
+					//
+					first = HandleTokenState(token, j);
+					matrix->AddState(i, j, first);
+					}
+				}	// if (!uncertainty && !polymorphism) ... else 
+
+			if (polymorphism)
+				matrix->SetPolymorphic(i, j, 1);
+			}	// if (!uncertainty && !polymorphism) ... else
+
+		}	// if (!tokens && token.GetTokenLength() == 1) ... else if (!tokens && token.GetTokenLength() > 1) ... else 
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called from HandleNextState to read in the next state when TOKENS was specified. Looks up state in character 
+|	states listed for the character to make sure it is a valid state, and returns state's value (0, 1, 2, ...). Note: 
+|	does NOT handle adding the state's value to matrix. Save the return value (call it k) and use the following command
+|	to add it to matrix: matrix->AddState(i, j, k);
+*/
+unsigned NxsCharactersBlock::HandleTokenState(
+  NxsToken &token,	/* the token used to read from `in' */
+  unsigned j)		/* the character index, in range [0..`nchar') */
+	{
+	// Token should be one of the character states listed for character j in charStates
+	//
+	if (charStates.find(j) == charStates.end())
+		{
+		errormsg = "No states were defined for character ";
+		errormsg += (1 + GetOrigCharIndex(j));
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	// TO DO: this section is very UGLY - need to find some cleaner way of comparing
+	// the token NxsString to the strings representing valid characters states
+	// in the NxsStringVector associated with character j
+	//
+	NxsStringVectorMap::const_iterator bagIter	= charStates.find(j);
+	NxsStringVector::const_iterator ci_begin	= (*bagIter).second.begin();
+	NxsStringVector::const_iterator ci_end		= (*bagIter).second.end();
+	NxsString t									= token.GetToken(respectingCase);
+	NxsStringVector::const_iterator cit;
+	if (respectingCase)
+		cit = find(ci_begin, ci_end, t);
+	else
+		cit = find_if (ci_begin, ci_end, bind2nd(NxsStringEqual(), t));
+
+	if (cit == ci_end)
+		{
+		errormsg = "Character state ";
+		errormsg += t;
+		errormsg += " not defined for character ";
+		errormsg += (1 + GetOrigCharIndex(j));
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	// Ok, the state has been identified, so return the state's internal representation. That is, 
+	// if the list of state labels was "small medium large" and "small" was specified in the data file,
+	// state saved in matrix would be 0 (it would be 1 if "medium" were specified in the data file, 
+	// and 2 if "large" were specified in the data file).
+	//
+	unsigned k = (cit - ci_begin);
+	return k;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called from HandleMatrix function to read in a standard (i.e., non-transposed) matrix. Interleaving, if 
+|	applicable, is dealt with herein.
+*/
+void NxsCharactersBlock::HandleStdMatrix(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	assert(charPos != NULL);
+	assert(taxonPos != NULL);
+
+	unsigned i = 0, j, currChar = 0;
+	unsigned firstChar = 0;
+	unsigned lastChar = ncharTotal;
+	unsigned nextFirst = 0;
+	int page = 0;
+
+	for (;;)
+		{
+		//************************************************
+		//******** Beginning of loop through taxa ********
+		//************************************************
+
+		for (i = 0; i < ntax; i++)
+			{
+			if (labels)
+				{
+				// This should be the taxon label
+				//
+				token.SetLabileFlagBit(token.hyphenNotPunctuation);
+				token.GetNextToken();
+
+				if (page == 0 && newtaxa)
+					{
+					// This section:
+					// - labels provided
+					// - on first (or only) interleave page
+					// - no previous TAXA block
+
+					// Check for duplicate taxon names
+					//
+					if (taxa->IsAlreadyDefined(token.GetToken()))
+						{
+						errormsg = "Data for this taxon (";
+						errormsg += token.GetToken();
+						errormsg += ") has already been saved";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					// Labels provided and not already stored in the taxa block with
+					// the TAXLABELS command; taxa->Reset() and taxa->SetNTax() have
+					// were already called, however, when the NTAX subcommand was
+					// processed.
+					//
+					taxa->AddTaxonLabel(token.GetToken());
+
+					// Order of occurrence in TAXA block same as row in matrix
+					//
+					taxonPos[i] = i;
+
+					}	// if (page == 0 && newtaxa)
+
+				else
+					{
+					// This section:
+					// - labels provided
+					// - TAXA block provided or has been created already
+					// - may be on any (interleave) page					
+
+					// Cannot assume taxon in same position in
+					// taxa block. Set up taxonPos array so that we can look up
+					// the correct row in matrix for any given taxon
+					//
+					unsigned positionInTaxaBlock;
+					try
+						{
+						positionInTaxaBlock = taxa->FindTaxon(token.GetToken());
+						}
+					catch(NxsTaxaBlock::NxsX_NoSuchTaxon)
+						{
+						if (token.Equals(";") && i == 0)
+							{
+							errormsg = "Unexpected ; (after only ";
+							errormsg += currChar;
+							errormsg += " characters were read)";
+							}
+						else
+							{
+							errormsg = "Could not find taxon named ";
+							errormsg += token.GetToken();
+							errormsg += " among stored taxon labels";
+							}
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					if (page == 0)
+						{
+						// Make sure user has not duplicated data for a single taxon
+						//
+						if (taxonPos[positionInTaxaBlock] != UINT_MAX)
+							{
+							errormsg = "Data for this taxon (";
+							errormsg += token.GetToken();
+							errormsg += ") has already been saved";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+
+						// Make sure user has kept same relative ordering of taxa in both the TAXA
+						// block and the CHARACTERS block
+						//
+						if (positionInTaxaBlock != i)
+							{
+							errormsg = "Relative order of taxa must be the same in both the TAXA and CHARACTERS blocks";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+
+						taxonPos[i] = positionInTaxaBlock;
+						}	// if (page == 0)
+
+					else
+						{
+						// Make sure user has kept the ordering of taxa the same from one interleave page to the next
+						//
+						if (taxonPos[positionInTaxaBlock] != i)
+							{
+							errormsg = "Ordering of taxa must be identical to that in first interleave page";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+						}	// // if (page == 0) ... else
+					}	// if (page == 0 && newtaxa) ... else
+				}	// if (labels)
+
+			else
+				{
+				// No labels provided, assume taxon position same as in taxa block
+				//
+				if (page == 0)
+					taxonPos[i] = i;
+				}	// if (labels) ... else
+
+			//******************************************************
+			//******** Beginning of loop through characters ********
+			//******************************************************
+
+			for (currChar = firstChar; currChar < lastChar; currChar++)
+				{
+				// It is possible that character currChar has been eliminated, in which case we need to 
+				// go through the motions of reading in the data but we don't store it. The variable j 
+				// will be our guide when it comes time to store data since j will be UINT_MAX for
+				// characters that were eliminated and will be set to the correct row for characters 
+				// that have not been eliminated.
+				//
+				j = charPos[currChar];
+
+				// ok will be false only if a newline character is encountered before character j processed
+				//
+				bool ok = HandleNextState(token, i, j);
+				if (interleaving && !ok)
+					{
+					if (lastChar < ncharTotal && j != lastChar)
+						{
+						errormsg = "Each line within an interleave page must comprise the same number of characters";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					// currChar should be firstChar in next go around
+					//
+					nextFirst = currChar;
+
+					// Set lastChar to currChar so that we can check to make sure the remaining lines 
+					// in this interleave page end at the same place
+					//
+					lastChar = currChar;
+
+					// Since j is now equal to lastChar, we are done with this innermost loop
+					}
+				} // for (currChar = firstChar; currChar < lastChar; currChar++)
+
+			} // for (i = 0; i < ntax; i++)
+
+		firstChar = nextFirst;
+		lastChar = ncharTotal;
+
+		// If currChar equals ncharTotal, then we've just finished reading the last interleave page 
+		// and thus should break from the outer loop. Note that if we are not interleaving, this will 
+		// still work since lastChar is initialized to ncharTotal and never changed
+		//
+		if (currChar == ncharTotal)
+			break;
+
+		page++;
+		} // for (;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called from HandleMatrix function to read in a transposed matrix. Interleaving, if applicable, is dealt with herein.
+*/
+void NxsCharactersBlock::HandleTransposedMatrix(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	assert(charPos != NULL);
+	assert(taxonPos != NULL);
+
+	unsigned i = 0, j, currChar;
+	unsigned firstTaxon = 0;
+	unsigned lastTaxon = ntaxTotal;
+	unsigned nextFirst = 0;
+	int page = 0;
+
+	for (;;)
+		{
+		//******************************************************
+		//******** Beginning of loop through characters ********
+		//******************************************************
+
+		for (currChar = 0; currChar < ncharTotal; currChar++)
+			{
+			// It is possible that character currChar has been eliminated, in
+			// which case we need to go through the motions of reading in the
+			// data but we don't store it.  The variable j will be our guide
+			// when it comes time to store data since j will be UINT_MAX for
+			// characters that were eliminated and will be set to the correct
+			// row for characters that have not been eliminated.
+			//
+			j = charPos[currChar];
+
+			if (labels)
+				{
+				// This should be the character label
+				//
+				token.GetNextToken();
+
+				if (page == 0 && newchar)
+					{
+					// Check for duplicate character names
+					//
+					NxsString s = token.GetToken();
+					NxsStringVector::const_iterator iter = find(charLabels.begin(), charLabels.end(), s);
+
+					bool charLabelFound = (iter != charLabels.end());
+					if (charLabelFound)
+						{
+						errormsg = "Data for this character (";
+						errormsg += token.GetToken();
+						errormsg += ") has already been saved";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					// Labels provided, need to add them to charLabels list. We're not supposed to save 
+					// anything for this character since it has been eliminated, but the labels must be 
+					// saved for purposes of numbering. Otherwise a more complicated system would be needed
+					// wherein an association is set up between character number and character label. Since
+					// this is not done in the case of taxa that are effectively eliminated when they are 
+					// included in the TAXA block but not in the CHARACTERS MATRIX command, I see no reason
+					// to not save the full character labels here even for those that have been eliminated.
+					// Also, for interleaved matrices, it is necessary to have the full labels saved somewhere
+					// so that it is possible to detect characters out of order or duplicated.
+					//
+					charLabels.push_back(token.GetToken());
+					}	// if (page == 0 && newchar)
+
+				else // either not first interleaved page or character labels not previously defined
+					{
+					NxsString s = token.GetToken();
+					NxsStringVector::const_iterator iter = find(charLabels.begin(), charLabels.end(), s);
+
+					if (iter == charLabels.end())
+						{
+						errormsg = "Could not find character named ";
+						errormsg += token.GetToken();
+						errormsg += " among stored character labels";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					unsigned positionInCharLabelsList = (iter - charLabels.begin());
+
+					// Make sure user has not duplicated data for a single character or changed the order 
+					// in which characters appear in different interleave pages
+					//
+					if (positionInCharLabelsList != currChar)
+						{
+						if (page == 0)
+							{
+							errormsg = "Data for this character (";
+							errormsg += token.GetToken();
+							errormsg += ") has already been saved";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+						else
+							{
+							errormsg = "Ordering of characters must be identical to that in first interleave page";
+							throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							}
+						}
+					}	// if (page == 0 && newchar) ... else
+				} // if (labels)
+
+			//************************************************
+			//******** Beginning of loop through taxa ********
+			//************************************************
+
+			for (i = firstTaxon; i < lastTaxon; i++)
+				{
+				if (page == 0)
+					{
+					// We are forced to assume that the user did not leave out any
+					// taxa, since without taxon labels in the matrix we would
+					// have no way of detecting which were left out; thus,
+					// ntax == ntaxTotal in this case.  Order of occurrence in
+					// TAXA block is the same as the row in matrix.
+					//
+					taxonPos[i] = i;
+					}
+
+				// ok will be 0 only if a newline character is encountered before
+				// taxon i processed
+				//
+				bool ok = HandleNextState(token, i, j);
+				if (interleaving && !ok)
+					{
+					if (lastTaxon < ntaxTotal && i != lastTaxon)
+						{
+						errormsg = "Each line within an interleave page must comprise the same number of taxa";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+
+					// i should be firstChar in next go around
+					//
+					nextFirst = i;
+
+					// Set lastTaxon to i so that we can check to make sure the
+					// remaining lines in this interleave page end at the same place
+					lastTaxon = i;
+
+					// Since i is now equal to lastTaxon, we are done with this innermost loop
+					}	// if (interleaving && !ok)
+				} // for (i = firstTaxon; i < lastTaxon; i++)
+
+			} // for (currChar = 0; currChar < ncharTotal; currChar++)
+
+		firstTaxon = nextFirst;
+		lastTaxon = ntaxTotal;
+
+		// If i equals ncharTotal, then we've just finished reading the last
+		// interleave page and thus should break from the outer loop
+		// Note that if we are not interleaving, this will still work since
+		// lastTaxon is initialized to ntaxTotal and never changed
+		//
+		if (i == ntaxTotal)
+			break;
+
+		page++;
+		} // for (;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when MATRIX command needs to be parsed from within the CHARACTERS block. Deals with everything after the 
+|	token MATRIX up to and including the semicolon that terminates the MATRIX command.
+*/
+void NxsCharactersBlock::HandleMatrix(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	unsigned i, j;
+
+	if (ntax == 0)
+		{
+		errormsg = "Must precede ";
+		errormsg += id;
+		errormsg += " block with a TAXA block or specify NEWTAXA and NTAX in the DIMENSIONS command";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (ntaxTotal == 0)
+		ntaxTotal = taxa->GetNumTaxonLabels();
+
+	// We use >= rather than just > below because someone might have eliminated
+	// all characters, and we should allow that (even though it is absurd)
+	//
+	assert(nchar >= 0);
+
+	if (datatype == NxsCharactersBlock::continuous)
+		{
+		errormsg = "Sorry, continuous character matrices have not yet been implemented";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (matrix != NULL)
+		delete matrix;
+	matrix = new NxsDiscreteMatrix(ntax, nchar);
+
+	// Allocate memory for (and initialize) the arrays activeTaxon and activeChar.
+	// All characters and all taxa are initially active.
+	//
+	activeTaxon = new bool[ntax];
+	for (i = 0; i < ntax; i++)
+		activeTaxon[i] = true;
+
+	activeChar = new bool[nchar];
+	for (j = 0; j < nchar; j++)
+		activeChar[j] = true;
+
+	// The value of ncharTotal is normally identical to the value of nchar specified
+	// in the CHARACTERS block DIMENSIONS command.  If an ELIMINATE command is
+	// processed, however, nchar < ncharTotal.  Note that the ELIMINATE command
+	// will have already been read by now, and the eliminated character numbers
+	// will be stored in the NxsUnsignedSet eliminated.
+	//
+	// Note that if an ELIMINATE command has been read, charPos will have already
+	// been created; thus, we only need to allocate and initialize charPos if user
+	// did not specify an ELIMINATE command
+	//
+	if (charPos == NULL)
+		BuildCharPosArray();
+
+	// The value of ntaxTotal equals the number of taxa specified in the
+	// TAXA block, whereas ntax equals the number of taxa specified in
+	// the DIMENSIONS command of the CHARACTERS block.  These two numbers
+	// will be identical unless some taxa were left out of the MATRIX
+	// command of the CHARACTERS block, in which case ntax < ntaxTotal.
+	//
+	if (taxonPos != NULL)
+		delete [] taxonPos;
+	taxonPos = new unsigned[ntaxTotal];
+
+	for (i = 0; i < ntaxTotal; i++)
+		taxonPos[i] = UINT_MAX;
+
+	if (transposing)
+		HandleTransposedMatrix(token);
+	else
+		HandleStdMatrix(token);
+
+	// If we've gotten this far, presumably it is safe to
+	// tell the ASSUMPTIONS block that were ready to take on
+	// the responsibility of being the current character-containing
+	// block (to be consulted if characters are excluded or included
+	// or if taxa are deleted or restored)
+	//
+	assumptionsBlock->SetCallback(this);
+
+	// This should be the terminating semicolon at the end of the matrix command
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' at the end of the MATRIX command; found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when STATELABELS command needs to be parsed from within the DIMENSIONS block. Deals with everything after 
+|	the token STATELABELS up to and including the semicolon that terminates the STATELABELS command. Note that the 
+|	numbers of states are shifted back one before being stored so that the character numbers in the NxsStringVectorMap 
+|	objects are 0-offset rather than being 1-offset as in the NxsReader data file.
+*/
+void NxsCharactersBlock::HandleStatelabels(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	bool semicolonFoundInInnerLoop = false;
+
+	charStates.clear();
+
+	if (charPos == NULL)
+		BuildCharPosArray();
+
+	for (;;)
+		{
+		if (semicolonFoundInInnerLoop)
+			break;
+
+		token.GetNextToken();
+
+		if (token.Equals(";"))
+			break;
+
+		// Token should be the character number; create a new association
+		//
+		unsigned n = atoi(token.GetToken().c_str());
+
+		if (n < 1 || n > ncharTotal)
+			{
+			errormsg = "Invalid character number (";
+			errormsg += token.GetToken();
+			errormsg += ") found in STATELABELS command (either out of range or not interpretable as an integer)";
+			throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		for (;;)
+			{
+			token.GetNextToken();
+
+			if (token.Equals(";"))
+				{
+				semicolonFoundInInnerLoop = true;
+				break;
+				}
+
+			if (token.Equals(","))
+				break;
+
+			// Token should be a character state label; add it to the list
+			//
+			if (!IsEliminated(n - 1))
+				{
+				unsigned k = GetCharPos(n - 1);
+				charStates[k].push_back(token.GetToken());
+				}
+
+			} // for (;;)
+		} // for (;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when TAXLABELS command needs to be parsed from within the CHARACTERS block. Deals with everything after the 
+|	token TAXLABELS up to and including the semicolon that terminates the TAXLABELS command.
+*/
+void NxsCharactersBlock::HandleTaxlabels(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	if (!newtaxa)
+		{
+		errormsg = "NEWTAXA must have been specified in DIMENSIONS command to use the TAXLABELS command in a ";
+		errormsg += id;
+		errormsg += " block";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		// Token should either be ';' or the name of a taxon
+		//
+		if (token.Equals(";"))
+			{
+			break;
+			}
+		else
+			{
+			// Check to make sure user is not trying to read in more
+			// taxon labels than there are taxa
+			//
+			if (taxa->GetNumTaxonLabels() > ntaxTotal)
+				{
+				errormsg = "Number of taxon labels exceeds NTAX specified in DIMENSIONS command";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			taxa->AddTaxonLabel(token.GetToken());
+			}
+		}
+
+	// OPEN ISSUE: Some may object to setting newtaxa to false here, because then the fact that new taxa were 
+	// specified in this CHARACTERS block rather than in a preceding TAXA block is lost. This will only be 
+	// important if we wish to recreate the original data file, which I don't anticipate anyone doing with
+	// this code (too difficult to remember all comments, the order of blocks in the file, etc.)
+
+	newtaxa = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns position of `ch' in `symbols' array. The value of `respectingCase' is used to determine whether the search 
+|	should be case sensitive or not. Assumes `symbols' is non-NULL. Returns UINT_MAX if `ch' is not found in `symbols'.
+*/
+unsigned NxsCharactersBlock::PositionInSymbols(
+  char ch)	/* the symbol character to search for */
+	{
+	assert(symbols != NULL);
+	unsigned symbolsLength = strlen(symbols);
+	bool found = false;
+	unsigned i;
+	for (i = 0; i < symbolsLength; i++)
+		{
+		char char_in_symbols	= (respectingCase	? symbols[i]	: (char)toupper(symbols[i]));
+		char char_in_question	= (respectingCase	? ch			: (char)toupper(ch));
+		if (char_in_symbols != char_in_question)
+			continue;
+		found = true;
+		break;
+		}
+	return (found ? i : UINT_MAX);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NxsReader 
+|	object) to the END or ENDBLOCK statement. Characters are read from the input stream `in'. Overrides the abstract 
+|	virtual function in the base class.
+*/
+void NxsCharactersBlock::Read(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	isEmpty = false;
+	isUserSupplied = true;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken(); 
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' after ";
+		errormsg += id;
+		errormsg += " block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	ntax = taxa->GetNumTaxonLabels();
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("DIMENSIONS"))
+			{
+			HandleDimensions(token, "NEWTAXA", "NTAX", "NCHAR");
+			}
+		else if (token.Equals("FORMAT"))
+			{
+			HandleFormat(token);
+			}
+		else if (token.Equals("ELIMINATE"))
+			{
+			HandleEliminate(token);
+			}
+		else if (token.Equals("TAXLABELS"))
+			{
+			HandleTaxlabels(token);
+			}
+		else if (token.Equals("CHARSTATELABELS"))
+			{
+			HandleCharstatelabels(token);
+			}
+		else if (token.Equals("CHARLABELS"))
+			{
+			HandleCharlabels(token);
+			}
+		else if (token.Equals("STATELABELS"))
+			{
+			HandleStatelabels(token);
+			}
+		else if (token.Equals("MATRIX"))
+			{
+			HandleMatrix(token);
+			}
+		else if (token.Equals("END"))
+			{
+			HandleEndblock(token, "Character");
+			break;
+			}
+		else if (token.Equals("ENDBLOCK"))
+			{
+			HandleEndblock(token, "Character");
+			break;
+			}
+		else
+			{
+			SkippingCommand(token.GetToken());
+
+			do
+				{
+				token.GetNextToken();
+				}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}	// else
+		}	// for (;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this CHARACTERS block. Overrides the abstract virtual 
+|	function in the base class.
+*/
+void NxsCharactersBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	out << endl;
+	out << id << " block contains ";
+	if (ntax == 0)
+		out << "no taxa";
+	else if (ntax == 1)
+		out << "one taxon";
+	else
+		out << ntax << " taxa";
+	out << " and ";
+	if (nchar == 0)
+		out << "no characters";
+	else if (nchar == 1)
+		out << "one character";
+	else
+		out << nchar << " characters";
+	out << endl;
+
+	if (formerly_datablock)
+		{
+		out << "Originally read in as a DATA block" << endl;
+		out << endl;
+		}
+
+	switch(datatype)
+		{
+		case NxsCharactersBlock::dna:
+			out << "  Data type is \"DNA\"" << endl;
+			break;
+
+		case NxsCharactersBlock::rna:
+			out << "  Data type is \"RNA\"" << endl;
+			break;
+
+		case NxsCharactersBlock::nucleotide:
+			out << "  Data type is \"nucleotide\"" << endl;
+			break;
+
+		case NxsCharactersBlock::protein:
+			out << "  Data type is \"protein\"" << endl;
+			break;
+
+		case NxsCharactersBlock::continuous:
+			out << "  Data type is \"continuous\"" << endl;
+			break;
+
+		default:
+			out << "  Data type is \"standard\"" << endl;
+		}
+
+	if (respectingCase)
+		out << "  Respecting case" << endl;
+	else
+		out << "  Ignoring case" << endl;
+
+	if (tokens)
+		out << "  Multicharacter tokens allowed in data matrix" << endl;
+	else
+		out << "  Data matrix entries are expected to be single symbols" << endl;
+
+	if (labels && transposing)
+		out << "  Character labels are expected on left side of matrix" << endl;
+	else if (labels && !transposing)
+		out << "  Taxon labels are expected on left side of matrix" << endl;
+	else
+		out << "  No labels are expected on left side of matrix" << endl;
+
+	if (charLabels.size() > 0)
+		{
+		out << "  Character and character state labels:" << endl;
+		for (unsigned k = 0; k < nchar; k++) 
+			{
+			if (charLabels[k].length() == 0)
+				out << '\t' << (1 + GetOrigCharIndex(k)) << '\t' << "(no label provided for this character)" << endl;
+			else
+				out << '\t' << (1 + GetOrigCharIndex(k)) << '\t' << charLabels[k] << endl;
+
+			// Output state labels if any are defined for this character
+			//
+			NxsStringVectorMap::const_iterator cib = charStates.find(k);
+			if (cib != charStates.end())
+				{
+				int ns = (*cib).second.size();
+				for (int m = 0; m < ns; m++)
+					{
+					out << "\t\t" << (*cib).second[m] << endl;
+					}
+				}
+			}
+		}
+
+	if (transposing && interleaving)
+		out << "  Matrix transposed and interleaved" << endl;
+	else if (transposing && !interleaving)
+		out << "  Matrix transposed but not interleaved" << endl;
+	else if (!transposing && interleaving)
+		out << "  Matrix interleaved but not transposed" << endl;
+	else
+		out << "  Matrix neither transposed nor interleaved" << endl;
+
+	out << "  Missing data symbol is '" << missing << '\'' << endl;
+
+	if (matchchar != '\0')
+		out << "  Match character is '" << matchchar << '\'' << endl;
+	else
+		out << "  No match character specified" << endl;
+
+	if (gap != '\0')
+		out << "  Gap character specified is '" << gap << '\'' << endl;
+	else
+		out << "  No gap character specified" << endl;
+
+	out << "  Valid symbols are: " << symbols << endl;
+
+	int numEquateMacros = equates.size();
+	if (numEquateMacros > 0)
+		{
+		out << "  Equate macros in effect:" << endl;
+		typedef NxsStringMap::const_iterator CI;
+		for (CI i = equates.begin(); i != equates.end(); ++i)
+			{
+			out << "    " << (*i).first << " = " << (*i).second << endl;
+			}
+		}
+	else
+		out << "  No equate macros have been defined" << endl;
+
+	if (ncharTotal == nchar)
+		out << "  No characters were eliminated" << endl;
+	else
+		{
+		out << "  The following characters were eliminated:" << endl;
+		NxsUnsignedSet::const_iterator k;
+		for (k = eliminated.begin(); k != eliminated.end(); k++)
+			{
+			out << "    " << ((*k)+1) << endl;
+			}
+		}
+
+	out << "  The following characters have been excluded:" << endl;
+	unsigned k;
+	unsigned nx = 0;
+	for (k = 0; k < nchar; k++)
+		{
+		if (activeChar[k])
+			continue;
+		out << "    " << (k+1) << endl;
+		nx++;
+		}
+
+	if (nx == 0)
+		out << "    (no characters excluded)" << endl;
+
+	out << "  The following taxa have been deleted:" << endl;
+	nx = 0;
+	for (k = 0; k < ntax; k++)
+		{
+		if (activeTaxon[k])
+			continue;
+		out << "    " << (k+1) << endl;
+		nx++;
+		}
+
+	if (nx == 0)
+		out << "    (no taxa deleted)" << endl;
+
+	out << "  Data matrix:" << endl;
+	DebugShowMatrix(out, false, "    ");
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns NxsCharactersBlock object to the state it was in when first created.
+*/
+void NxsCharactersBlock::Reset()
+	{ 
+	// Reset base class data members that could have changed
+	//
+	errormsg.clear();
+	isEnabled      = true;
+	isEmpty        = true;
+	isUserSupplied = false;
+
+	ntax				= 0;
+	ntaxTotal			= 0;
+	nchar				= 0;
+	ncharTotal			= 0;
+	newchar				= true;
+	newtaxa				= false;
+	interleaving		= false;
+	transposing			= false;
+	respectingCase		= false;
+	formerly_datablock	= false;
+	labels				= true;
+	tokens				= false;
+	datatype			= NxsCharactersBlock::standard;
+	missing				= '?';
+	gap					= '\0';
+	matchchar			= '\0';
+	matrix				= NULL;
+	charPos				= NULL;
+	taxonPos			= NULL;
+	activeTaxon			= NULL;
+	activeChar			= NULL;
+	symbols				= NULL;
+
+	ResetSymbols();
+
+	charLabels.clear();
+	charStates.clear();
+	equates.clear();
+	eliminated.clear();
+
+	if (matrix != NULL)
+		{
+		delete matrix;
+		matrix = NULL;
+		}
+
+	if (charPos != NULL)
+		{
+		delete [] charPos;
+		charPos = NULL;
+		}
+
+	if (taxonPos != NULL)
+		{
+		delete [] taxonPos;
+		taxonPos = NULL;
+		}
+
+	if (activeTaxon != NULL)
+		{
+		delete [] activeTaxon;
+		activeTaxon = NULL;
+		}
+
+	if (activeChar != NULL)
+		{
+		delete [] activeChar;
+		activeChar = NULL;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Resets standard symbol set after a change in `datatype' is made. Also flushes equates list and installs standard 
+|	equate macros for the current `datatype'.
+*/
+void NxsCharactersBlock::ResetSymbols()
+	{
+	// Symbols might be NULL (if a different NxsCharactersBlock has consumed this one
+	//
+	if (symbols == NULL)
+		{
+		symbols = new char[NCL_MAX_STATES+1];
+		symbols[0] = '0';
+		symbols[1] = '1';
+		symbols[2] = '\0';
+		}
+
+	switch(datatype)
+		{
+		case NxsCharactersBlock::dna:
+			strcpy(symbols, "ACGT");
+			break;
+
+		case NxsCharactersBlock::rna:
+			strcpy(symbols, "ACGU");
+			break;
+
+		case NxsCharactersBlock::nucleotide:
+			strcpy(symbols, "ACGT");
+			break;
+
+		case NxsCharactersBlock::protein:
+			strcpy(symbols, "ACDEFGHIKLMNPQRSTVWY*");
+			break;
+
+		default:
+			strcpy(symbols, "01");
+		}
+
+	// Setup standard equates
+	//
+	equates.clear();
+	if (datatype == NxsCharactersBlock::dna || datatype == NxsCharactersBlock::rna || datatype == NxsCharactersBlock::nucleotide)
+		{
+		equates[ NxsString("R") ] = NxsString("{AG}");
+		equates[ NxsString("Y") ] = NxsString("{CT}");
+		equates[ NxsString("M") ] = NxsString("{AC}");
+		equates[ NxsString("K") ] = NxsString("{GT}");
+		equates[ NxsString("S") ] = NxsString("{CG}");
+		equates[ NxsString("W") ] = NxsString("{AT}");
+		equates[ NxsString("H") ] = NxsString("{ACT}");
+		equates[ NxsString("B") ] = NxsString("{CGT}");
+		equates[ NxsString("V") ] = NxsString("{ACG}");
+		equates[ NxsString("D") ] = NxsString("{AGT}");
+		equates[ NxsString("N") ] = NxsString("{ACGT}");
+		equates[ NxsString("X") ] = NxsString("{ACGT}");
+		}
+	else if (datatype == NxsCharactersBlock::protein)
+		{
+		equates[ NxsString("B") ] = NxsString("{DN}");
+		equates[ NxsString("Z") ] = NxsString("{EQ}");
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Looks up the state(s) at row `i', column `j' of matrix and writes it (or them) to out. If there is uncertainty or 
+|	polymorphism, the list of states is surrounded by the appropriate set of symbols (i.e., parentheses for 
+|	polymorphism, curly brackets for uncertainty). If TOKENS was specified, the output takes the form of the defined 
+|	state labels; otherwise, the correct symbol is looked up in `symbols' and output.
+*/
+void NxsCharactersBlock::ShowStateLabels(
+  ostream &out,				/* the output stream on which to write */
+  unsigned i,				/* the taxon, in range [0..`ntax') */
+  unsigned j,				/* the character, in range [0..`nchar') */
+  unsigned first_taxon)		/* the index of the first taxon (if UINT_MAX, don't use matchchar) */
+	{
+	if (tokens)
+		{
+		unsigned n = matrix->GetNumStates(i, j);
+		if (n == 0 && matrix->IsGap(i, j))
+			out << gap;
+		else if (n == 0 && matrix->IsMissing(i, j))
+			out << missing;
+		else if (n == 1) 
+			{
+			int s = matrix->GetState(i, j);
+			bool use_matchchar = false;
+			if (first_taxon != UINT_MAX /*BQM: modified from '>= 0' */ && i > first_taxon)
+				{
+				int firsts = matrix->GetState(first_taxon, j);
+				if (firsts == s)
+				use_matchchar = true;
+				}
+			if (use_matchchar)
+				{
+				// Show matchchar symbol '.'
+				//
+				out << "  .";
+				}
+			else
+				{
+				NxsStringVectorMap::const_iterator ci = charStates.find(j);
+
+				// OPEN ISSUE: need to eliminate state labels for characters that have
+				// been eliminated
+				//
+				if (ci == charStates.end())
+					out << "  " << s << "[<-no label found]";
+				else
+					{
+					// Show label at index number s in NxsStringVector at ci
+					//
+					out << "  " << (*ci).second[s];
+					}
+				}	// if (use_matchchar) ... else
+			}	// if (n == 0 && matrix->IsGap(i, j)) ... else if (n == 0 && matrix->IsMissing(i, j)) ... else
+		else 
+			{
+			// TODO: handle matchchar possibility here too
+			//
+			if (matrix->IsPolymorphic(i, j))
+				out << "  (";
+			else
+				out << "  {";
+			for (unsigned k = 0; k < n; k++)
+				{
+				unsigned s = matrix->GetState(i, j, k);
+				NxsStringVectorMap::const_iterator ci = charStates.find(j);
+				if (ci == charStates.end())
+					out << "  " << s << "[<-no label found]";
+				else
+					{
+					// Show label at index number s in NxsStringVector at ci
+					//
+					out << "  " << (*ci).second[s];
+					}
+				}
+			if (matrix->IsPolymorphic(i, j))
+				out << ')';
+			else
+				out << '}';
+			}
+
+		}	// if (tokens)
+
+	else
+		{
+		if (first_taxon != UINT_MAX /*BQM: modified from '>= 0' */ && i > first_taxon)
+			{
+			char s[NCL_MAX_STATES + 3];
+			WriteStates(matrix->GetDiscreteDatum(i, j), s, NCL_MAX_STATES + 3);
+
+			char ss[NCL_MAX_STATES + 3];
+			WriteStates(matrix->GetDiscreteDatum(first_taxon, j), ss, NCL_MAX_STATES + 3);
+
+			if (strcmp(s, ss) == 0)
+				out << '.';
+			else
+				ShowStates(out, i, j);
+			}
+		else 
+			ShowStates(out, i, j);
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Writes out the state (or states) stored in this NxsDiscreteDatum object to the buffer `s' using the symbols array 
+|	to do the necessary translation of the numeric state values to state symbols. In the case of polymorphism or 
+|	uncertainty, the list of states will be surrounded by brackets or parentheses (respectively). Assumes `s' is 
+|	non-NULL and long enough to hold everything printed.
+*/
+void NxsCharactersBlock::WriteStates(
+  NxsDiscreteDatum &d,	/* the datum to be queried */
+  char *s,				/* the buffer to which to print */
+  unsigned slen)		/* the length of the buffer `s' */
+	{
+	assert(s != NULL);
+	assert(slen > 1);
+
+	if (matrix->IsMissing(d))
+		{
+		s[0] = missing;
+		s[1] = '\0';
+		}
+	else if (matrix->IsGap(d))
+		{
+		s[0] = gap;
+		s[1] = '\0';
+		}
+	else
+		{
+		assert(symbols != NULL);
+		unsigned symbolListLen = strlen(symbols);
+
+		unsigned numStates = matrix->GetNumStates(d);
+		unsigned numCharsNeeded = numStates;
+		if (numStates > 1)
+			numCharsNeeded += 2;
+		assert(slen > numCharsNeeded);
+
+		if (numStates == 1)
+			{
+			unsigned v = matrix->GetState(d);
+			assert(v < symbolListLen);
+			s[0] = symbols[v];
+			s[1] = '\0';
+			}
+
+		else
+			{
+			// numStates must be greater than 1
+			//
+			unsigned i = 0;
+			if (matrix->IsPolymorphic(d))
+				s[i++] = '(';
+			else
+				s[i++] = '{';
+			for (unsigned k = 0; k < numStates; k++)
+				{
+				unsigned v = matrix->GetState(d, k);
+				assert(v < symbolListLen);
+				s[i++] = symbols[v];
+				s[i] = '\0';
+				}
+			if (matrix->IsPolymorphic(d))
+				s[i++] = ')';
+			else
+				s[i++] = '}';
+			s[i] = '\0';
+			}
+		}
+	}
diff --git a/ncl/nxscharactersblock.h b/ncl/nxscharactersblock.h
new file mode 100644
index 0000000..a329829
--- /dev/null
+++ b/ncl/nxscharactersblock.h
@@ -0,0 +1,800 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSCHARACTERSBLOCK_H
+#define NCL_NXSCHARACTERSBLOCK_H
+
+class NxsTaxaBlock;
+class NxsAssumptionsBlock;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NEXUS block CHARACTERS. It overrides the member functions Read and 
+|	Reset, which are abstract virtual functions in the base class NxsBlock. The issue of bookkeeping demands a careful
+|	explanation. Users are allowed to control the number of characters analyzed either by "eliminating" or "excluding"
+|	characters. Characters can be eliminated (by using the ELIMINATE command) at the time of execution of the data 
+|	file, but not thereafter. Characters can, however, be excluded at any time after the data are read. No storage is 
+|	provided for eliminated characters, whereas excluded characters must be stored because at any time they could be 
+|	restored to active status. Because one can depend on eliminated characters continuing to be eliminated, it would 
+|	be inefficient to constantly have to check whether a character has been eliminated. Hence, the characters are 
+|	renumbered so that one can efficiently traverse the entire range of non-eliminated characters. The original range 
+|	of characters will be hereafter denoted [0..`ncharTotal'), whereas the new, reduced range will be denoted 
+|	[0..`nchar'). The two ranges exactly coincide if `ncharTotal' = `nchar' (i.e., no ELIMINATE command was specified
+|	in the CHARACTERS block. The possibility for eliminating and excluding characters creates a very confusing situation
+|	that is exacerbated by the fact that character indices used in the code begin at 0 whereas character numbers in the
+|	data file begin at 1. The convention used hereafter will be to specify "character number k" when discussing 
+|	1-offset character numbers in the data file and either "character index k" or simply "character k" when discussing 
+|	0-offset character indices.
+|	
+|	There are several functions (and data structures) that provide services related to keeping track of the 
+|	correspondence between character indices in the stored data matrix compared to character numbers in the original 
+|	data file. The array `charPos' can be used to find the index of one of the original characters in the matrix. 
+|	The function GetCharPos provides public access to the protected `charPos' array. For example, if character 9 
+|	(= character number 10) was the only one eliminated, GetCharPos(9) would return UINT_MAX indicating that that 
+|	character 9 does not now exist. GetCharPos(10) returns 9 indicating that character 10 in the data file corresponds 
+|	to character 9 in the stored data matrix. All public functions in which a character number must be supplied (such 
+|	as GetInternalRepresentation) assume that the character number is the current position of the character in the data
+|	matrix. This allows one to quickly traverse the data matrix without having to constantly check whether or not a 
+|	character was eliminated. Note that GetNChar returns `nchar', not `ncharTotal', and this function should be used 
+|	to obtain the end point for a traversal of characters in the matrix. Other functions requiring a (current) character
+|	index are: 
+|>
+|	GetInternalRepresentation
+|	GetNumStates
+|	GetNumStates
+|	GetObsNumStates
+|	GetOrigCharIndex
+|	GetOrigCharNumber
+|	GetState
+|	HandleNextState
+|	HandleTokenState
+|	IsGapState
+|	IsMissingState
+|	IsPolymorphic
+|	ShowStateLabels
+|>
+|	The function IsEliminated is exceptional in requiring (by necessity) the original character index. The function 
+|	GetOrigCharIndex returns the original character index for any current character index. This is useful only when 
+|	outputting information that will be seen by the user, and in this case, it is really the character number that 
+|	should be output. To get the original character number, either add 1 to GetOrigCharIndex or call GetOrigCharNumber
+|	function (which simply returns GetOrigCharIndex + 1).
+|	
+|	A character may be excluded by calling the function ExcludeCharacter and providing the current character index or 
+|	by calling the function ApplyExset and supplying an exclusion set comprising original character indices. These 
+|	functions manipulate a bool array, `activeChar', which can be queried using one of two functions: IsActiveChar
+|	or IsExcluded. The array `activeChar' is `nchar' elements long, so IsActiveChar and IsExcluded both accept only 
+|	current character indices. Thus, a normal loop through all characters in the data matrix should look something 
+|	like this:
+|>
+|	for(unsigned j = 0; j < nchar; j++)
+|		{
+|		if (IsExcluded(j))
+|			continue;
+|		.
+|		.
+|		.
+|		}
+|>
+|	A corresponding set of data structures and functions exists to provide the same services for taxa. Thus, `ntax'
+|	 holds the current number of taxa, whereas `ntaxTotal' holds the number of taxa specified in the TAXA block. 
+|	If data is provided in the MATRIX command for all taxa listed in the TAXA block, ntax will be equal to `ntaxTotal'.
+|	If data is not provided for some of the taxa, the ones left out are treated just like eliminated characters. The 
+|	function GetTaxonPos can be used to query the `taxonPos' array, which behaves like the `charPos' array does for 
+|	characters: UINT_MAX for element `i' means that the taxon whose original index was `i' has been eliminated and no 
+|	data is stored for it in the matrix. Otherwise, GetTaxonPos(i) returns the current index corresponding to the taxon 
+|	with an original index of `i'. The function GetNTax returns `ntax', whereas GetNTaxTotal must be used to gain 
+|	access to `ntaxTotal' (but this is seldom necessary). The functions GetOrigTaxonIndex and GetOrigTaxonNumber behave 
+|	like their character counterparts, GetOrigCharIndex and GetOrigCharNumber. Like characters, taxa can be temporarily
+|	inactivated so that they do not participate in any analyses.until they are reactivated by the user. Inactivation 
+|	of a taxon is refered to as deleting the taxon, whereas restoring a taxon means reactivating it. Thus, the 
+|	ApplyDelset, DeleteTaxon, and RestoreTaxon functions correspond to the ApplyExset, ExcludeCharacter, and 
+|	IncludeCharacter functions for characters. To query whether a taxon is currently deleted, use either 
+|	IsActiveTaxon or IsDeleted. A normal loop across all active taxa can be constructed as follows:
+|>
+|	for (unsigned i = 0; i < ntax; i++)
+|		{
+|		if (IsDeleted(i))
+|			continue;
+|		.
+|		.
+|		.
+|		}
+|>
+|	Below is a table showing the correspondence between the elements of a CHARACTERS block in a NEXUS file and the
+|	variables and member functions of the NxsCharactersBlock class that can be used to access each piece of information
+|	stored. Items in parenthesis should be viewed as "see also" items.
+|>
+|	NEXUS         Command        Data           Member
+|   Command       Atribute       Member         Functions
+|	---------------------------------------------------------------------
+|	DIMENSIONS    NEWTAXA        newtaxa
+|	
+|	              NTAX           ntax           GetNTax
+|                                (ntaxTotal)    (GetNumMatrixRows)
+|	
+|	              NCHAR          nchar          GetNChar
+|	                             (ncharTotal)   (GetNumMatrixCols)
+|	
+|	FORMAT        DATATYPE       datatype       GetDataType
+|	
+|	              RESPECTCASE    respectingCase IsRespectCase
+|	
+|	              MISSING        missing        GetMissingSymbol
+|	
+|	              GAP            gap            GetGapSymbol
+|	
+|	              SYMBOLS        symbols        GetSymbols
+|	
+|	              EQUATE         equates        GetEquateKey
+|	                                            GetEquateValue
+|	                                            GetNumEquates
+|	
+|	              MATCHCHAR      matchchar      GetMatchcharSymbol
+|	
+|	              (NO)LABELS     labels         IsLabels
+|	
+|	              TRANSPOSE      transposing    IsTranspose
+|	
+|	              INTERLEAVE     interleaving   IsInterleave
+|	
+|	              ITEMS          (Note: only STATES implemented)
+|	
+|	              STATESFORMAT   (Note: only STATESPRESENT implemented)
+|	
+|	              (NO)TOKENS     tokens         IsTokens
+|	
+|	ELIMINATE                    eliminated     IsEliminated
+|	                                            GetNumEliminated
+|	
+|	MATRIX                       matrix         GetState
+|	                                            GetInternalRepresentation
+|	                                            GetNumStates
+|	                                            GetNumMatrixRows
+|	                                            GetNumMatrixCols
+|	                                            IsPolymorphic
+|>
+*/
+class NxsCharactersBlock
+  : public NxsBlock
+	{
+	friend class NxsAssumptionsBlock;
+
+	public:
+
+		enum DataTypesEnum		/* values used to represent different basic types of data stored in a CHARACTERS block, and used with the data member `datatype' */
+			{
+			standard = 1,		/* indicates `matrix' holds characters with arbitrarily-assigned, discrete states, such as discrete morphological data */
+			dna,				/* indicates `matrix' holds DNA sequences (states A, C, G, T) */
+			rna,				/* indicates `matrix' holds RNA sequences (states A, C, G, U) */
+			nucleotide,			/* indicates `matrix' holds nucleotide sequences */
+			protein,			/* indicates `matrix' holds amino acid sequences */
+			continuous			/* indicates `matrix' holds continuous data */
+			};
+
+								NxsCharactersBlock(NxsTaxaBlock *tb, NxsAssumptionsBlock *ab);
+		virtual					~NxsCharactersBlock();
+
+		unsigned				ApplyDelset(NxsUnsignedSet &delset);
+		unsigned				ApplyExset(NxsUnsignedSet &exset);
+		unsigned				ApplyIncludeset(NxsUnsignedSet &inset);
+		unsigned				ApplyRestoreset(NxsUnsignedSet &restoreset);
+		unsigned				GetCharPos(unsigned origCharIndex);
+		unsigned				GetTaxPos(unsigned origTaxonIndex);
+		unsigned				GetDataType();
+		int						GetInternalRepresentation(unsigned i, unsigned j, unsigned k = 0);
+		unsigned				GetNTax();
+		unsigned				GetNChar();
+		unsigned				GetNCharTotal();
+		unsigned				GetNTaxTotal();
+		unsigned				GetNumActiveChar();
+		unsigned				GetNumActiveTaxa();
+		unsigned				GetNumEliminated();
+		unsigned				GetNumEquates();
+		unsigned				GetNumMatrixCols();
+		unsigned				GetNumMatrixRows();
+		unsigned				GetNumStates(unsigned i, unsigned j);
+		unsigned				GetOrigCharIndex(unsigned j);
+		unsigned				GetOrigCharNumber(unsigned j);
+		unsigned				GetOrigTaxonIndex(unsigned j);
+		unsigned				GetOrigTaxonNumber(unsigned j);
+		char					GetGapSymbol();
+		char					GetMatchcharSymbol();
+		char					GetMissingSymbol();
+		bool					IsGapState(unsigned i, unsigned j);
+		bool					IsInterleave();
+		bool					IsLabels();
+		bool					IsMissingState(unsigned i, unsigned j);
+		bool					IsPolymorphic(unsigned i, unsigned j);
+		bool					IsRespectCase();
+		bool					IsTokens();
+		bool					IsTranspose();
+		bool					IsEliminated(unsigned origCharIndex);
+		void					Consume(NxsCharactersBlock &other);
+		void					ExcludeCharacter(unsigned i);
+		void					IncludeCharacter(unsigned i);
+		bool					IsActiveChar(unsigned j);
+		bool					IsExcluded(unsigned j);
+		void					DeleteTaxon(unsigned i);
+		void					RestoreTaxon(unsigned i);
+		bool					IsActiveTaxon(unsigned i);
+		bool					IsDeleted(unsigned i);
+		void					ShowStateLabels(ostream &out, unsigned i, unsigned c, unsigned first_taxon = -1);
+		unsigned				GetStateSymbolIndex(unsigned i, unsigned j, unsigned k = 0);	// added by mth for standard data types
+		char					GetState(unsigned i, unsigned j, unsigned k = 0);
+		char					*GetSymbols();
+		bool					*GetActiveTaxonArray();
+		bool					*GetActiveCharArray();
+		NxsString				GetCharLabel(unsigned i);
+		NxsString				GetStateLabel(unsigned i, unsigned j);
+		NxsString				GetTaxonLabel(unsigned i);
+		virtual unsigned		CharLabelToNumber(NxsString s);
+		virtual unsigned		TaxonLabelToNumber(NxsString s);
+		virtual unsigned		GetMaxObsNumStates();
+		virtual unsigned		GetObsNumStates(unsigned j);
+		virtual void			DebugShowMatrix(ostream &out, bool use_matchchar, const char *marginText = 0);
+		virtual void			Report(ostream &out);
+		virtual void			Reset();
+
+	protected:
+
+		void					BuildCharPosArray(bool check_eliminated = false);
+		bool					IsInSymbols(char ch);
+		void					HandleCharlabels(NxsToken &token);
+		void					HandleCharstatelabels(NxsToken &token);
+		void					HandleDimensions(NxsToken &token, NxsString newtaxaLabel, NxsString ntaxLabel, NxsString ncharLabel);
+		void					HandleEliminate(NxsToken &token);
+		void					HandleEndblock(NxsToken &token, NxsString charToken);
+		virtual void			HandleFormat(NxsToken &token);
+		virtual void			HandleMatrix(NxsToken &token);
+		virtual bool			HandleNextState(NxsToken &token, unsigned i, unsigned c);
+		virtual void			HandleStdMatrix(NxsToken &token);
+		virtual unsigned		HandleTokenState(NxsToken &token, unsigned c);
+		virtual void			HandleTransposedMatrix(NxsToken &token);
+		virtual void			Read(NxsToken &token);
+		unsigned				PositionInSymbols(char ch);
+		void					HandleStatelabels(NxsToken &token);
+		void					HandleTaxlabels(NxsToken &token);
+		void					ResetSymbols();
+		void					ShowStates(ostream &out, unsigned i, unsigned j);
+		void					WriteStates(NxsDiscreteDatum &d, char *s, unsigned slen);
+
+		NxsTaxaBlock			*taxa;				/* pointer to the TAXA block in which taxon labels are stored */
+		NxsAssumptionsBlock		*assumptionsBlock;	/* pointer to the ASSUMPTIONS block in which exsets, taxsets and charsets are stored */
+
+		unsigned				ntax;				/* number of rows in matrix (same as `ntaxTotal' unless fewer taxa appeared in CHARACTERS MATRIX command than were specified in the TAXA block, in which case `ntaxTotal' > `ntax') */
+		unsigned				ntaxTotal;			/* number of taxa (same as `ntax' unless fewer taxa appeared in CHARACTERS MATRIX command than were specified in the TAXA block, in which case `ntaxTotal' > `ntax') */
+		unsigned				nchar;				/* number of columns in matrix (same as `ncharTotal' unless some characters were eliminated, in which case `ncharTotal' > `nchar') */
+		unsigned				ncharTotal;			/* total number of characters (same as `nchar' unless some characters were eliminated, in which case `ncharTotal' > `nchar') */
+
+		bool					newtaxa;			/* true if NEWTAXA keyword encountered in DIMENSIONS command */
+		bool					newchar;			/* true unless CHARLABELS or CHARSTATELABELS command read */
+
+		bool					formerly_datablock;	/* true if this object was originally read in as a DATA block rather than as a CHARACTERS block, false otherwise */
+		bool					respectingCase;		/* if true, RESPECTCASE keyword specified in FORMAT command */
+		bool					transposing;		/* indicates matrix will be in transposed format */
+		bool					interleaving;		/* indicates matrix will be in interleaved format */
+		bool					tokens;				/* if false, data matrix entries must be single symbols; if true, multicharacter entries are allows */
+		bool					labels;				/* indicates whether or not labels will appear on left side of matrix */
+
+		char					missing;			/* missing data symbol */
+		char					gap;				/* gap symbol for use with molecular data */
+		char					matchchar;			/* match symbol to use in matrix */
+
+		char					*symbols;			/* list of valid character state symbols */
+
+		NxsStringMap			equates;			/* list of associations defined by EQUATE attribute of FORMAT command */
+
+		NxsDiscreteMatrix		*matrix;			/* storage for discrete data */
+		unsigned				*charPos;			/* maps character numbers in the data file to column numbers in matrix (necessary if some characters have been eliminated) */
+		unsigned				*taxonPos;			/* maps taxon numbers in the data file to row numbers in matrix (necessary if fewer taxa appear in CHARACTERS block MATRIX command than are specified in the TAXA block) */
+		NxsUnsignedSet			eliminated;			/* array of (0-offset) character numbers that have been eliminated (will remain empty if no ELIMINATE command encountered) */
+
+		bool					*activeChar;		/* `activeChar[i]' true if character `i' not excluded; `i' is in range [0..`nchar') */
+		bool					*activeTaxon;		/* `activeTaxon[i]' true if taxon `i' not deleted; `i' is in range [0..`ntax') */
+
+		NxsStringVector			charLabels;			/* storage for character labels (if provided) */
+		NxsStringVectorMap		charStates;			/* storage for character state labels (if provided) */
+
+	private:
+
+		DataTypesEnum			datatype;			/* flag variable (see datatypes enum) */
+	};
+
+typedef NxsCharactersBlock CharactersBlock;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes taxon whose 0-offset current index is `i'. If taxon has already been deleted, this function has no effect.
+*/
+inline void NxsCharactersBlock::DeleteTaxon(
+  unsigned i)	/* index of taxon to delete in range [0..`ntax') */
+	{
+	activeTaxon[i] = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Excludes character whose 0-offset current index is `i'. If character has already been excluded, this function has 
+|	no effect.
+*/
+inline void NxsCharactersBlock::ExcludeCharacter(
+  unsigned i)	/* index of character to exclude in range [0..`nchar') */
+	{
+	activeChar[i] = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns `activeChar' data member (pointer to first element of the `activeChar' array). Access to this protected 
+|	data member is necessary in certain circumstances, such as when a NxsCharactersBlock object is stored in another 
+|	class, and that other class needs direct access to the `activeChar' array even though it is not derived from 
+|	NxsCharactersBlock.
+*/
+inline bool *NxsCharactersBlock::GetActiveCharArray()
+	{
+		return activeChar;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns `activeTaxon' data member (pointer to first element of the `activeTaxon' array). Access to this protected 
+|	data member is necessary in certain circumstances, such as when a NxsCharactersBlock object is stored in another 
+|	class, and that other class needs direct access to the `activeTaxon' array even though it is not derived from 
+|	NxsCharactersBlock.
+*/
+inline bool *NxsCharactersBlock::GetActiveTaxonArray()
+	{
+	return activeTaxon;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns label for character `i', if a label has been specified. If no label was specified, returns string 
+|	containing a single blank (i.e., " ").
+*/
+inline NxsString NxsCharactersBlock::GetCharLabel(
+  unsigned i)	/* the character in range [0..`nchar') */
+	{
+	NxsString s = " ";
+	if (static_cast<unsigned>(i) < charLabels.size())
+		s = charLabels[i];
+	return s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns current index of character in matrix. This may differ from the original index if some characters were 
+|	removed using an ELIMINATE command. For example, character number 9 in the original data matrix may now be at 
+|	position 8 if the original character 8 was eliminated. The parameter `origCharIndex' is assumed to range from 
+|	0 to `ncharTotal' - 1.
+*/
+inline unsigned NxsCharactersBlock::GetCharPos(
+  unsigned origCharIndex)	/* original index of character in range [0..`ncharTotal' - 1) */
+	{
+	assert(charPos);
+	assert(origCharIndex >= 0);
+	assert(origCharIndex < ncharTotal);
+
+	return charPos[origCharIndex];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the gap symbol currently in effect. If no gap symbol specified, returns '\0'.
+*/
+inline char NxsCharactersBlock::GetGapSymbol()
+	{
+	return gap;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns current index of taxon in matrix. This may differ from the original index if some taxa were listed in the 
+|	TAXA block but not in the DATA or CHARACTERS block. The parameter `origTaxonIndex' is assumed to range from 0 to 
+|	`ntaxTotal' - 1.
+*/
+inline unsigned NxsCharactersBlock::GetTaxPos(
+  unsigned origTaxonIndex)	/* original index of taxon */
+	{
+	assert(taxonPos);
+	assert(origTaxonIndex >= 0);
+	assert(origTaxonIndex < ntaxTotal);
+
+	return taxonPos[origTaxonIndex];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value of `datatype'.
+*/
+inline unsigned NxsCharactersBlock::GetDataType()
+	{
+	return (int)datatype;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the `matchchar' symbol currently in effect. If no `matchchar' symbol specified, returns '\0'.
+*/
+inline char NxsCharactersBlock::GetMatchcharSymbol()
+	{
+	return matchchar;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns internal representation of the state for taxon `i', character `j'. In the normal situation, `k' is 0 meaning
+|	there is only one state with no uncertainty or polymorphism. If there are multiple states, specify a number in the 
+|	range [0..n) where n is the number of states returned by the GetNumStates function. Use the IsPolymorphic 
+|	function to determine whether the multiple states correspond to uncertainty in state assignment or polymorphism in 
+|	the taxon. The value returned from this function is one of the following:
+|~
+|	o -3 means gap state (see note below)
+|	o -2 means missing state (see note below)
+|	o an integer 0 or greater is internal representation of a state
+|~
+|	Note: gap and missing states are actually represented internally in a different way; for a description of the actual
+|	internal representation of states, see the documentation for NxsDiscreteDatum.
+*/
+inline int NxsCharactersBlock::GetInternalRepresentation(
+  unsigned i,	/* the taxon in range [0..`ntax') */
+  unsigned j,	/* the character in range [0..`nchar') */
+  unsigned k)	/* the 0-offset index of state to return */
+	{
+	if (IsGapState(i, j))
+		return -3;
+	else if (IsMissingState(i, j))
+		return -2;
+	else
+		return matrix->GetState(i, j, k);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the missing data symbol currently in effect. If no missing data symbol specified, returns '\0'.
+*/
+inline char NxsCharactersBlock::GetMissingSymbol()
+	{
+	return missing;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `nchar'.
+*/
+inline unsigned NxsCharactersBlock::GetNChar()
+	{
+	return nchar;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `ncharTotal'.
+*/
+inline unsigned NxsCharactersBlock::GetNCharTotal()
+	{
+	return ncharTotal;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `ntax'.
+*/
+inline unsigned NxsCharactersBlock::GetNTax()
+	{
+	return ntax;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `ntaxTotal'.
+*/
+inline unsigned NxsCharactersBlock::GetNTaxTotal()
+	{
+	return ntaxTotal;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of characters eliminated with the ELIMINATE command.
+*/
+inline unsigned NxsCharactersBlock::GetNumEliminated()
+	{
+	return (ncharTotal - nchar);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of stored equate associations.
+*/
+inline unsigned NxsCharactersBlock::GetNumEquates()
+	{
+	return equates.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of actual columns in `matrix'. This number is equal to `nchar', but can be smaller than 
+|	`ncharTotal' since the user could have eliminated some of the characters.
+*/
+inline unsigned NxsCharactersBlock::GetNumMatrixCols()
+	{
+	return nchar;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of actual rows in `matrix'. This number is equal to `ntax', but can be smaller than `ntaxTotal'
+|	since the user did not have to provide data for all taxa specified in the TAXA block.
+*/
+inline unsigned NxsCharactersBlock::GetNumMatrixRows()
+	{
+	return ntax;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of states for taxon `i', character `j'.
+*/
+inline unsigned NxsCharactersBlock::GetNumStates(
+  unsigned i,	/* the taxon in range [0..`ntax') */
+  unsigned j)	/* the character in range [0..`nchar') */
+	{
+	return matrix->GetNumStates(i, j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of states for character `j' over all taxa. Note: this function is rather slow, as it must walk 
+|	through each row, adding the states encountered to a set, then finally returning the size of the set. Thus, if this 
+|	function is called often, it would be advisable to initialize an array using this function, then refer to the array 
+|	subsequently.
+*/
+inline unsigned NxsCharactersBlock::GetObsNumStates(
+  unsigned j)	/* the character in range [0..`nchar') */
+	{
+	return matrix->GetObsNumStates(j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the original character number (used in the NEXUS data file) in the range [1..`ncharTotal']. Will be equal 
+|	to `j' + 1 unless some characters were eliminated.
+*/
+inline unsigned NxsCharactersBlock::GetOrigCharNumber(
+  unsigned j)	/* the character in range [0..`nchar') */
+	{
+	return (1 + GetOrigCharIndex(j));
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the original taxon number (used in the NEXUS data file) in the range [1..`ntaxTotal']. Will be equal to 
+|	`i' + 1 unless data was not provided for some taxa listed in a preceding TAXA block.
+*/
+inline unsigned NxsCharactersBlock::GetOrigTaxonNumber(
+  unsigned i)	/* the character in range [0..`ntax') */
+	{
+	return (1 + GetOrigTaxonIndex(i));
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns symbol from symbols list representing the state for taxon `i' and character `j'. The normal situation in 
+|	which there is only one state with no uncertainty or polymorphism is represented by `k' = 0. If there are multiple 
+|	states, specify a number in the range [0..n) where n is the number of states returned by the GetNumStates function.
+|	Use the IsPolymorphic function to determine whether the multiple states correspond to uncertainty in state 
+|	assignment or polymorphism in the taxon. Assumes `symbols' is non-NULL.
+*/
+inline char NxsCharactersBlock::GetState(
+  unsigned i,	/* the taxon in range [0..`ntax') */
+  unsigned j,	/* the character in range [0..`nchar') */
+  unsigned k)	/* the 0-offset index of the state to return */
+	{
+	assert(symbols);
+	char state_char = '\0';
+
+	//unsigned symbolsLen = strlen(symbols);
+	unsigned p = matrix->GetState(i, j, k);
+	assert(p < strlen(symbols));
+	state_char = *(symbols + p);
+
+	return state_char;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns data member `symbols'.  Warning: returned value may be NULL.
+*/
+inline char *NxsCharactersBlock::GetSymbols()
+	{
+	return symbols;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns label for taxon number `i' (`i' ranges from 0 to `ntax' - 1).
+*/
+inline NxsString NxsCharactersBlock::GetTaxonLabel(
+  unsigned i)	/* the taxon's position in the taxa block */
+	{
+	NxsString s = taxa->GetTaxonLabel(i);
+	return s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Includes character whose 0-offset current index is `i'. If character is already active, this function has no effect.
+*/
+inline void NxsCharactersBlock::IncludeCharacter(
+  unsigned i)	/* index of character to include in range [0..`nchar') */
+	{
+	activeChar[i] = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if character `j' is active. If character `j' has been excluded, returns false. Assumes `j' is in the 
+|	range [0..`nchar').
+*/
+inline bool NxsCharactersBlock::IsActiveChar(
+  unsigned j)	/* the character in question, in the range [0..`nchar') */
+	{
+	assert(j >= 0);
+	assert(j < nchar);
+
+	return activeChar[j];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if taxon `i' is active. If taxon `i' has been deleted, returns false. Assumes `i' is in the range 
+|	[0..`ntax').
+*/
+inline bool NxsCharactersBlock::IsActiveTaxon(
+  unsigned i)	/* the taxon in question, in the range [0..`ntax') */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+
+	return activeTaxon[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if taxon number i has been deleted, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsDeleted(
+  unsigned i)	/* the taxon in question, in the range [0..`ntax') */
+	{
+	return !IsActiveTaxon(i);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if character `j' has been excluded. If character `j' is active, returns false. Assumes `j' is in the 
+|	range [0..`nchar').
+*/
+inline bool NxsCharactersBlock::IsExcluded(
+  unsigned j)	/* the character in question, in the range [0..`nchar') */
+	{
+	return !IsActiveChar(j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the state at taxon `i', character `j' is the gap state, false otherwise. Assumes `matrix' is 
+|	non-NULL.
+*/
+inline bool NxsCharactersBlock::IsGapState(
+  unsigned i,	/* the taxon, in range [0..`ntax') */
+  unsigned j)	/* the character, in range [0..`nchar') */
+	{
+	assert(matrix);
+	return matrix->IsGap(i, j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if INTERLEAVE was specified in the FORMAT command, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsInterleave()
+	{
+	return interleaving;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if LABELS was specified in the FORMAT command, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsLabels()
+	{
+	return labels;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the state at taxon `i', character `j' is the missing state, false otherwise. Assumes `matrix' is 
+|	non-NULL.
+*/
+inline bool NxsCharactersBlock::IsMissingState(
+  unsigned i,	/* the taxon, in range [0..`ntax') */
+  unsigned j)	/* the character, in range [0..`nchar') */
+	{
+	assert(matrix);
+	return matrix->IsMissing(i, j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if taxon `i' is polymorphic for character `j', false otherwise. Assumes `matrix' is non-NULL. Note 
+|	that return value will be false if there is only one state (i.e., one cannot tell whether there is uncertainty 
+|	using this function).
+*/
+inline bool NxsCharactersBlock::IsPolymorphic(
+  unsigned i,	/* the taxon in range [0..`ntax') */
+  unsigned j)	/* the character in range [0..`nchar') */
+	{
+	assert(matrix);
+	return matrix->IsPolymorphic(i, j);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if RESPECTCASE was specified in the FORMAT command, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsRespectCase()
+	{
+	return respectingCase;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if TOKENS was specified in the FORMAT command, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsTokens()
+	{
+	return tokens;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if TRANSPOSE was specified in the FORMAT command, false otherwise.
+*/
+inline bool NxsCharactersBlock::IsTranspose()
+	{
+	return transposing;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Restores taxon whose 0-offset current index is `i'. If taxon is already active, this function has no effect.
+*/
+inline void NxsCharactersBlock::RestoreTaxon(
+  unsigned i)	/* index of taxon to restore in range [0..`ntax') */
+	{
+	activeTaxon[i] = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Shows the states for taxon `i', character `j', on the stream `out'. Uses `symbols' array to translate the states 
+|	from the way they are stored (as integers) to the symbol used in the original data matrix. Assumes `i' is in the 
+|	range [0..`ntax') and `j' is in the range [0..`nchar'). Also assumes `matrix' is non-NULL.
+*/
+inline void NxsCharactersBlock::ShowStates(
+  ostream &out,	/* the stream on which to show the state(s) */
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+	assert(j >= 0);
+	assert(j < nchar);
+	assert(matrix);
+
+	char s[NCL_MAX_STATES + 3];
+	WriteStates(matrix->GetDiscreteDatum(i, j), s, NCL_MAX_STATES + 3);
+
+	out << s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts a taxon label to a number corresponding to the taxon's position within the list maintained by the 
+|	NxsTaxaBlock object. This method overrides the virtual function of the same name in the NxsBlock base class. If 
+|	`s' is not a valid taxon label, returns the value 0.
+*/
+inline unsigned NxsCharactersBlock::TaxonLabelToNumber(
+  NxsString s)	/* the taxon label to convert */
+	{
+	unsigned i;
+	try
+		{
+		i = 1 + taxa->FindTaxon(s);
+		}
+	catch(NxsTaxaBlock::NxsX_NoSuchTaxon)
+		{
+		i = 0;
+		}
+
+	return i;
+	}
+
+
+
+
+
+#endif
diff --git a/ncl/nxsdatablock.cpp b/ncl/nxsdatablock.cpp
new file mode 100644
index 0000000..43a1d2e
--- /dev/null
+++ b/ncl/nxsdatablock.cpp
@@ -0,0 +1,54 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets `id' to "DATA" and `newtaxa' to true, and calls the base class (NxsCharactersBlock) constructor.
+*/
+NxsDataBlock::NxsDataBlock(
+  NxsTaxaBlock *tb,			/* the taxa block object for storing taxon labels */
+  NxsAssumptionsBlock *ab)	/* the assumptions block object for storing exsets */
+  : NxsCharactersBlock(tb, ab)
+	{
+	id = "DATA";
+	newtaxa = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Calls Reset function of the parent class (NxsCharactersBlock) and resets `newtaxa' to true in preparation for 
+|	reading another DATA block.
+*/
+void NxsDataBlock::Reset()
+	{
+	NxsCharactersBlock::Reset();
+	newtaxa = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts this NxsDataBlock object into a NxsCharactersBlock object, storing the result in the supplied 
+|	NxsCharactersBlock object. This NxsDataBlock object will subsequently say it is empty when asked, and 
+|	the `formerly_datablock' data member of `charactersblock' will be set to true.
+*/
+void NxsDataBlock::TransferTo(
+  NxsCharactersBlock &charactersblock)	/* the NxsCharactersBlock object that will receive all the data from this object */
+	{
+	charactersblock.Reset();
+	charactersblock.Consume((NxsCharactersBlock &)(*this));
+	}
diff --git a/ncl/nxsdatablock.h b/ncl/nxsdatablock.h
new file mode 100644
index 0000000..0896077
--- /dev/null
+++ b/ncl/nxsdatablock.h
@@ -0,0 +1,39 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSDATABLOCK_H
+#define NCL_NXSDATABLOCK_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NEXUS block DATA. It is derived from the NxsCharactersBlock class, 
+|	and differs from NxsCharactersBlock only in name and the fact that `newtaxa' is initially true rather than false.
+*/
+class NxsDataBlock
+  : public NxsCharactersBlock
+	{
+	public:
+		NxsDataBlock(NxsTaxaBlock *tb, NxsAssumptionsBlock *ab);
+
+		void TransferTo(NxsCharactersBlock &charactersblock);
+		void Reset();
+	};
+
+typedef NxsDataBlock DataBlock;
+
+#endif
diff --git a/ncl/nxsdefs.h b/ncl/nxsdefs.h
new file mode 100644
index 0000000..84427d4
--- /dev/null
+++ b/ncl/nxsdefs.h
@@ -0,0 +1,79 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#ifndef NCL_NXSDEFS_H
+#define NCL_NXSDEFS_H
+
+#define NCL_NAME_AND_VERSION  "NCL version 2.0"
+#define NCL_COPYRIGHT         "Copyright (c) 1999-2003 by Paul O. Lewis"
+#define NCL_HOMEPAGEURL       "http://lewis.eeb.uconn.edu/ncl/"
+
+// Maximum number of states that can be stored; the only limitation is that this
+// number be less than the maximum size of an int (not likely to be a problem).
+// A good number for this is 76, which is 96 (the number of distinct symbols
+// able to be input from a standard keyboard) less 20 (the number of symbols
+// symbols disallowed by the NEXUS standard for use as state symbols)
+//
+#define NCL_MAX_STATES         76
+
+#if defined(__MWERKS__) || defined(__DECCXX) || defined(_MSC_VER)
+	typedef long		file_pos;
+#else
+	typedef streampos	file_pos;
+#endif
+
+#define	SUPPORT_OLD_NCL_NAMES
+
+class NxsString;
+
+typedef vector<bool>										NxsBoolVector;
+typedef vector<char>										NxsCharVector;
+typedef vector<unsigned>									NxsUnsignedVector;
+typedef vector<NxsString>									NxsStringVector;
+typedef vector<NxsStringVector>								NxsAllelesVector;
+
+typedef set< unsigned, less<unsigned> >						NxsUnsignedSet;
+
+typedef map< unsigned, NxsStringVector, less<unsigned> >	NxsStringVectorMap;
+typedef map< NxsString, NxsString, less<NxsString> >		NxsStringMap;
+typedef map< NxsString, NxsUnsignedSet, less<NxsString> >	NxsUnsignedSetMap;
+
+// The following typedefs are simply for maintaining compatibility with existing code.
+// The names on the right are deprecated and should not be used.
+//
+typedef	NxsBoolVector		BoolVect;
+typedef NxsUnsignedSet		IntSet;
+typedef NxsUnsignedSetMap	IntSetMap;
+typedef NxsAllelesVector	AllelesVect;
+typedef NxsStringVector		LabelList;
+typedef NxsStringVector		StrVec;
+typedef NxsStringVector		vecStr;
+typedef NxsStringVectorMap	LabelListBag;
+typedef NxsStringMap		AssocList;
+
+//class NxsTreesBlock;
+//class NxsTaxaBlock;
+//class NxsAllelesBlock;
+//class NxsAssumptionsBlock;
+//class NxsCharactersBlock;
+//class NxsDistancesBlock;
+//class NxsAssumptionsBlock;
+//class NxsDiscreteDatum;
+//class NxsDiscreteMatrix;
+
+#endif
diff --git a/ncl/nxsdiscretedatum.cpp b/ncl/nxsdiscretedatum.cpp
new file mode 100644
index 0000000..1866765
--- /dev/null
+++ b/ncl/nxsdiscretedatum.cpp
@@ -0,0 +1,88 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets `states' to NULL.
+*/
+NxsDiscreteDatum::NxsDiscreteDatum()
+	{
+	states = NULL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes memory associated with `states' (if any was allocated).
+*/
+NxsDiscreteDatum::~NxsDiscreteDatum()
+	{
+	if (states != NULL)
+		delete [] states;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Makes this NxsDiscreteDatum object an exact copy of `other'. Useful for dealing with matchchar symbols in a matrix.
+*/
+void NxsDiscreteDatum::CopyFrom(
+  const NxsDiscreteDatum &other)	/* the source NxsDiscreteDatum object  */
+	{
+	if (states != NULL)
+		{
+		delete [] states;
+		states = NULL;
+		}
+
+	if (other.states == NULL)
+		return;
+
+	unsigned sz = other.states[0];
+	if (sz == 0)
+		{
+		// First element of other.states is zero, indicating that the gap state is present
+		//
+		states = new unsigned[1];
+		states[0] = 0;
+		}
+
+	else if (sz == 1)
+		{
+		// First element of other.states is one, indicating that there is just one state for this
+		// taxon-character combination. With just one state, no need to worry about either 
+		// ambiguity or polymorphism.
+		//
+		states = new unsigned[2];
+		states[0] = 1;
+		states[1] = other.states[1];
+		}
+
+	else
+		{
+		// First element of other.states is greater than 1, indicating that ambiguity or 
+		// polymorphism is present.
+		//
+		states = new unsigned[sz + 2];
+		states[0] = sz;
+		for (unsigned i = 1; i <= sz; i++)
+			states[i] = other.states[i];
+
+		// Copy the polymorphism indicator element.
+		//
+		states[sz + 1] = other.states[sz + 1];
+		}
+	}
diff --git a/ncl/nxsdiscretedatum.h b/ncl/nxsdiscretedatum.h
new file mode 100644
index 0000000..e029121
--- /dev/null
+++ b/ncl/nxsdiscretedatum.h
@@ -0,0 +1,71 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSDISCRETEDATUM_H
+#define NCL_NXSDISCRETEDATUM_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Class for holding discrete states in a matrix. Note that there is no way to access the variables of this class 
+|	since they are all private and there are no public access functions. This class is designed to be manipulated by 
+|	the class NxsDiscreteMatrix, which is the only class that has been designated a friend of NxsDiscreteDatum. 
+|	The variable `states' is NULL if there is missing data, and non-NULL for any other state. If `states' is non-NULL,
+|	the first cell is used to store the number of states. This will be 0 if the state is the gap state, 1 if the state
+|	is unambiguous and nonpolymorphic (and not the gap state of course), and 2 or higher if there is either 
+|	polymorphism or uncertainty. If polymorphism or uncertainty apply, it becomes necessary to store information about 
+|	which of these two situations holds. Thus, the last cell in the array is set to either 1 (polymorphism) or 0 
+|	(uncertainty). While a little complicated, this scheme has the following benefits:
+|~
+|	o if the state is missing, the only memory allocated is for a pointer (`states')
+|	o if the state is unambiguous and not polymorphic, no storage is used for keeping track of whether polymorphism or 
+|	  uncertainty holds
+|	o it allows for a virtually unlimited number of states, which is important if it is to be general enough to store 
+|	  microsatellite data for a NxsAllelesBlock object, for example.
+|~
+|	Supposing the gap symbol is '-', the missing data symbol is '?', and the symbols list is "ACGT", the following 
+|	table shows the status of the states variable under several different possible data matrix entries:
+|>
+|	Matrix entry        states array
+|	--------------------------------
+|	     ?              NULL
+|	     -              [0]
+|	     G              [1][2]
+|	(AG) polymorphic    [2][0][2][1]
+|	{AG} ambiguous      [2][0][2][0]
+|	--------------------------------
+|>	
+*/
+class NxsDiscreteDatum
+	{
+	friend class NxsDiscreteMatrix;
+
+	public:
+
+					NxsDiscreteDatum();
+		virtual		~NxsDiscreteDatum();
+
+		void		CopyFrom(const NxsDiscreteDatum &other);
+
+	private:
+
+		unsigned	*states;	/* holds information about state for a single taxon-character combination */
+	};
+
+typedef NxsDiscreteDatum DiscreteDatum;
+
+#endif
diff --git a/ncl/nxsdiscretematrix.cpp b/ncl/nxsdiscretematrix.cpp
new file mode 100644
index 0000000..116ba03
--- /dev/null
+++ b/ncl/nxsdiscretematrix.cpp
@@ -0,0 +1,637 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes `nrows' to `rows' and `ncols' to `cols'. In addition, memory is allocated for `data' (each element of 
+|	the matrix `data' is a NxsDiscreteDatum object, which can do its own initialization).
+*/
+NxsDiscreteMatrix::NxsDiscreteMatrix(
+  unsigned rows,	/* number of taxa */
+  unsigned cols)	/* number of characters */
+	{
+	nrows = rows;
+	ncols = cols;
+
+	data = new NxsDiscreteDatum*[nrows];
+	for (unsigned i = 0; i < nrows; i++)
+		data[i] = new NxsDiscreteDatum[ncols];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes memory allocated in the constructor for data member `data'.
+*/
+NxsDiscreteMatrix::~NxsDiscreteMatrix()
+	{
+	if (data != NULL)
+		{
+		for (unsigned i = 0; i < nrows; i++)
+			delete [] data[i];
+		delete [] data;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Allocates memory for `nAddRows' additional rows and updates the variable nrows. Data already stored in `data' is not
+|	destroyed; the newly-allocated rows are added at the bottom of the existing matrix.
+*/
+void NxsDiscreteMatrix::AddRows(
+  unsigned nAddRows)	/* the number of additional rows to allocate */
+	{
+	unsigned new_nrows = nrows + nAddRows;
+
+	// Allocate a matrix big enough to hold all of the existing data
+	// as well as the new rows.
+	//
+	NxsDiscreteDatum **new_data = new NxsDiscreteDatum*[new_nrows];
+
+	// Copy existing data to the new matrix.
+	//
+	unsigned i;
+	for (i = 0; i < nrows; i++)
+		new_data[i] = data[i];
+
+	// Let data now point to the new data matrix
+	//
+	delete [] data;
+	data = new_data;
+
+	// Create data elements for the newly added rows
+	//
+	for (i = nrows; i < new_nrows; i++)
+		data[i] = new NxsDiscreteDatum[ncols];
+
+	nrows = new_nrows;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds state directly to the NxsDiscreteDatum object at data[i][j]. Assumes `data' is non-NULL, `i' is in the range 
+|	[0..nrows), and `j' is in the range [0..ncols). The `value' argument is assumed to be either zero or a positive 
+|	integer. Calls private member function AddState to do the real work; look at the documentation for that function 
+|	for additional details.
+*/
+void NxsDiscreteMatrix::AddState(
+  unsigned i,		/* the (0-offset) index of the taxon in question */
+  unsigned j,		/* the (0-offset) index of the character in question */
+  unsigned value)	/* the state to be added */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+	assert(value >= 0);
+
+	AddState(data[i][j], value);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds an additional state to the array states of `d'. If `states' is NULL, allocates memory for two integers and 
+|	assigns 1 to the first and `value' to the second. If `states' is non-NULL, allocates a new int array long enough to 
+|	hold states already present plus the new one being added here, then deletes the old `states' array. Assumes that we 
+|	are not trying to set either the missing state or the gap state here; the functions SetMissing or SetGap, 
+|	respectively, should be used for those purposes. Also assumes that we do not want to overwrite the state. This 
+|	function always adds states to those already present; use SetState to overwrite the state.
+*/
+void NxsDiscreteMatrix::AddState(
+  NxsDiscreteDatum &d,	/* the NxsDiscreteDatum object affected */
+  unsigned value)		/* the additional state to be added */
+	{
+	unsigned oldns = GetNumStates(d);
+	unsigned k, newlen;
+
+	unsigned *tmp = d.states;
+
+	if (IsMissing(d))
+		{
+		d.states = new unsigned[2];
+		d.states[0] = 1;
+		d.states[1] = value;
+		}
+
+	else if (IsGap(d))
+		{
+		d.states = new unsigned[2];
+		d.states[0] = 1;
+		d.states[1] = value;
+		}
+
+	else if (oldns == 1)
+		{
+		d.states = new unsigned[4];
+		d.states[0] = 2;
+		d.states[1] = tmp[1];
+		d.states[2] = value;
+		d.states[3] = 0;	// Assume not polymorphic unless told otherwise.
+		}
+
+	else
+		{
+		newlen = oldns + 3;
+		d.states = new unsigned[newlen];
+		d.states[0] = oldns + 1;
+		for (k = 1; k < oldns + 1; k++)
+		d.states[k] = tmp[k];
+		d.states[newlen - 2] = value;
+		d.states[newlen - 1] = 0;	// Assume not polymorphic unless told otherwise.
+		}
+
+	if (tmp != NULL)
+		delete [] tmp;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets state of taxon `i' and character `j' to state of first taxon for character `j'. Assumes `i' is in the range 
+|	[0..nrows) and `j' is in the range [0..ncols). Also assumes `data' is non-NULL. Calls private function CopyFrom
+|	to do the actual work.
+*/
+void NxsDiscreteMatrix::CopyStatesFromFirstTaxon(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	data[i][j].CopyFrom(data[0][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Performs a dump of the current contents of the data matrix stored in the variable `data'. Translates missing data
+|	elements to the '?' character and gap states to '-', otherwise, calls GetState to provide the representation.
+*/
+void NxsDiscreteMatrix::DebugSaveMatrix(
+  ostream &out,			/* the stream on which to dump the matrix contents */
+  unsigned colwidth)	/* the width of a data matrix column in characters */
+	{
+	out << endl;
+	out << "nrows = " << nrows << endl;
+	out << "ncols = " << ncols << endl;
+	for (unsigned i = 0; i < nrows; i++)
+		{
+		for (unsigned j = 0; j < ncols; j++)
+			{
+			if (IsMissing(i, j))
+				out << setw(colwidth) << '?';
+			else if (IsGap(i, j))
+				out << setw(colwidth) << '-';
+			else
+				out << setw(colwidth) << GetState(i, j);
+			}
+		out << endl;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Duplicates columns `startCol' to `endCol' in row `row' of the matrix. If additional storage is needed to accommodate
+|	the duplication, this is done automatically through the use of the AddRows method. Note that `count' includes the 
+|	row already present, so if `count' is 10, then 9 more rows will actually be added to the matrix to make a total of 
+|	10 identical rows. The parameters `startCol' and `endCol' default to 0 and `ncols', so if duplication of the entire
+|	row is needed, these need not be explicitly specified in the call to DuplicateRow. Return value is number of 
+|	additional rows allocated to matrix (0 if no rows needed to be allocated). Assumes `data' is non-NULL, `row' is in
+|	the range [[0..`nrows'), `startCol' is in the range [0..`ncols'), and `endCol' is either UINT_MAX, in which case it
+|	is reset to `ncols' - 1, or is in the range (`startCol'..`ncols').
+*/
+unsigned NxsDiscreteMatrix::DuplicateRow(
+  unsigned row,			/* the row to be duplicated */
+  unsigned count,		/* the total number of copies needed */
+  unsigned startCol,	/* the starting column (inclusive) in the range of columns to be duplicated */
+  unsigned endCol)		/* the ending column (inclusive) in the range of columns to be duplicated */
+	{
+	assert(data != NULL);
+	assert(row >= 0);
+	assert(row < nrows);
+	assert(startCol >= 0);
+	assert(startCol < ncols);
+	if (endCol == UINT_MAX)
+		endCol = ncols - 1;
+	assert(endCol > startCol);
+	assert(endCol < ncols);
+
+	// Expand matrix (if necessary) to accommodate additional rows.
+	//
+	unsigned nNewRows = 0;
+	if (row + count > nrows)
+		{
+		nNewRows = row + count - nrows;
+		AddRows(nNewRows);
+		}
+
+	for (unsigned i = 1; i < count; i++)
+		{
+		for (unsigned col = startCol; col <= endCol; col++)
+			data[row+i][col] = data[row][col];
+		}
+
+	return nNewRows;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes all cells of `data', setting `data' to NULL, and resets `nrows' and `ncols' to 0.
+*/
+void NxsDiscreteMatrix::Flush()
+	{
+	if (data != NULL)
+		{
+		for (unsigned i = 0; i < nrows; i++)
+			delete [] data[i];
+		delete [] data;
+		}
+
+	nrows	= 0;
+	ncols	= 0;
+	data	= NULL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Assumes that `data' is non-NULL, `i' is in the range [0..`nrows') and `j' is in the range [0..`ncols'). Returns 
+|	reference to the NxsDiscreteDatum object at row `i', column `j' of matrix.
+*/
+NxsDiscreteDatum &NxsDiscreteMatrix::GetDiscreteDatum(
+  unsigned i,	/* the row of the matrix */
+  unsigned j)	/* the column of the matrix */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return data[i][j];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns number of states for taxon `i' and character `j'. Assumes `data' is non-NULL, `i' is in the range 
+|	[0..`nrows'), and `j' is in the range [0..`ncols'). Calls private member function GetNumStates to do the actual
+|	work.
+*/
+unsigned NxsDiscreteMatrix::GetNumStates(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return GetNumStates(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns total number of states assigned to `d'. Returns 0 for both gap and missing states.
+*/
+unsigned NxsDiscreteMatrix::GetNumStates(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states == NULL)
+		return 0;
+
+	return d.states[0];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns number of states for character `j' over all taxa. Note: this function is rather slow, as it must walk 
+|	through each taxon for the specified character, adding the states encountered to a set, then finally returning the
+|	size of the set. Thus, if this function is called often, it would be advisable to initialize an array using this 
+|	function, then refer to the array subsequently. Assumes `j' is in the range [0..`ncols') and `data' is non-NULL.
+|	Includes all taxa (i.e. there is no mechanism here for treating some taxa as deleted for a particular analysis).
+|	Missing and gap states are ignored.
+*/
+unsigned NxsDiscreteMatrix::GetObsNumStates(
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	// Create a set object to hold all states seen for all taxa for character j
+	//
+	set< unsigned, less<unsigned> > stateset;
+
+	for (unsigned i = 0; i < nrows; i++)
+		{
+		NxsDiscreteDatum &d = data[i][j];
+		unsigned ns = GetNumStates(d);
+		if (ns == 0)
+			continue;
+		for (unsigned k = 0; k < ns; k++)
+			stateset.insert(GetState(d, k));
+		}
+
+	return stateset.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the `k'th state possessed by taxon `i' and character `j'. This taxon-character combination will have more 
+|	than one state if there is ambiguity or polymorphism. Assumes that `i' is in the range [0..`nrows') and `j' is in 
+|	the range [0..`ncols'). Also assumes that at least one state is present (i.e., not the gap or missing state). Use 
+|	the function GetNumStates to determine the  number of states present. Assumes `k' is in the range [0..ns), where ns
+|	is the value returned by GetNumStates.
+*/
+unsigned NxsDiscreteMatrix::GetState(
+  unsigned i,	/* the row of the matrix */
+  unsigned j,	/* the column of the matrix */
+  unsigned k)	/* the state to return */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return GetState(data[i][j], k);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the internal unsigned representation of the state stored in `d' at position `k' of the array `d.states'. 
+|	Assumes that the state is not the missing or gap state. Use IsMissing and IsGap prior to calling this function to 
+|	ensure this function will succeed. Assumes that `k' is in the range [ 0 .. `d.states'[0]).
+*/
+unsigned NxsDiscreteMatrix::GetState(
+  NxsDiscreteDatum &d,	/* the datum in question */
+  unsigned k)			/* the number of the state */
+	{
+	assert(!IsMissing(d));
+	assert(!IsGap(d));
+	assert(k >= 0);
+	assert(k < d.states[0]);
+
+	return d.states[k + 1];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns 1 if the state for taxon `i', character `j', is set to the gap symbol, 0 otherwise. Assumes `data' is 
+|	non-NULL, `i' is in the range [0..`nrows') and `j' is in the range [0..`ncols').
+*/
+bool NxsDiscreteMatrix::IsGap(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return IsGap(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the gap state is stored, otherwise returns false. Note: returns false if this datum represents 
+|	missing data (often the gap state is equated with missing data, but the distinction is made here).
+*/
+bool NxsDiscreteMatrix::IsGap(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states == NULL || d.states[0] > 0)
+		return 0;
+	else
+		return 1;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns 1 if the state for taxon `i', character `j', is set to the missing data symbol, 0 otherwise. Assumes `i' is 
+|	in the range [0..`nrows') and `j' is in the range [0..`ncols').
+*/
+bool NxsDiscreteMatrix::IsMissing(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return IsMissing(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the missing state is stored, false otherwise. Note that this function returns false if the gap state
+|	is stored (often the gap state is equated with missing data, but the distinction is maintained here).
+*/
+bool NxsDiscreteMatrix::IsMissing(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states == NULL)
+		return 1;
+	else
+		return 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns 1 if character `j' is polymorphic in taxon `i', 0 otherwise. Assumes `data' is non-NULL, `i' is in the 
+|	range [0..`nrows') and `j' is in the range [0..`ncols').
+*/
+bool NxsDiscreteMatrix::IsPolymorphic(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	return IsPolymorphic(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the number of states is greater than 1 and polymorphism has been specified. Returns false if the 
+|	state stored is the missing state, the gap state, or if the number of states is 1.
+*/
+bool NxsDiscreteMatrix::IsPolymorphic(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states == NULL || d.states[0] < 2)
+		return 0;
+
+	int nstates = d.states[0];
+	int ncells = nstates + 2;
+	return (bool)(d.states[ncells - 1] > 0);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes all cells of `data' and reallocates memory to create a new matrix object with `nrows' = `rows' and `ncols' 
+|	= `cols'. Assumes `rows' and `cols' are both greater than 0.
+*/
+void NxsDiscreteMatrix::Reset(
+  unsigned rows,	/* the new number of rows (taxa) */
+  unsigned cols)	/* the new number of columns (characters) */
+	{
+	unsigned i;
+	assert(rows > 0);
+	assert(cols > 0);
+
+	// Delete what is there now
+	//
+	if (data != NULL)
+		{
+		for (i = 0; i < nrows; i++)
+			delete [] data[i];
+		delete [] data;
+		}
+
+	nrows = rows;
+	ncols = cols;
+
+	// Create new data matrix
+	//
+	data = new NxsDiscreteDatum*[nrows];
+	for (i = 0; i < nrows; i++)
+		data[i] = new NxsDiscreteDatum[ncols];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets state stored at `data[i][j]' to the gap state. Assumes `i' is in the range [0..`nrows') and `j' is in the 
+|	range [0..`ncols'). Calls the private SetGap member function to do the actual work.
+*/
+void NxsDiscreteMatrix::SetGap(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	SetGap(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Assigns the gap state to `d', erasing any previously stored information. The gap state is designated internally as 
+|	a states array one element long, with the single element set to the value 0.
+*/
+void NxsDiscreteMatrix::SetGap(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states != NULL)
+		delete [] d.states;
+	d.states = new unsigned[1];
+	d.states[0] = 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets state stored at `data[i][j]' to the missing state. Assumes `data' is non-NULL, `i' is in the range [0..`nrows')
+|	and `j' is in the range [0..`ncols'). Calls the private member function SetMissing to do the actual work.
+*/
+void NxsDiscreteMatrix::SetMissing(
+  unsigned i,	/* the (0-offset) index of the taxon in question */
+  unsigned j)	/* the (0-offset) index of the character in question */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	SetMissing(data[i][j]);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Assigns the missing state to `d', erasing any previously stored information. The missing state is stored internally
+|	as a NULL value for the states array.
+*/
+void NxsDiscreteMatrix::SetMissing(
+  NxsDiscreteDatum &d)	/* the datum in question */
+	{
+	if (d.states != NULL)
+		delete [] d.states;
+	d.states = NULL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Specify 1 for `value' if taxon at row `i' is polymorphic at character in column `j', 0 for `value' if uncertain 
+|	which state applies. Sets polymorphism state of taxon `i' and character `j' to `value'. Assumes `data' is non-NULL,
+|	`i' is in the range [0..`nrows') and `j' is in the range [0..`ncols'). Also assumes that the number of states 
+|	stored is greater than 1. Calls private member function SetPolymorphic to do the actual work.
+*/
+void NxsDiscreteMatrix::SetPolymorphic(
+  unsigned i,		/* the (0-offset) index of the taxon in question */
+  unsigned j,		/* the (0-offset) index of the character in question */
+  unsigned value)	/* specify either 0 or 1, where 0 means ambiguity and 1 means polymorphism */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+	assert(value == 0 || value == 1);
+
+	SetPolymorphic(data[i][j], value);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the polymorphism cell (last cell in `d.states') to `value'. Warning: has no effect if there are fewer than 2 
+|	states stored!
+*/
+void NxsDiscreteMatrix::SetPolymorphic(
+  NxsDiscreteDatum &d,	/* the datum in question */
+  unsigned value)		/* specify 1 if polymorphic, 0 if uncertain */
+	{
+	if (d.states == NULL || d.states[0] < 2)
+		return;
+
+	int nstates = d.states[0];
+	int ncells = nstates + 2;
+	d.states[ncells - 1] = value;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets state of taxon `i' and character `j' to `value'. Assumes `data' is non-NULL, `i' is in the range [0..`nrows') 
+|	and `j' is in the range [0..`ncols'). Assumes that this function will not be called if there is missing data or the 
+|	state is the gap state, in which case the functions SetMissing or SetGap, respectively, should be called instead.
+|	Calls the private member function SetState to do the actual work.
+*/
+void NxsDiscreteMatrix::SetState(
+  unsigned i,		/* the (0-offset) index of the taxon in question */
+  unsigned j,		/* the (0-offset) index of the character in question */
+  unsigned value)	/* the value to assign for this state */
+	{
+	assert(i >= 0);
+	assert(i < nrows);
+	assert(j >= 0);
+	assert(j < ncols);
+	assert(data != NULL);
+
+	SetState(data[i][j], value);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Assigns `value' to the 2nd cell in `d.states' (1st cell in `d.states' array is set to 1 to indicate that there is 
+|	only one state). Warning: if already one or more states (including the gap state) are assigned to `d', they will 
+|	be forgotten. Use the function AddState if you want to preserve states already stored in `d'. Assumes state being 
+|	set is not the missing state nor the gap state; use SetMissing or SetGap, respectively, for that.
+*/
+void NxsDiscreteMatrix::SetState(
+  NxsDiscreteDatum &d,	/* the datum in question */
+  unsigned value)		/* the value to assign for the state */
+	{
+	if (d.states != NULL)
+		delete [] d.states;
+	d.states = new unsigned[2];
+	d.states[0] = 1;
+	d.states[1] = value;
+	}
+
diff --git a/ncl/nxsdiscretematrix.h b/ncl/nxsdiscretematrix.h
new file mode 100644
index 0000000..f933297
--- /dev/null
+++ b/ncl/nxsdiscretematrix.h
@@ -0,0 +1,90 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSDISCRETEMATRIX_H
+#define NCL_NXSDISCRETEMATRIX_H
+
+#include <limits.h>
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Class providing storage for the discrete data types (dna, rna, nucleotide, standard, and protein) inside a DATA or 
+|	CHARACTERS block. This class is also used to store the data for an ALLELES block. Maintains a matrix in which each 
+|	cell is an object of the class NxsDiscreteDatum. NxsDiscreteDatum stores the state for a particular combination of 
+|	taxon and character as an integer. Ordinarily, there will be a single state recorded for each taxon-character 
+|	combination, but exceptions exist if there is polymorphism for a taxon-character combination, or if there is 
+|	uncertainty about the state (e.g., in dna data, the data file might have contained an R or Y entry). Please consult 
+|	the documentation for the NxsDiscreteDatum class for the details about how states are stored. For data stored in an 
+|	ALLELES block, rows of the matrix correspond to individuals and columns to loci. Each NxsDiscreteDatum must 
+|	therefore store information about both genes at a single locus for a single individual in the case of diploid data.
+|	To do this, two macros HIWORD and LOWORD are used to divide up the unsigned value into two words. A maximum of 255 
+|	distinct allelic forms can be accommodated by this scheme, assuming at minimum a 32-bit architecture. Because it is
+|	not known in advance how many rows are going to be necessary, The NxsDiscreteMatrix class provides the AddRows 
+|	method, which expands the number of rows allocated for the matrix while preserving data already stored. 
+*/
+class NxsDiscreteMatrix
+	{
+	friend class NxsCharactersBlock;
+	friend class NxsAllelesBlock;
+
+	public:
+
+							NxsDiscreteMatrix(unsigned rows, unsigned cols);
+		virtual				~NxsDiscreteMatrix();
+
+		void				AddRows(unsigned nAddRows);
+		void				AddState(unsigned i, unsigned j, unsigned value);
+		void				CopyStatesFromFirstTaxon(unsigned i, unsigned j);
+		void				DebugSaveMatrix(ostream &out, unsigned colwidth = 12);
+		unsigned			DuplicateRow(unsigned row, unsigned count, unsigned startCol = 0, unsigned endCol = UINT_MAX);
+		void				Flush();
+		unsigned			GetState(unsigned i, unsigned j, unsigned k = 0);
+		unsigned			GetNumStates(unsigned i, unsigned j);
+		unsigned			GetObsNumStates(unsigned j);
+		bool				IsGap(unsigned i, unsigned j);
+		bool				IsMissing(unsigned i, unsigned j);
+		bool				IsPolymorphic(unsigned i, unsigned j);
+		void				Reset(unsigned rows, unsigned cols);
+		void				SetGap(unsigned i, unsigned j);
+		void				SetMissing(unsigned i, unsigned j);
+		void				SetPolymorphic(unsigned i, unsigned j, unsigned value = 1);
+		void				SetState(unsigned i, unsigned j, unsigned value);
+
+	private:
+
+		unsigned			nrows;	/* number of rows (taxa) in the data matrix */
+		unsigned			ncols;	/* number of columns (characters) in the data matrix */
+		NxsDiscreteDatum	**data;	/* storage for the data */
+
+		void				AddState(NxsDiscreteDatum &d, unsigned value);
+		bool				IsGap(NxsDiscreteDatum &d);
+		bool				IsMissing(NxsDiscreteDatum &d);
+		bool				IsPolymorphic(NxsDiscreteDatum &d);
+		NxsDiscreteDatum	&GetDiscreteDatum(unsigned i, unsigned j);
+		unsigned			GetNumStates(NxsDiscreteDatum &d);
+		unsigned			GetState(NxsDiscreteDatum &d, unsigned k = 0);
+		void				SetGap(NxsDiscreteDatum &d);
+		void				SetMissing(NxsDiscreteDatum &d);
+		void				SetPolymorphic(NxsDiscreteDatum &d, unsigned value);
+		void				SetState(NxsDiscreteDatum &d, unsigned value);
+	};
+
+typedef NxsDiscreteMatrix DiscreteMatrix;
+
+
+#endif
diff --git a/ncl/nxsdistancedatum.cpp b/ncl/nxsdistancedatum.cpp
new file mode 100644
index 0000000..bb0ef4b
--- /dev/null
+++ b/ncl/nxsdistancedatum.cpp
@@ -0,0 +1,36 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes value to 0.0 and missing to true.
+*/
+NxsDistanceDatum::NxsDistanceDatum()
+	{
+	missing	= true;
+	value	= 0.0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Does nothing.
+*/
+NxsDistanceDatum::~NxsDistanceDatum()
+	{
+	}
diff --git a/ncl/nxsdistancedatum.h b/ncl/nxsdistancedatum.h
new file mode 100644
index 0000000..1530c5a
--- /dev/null
+++ b/ncl/nxsdistancedatum.h
@@ -0,0 +1,44 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSDISTANCEDATUM_H
+#define NCL_NXSDISTANCEDATUM_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class stores pairwise distance values. It has no public access functions, reflecting the fact that it is 
+|	manipulated strictly by its only friend class, the NxsDistancesBlock class.
+*/
+class NxsDistanceDatum
+	{
+	friend class NxsDistancesBlock;
+
+	public:
+
+					NxsDistanceDatum();
+		virtual		~NxsDistanceDatum();
+
+	private:
+
+		double		value;		/* the pairwise distance value stored */
+		bool		missing;	/* true if there is missing data for this pair */
+	};
+
+typedef NxsDistanceDatum DistanceDatum;
+
+#endif
diff --git a/ncl/nxsdistancesblock.cpp b/ncl/nxsdistancesblock.cpp
new file mode 100644
index 0000000..dd4cfde
--- /dev/null
+++ b/ncl/nxsdistancesblock.cpp
@@ -0,0 +1,896 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets `id' to "DISTANCES", `taxa' to `t', `triangle' to `NxsDistancesBlockEnum::lower', `missing' to '?', `matrix' 
+|	and `taxonPos' to NULL, `labels' and `diagonal' to true, `newtaxa' and `interleave' to false, and `ntax' and `nchar'
+|	to 0. Assumes `t' is non-NULL.
+*/
+NxsDistancesBlock::NxsDistancesBlock(
+  NxsTaxaBlock *t)	/* the NxsTaxaBlock that will keep track of taxon labels */
+  : NxsBlock()
+	{
+	assert(t != NULL);
+	taxa		= t;
+	id = "DISTANCES";
+	ntax		= 0;
+	nchar		= 0;
+	diagonal	= true;
+	labels		= true;
+	newtaxa		= false;
+	interleave	= false;
+	triangle	= NxsDistancesBlockEnum(lower);
+	missing		= '?';
+	matrix		= NULL;
+	taxonPos	= NULL;
+}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Deletes `matrix' and `taxonPos' arrays.
+*/
+NxsDistancesBlock::~NxsDistancesBlock()
+	{
+	if (matrix != NULL)
+		delete matrix;
+	if (taxonPos != NULL)
+		delete [] taxonPos;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when DIMENSIONS command needs to be parsed from within the DISTANCES block. Deals with everything after the 
+|	token DIMENSIONS up to and including the semicolon that terminates the DIMENSIONS command.
+*/
+void NxsDistancesBlock::HandleDimensionsCommand(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	for (;;)
+		{
+		token.GetNextToken();
+
+		// Token should either be ';' or the name of a subcommand
+		//
+		if (token.Equals(";"))
+			break;
+
+		else if (token.Equals("NEWTAXA"))
+			{
+			ntax = 0;
+			newtaxa = 1;
+			}
+
+		else if (token.Equals("NTAX"))
+			{
+			if (!newtaxa)
+				{
+				errormsg = "Must specify NEWTAXA before NTAX if new taxa are being defined";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the number of taxa
+			//
+			token.GetNextToken();
+			ntax = atoi(token.GetToken().c_str());
+			}
+
+		else if (token.Equals("NCHAR"))
+			{
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the number of characters
+			//
+			token.GetNextToken();
+			nchar = atoi(token.GetToken().c_str());
+			}
+		}
+
+	if (ntax == 0)
+		ntax = taxa->GetNumTaxonLabels();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when FORMAT command needs to be parsed from within the DISTANCES block. Deals with everything after the 
+|	token FORMAT up to and including the semicolon that terminates the FORMAT command.
+*/
+void NxsDistancesBlock::HandleFormatCommand(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	for (;;)
+		{
+		// This should either be ';' or the name of a subcommand
+		//
+		token.GetNextToken();
+
+		if (token.Equals(";"))
+			break;
+
+		else if (token.Equals("TRIANGLE"))
+			{
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be LOWER, UPPER, or BOTH
+			//
+			token.GetNextToken();
+
+			if (token.Equals("LOWER"))
+				triangle = NxsDistancesBlockEnum(lower);
+			else if (token.Equals("UPPER"))
+				triangle = NxsDistancesBlockEnum(upper);
+			else if (token.Equals("BOTH"))
+				triangle = NxsDistancesBlockEnum(both);
+			else
+				{
+				errormsg = "Expecting UPPER, LOWER, or BOTH but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+
+		else if (token.Equals("DIAGONAL"))
+			{
+			diagonal = 1;
+			}
+
+		else if (token.Equals("NODIAGONAL"))
+			{
+			diagonal = 0;
+			}
+
+		else if (token.Equals("LABELS"))
+			{
+			labels = 1;
+			}
+
+		else if (token.Equals("NOLABELS"))
+			{
+			labels = 0;
+			}
+
+		else if (token.Equals("INTERLEAVE"))
+			{
+			interleave = 1;
+			}
+
+		else if (token.Equals("NOINTERLEAVE"))
+			{
+			interleave = 0;
+			}
+
+		else if (token.Equals("MISSING"))
+			{
+			// This should be the equals sign
+			//
+			token.GetNextToken();
+
+			if (!token.Equals("="))
+				{
+				errormsg = "Expecting '=' but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the missing data symbol
+			//
+			token.GetNextToken();
+
+			if (token.GetTokenLength() != 1)
+				{
+				errormsg = "Missing data symbol specified (";
+				errormsg += token.GetToken();
+				errormsg += ") is invalid (must be a single character)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			missing = token.GetToken()[0];
+			}
+
+		else
+			{
+			errormsg = "Token specified (";
+			errormsg += token.GetToken();
+			errormsg += ") is an invalid subcommand for the FORMAT command";
+			throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called from within HandleMatrix, this function is used to deal with interleaved matrices. It is called once for 
+|	each pass through the taxa. The local variable `jmax' records the number of columns read in the current interleaved 
+|	page and is used to determine the offset used for j in subsequent pages.
+*/
+bool NxsDistancesBlock::HandleNextPass(
+  NxsToken &token,	/* the token we are using for reading the data file */
+  unsigned &offset)	/* the offset */
+	{
+	unsigned i, j, k, jmax = 0; 
+	bool done = false;
+
+	unsigned i_first = 0;
+	if (triangle == NxsDistancesBlockEnum(lower))
+		i_first = offset;
+
+	unsigned i_last = ntax;
+
+	for (i = i_first; i < i_last; i++)
+		{
+		// Deal with taxon label if provided. Here are the four situations we need to deal with:
+		//   newtaxa  (offset > 0)  handled by
+		//      0           0         case 1
+		//      0           1         case 1
+		//      1           0         case 2
+		//      1           1         case 1
+		//
+		if (labels && (!newtaxa || offset > 0))
+			{
+			// Case 1: Expecting taxon labels, and also expecting them to already be in taxa
+			//
+			do
+				{
+				token.SetLabileFlagBit(NxsToken::newlineIsToken);
+				token.GetNextToken();
+				}
+			while(token.AtEOL());
+
+			try
+				{
+				// Look up position of taxon in NxsTaxaBlock list
+				//
+				k = taxa->FindTaxon(token.GetToken());
+
+				// Array taxonPos is initialized to UINT_MAX and filled in as taxa are encountered
+				//
+				if (taxonPos[i] == UINT_MAX)
+					{
+					taxonPos[i] = k;
+					}
+				else if (taxonPos[i] != k)
+					{
+					errormsg = "Taxon labeled ";
+					errormsg += token.GetToken();
+					errormsg += " is out of order compared to previous interleave pages";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				}
+
+			catch (NxsTaxaBlock::NxsX_NoSuchTaxon)
+				{
+				errormsg = "Could not find ";
+				errormsg += token.GetToken();
+				errormsg += " among taxa previously defined";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+
+		else if (labels && newtaxa)
+			{
+			// Case 2: Expecting taxon labels, and also expecting taxa block to be empty
+			//
+			do
+				{
+				token.SetLabileFlagBit(NxsToken::newlineIsToken);
+				token.GetNextToken();
+				}
+			while(token.AtEOL());
+
+			taxa->AddTaxonLabel(token.GetToken());
+			taxonPos[i] = i;
+			}
+
+		// Now deal with the row of distance values
+		//
+		unsigned true_j = 0;
+		for (j = 0; j < ntax; j++)
+			{
+			if (i == ntax - 1 && j == ntax - 1)
+				{
+				done = true;
+				}
+
+			if ((i == ntax - 1) && (true_j == ntax - 1))
+				{
+				done = true;
+				break;
+				}
+
+			if (i == ntax-1 && !diagonal && triangle == NxsDistancesBlockEnum(upper))
+				{
+				done = true;
+				break;
+				}
+
+			if (!diagonal && triangle == NxsDistancesBlockEnum(lower) && j == ntax - offset - 1)
+				{
+				done = true;
+				break;
+				}
+
+			token.SetLabileFlagBit(NxsToken::newlineIsToken);
+			token.GetNextToken();
+
+			if (token.AtEOL())
+				{
+				if (j > jmax)
+					{
+					jmax = j;
+					if (!diagonal && triangle == NxsDistancesBlockEnum(upper) && i >= offset)
+						jmax++;
+					if (interleave && triangle == NxsDistancesBlockEnum(upper))
+						i_last = jmax + offset;
+					}
+				break;
+				}
+
+			true_j = j + offset;
+			if (triangle == NxsDistancesBlockEnum(upper) && i > offset)
+				true_j += (i - offset);
+			if (!diagonal && triangle == NxsDistancesBlockEnum(upper) && i >= offset)
+				true_j++;
+
+			if (true_j == ntax)
+				{
+				errormsg = "Too many distances specified in row just read in";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			string t = token.GetToken();
+			if (token.GetTokenLength() == 1 && t[0] == missing)
+				SetMissing(i, true_j);
+			else
+				SetDistance(i, true_j, atof(t.c_str()));
+			}
+		}
+
+	offset += jmax;
+
+	return done;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when MATRIX command needs to be parsed from within the DISTANCES block. Deals with everything after the 
+|	token MATRIX up to and including the semicolon that terminates the MATRIX command.
+*/
+void NxsDistancesBlock::HandleMatrixCommand(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	unsigned i;
+	unsigned prev_ntax = ntax;
+
+	if (ntax == 0)
+		ntax = taxa->GetNumTaxonLabels();
+
+	if (ntax == 0)
+		{
+		errormsg = "MATRIX command cannot be read if NTAX is zero";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (triangle == NxsDistancesBlockEnum(both) && !diagonal)
+		{
+		errormsg = "Cannot specify NODIAGONAL and TRIANGLE=BOTH at the same time";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (newtaxa)
+		taxa->Reset();
+
+	// Allocate taxonPos array, deleting it first if previously allocated
+	//
+	if (taxonPos != NULL)
+		{
+		delete [] taxonPos;
+		}
+
+	taxonPos = new unsigned[ntax];
+	
+	for (i = 0; i < ntax; i++)
+		taxonPos[i] = UINT_MAX;
+
+	// Allocate matrix array, deleting it first if previously allocated
+	//
+	if (matrix != NULL)
+		{
+		assert(prev_ntax > 0);
+		for (i = 0; i < prev_ntax; i++)
+			delete [] matrix[i];
+		delete [] matrix;
+		}
+
+	matrix = new NxsDistanceDatum*[ntax];
+	for (i = 0; i < ntax; i++)
+		matrix[i] = new NxsDistanceDatum[ntax];
+
+	unsigned offset = 0;
+	bool done = false;
+	while (!done)
+		{
+		done = HandleNextPass(token, offset);
+		}
+
+	// Token should be equal to the terminating semicolon
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate MATRIX command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when TAXLABELS command needs to be parsed from within the DISTANCES block. Deals with everything after the 
+|	token TAXLABELS up to and including the semicolon that terminates the TAXLABELS command.
+*/
+void NxsDistancesBlock::HandleTaxlabelsCommand(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	if (!newtaxa)
+		{
+		errormsg = "NEWTAXA must have been specified in DIMENSIONS command to use the TAXLABELS command in a ";
+		errormsg += id;
+		errormsg += " block";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (ntax == 0)
+		{
+		errormsg = "NTAX must be specified before TAXLABELS command";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for (unsigned i = 0; i < ntax; i++)
+		{
+		token.GetNextToken();
+		taxa->AddTaxonLabel(token.GetToken());
+		}
+
+	// This should be terminating semicolon
+	//
+	token.GetNextToken(); 
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate TAXLABELS command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	// Some may object to setting newtaxa to false here, because then the
+	// fact that new taxa were specified in this DISTANCES block rather than in
+	// a preceding TAXA block is lost.  This will only be important if we wish to
+	// recreate the original data file, which I don't anticipate anyone doing with
+	// this code (too difficult to remember all comments, the order of blocks in
+	// the file, etc.)
+	//
+	newtaxa = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NEXUS object)
+|	to the end or endblock statement. Characters are read from the input stream in. Overrides the abstract virtual 
+|	function in the base class.
+*/
+void NxsDistancesBlock::Read(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	isEmpty = false;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken(); 
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' after ";
+		errormsg += id;
+		errormsg += " block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("DIMENSIONS"))
+			{
+			HandleDimensionsCommand(token);
+			}
+
+		else if (token.Equals("FORMAT"))
+			{
+			HandleFormatCommand(token);
+			}
+
+		else if (token.Equals("TAXLABELS"))
+			{
+			HandleTaxlabelsCommand(token);
+			}
+
+		else if (token.Equals("MATRIX"))
+			{
+			HandleMatrixCommand(token);
+			}
+
+		else if (token.Equals("END"))
+			{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate the END command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			break;
+			}
+
+		else if (token.Equals("ENDBLOCK"))
+			{
+			// Get the semicolon following ENDBLOCK
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			break;
+			}
+
+		else
+			{
+			SkippingCommand(token.GetToken());
+			do
+				{
+				token.GetNextToken();
+				}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this taxa block. Overrides the abstract virtual function in 
+|	the base class.
+*/
+void NxsDistancesBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	unsigned ntaxTotal = ntax;
+
+	if (ntaxTotal == 0)
+		ntaxTotal = taxa->GetNumTaxonLabels();
+
+	out << endl;
+	out << id << " block contains ";
+	if (ntaxTotal == 0)
+		{
+		out << "no taxa" << endl;
+		}
+	else if (ntaxTotal == 1)
+		out << "one taxon" << endl;
+	else
+		out << ntaxTotal << " taxa" << endl;
+
+	if (IsLowerTriangular())
+		out << "  Matrix is lower-triangular" << endl;
+	else if (IsUpperTriangular())
+		out << "  Matrix is upper-triangular" << endl;
+	else
+		out << "  Matrix is rectangular" << endl;
+
+	if (IsInterleave())
+		out << "  Matrix is interleaved" << endl;
+	else 
+		out << "  Matrix is non-interleaved" << endl;
+
+	if (IsLabels())
+		out << "  Taxon labels provided" << endl;
+	else
+		out << "  No taxon labels provided" << endl;
+
+	if (IsDiagonal())
+		out << "  Diagonal elements specified" << endl;
+	else 
+		out << "  Diagonal elements not specified" << endl;
+
+	out << "  Missing data symbol is " << missing << endl;
+
+	if (ntax == 0)
+		return;
+
+	out.setf(ios::fixed, ios::floatfield);
+	out.setf(ios::showpoint);
+	for (unsigned i = 0; i < ntax; i++)
+		{
+		if (labels)
+			out << setw(20) << taxa->GetTaxonLabel(i);
+		else
+			out << "\t\t";
+
+		for (unsigned j = 0; j < ntax; j++)
+			{
+			if (triangle == NxsDistancesBlockEnum(upper) && j < i)
+				{
+				out << setw(12) << " ";
+				}
+			else if (triangle == NxsDistancesBlockEnum(lower) && j > i)
+				continue;
+			else if (!diagonal && i == j)
+				{
+				out << setw(12) << " ";
+				}
+			else if (IsMissing(i, j))
+				out << setw(12) << missing;
+			else
+				out << setw(12) << setprecision(5) << GetDistance(i, j);
+			}
+
+		out << endl;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Flushes taxonLabels and sets ntax to 0 in preparation for reading a new TAXA block.
+*/
+void NxsDistancesBlock::Reset()
+	{
+	// Reset base class data members that could have changed
+	//
+	errormsg.clear();
+	isEnabled      = true;
+	isEmpty        = true;
+	isUserSupplied = false;
+
+	if (matrix != NULL)
+		{
+		for (unsigned i = 0; i < ntax; i++)
+			delete [] matrix[i];
+		delete [] matrix;
+		matrix = NULL;
+		}
+
+	if (taxonPos != NULL)
+		delete [] taxonPos;
+	taxonPos = NULL;
+
+	ntax        = 0;
+	nchar       = 0;
+	diagonal    = true;
+	labels      = true;
+	newtaxa     = false;
+	interleave  = false;
+	missing     = '?';
+	triangle    = NxsDistancesBlockEnum(lower);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of ntax.
+*/
+unsigned NxsDistancesBlock::GetNtax()
+	{
+	return ntax;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of nchar.
+*/
+unsigned NxsDistancesBlock::GetNchar()
+	{
+	return nchar;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of the (`i', `j')th element of `matrix'. Assumes `i' and `j' are both in the range [0..`ntax') 
+|	and the distance stored at `matrix[i][j]' is not missing. Also assumes `matrix' is not NULL.
+*/
+double NxsDistancesBlock::GetDistance(
+  unsigned i,	/* the row */
+  unsigned j)	/* the column */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+	assert(j >= 0);
+	assert(j < ntax);
+	assert(matrix != NULL);
+
+	return matrix[i][j].value;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `missing'.
+*/
+char NxsDistancesBlock::GetMissingSymbol()
+	{
+	return missing;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of `triangle'.
+*/
+unsigned NxsDistancesBlock::GetTriangle()
+	{
+	return triangle;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the value of `triangle' is NxsDistancesBlockEnum(both), false otherwise.
+*/
+bool NxsDistancesBlock::IsRectangular()
+	{
+	return (triangle == NxsDistancesBlockEnum(both) ? true : false);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the value of triangle is NxsDistancesBlockEnum(upper), false otherwise.
+*/
+bool NxsDistancesBlock::IsUpperTriangular()
+	{
+	return (triangle == NxsDistancesBlockEnum(upper) ? true : false);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the value of triangle is NxsDistancesBlockEnum(lower), false otherwise.
+*/
+bool NxsDistancesBlock::IsLowerTriangular()
+	{
+	return (triangle == NxsDistancesBlockEnum(lower) ? true : false);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of diagonal.
+*/
+bool NxsDistancesBlock::IsDiagonal()
+	{
+	return diagonal;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of interleave.
+*/
+bool NxsDistancesBlock::IsInterleave()
+	{
+	return interleave;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the value of labels.
+*/
+bool NxsDistancesBlock::IsLabels()
+	{
+	return labels;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the (`i',`j')th distance is missing. Assumes `i' and `j' are both in the range [0..`ntax') and 
+|	`matrix' is not NULL.
+*/
+bool NxsDistancesBlock::IsMissing(
+  unsigned i,	/* the row */
+  unsigned j)	/* the column */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+	assert(j >= 0);
+	assert(j < ntax);
+	assert(matrix != NULL);
+
+	return (bool)(matrix[i][j].missing);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the value of the (`i',`j')th matrix element to `d' and `missing' to false . Assumes `i' and `j' are both in 
+|	the range [0..`ntax') and `matrix' is not NULL.
+*/
+void NxsDistancesBlock::SetDistance(
+  unsigned i,	/* the row */
+  unsigned j,	/* the column */
+  double d)		/* the distance value */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+	assert(j >= 0);
+	assert(j < ntax);
+	assert(matrix != NULL);
+
+	matrix[i][j].value = d;
+	matrix[i][j].missing = false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the value of the (`i', `j')th `matrix' element to missing. Assumes `i' and `j' are both in the range 
+|	[0..`ntax') and `matrix' is not NULL.
+*/
+void NxsDistancesBlock::SetMissing(
+  unsigned i,	/* the row */
+  unsigned j)	/* the column */
+	{
+	assert(i >= 0);
+	assert(i < ntax);
+	assert(j >= 0);
+	assert(j < ntax);
+	assert(matrix != NULL);
+
+	matrix[i][j].missing = 1;
+	matrix[i][j].value = 0.0;
+	}
+
+ /*----------------------------------------------------------------------------------------------------------------------
+|	Sets `nchar' to `n'.
+*/
+void NxsDistancesBlock::SetNchar(
+  unsigned n)	/* the number of characters */
+	{
+	nchar = n;
+	}
diff --git a/ncl/nxsdistancesblock.h b/ncl/nxsdistancesblock.h
new file mode 100644
index 0000000..a32747b
--- /dev/null
+++ b/ncl/nxsdistancesblock.h
@@ -0,0 +1,128 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSDISTANCESBLOCK_H
+#define NCL_NXSDISTANCESBLOCK_H
+
+class NxsDistanceDatum;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NEXUS block DISTANCES. It overrides the member functions Read and 
+|	Reset, which are abstract virtual functions in the base class NxsBlock. Below is a table showing the correspondence 
+|	between the elements of a DISTANCES block and the variables and member functions that can be used to access each 
+|	piece of information stored.
+|>
+|	NEXUS command   Command attribute  Data Members        Member Functions
+|	------------------------------------------------------------------------
+|	DIMENSIONS      NEWTAXA            newtaxa
+|	
+|	                NTAX               ntax                GetNtax 
+|	
+|	                NCHAR              nchar               GetNchar
+|	
+|	FORMAT          TRIANGLE           triangle            GetTriangle
+|	                                                       IsUpperTriangular
+|	                                                       IsLowerTriangular
+|	                                                       IsRectangular
+|	
+|	                [NO]DIAGONAL       diagonal            IsDiagonal
+|	
+|	                [NO]LABELS         labels              IsLabels
+|	
+|	                MISSING            missing             GetMissingSymbol
+|	
+|	                INTERLEAVE         interleave          IsInterleave
+|	
+|	                TAXLABELS          (stored in the      (access through
+|					                   NxsTaxaBlock        data member taxa)
+|									   object)  
+|	
+|	MATRIX                             matrix              GetDistance
+|	                                                       IsMissing
+|	                                                       SetMissing
+|	                                                       SetDistance
+|	------------------------------------------------------------------------
+|>
+*/
+class NxsDistancesBlock
+  : public NxsBlock
+	{
+	public:
+							NxsDistancesBlock(NxsTaxaBlock *t);
+		virtual				~NxsDistancesBlock();
+
+		double				GetDistance(unsigned i, unsigned j);
+		char				GetMissingSymbol();
+		unsigned			GetNchar();
+		unsigned			GetNtax();
+		unsigned			GetTriangle();
+		bool				IsRectangular();
+		bool				IsDiagonal();
+		bool				IsInterleave();
+		bool				IsLabels();
+		bool				IsLowerTriangular();
+		bool				IsMissing(unsigned i, unsigned j);
+		bool				IsUpperTriangular();
+		virtual void		Report(std::ostream &out);
+		virtual void		Reset();
+		void				SetDistance(unsigned i, unsigned j, double d);
+		void				SetMissing(unsigned i, unsigned j);
+		void				SetNchar(unsigned i);
+
+		enum NxsDistancesBlockEnum		/* used by data member triangle to determine which triangle(s) of the distance matrix is/are occupied */
+			{
+			upper			= 1,		/* matrix is upper-triangular */
+			lower			= 2,		/* matrix is lower-triangular */
+			both			= 3			/* matrix is rectangular */
+			};
+
+	protected:
+
+		void				HandleDimensionsCommand(NxsToken &token);
+		void				HandleFormatCommand(NxsToken &token);
+		void				HandleMatrixCommand(NxsToken &token);
+		bool				HandleNextPass(NxsToken &token, unsigned &offset);
+		void				HandleTaxlabelsCommand(NxsToken &token);
+		virtual void		Read(NxsToken &token);
+
+	private:
+
+		NxsTaxaBlock		*taxa;		/* pointer to NxsTaxaBlock object that stores the taxon labels */
+
+		bool				newtaxa;	/* true if new taxa were named in this DISTANCES block */
+		unsigned			ntax;		/* number of taxa (determines dimensions of the matrix) */
+		unsigned			nchar;		/* the number of characters used in generating the pairwise distances */
+
+		bool				diagonal;	/* true if diagonal elements provided when reading in DISTANCES block */
+		bool				interleave;	/* true if interleave format used when reading in DISTANCES block */
+		bool				labels;		/* true if taxon labels were provided when reading in DISTANCES block */
+
+		int					triangle;	/* indicates whether matrix is upper triangular, lower triangular, or rectangular, taking on one of the elements of the NxsDistancesBlockEnum enumeration */
+
+		char				missing;	/* the symbol used to represent missing data (e.g. '?') */
+
+		NxsDistanceDatum	**matrix;	/* the structure used for storing the pairwise distance matrix */
+		unsigned			*taxonPos;	/* array holding 0-offset index into the NxsTaxaBlock list of taxon labels (used to ensure that order of taxa is same for each interleaved block) */
+	};
+
+typedef NxsDistancesBlock	DistancesBlock;
+#define IsBoth				IsRectangular
+
+#endif
+
diff --git a/ncl/nxsemptyblock.cpp b/ncl/nxsemptyblock.cpp
new file mode 100644
index 0000000..62085fd
--- /dev/null
+++ b/ncl/nxsemptyblock.cpp
@@ -0,0 +1,174 @@
+//	Copyright (C) 1999-2002 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library).
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the base class data member `id' to the name of the block (i.e. "EMPTY") in NEXUS data files.
+*/
+NxsEmptyBlock::NxsEmptyBlock()
+	{
+	id = "EMPTY";
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Nothing needs to be done.
+*/
+NxsEmptyBlock::~NxsEmptyBlock()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	The code here is identical to the base class version (simply returns 0), so the code here should either be modified
+|	or this derived version eliminated altogether. Under what circumstances would you need to modify the default code, 
+|	you ask? This function should be modified to something meaningful if this derived class needs to construct and run
+|	a NxsSetReader object to read a set involving characters. The NxsSetReader object may need to use this function to
+|	look up a character label encountered in the set. A class that overrides this method should return the character 
+|	index in the range [1..`nchar']; i.e., add one to the 0-offset index.
+*/
+unsigned NxsEmptyBlock::CharLabelToNumber(
+  NxsString s)	/* the character label to be translated to character number */
+	{
+	return 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when the END or ENDBLOCK command needs to be parsed from within the EMPTY block. Basically just checks to 
+|	make sure the next token in the data file is a semicolon.
+*/
+void NxsEmptyBlock::HandleEndblock(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	// Get the semicolon following END or ENDBLOCK token
+	//
+	token.GetNextToken();
+
+	if(!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate the END or ENDBLOCK command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NxsReader 
+|	object) to the END or ENDBLOCK statement. Characters are read from the input stream `in'. Overrides the pure 
+|	virtual function in the base class.
+*/
+void NxsEmptyBlock::Read(
+  NxsToken &token)	/* the token used to read from `in'*/
+	{
+	isEmpty = false;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' after ";
+		errormsg += id;
+		errormsg += " block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for(;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("END"))
+			{
+			HandleEndblock(token);
+			break;
+			}
+
+		else if(token.Equals("ENDBLOCK"))
+			{
+			HandleEndblock(token);
+			break;
+			}
+
+		else
+			{
+			SkippingCommand(token.GetToken());
+
+			do
+				{
+				token.GetNextToken();
+				}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets `isEmpty' to true in preparation for reading a new EMPTY block. Overrides the pure virtual function in the 
+|	base class.
+*/
+void NxsEmptyBlock::Reset()
+	{
+	isEmpty = true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this EMPTY block. Overrides the pure virtual function in 
+|	the base class.
+*/
+void NxsEmptyBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	out << endl;
+	out << id << " block contains...";
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function is called when an unknown command named `commandName' is about to be skipped. This version of the 
+|	function (which is identical to the base class version) does nothing (i.e., no warning is issued that a command 
+|	was unrecognized). Modify this virtual function to provide such warnings to the user (or eliminate it altogether 
+|	since the base class version already does what this does). 
+*/
+void NxsEmptyBlock::SkippingCommand(
+  NxsString commandName)	/* the name of the command being skipped */
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	The code here is identical to the base class version (simply returns 0), so the code here should either be modified
+|	or this derived version eliminated altogether. Under what circumstances would you need to modify the default code,
+|	you ask? This function should be modified to something meaningful if this derived class needs to construct and run 
+|	a NxsSetReader object to read a set involving taxa. The NxsSetReader object may need to use this function to look 
+|	up a taxon label encountered in the set. A class that overrides this method should return the taxon index in the 
+|	range [1..ntax]; i.e., add one to the 0-offset index.
+*/
+unsigned NxsEmptyBlock::TaxonLabelToNumber(
+  NxsString s)	/* the taxon label to be translated to a taxon number */
+	{
+	return 0;
+	}
+
diff --git a/ncl/nxsemptyblock.h b/ncl/nxsemptyblock.h
new file mode 100644
index 0000000..7b55fec
--- /dev/null
+++ b/ncl/nxsemptyblock.h
@@ -0,0 +1,77 @@
+//	Copyright (C) 1999-2002 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library).
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#ifndef NCL_NXSEMPTYBLOCK_H
+#define NCL_NXSEMPTYBLOCK_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This is a template that can be used to create a class representing a NEXUS block. Here are the steps to follow if
+|	you wish to create a new block specifically for use with your particular application. Suppose your application is
+|	called Phylome and you want to create a private block called a PHYLOME block that can appear in NEXUS data files
+|	and contains commands for your program.
+|~
+|	o Copy the files nxsemptyblock.h and nxsemptyblock.cpp and rename them (e.g. nxsphylomeblock.h and 
+|	  nxsphylomeblock.cpp)
+|	o In nxsphylomeblock.h and nxsphylomeblock.cpp, replace all instances of EMPTY (case-sensitive, whole word search)
+|	  with PHYLOME
+|	o In nxsphylomeblock.h, replace both instances of NCL_NXSEMPTYBLOCK_H at the top of the file with
+|	  NCL_NXSPHYLOMEBLOCK_H
+|	o In nxsphylomeblock.h and nxsphylomeblock.cpp, replace all instances of NxsEmptyBlock (case-sensitive, whole word
+|	  search) with NxsPhylomeBlock
+|	o Modify the Read function in nxsphylomeblock.cpp to interpret what comes after the BEGIN PHYLOME command in the
+|	  NEXUS data file
+|	o Modify the CharLabelToNumber and TaxonLabelToNumber if you need to read in sets of characters or taxa, 
+|	  respectively. These functions provide a way for NxsSetReader objects to translate character or taxon labels to
+|	  the corresponding numbers. If you do not need these capabilities, then it is safe to just delete these functions
+|	  from nxsphylomeblock.h and nxsphylomeblock.cpp because they are no different that the base class versions
+|	o Modify the SkippingCommand function if you want to notify users when commands within the PHYLOME block are not 
+|	  recognized and are being skipped
+|	o In nxsphylomeblock.h, replace this comment with something meaningful for your class. Start off with something
+|	  like "This class handles reading and storage for the NEXUS block PHYLOME. It overrides the member functions 
+|	  Read and Reset, which are abstract virtual functions in the base class NxsBlock"
+|~
+|	Adding a new data member? Don't forget to:
+|~
+|	o Describe it in the class declaration using a C-style comment. 
+|	o Initialize it (unless it is self-initializing) in the constructor and reinitialize it in the Reset function.
+|	o Describe the initial state in the constructor documentation. 
+|	o Delete memory allocated to it in both the destructor and Reset function. 
+|	o Report it in some way in the Report function. 
+|~
+*/
+class NxsEmptyBlock
+  : public NxsBlock
+	{
+	public:
+
+						NxsEmptyBlock();
+		virtual			~NxsEmptyBlock();
+
+		virtual void	Report(ostream &out);
+
+	protected:
+
+		void			SkippingCommand(NxsString commandName);
+		unsigned		TaxonLabelToNumber(NxsString s);
+		unsigned		CharLabelToNumber(NxsString s);
+		void			HandleEndblock(NxsToken &token);
+		virtual void	Read(NxsToken &token);
+		virtual void	Reset();
+	};
+
+#endif
diff --git a/ncl/nxsexception.cpp b/ncl/nxsexception.cpp
new file mode 100644
index 0000000..67ceced
--- /dev/null
+++ b/ncl/nxsexception.cpp
@@ -0,0 +1,49 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Copies 's' to msg and sets line, col and pos to the current line, column and position in the file where parsing
+|	stopped.
+*/
+NxsException::NxsException(
+  NxsString s,	/* the message for the user */
+  file_pos fp,	/* the current file position */
+  long fl,		/* the current file line */
+  long fc)		/* the current file column */
+	{
+	pos		= fp;
+	line	= fl;
+	col		= fc;
+	msg		= s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Creates a NxsException object with the specified message, getting file position information from the NxsToken.
+*/
+NxsException::NxsException(
+  const NxsString &s,		/* message that describes the error */
+  const NxsToken &t)		/* NxsToken that was supplied the last token (the token that caused the error) */
+	{
+	msg		= s; 
+	pos		= t.GetFilePosition();
+	line	= t.GetFileLine();
+	col		= t.GetFileColumn();
+  	}
diff --git a/ncl/nxsexception.h b/ncl/nxsexception.h
new file mode 100644
index 0000000..a233e33
--- /dev/null
+++ b/ncl/nxsexception.h
@@ -0,0 +1,42 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSEXCEPTION_H
+#define NCL_NXSEXCEPTION_H
+
+class NxsToken;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Exception class that conveys a message specific to the problem encountered.
+*/
+class NxsException
+	{
+	public:
+		NxsString	msg;	/* NxsString to hold message */
+		file_pos	pos;	/* current file position */
+		long		line;	/* current line in file */
+		long		col;	/* column of current line */
+
+		NxsException(NxsString s, file_pos fp = 0, long fl = 0L, long fc = 0L);
+		NxsException(const NxsString &s, const NxsToken &t);
+	};
+
+typedef NxsException XNexus;
+
+#endif
diff --git a/ncl/nxsindent.h b/ncl/nxsindent.h
new file mode 100644
index 0000000..29f5d04
--- /dev/null
+++ b/ncl/nxsindent.h
@@ -0,0 +1,56 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis and Mark T. Holder
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSINDENT_H
+#define NCL_NXSINDENT_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Manipulator for use in indenting text `leftMarg' characters.
+*/
+class Indent
+	{
+	public:
+					Indent(unsigned i);
+
+		unsigned	leftMarg;	/* the amount by which to indent */
+	};
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes `leftMarg' to `i'.
+*/
+inline Indent::Indent(
+  unsigned i)	/* the amount (in characters) by which to indent */
+	{
+	leftMarg = i;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Output operator for the Indent manipulator.
+*/
+inline ostream &operator <<(
+  ostream &o,		/* the ostream object */
+  const Indent &i)	/* the Indent object to be sent to `o' */
+	{
+#if defined (HAVE_PRAGMA_UNUSED)
+#	pragma unused(i)
+#endif
+	return o;
+	}
+
+#endif
diff --git a/ncl/nxsreader.cpp b/ncl/nxsreader.cpp
new file mode 100644
index 0000000..7ceb07c
--- /dev/null
+++ b/ncl/nxsreader.cpp
@@ -0,0 +1,492 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes both `blockList' and `currBlock' to NULL.
+*/
+NxsReader::NxsReader()
+	{
+	blockList	= NULL;
+	currBlock	= NULL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Nothing to be done.
+*/
+NxsReader::~NxsReader()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds `newBlock' to the end of the list of NxsBlock objects growing from `blockList'. If `blockList' points to NULL,
+|	this function sets `blockList' to point to `newBlock'. Calls SetNexus method of `newBlock' to inform `newBlock' of
+|	the NxsReader object that now owns it. This is useful when the `newBlock' object needs to communicate with the 
+|	outside world through the NxsReader object, such as when it issues progress reports as it is reading the contents
+|	of its block.
+*/
+void NxsReader::Add(
+  NxsBlock *newBlock)	/* a pointer to an existing block object */
+	{
+	assert(newBlock != NULL);
+
+	newBlock->SetNexus(this);
+
+	if (!blockList)
+		blockList = newBlock;
+	else
+		{
+		// Add new block to end of list
+		//
+		NxsBlock *curr;
+		for (curr = blockList; curr && curr->next;)
+			curr = curr->next;
+		assert(curr && !curr->next);
+		curr->next = newBlock;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns position (first block has position 0) of block `b' in `blockList'. Returns UINT_MAX if `b' cannot be found
+|	in `blockList'.
+*/
+unsigned NxsReader::PositionInBlockList(
+  NxsBlock *b)	/* a pointer to an existing block object */
+	{
+	unsigned pos = 0;
+	NxsBlock *curr = blockList;
+
+	for (;;)
+		{
+		if (curr == NULL || curr == b)
+			break;
+		pos++;
+		curr = curr->next;
+		}
+
+	if (curr == NULL)
+		pos = UINT_MAX;
+
+	return pos;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reassign should be called if a block (`oldb') is about to be deleted (perhaps to make way for new data). Create 
+|	the new block (`newb') before deleting `oldb', then call Reassign to replace `oldb' in `blockList' with `newb'. 
+|	Assumes `oldb' exists and is in `blockList'.
+*/
+void NxsReader::Reassign(
+  NxsBlock *oldb,	/* a pointer to the block object soon to be deleted */
+  NxsBlock *newb)	/* a pointer to oldb's replacement */
+	{
+	NxsBlock *prev = NULL;
+	NxsBlock *curr = blockList;
+	newb->SetNexus(this);
+
+	for (;;)
+		{
+		if (curr == NULL || curr == oldb)
+			break;
+		prev = curr;
+		curr = curr->next;
+		}
+
+	assert(curr != NULL);
+
+	newb->next = curr->next;
+	if (prev == NULL) 
+		blockList = newb;
+	else 
+		prev->next = newb;
+	curr->next = NULL;
+	curr->SetNexus(NULL);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	If `blockList' data member still equals NULL, returns true; otherwise, returns false. `blockList' will not be equal
+|	to NULL if the Add function has been called to add a block object to the list.
+*/
+bool NxsReader::BlockListEmpty()
+	{
+	return (blockList == NULL ? true : false);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function was created for purposes of debugging a new NxsBlock. This version does nothing; to create an active
+|	DebugReportBlock function, override this version in the derived class and call the Report function of `nexusBlock'.
+|	This function is called whenever the main NxsReader Execute function encounters the [&spillall] command comment 
+|	between blocks in the data file. The Execute function goes through all blocks and passes them, in turn, to this 
+|	DebugReportBlock function so that their contents are displayed. Placing the [&spillall] command comment between
+|	different versions of a block allows multiple blocks of the same type to be tested using one long data file. Say 
+|	you are interested in testing whether the normal, transpose, and interleave format of a matrix can all be read 
+|	correctly. If you put three versions of the block in the data file one after the other, the second one will wipe out
+|	the first, and the third one will wipe out the second, unless you have a way to report on each one before the next 
+|	one is read. This function provides that ability.
+*/
+void NxsReader::DebugReportBlock(
+  NxsBlock &nexusBlock)	/* the block that should be reported */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(nexusBlock)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Detaches `oldBlock' from the list of NxsBlock objects growing from `blockList'. If `blockList' itself points to 
+|	`oldBlock', this function sets `blockList' to point to `oldBlock->next'. Note: the object pointed to by `oldBlock' 
+|	is not deleted, it is simply detached from the linked list. No harm is done in Detaching a block pointer that has 
+|	already been detached previously; if `oldBlock' is not found in the block list, Detach simply returns quietly. If 
+|	`oldBlock' is found, its SetNexus object is called to set the NxsReader pointer to NULL, indicating that it is no 
+|	longer owned by (i.e., attached to) a NxsReader object.
+*/
+void NxsReader::Detach(
+  NxsBlock *oldBlock)	/* a pointer to an existing block object */
+	{
+	assert(oldBlock != NULL);
+
+	// Return quietly if there are not blocks attached
+	//
+	if (blockList == NULL)
+		return;
+
+	if (blockList == oldBlock) 
+		{
+		blockList = oldBlock->next;
+		oldBlock->SetNexus(NULL);
+		}
+	else 
+		{
+		// Bug fix MTH 6/17/2002: old version detached intervening blocks as well
+		//
+		NxsBlock *curr = blockList;
+		for (; curr->next != NULL && curr->next != oldBlock;)
+			curr = curr->next;
+
+		// Line below can be uncommented to find cases where Detach function is 
+		// called for pointers that are not in the linked list. If line below is
+		// uncommented, the part of the descriptive comment that precedes this
+		// function about "...simply returns quietly" will be incorrect (at least
+		// in the Debugging version of the program where asserts are active).
+		//
+		//assert(curr->next == oldBlock);
+
+		if (curr->next == oldBlock) 
+			{
+			curr->next = oldBlock->next;
+			oldBlock->SetNexus(NULL);
+			}
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called by the NxsReader object when a block named `blockName' is entered. Allows derived class overriding this
+|	function to notify user of progress in parsing the NEXUS file. Also gives program the opportunity to ask user if it
+|	is ok to purge data currently contained in this block. If user is asked whether existing data should be deleted, and
+|	the answer comes back no, then then the overrided function should return false, otherwise it should return true.
+|	This (base class) version always returns true.
+*/
+bool NxsReader::EnteringBlock(
+  NxsString blockName)	/* the name of the block just entered */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(blockName)
+#	endif
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called by the NxsReader object when a block named `blockName' is being exited. Allows derived class overriding this
+|	function to notify user of progress in parsing the NEXUS file.
+*/
+void NxsReader::ExitingBlock(
+  NxsString blockName)	/* the name of the block being exited */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(blockName)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads the NxsReader data file from the input stream provided by `token'. This function is responsible for reading 
+|	through the name of a each block. Once it has read a block name, it searches `blockList' for a block object to 
+|	handle reading the remainder of the block's contents. The block object is responsible for reading the END or 
+|	ENDBLOCK command as well as the trailing semicolon. This function also handles reading comments that are outside 
+|	of blocks, as well as the initial "#NEXUS" keyword. The `notifyStartStop' argument is provided in case you do not 
+|	wish the ExecuteStart and ExecuteStop functions to be called. These functions are primarily used for creating and 
+|	destroying a dialog box to show progress, and nested Execute calls can thus cause problems (e.g., a dialog box is 
+|	destroyed when the inner Execute calls ExecuteStop and the outer Execute still expects the dialog box to be 
+|	available). Specifying `notifyStartStop' false for all the nested Execute calls thus allows the outermost Execute 
+|	call to control creation and destruction of the dialog box.
+*/
+void NxsReader::Execute(
+  NxsToken	&token,				/* the token object used to grab NxsReader tokens */
+  bool		notifyStartStop)	/* if true, ExecuteStarting and ExecuteStopping will be called */
+	{
+	char id_str[256];
+	currBlock = NULL;
+
+	bool disabledBlock = false;
+	NxsString errormsg;
+
+	try
+		{
+		token.GetNextToken();
+		}
+	catch (NxsException x)
+		{
+		NexusError(token.errormsg, 0, 0, 0);
+		return;
+		}
+
+	if (!token.Equals("#NEXUS"))
+		{
+		errormsg = "Expecting #NEXUS to be the first token in the file, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		NexusError(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		return;
+		}
+
+	if (notifyStartStop)
+		ExecuteStarting();
+
+	for (;;)
+		{
+		token.SetLabileFlagBit(NxsToken::saveCommandComments);
+		token.GetNextToken();
+
+		if (token.AtEOF())
+			break;
+
+		if (token.Equals("BEGIN"))
+			{
+			disabledBlock = false;
+			token.GetNextToken();
+
+			for (currBlock = blockList; currBlock != NULL; currBlock = currBlock->next)
+				{
+				if (token.Equals(currBlock->GetID()))
+					{
+					if (currBlock->IsEnabled()) 
+						{
+						strcpy(id_str, currBlock->GetID().c_str());
+						bool ok_to_read = EnteringBlock(id_str);
+						if (!ok_to_read) 
+							currBlock = NULL;
+						else
+							{
+							currBlock->Reset();
+
+							// We need to back up currBlock, because the Read statement might trigger
+							// a recursive call to Execute (if the block contains instructions to execute 
+							// another file, then the same NxsReader object may be used and any member fields (e.g. currBlock)
+							//  could be trashed.
+							//
+							NxsBlock *tempBlock = currBlock;	
+
+							try 
+								{
+								currBlock->Read(token);
+								currBlock = tempBlock;
+								}
+
+							catch (NxsException x) 
+								{
+								currBlock = tempBlock;
+								if (currBlock->errormsg.length() > 0)
+									NexusError(currBlock->errormsg, x.pos, x.line, x.col);
+								else
+									NexusError(x.msg, x.pos, x.line, x.col);
+								currBlock = NULL;
+								return;
+								}	// catch (NxsException x) 
+							ExitingBlock(id_str /*currBlock->GetID()*/);
+							}	// else
+						}	// if (currBlock->IsEnabled()) 
+
+					else
+						{
+						disabledBlock = true;
+						SkippingDisabledBlock(token.GetToken());
+						}
+					break;
+					}	// if (token.Equals(currBlock->GetID()))
+				}	// for (currBlock = blockList; currBlock != NULL; currBlock = currBlock->next)
+
+			if (currBlock == NULL)
+				{
+				token.BlanksToUnderscores();
+				NxsString currBlockName = token.GetToken();
+
+				if (!disabledBlock) 
+					SkippingBlock(currBlockName);
+
+				for (;;)
+					{
+					token.GetNextToken();
+
+					if (token.Equals("END") || token.Equals("ENDBLOCK")) 
+						{
+						token.GetNextToken();
+
+						if (!token.Equals(";")) 
+							{
+							errormsg = "Expecting ';' after END or ENDBLOCK command, but found ";
+							errormsg += token.GetToken();
+							errormsg += " instead";
+							NexusError(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+							return;
+							}
+						break;
+						}
+
+					if (token.AtEOF()) 
+						{
+						errormsg = "Encountered end of file before END or ENDBLOCK in block ";
+						errormsg += currBlockName;
+						NexusError(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						return;
+						}
+					}	// for (;;)
+				}	// if (currBlock == NULL)
+			currBlock = NULL;
+			}	// if (token.Equals("BEGIN"))
+
+		else if (token.Equals("&SHOWALL"))
+			{
+			for (NxsBlock*  showBlock = blockList; showBlock != NULL; showBlock = showBlock->next)
+				{
+				DebugReportBlock(*showBlock);
+				}
+			}
+
+		else if (token.Equals("&LEAVE"))
+			{
+			break;
+			}
+
+		} // for (;;)
+
+	if (notifyStartStop)
+		ExecuteStopping();
+
+	currBlock = NULL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns a string containing the copyright notice for the NxsReader Class Library, useful for reporting the use of 
+|	this library by programs that interact with the user.
+*/
+const char *NxsReader::NCLCopyrightNotice()
+	{
+	return NCL_COPYRIGHT;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns a string containing the URL for the NxsReader Class Library internet home page.
+*/
+const char *NxsReader::NCLHomePageURL()
+	{
+	return NCL_HOMEPAGEURL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns a string containing the name and current version of the NxsReader Class Library, useful for reporting the 
+|	use of this library by programs that interact with the user.
+*/
+const char *NxsReader::NCLNameAndVersion()
+	{
+	return NCL_NAME_AND_VERSION;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called just after Execute member function reads the opening "#NEXUS" token in a NEXUS data file. Override this 
+|	virtual base class function if your application needs to do anything at this point in the execution of a NEXUS data
+|	file (e.g. good opportunity to pop up a dialog box showing progress). Be sure to call the Execute function with the
+|	`notifyStartStop' argument set to true, otherwise ExecuteStarting will not be called.
+|	
+*/
+void NxsReader::ExecuteStarting()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when Execute member function encounters the end of the NEXUS data file, or the special comment [&LEAVE] is
+|	found between NEXUS blocks. Override this virtual base class function if your application needs to do anything at 
+|	this point in the execution of a NEXUS data file (e.g. good opportunity to hide or destroy a dialog box showing 
+|	progress). Be sure to call the Execute function with the `notifyStartStop' argument set to true, otherwise 
+|	ExecuteStopping will not be called.
+*/
+void NxsReader::ExecuteStopping()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Called when an error is encountered in a NEXUS file. Allows program to give user details of the error as well as 
+|	the precise location of the error.
+*/
+void NxsReader::NexusError(
+  NxsString	msg,	/* the error message to be displayed */
+  file_pos	pos,	/* the current file position */
+  long	line,	/* the current file line */
+  long	col)	/* the current column within the current file line */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(msg, pos, line, col)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function may be used to report progess while reading through a file. For example, the NxsAllelesBlock class 
+|	uses this function to report the name of the population it is currently reading so the user doesn't think the 
+|	program has hung on large data sets.
+*/
+void NxsReader::OutputComment(
+  const NxsString &comment)	/* a comment to be shown on the output */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(comment)
+#	endif
+	}
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function is called when an unknown block named `blockName' is about to be skipped. Override this pure virtual
+|	function to provide an indication of progress as the NEXUS file is being read.
+*/
+void NxsReader::SkippingBlock(
+  NxsString blockName)	/* the name of the block being skipped */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(blockName)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function is called when a disabled block named `blockName' is encountered in a NEXUS data file being executed.
+|	Override this pure virtual function to handle this event in an appropriate manner. For example, the program may 
+|	wish to inform the user that a data block was encountered in what is supposed to be a tree file.
+*/
+void NxsReader::SkippingDisabledBlock(
+  NxsString blockName)	/* the name of the disabled block being skipped */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(blockName)
+#	endif
+	}
+
diff --git a/ncl/nxsreader.h b/ncl/nxsreader.h
new file mode 100644
index 0000000..d488e5f
--- /dev/null
+++ b/ncl/nxsreader.h
@@ -0,0 +1,77 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSREADER_H
+#define NCL_NXSREADER_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This is the class that orchestrates the reading of a NEXUS data file. An object of this class should be created, 
+|	and objects of any block classes that are expected to be needed should be added to `blockList' using the Add 
+|	member function. The Execute member function is then called, which reads the data file until encountering a block 
+|	name, at which point the correct block is looked up in `blockList' and that object's Read method called. 
+*/
+class NxsReader
+	{
+	public:
+		enum	NxsTolerateFlags	/* Flags used with data member tolerate used to allow some flexibility with respect to the NEXUS format */
+			{
+			allowMissingInEquate	= 0x0001,	/* if set, equate symbols are allowed for missing data symbol */
+			allowPunctuationInNames	= 0x0002	/* if set, some punctuation is allowed within tokens representing labels for taxa, characters, and sets */
+			};
+
+						NxsReader();
+		virtual			~NxsReader();
+
+		bool			BlockListEmpty();
+		unsigned		PositionInBlockList(NxsBlock *b);
+		void			Add(NxsBlock *newBlock);
+		void			Detach(NxsBlock *newBlock);
+		void			Reassign(NxsBlock *oldb, NxsBlock *newb);
+		void			Execute(NxsToken& token, bool notifyStartStop = true);
+
+		virtual void	DebugReportBlock(NxsBlock &nexusBlock);
+
+		const char			*NCLNameAndVersion();
+		const char			*NCLCopyrightNotice();
+		const char			*NCLHomePageURL();
+
+		virtual void	ExecuteStarting();
+		virtual void	ExecuteStopping();
+
+		virtual bool	EnteringBlock(NxsString blockName);
+		virtual void	ExitingBlock(NxsString blockName);
+
+		virtual void	OutputComment(const NxsString &comment);
+
+		virtual void	NexusError(NxsString msg, file_pos pos, long line, long col);
+
+		virtual void	SkippingDisabledBlock(NxsString blockName);
+		virtual void	SkippingBlock(NxsString blockName);
+
+	protected:
+
+		NxsBlock		*blockList;	/* pointer to first block in list of blocks */
+		NxsBlock		*currBlock;	/* pointer to current block in list of blocks */
+	};
+
+typedef NxsBlock NexusBlock;
+typedef NxsReader Nexus;
+
+#endif
+
diff --git a/ncl/nxssetreader.cpp b/ncl/nxssetreader.cpp
new file mode 100644
index 0000000..aaea7f5
--- /dev/null
+++ b/ncl/nxssetreader.cpp
@@ -0,0 +1,273 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes `max' to maxValue, `settype' to `type', `token' to `t', `block' to `nxsblk' and `nxsset' to `iset', 
+|	then clears `nxsset'.
+*/
+NxsSetReader::NxsSetReader(
+  NxsToken			&t,			/* reference to the NxsToken being used to read in the NEXUS data file */
+  unsigned			maxValue,	/* maximum possible value allowed in this set (e.g. nchar or ntax) */
+  NxsUnsignedSet	&iset,		/* reference to the set object to store the set defined in the NEXUS data file */
+  NxsBlock			&nxsblk,	/* reference to the NxsBlock object (used for looking up taxon or character labels when encountered in the set definition) */
+  unsigned			type)		/* one of the elements in the NxsSetReaderEnum enumeration */
+  : block(nxsblk), token(t), nxsset(iset)
+	{
+	max		= maxValue;
+	settype	= type;
+	nxsset.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds the range specified by `first', `last', and `modulus' to the set. If `modulus' is zero it is ignored. The 
+|	parameters `first' and `last' refer to numbers found in the data file itself, and thus have range [1..`max']. They 
+|	are stored in `nxsset', however, with offset 0. For example, if the data file says "4-10\2" this function would be
+|	called with `first' = 4, `last' = 10 and `modulus' = 2, and the values stored in `nxsset' would be 3, 5, 7, 9. The
+|	return value is true unless `last' is greater than `max', `first' is less than 1, or `first' is greater than `last':
+|	in any of these cases, the return value is false to indicate failure to store this range.
+*/
+bool NxsSetReader::AddRange(
+  unsigned first,		/* the first member of the range (inclusive, offset 1) */
+  unsigned last,		/* the last member of the range (inclusive, offset 1) */
+  unsigned modulus)		/* the modulus to use (if non-zero) */
+	{
+	if (last > max || first < 1 || first > last)
+		return false;
+
+	for (unsigned i = first - 1; i < last; i++)
+		{
+		unsigned diff = i - first + 1;
+		if (modulus > 0 && diff % modulus != 0)
+			continue;
+		nxsset.insert(i);
+		}
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Tries to interpret `token' as a number. Failing that, tries to interpret `token' as a character or taxon label, 
+|	which it then converts to a number. Failing that, it throws a NxsException exception.
+*/
+unsigned NxsSetReader::GetTokenValue()
+	{
+	unsigned v = atoi(token.GetToken().c_str());
+
+	if (v == 0 && settype != NxsSetReader::generic)
+		{
+		if (settype == NxsSetReader::charset)
+			v = block.CharLabelToNumber(token.GetToken());
+		else if (settype == NxsSetReader::taxset)
+			v = block.TaxonLabelToNumber(token.GetToken());
+		}
+
+	if (v == 0)
+		{
+		block.errormsg = "Set element (";
+		block.errormsg += token.GetToken();
+		block.errormsg += ") not a number ";
+		if (settype == NxsSetReader::charset)
+			block.errormsg += "and not a valid character label";
+		else if (settype == NxsSetReader::taxset)
+			block.errormsg += "and not a valid taxon label";
+
+		throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	return v;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads in a set from a NEXUS data file. Returns true if the set was terminated by a semicolon, false otherwise.
+*/
+bool NxsSetReader::Run()
+	{
+	bool ok;
+	bool retval = false;
+
+	unsigned rangeBegin = UINT_MAX;
+	unsigned rangeEnd = rangeBegin;
+	bool insideRange = false;
+	unsigned modValue = 0;
+
+	for (;;)
+		{
+		// Next token should be one of the following:
+		//   ';'        --> set definition finished
+		//   '-'        --> range being defined
+		//   <integer>  --> member of set (or beginning or end of a range)
+		//   '.'        --> signifies the number max
+		//   '\'        --> signifies modulus value coming next
+		//
+		token.GetNextToken();
+
+		if (token.Equals("-"))
+			{
+			// We should not be inside a range when we encounter a hyphenation symbol.
+			// The hyphen is what _puts_ us inside a range!
+			//
+			if (insideRange)
+				{
+				block.errormsg = "The symbol '-' is out of place here";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			insideRange = true;
+			}
+
+		else if (token.Equals("."))
+			{
+			// We _should_ be inside a range if we encounter a period, as this
+			// is a range termination character
+			//
+			if (!insideRange)
+				{
+				block.errormsg = "The symbol '.' can only be used to specify the end of a range";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			rangeEnd = max;
+			}
+
+		else if (token.Equals("\\"))
+			{
+			// The backslash character is used to specify a modulus to a range, and
+			// thus should only be encountered if currently inside a range
+			//
+			if (!insideRange)
+				{
+				block.errormsg = "The symbol '\\' can only be used after the end of a range has been specified";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the modulus value
+			//
+			token.GetNextToken();
+			modValue = atoi(token.GetToken().c_str());
+
+			if (modValue <= 0)
+				{
+				block.errormsg = "The modulus value specified (";
+				block.errormsg += token.GetToken();
+				block.errormsg += ") is invalid; must be greater than 0";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+
+		else if (insideRange && rangeEnd == UINT_MAX)
+			{
+			// The beginning of the range and the hyphen symbol have been read
+			// already, just need to store the end of the range at this point
+			//
+			rangeEnd = GetTokenValue();
+			}
+
+		else if (insideRange)
+			{
+			// If insideRange is true, we must have already stored the beginning
+			// of the range and read in the hyphen character. We would not have
+			// made it this far if we had also not already stored the range end.
+			// Thus, we can go ahead and add the range.
+			//
+			ok = AddRange(rangeBegin, rangeEnd, modValue);
+
+			if (!ok)
+				{
+				block.errormsg = "Character number out of range (or range incorrectly specified) in set specification";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// We have actually already read in the next token, so deal with it
+			// now so that we don't end up skipping a token
+			//
+			if (token.Equals(";"))
+				{
+				retval = true;
+				break;
+				}
+			else if (token.Equals(","))
+				{
+				break;
+				}
+
+			rangeBegin = GetTokenValue();
+			rangeEnd = UINT_MAX;
+			insideRange = false;
+			}
+
+		else if (rangeBegin != UINT_MAX)
+			{
+			// If we were inside a range, we would have not gotten this far.
+			// If not in a range, we are either getting ready to begin a new
+			// range or have previously read in a single value. Handle the
+			// latter possibility here.
+			//
+			ok = AddRange(rangeBegin, rangeBegin, modValue);
+
+			if (!ok)
+				{
+				block.errormsg = "Character number out of range (or range incorrectly specified) in set specification";
+				throw NxsException(block.errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			if (token.Equals(";"))
+				{
+				retval = true;
+				break;
+				}
+			else if (token.Equals(","))
+				{
+				break;
+				}
+
+			rangeBegin = GetTokenValue();
+			rangeEnd = UINT_MAX;
+			}
+
+		else if (token.Equals(";"))
+			{
+			retval = true;
+			break;
+			}
+
+		else if (token.Equals(","))
+			{
+			break;
+			}
+
+		else if (token.Equals("ALL"))
+			{
+			rangeBegin = 1;
+			rangeEnd = max;
+			ok = AddRange(rangeBegin, rangeEnd);
+			}
+
+		else
+			{
+			// Can only get here if rangeBegin still equals UINT_MAX and thus we
+			// are reading in the very first token and that token is neither
+			// the word "all" nor is it a semicolon
+			//
+			rangeBegin = GetTokenValue();
+			rangeEnd = UINT_MAX;
+			}
+		}
+
+	return retval;
+	}
diff --git a/ncl/nxssetreader.h b/ncl/nxssetreader.h
new file mode 100644
index 0000000..b24d9f7
--- /dev/null
+++ b/ncl/nxssetreader.h
@@ -0,0 +1,79 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSSETREADER_H
+#define NCL_NXSSETREADER_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	A class for reading NEXUS set objects and storing them in a set of int values. The NxsUnsignedSet `nxsset' will be 
+|	cleared, and `nxsset' will be built up as the set is read, with each element in the list storing a 
+|	member of the set (ranges are stored as individual elements). This class handles set descriptions of the following 
+|	form:
+|>
+|	4-7 15 20-.\3;
+|>
+|	The above set includes every number from 4 to 7 (inclusive), 15 and every third number from 20 to max, where `max' 
+|	would ordinarily be set to either the last character (if `settype' is `NxsSetReaderEnum::charset') or the last 
+|	taxon (if `settype' is `NxsSetReaderEnum::taxset'). If `max' equaled 30, the example above would be stored as
+|	follows (remember that internally the numbers are stored with offset 0, even though in the NEXUS data file the
+|	numbers always start at 1.
+|>
+|	3, 4, 5, 6, 14, 19, 22, 25, 28
+|>
+|	The following example of how NxsSetReader is used comes from the NxsCharactersBlock::HandleEliminate function:
+|>
+|	NxsSetReader(token, ncharTotal, eliminated, *this, NxsSetReader::charset).Run();
+|>
+|	This reads in a set of eliminated characters from a NEXUS data file, storing the resulting set in the data member
+|	`eliminated'. In this case `max' is set to `ncharTotal' (the total number of characters), and the block reference
+|	is set to the NxsCharactersBlock object, which provides a 
+*/
+class NxsSetReader
+	{
+	public:
+		
+		enum NxsSetReaderEnum	/* For use with the variable `settype' */
+			{
+			generic = 1,		/* means expect a generic set (say, characters weights) */
+			charset,			/* means expect a character set */
+			taxset				/* means expect a taxon set */
+			};
+
+						NxsSetReader(NxsToken &t, unsigned maxValue, NxsUnsignedSet &iset, NxsBlock &nxsblk, unsigned type);
+
+		bool			Run();
+
+	protected:
+
+		bool			AddRange(unsigned first, unsigned last, unsigned modulus = 0);
+
+	private:
+
+		unsigned		GetTokenValue();
+
+		NxsBlock		█		/* reference to the block object used for looking up labels */
+		NxsToken		&token;		/* reference to the token being used to parse the NEXUS data file */
+		NxsUnsignedSet	&nxsset;	/* reference to the NxsUnsignedSet set being read */
+		unsigned		max;		/* maximum number of elements in the set */
+		unsigned		settype;	/* the type of set being read (see the NxsSetReaderEnum enumeration) */
+	};
+
+typedef NxsSetReader SetReader;
+
+#endif
diff --git a/ncl/nxsstring.cpp b/ncl/nxsstring.cpp
new file mode 100644
index 0000000..964de39
--- /dev/null
+++ b/ncl/nxsstring.cpp
@@ -0,0 +1,877 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis and Mark T. Holder
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Capitalizes every character in the stored string.
+*/
+NxsString &NxsString::ToUpper()
+	{
+	for (NxsString::iterator sIt = begin(); sIt != end(); sIt++)
+		*sIt = (char) toupper(*sIt);
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Appends a string representation of the supplied double to the stored string and returns a reference to itself.
+*/
+NxsString &NxsString::operator+=(
+  const double d)	/* the double value to append */
+	{
+	char tmp[81];
+
+	// Create a C-string representing the supplied double value. 
+	// The # causes a decimal point to always be output.
+	//
+	sprintf(tmp, "%#3.6f", d);
+	unsigned tmplen = (unsigned)strlen(tmp);
+
+	// If the C-string has a lot of trailing zeros, lop them off
+	//
+	for (;;)
+		{
+		if (tmplen < 3 || tmp[tmplen-1] != '0' || tmp[tmplen-2] == '.')
+			break;
+		tmp[tmplen-1] = '\0';
+		tmplen--;
+		}
+
+	append(tmp);
+	return *this;
+	}
+
+/*-------------------------------------------------------------------------------------------------------------------------- 
+|	Adds `n' copies of the character `c' to the end of the stored string and returns a reference to itself.
+*/
+NxsString &NxsString::AddTail(
+  char c,		/* the character to use in the appended tail */
+  unsigned n)	/* the number of times `c' is to be appended */
+	{
+	char s[2];
+	s[0] = c;
+	s[1] = '\0';
+
+	for (unsigned i = 0; i < n; i++)
+		append(s);
+
+	return *this;
+	}
+
+/*-------------------------------------------------------------------------------------------------------------------------- 
+|	Replaces the stored string with a copy of itself surrounded by single quotes (single quotes inside the string are 
+|	converted to the '' pair of characters that signify a single quote). Returns a reference to itself.
+*/
+NxsString &NxsString::AddQuotes() 
+	{
+	NxsString withQuotes;
+	int len = length();
+	withQuotes.reserve(len + 4);
+	withQuotes += '\'';
+	for (NxsString::const_iterator sIt = begin(); sIt != end(); sIt++)
+		{
+		withQuotes += *sIt;
+		if (*sIt == '\'')
+			withQuotes += '\'';
+		}
+	withQuotes += '\'';
+	*this = withQuotes;
+
+	return *this;
+	}
+ 
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Appends a printf-style formatted string onto the end of this NxsString and returns the number of characters added to the 
+|	string. For example, the following code would result in the string s being set to "ts-tv rate ratio = 4.56789":
+|>
+|	double kappa = 4.56789;
+|	NxsString s;
+|	s.PrintF("ts-tv rate ratio = %.5f", kappa);
+|>
+*/
+int NxsString::PrintF(
+  const char *formatStr,	/* the printf-style format string */
+  ...)						/* other arguments referred to by the format string */
+  	{
+  	const int kInitialBufferSize = 256;
+  	char buf[kInitialBufferSize];
+
+	// Create a pointer to the list of optional arguments
+	//
+  	va_list argList;
+
+	// Set arg_ptr to the first optional argument in argList. The
+	// second argument (formatStr) is the last non-optional argument.
+	// 
+  	va_start(argList, formatStr);
+
+	// If vsnprintf returns -1, means kInitialBufferSize was not large enough.
+	// In this case, only kInitialBufferSize bytes are written.
+	//
+  	int nAdded = vsnprintf(buf, kInitialBufferSize, formatStr, argList);
+
+	// Reset the argument list pointer
+	//
+  	va_end(argList);
+
+	// Currently, if formatted string is too long to fit into the supplied buf,
+	// just adding a terminating '\0' and returning the truncated string
+	// Need to think of a better solution
+	//
+	if (nAdded  < 0 || nAdded >= kInitialBufferSize)
+		buf[kInitialBufferSize - 1] = '\0';
+
+	*this << buf;
+
+#if 0
+	// This part not being used anymore because there seems to be some differences
+	// between compilers in what is returned from the vsnprintf function. VC returns
+	// -1 if string is too long, Metrowerks returns the number of bytes that it would
+	// have used had there been enough space! 
+	//
+
+  	if (nAdded >= kInitialBufferSize)
+  		{
+  		char *tempbuf = new char[nAdded + 2];
+
+  		va_list argList;
+  		va_start(argList, formatStr);
+
+  		unsigned newNAdded = vsnprintf(tempbuf, nAdded + 1, formatStr, argList);
+
+  		va_end(argList);
+
+  		assert(nAdded == newNAdded);
+
+  		*this << tempbuf;
+  		delete [] tempbuf;
+  		}
+  	else
+  		*this << buf;
+
+#endif
+
+  	return nAdded;
+  	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if the string is a abbreviation (or complete copy) of the argument `s'.
+*/
+bool NxsString::IsStdAbbreviation(
+  const NxsString &s,	/* the string for which the stored string is potentially an abbreviation */
+  bool respectCase)		/* if true, comparison will be case-sensitive */
+  const
+	{
+	if (empty())
+		return false;
+
+	// s is the unabbreviated comparison string
+	//
+	const unsigned slen = static_cast<unsigned long>(s.size());
+
+	// t is the stored string
+	//
+	const unsigned tlen = static_cast<unsigned long>(size());
+
+	// t cannot be an abbreviation of s if it is longer than s
+	//
+	if (tlen > slen)
+		return false;
+
+	// Examine each character in t and return false (meaning "not an abbreviation")
+	// if at any point the corresponding character in s is different
+	//
+	for (unsigned k = 0; k < tlen; k++)
+		{
+		if (respectCase)
+			{
+			if ((*this)[k] != s[k])
+				return false;
+			}
+		else if (toupper((*this)[k]) != toupper(s[k]))
+			return false;
+		}
+
+	return true;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if the stored string is a case-insensitive abbreviation (or complete copy) of `s' and the stored string 
+| 	has all of the characters that are in the initial capitalized portion of `s'. For example if `s' is "KAPpa" then 
+|	"kappa", "kapp", or "kap" (with any capitalization pattern) will return true and all other strings will return false. 
+|	Always returns false if the stored string has length of zero.
+*/
+bool NxsString::IsCapAbbreviation(
+  const NxsString &s)	/* the string for which the stored string is potentially an abbreviation */
+  const
+	{
+	if (empty())
+		return false;
+
+	// s is the unabbreviated comparison string
+	//
+	const unsigned slen = static_cast<unsigned>(s.size());
+
+	// t is the stored string
+	//
+	const unsigned tlen = static_cast<unsigned>(size());
+
+	// If the stored string is longer than s then it cannot be an abbreviation of s
+	//
+	if (tlen > slen)
+		return false;
+	
+	unsigned k = 0;
+	for (; k < slen; k++) 
+		{
+		if (isupper(s[k]))	
+			{
+			// If still in the uppercase portion of s and we've run out of characters
+			// in t, then t is not a valid abbrevation of s
+			//
+			if (k >= tlen)
+				return false;
+
+			// If kth character in t is not equal to kth character in s, then
+			// t is not an abbrevation of s
+			//
+			char tokenChar = (char)toupper((*this)[k]);
+			if (tokenChar != s[k])
+				return false;
+			}
+		else if (!isalpha(s[k]))
+			{
+			// Get here if we are no longer in the upper case portion of s and 
+			// s[k] is not an alphabetic character. This section is necessary because
+			// we are dealing with a section of s that is not alphabetical and thus
+			// we cannot tell whether this should be part of the abbrevation or not
+			// (i.e. we cannot tell if it is capitalized or not). In this case, we
+			// pretend that we are still in the upper case portion of s and return
+			// false if we have run out of characters in t (meaning that the abbreviation
+			// was too short) or we find a mismatch.
+			//
+			if (k >= tlen)
+				return false;
+
+			if ((*this)[k] != s[k])
+				return false;
+			}
+		else
+			{
+			// Get here if we are no longer in the upper case portion of s and
+			// s[k] is an alphabetic character. Just break because we have determined
+			// that t is in fact a valid abbreviation of s.
+			//
+			break;
+			}
+		}
+
+	// Check the lower case portion of s and any corresponding characters in t for mismatches
+	// Even though the abbreviation is valid up to this point, it will become invalid if
+	// any mismatches are found beyond the upper case portion of s
+	//
+	for (; k < tlen; k++)
+		{
+  		const char tokenChar = (char)toupper((*this)[k]);
+  		const char otherChar = (char)toupper(s[k]);
+		if (tokenChar != otherChar)
+			return false;
+		}
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Right-justifies `x' in a field `w' characters wide, using blank spaces to fill in unused portions on the left-hand 
+|	side of the field. Specify true for `clear_first' to first empty the string. Assumes `w' is large enough to 
+|	accommodate the string representation of `x'.
+*/
+NxsString &NxsString::RightJustifyLong(
+  long x,			/* long value to right justify */
+  unsigned int w,	/* width of field */
+  bool clear_first)	/* if true, initialize string first to empty string */
+	{
+	bool x_negative = (x < 0L ? true : false);
+	unsigned long xabs = (x_negative ? (-x) : x);
+	unsigned num_spaces = w;
+
+	// If w = 10 and x = 123, we need 7 blank spaces before x
+	// log10(123) is 2.09, indicating that x is at least 10^2 = 100 but not
+	// 10^3 = 1000, thus x requires at least 3 characters to display
+	// 
+	unsigned x_width = (x == 0 ? 1 :1 + (int)log10((double)xabs));
+	if (x_negative)
+		x_width++;	// for the minus sign
+
+	assert(x_width <= num_spaces);
+	num_spaces -= x_width;
+
+	if (clear_first)
+		erase();
+
+	for (unsigned k = 0; k < num_spaces; k++)
+		*this += ' ';
+
+	if (x_negative)
+		*this += '-';
+
+	*this += xabs;
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Right-justifies `x' in a field `w' characters wide with precision `p', using blank spaces to fill in unused 
+|	portions on the left-hand side of the field. Specify true for `clear_first' to first empty the string. Assumes that
+|	the specified width is enough to accommodate the string representation of `x'.
+*/
+NxsString &NxsString::RightJustifyDbl(
+  double x,				/* double value to right justify */
+  unsigned w,			/* width of field */
+  unsigned p,			/* precision to use when displaying `x' */
+  bool clear_first)		/* if true, initialize stored string first to the empty string */
+	{
+	if (clear_first)
+		erase();
+
+	char fmtstr[81];
+	sprintf(fmtstr, "%%.%df", p);
+	NxsString tmp;
+	tmp.PrintF(fmtstr, x);
+
+	unsigned num_spaces = w - tmp.length();
+	assert(num_spaces >= 0);
+
+	for (unsigned k = 0; k < num_spaces; k++)
+		*this += ' ';
+
+	*this += tmp;
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Right-justifies `s' in a field `w' characters wide, using blank spaces to fill in unused portions on the left-hand
+|	side of the field. Specify true for `clear_first' to first empty the string. Assumes that the specified width is 
+|	enough to accommodate `s'.
+*/
+NxsString &NxsString::RightJustifyString(
+  const NxsString &s,	/* string to right justify */
+  unsigned w,			/* width of field */
+  bool clear_first)		/* if true, initialize string first to the empty string */
+	{
+	if (clear_first)
+		erase();
+
+	unsigned num_spaces = w - s.length();
+	assert(num_spaces >= 0);
+
+	for (unsigned k = 0; k < num_spaces; k++)
+		*this += ' ';
+
+	*this += s;
+	return *this;
+	}
+
+/*-------------------------------------------------------------------------------------------------------------------------- 
+|	Returns true if the string needs to be surrounded by single-quotes to make it a single nexus token.
+*/
+bool NxsString::QuotesNeeded() const
+	{
+	bool quotes_needed = false;
+
+	for (NxsString::const_iterator sIt = begin(); sIt != end(); sIt++)
+		{
+		char c = (*sIt);
+
+		if (!isgraph(c))
+			{
+			// The standard C function isgraph returns zero if c is either a space or is not a printable character.
+			//
+			quotes_needed = true;
+			}
+		else if (strchr("(){}\"-]/\\,;:=*`+<>", c) != NULL)
+			{
+			// Get here if c is any NEXUS punctuation mark except left square bracket ([) or apostrophe (').
+			// Left square bracket characters and apostrophes never get returned as punctuation by NxsToken,
+			// so we should never encounter them here. 
+			//
+			
+			if (length() > 1)
+				quotes_needed = true;
+			}
+		else if (c == '\'' || c == '[')
+			{
+			// Get here if c is either an apostrophe or left square bracket. Quotes are needed if one of these
+			// characters is all there is to this string
+			//
+			//@POL Mark, I'm confused.
+			//
+			quotes_needed = true;
+			}
+
+		if (quotes_needed)
+			break;
+		}
+
+	return quotes_needed;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts any blank spaces found in the stored string to the underscore character.
+*/
+NxsString &NxsString::BlanksToUnderscores()
+	{
+	unsigned len = length();
+	for (unsigned k = 0; k < len; k++)
+		{
+		char &ch = at(k);
+		if (ch == ' ')
+			ch = '_';
+		}
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts any underscore characters found in the stored string to blank spaces.
+*/
+NxsString &NxsString::UnderscoresToBlanks()
+	{
+	unsigned len = length();
+	for (unsigned k = 0; k < len; k++)
+		{
+		char &ch = at(k);
+		if (ch == '_')
+			ch = ' ';
+		}
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Shortens stored string to `n' - 3 characters, making the last three characters "...". If string is already less than 
+|	`n' characters in length, this function has no effect. This is useful when it is desirable to show some of the
+|	contents of a string, even when the string will not fit in its entirety into the space available for displaying it.
+|	Assumes that `n' is at least 4.
+*/
+NxsString &NxsString::ShortenTo(
+  unsigned n)	/* maximum number of characters available for displaying the string */
+	{
+	assert(n > 3);
+	if (length() <= static_cast<unsigned>(n))
+		return *this;
+	
+	NxsString s;
+	for (NxsString::iterator sIt = begin(); sIt != end(); sIt++)
+		{
+		s += (*sIt);
+		if (s.length() >= n - 3)
+			break;
+		}
+	s += "...";
+
+	*this = s;
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts every character in the stored string to its lower case equivalent.
+*/
+NxsString &NxsString::ToLower()
+	{
+	for (NxsString::iterator sIt = begin(); sIt != end(); sIt++)
+		{
+		char c = (char)tolower(*sIt);
+		*sIt = c;
+		}
+	return *this;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the stored string can be interpreted as a double value, and returns false otherwise.
+*/
+bool NxsString::IsADouble() const
+	{
+	const char	*str			= c_str();
+	unsigned	i				= 0;
+	bool		hadDecimalPt	= false;
+	bool		hadExp			= false;
+	bool		hadDigit		= false;
+	bool		hadDigitInExp	= false;
+
+	//	First char can be -
+	//
+	if (str[i]=='-')
+		i++;
+		
+	while (str[i])
+		{
+		if (isdigit(str[i]))
+			{
+			//	Digits are always OK
+			//
+			if (hadExp)
+				hadDigitInExp = true;
+			else
+				hadDigit = true;
+			}
+		else if (str[i] == '.')
+			{
+			//	One decimal point is allowed and it must be before the exponent
+			//
+			if (hadExp || hadDecimalPt)	
+				return false;
+			hadDecimalPt = true;
+			}
+		else if (str[i] == 'e' || str[i] == 'E')
+			{
+			//	One e is allowed, but it must be after at least one digit
+			//
+			if (hadExp || !hadDigit)
+				return false;
+			hadExp = true;
+			}
+		else if (str[i] == '-')
+			{
+			//	Another - is allowed if it is preceded by e
+			//
+			if (!hadExp || (str[i-1] != 'e' && str[i-1] != 'E')	)
+				return false;
+			}
+		else	
+			return false;
+		i++;
+		}
+		
+	if (hadExp)
+		{
+		if (hadDigitInExp)
+			return true;
+		return false;
+		}
+
+	if (hadDigit)
+		return true;
+	return false;
+	}
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if stored string can be interpreted as a long integer.
+*/
+bool NxsString::IsALong() const
+	{
+	const char *str	= c_str();
+	unsigned i		= 0;
+
+	//	First char can be -
+	//
+	if (str[i]=='-')
+		i++;
+
+	if (!isdigit(str[i]))
+		return false;
+
+	while (str[i])
+		{
+		if (!isdigit(str[i]))
+			return false;
+		i++;
+		}
+
+	return true;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if the stored string is a non-case-sensitive copy of the argument `s'. Note: will return true if both the
+|	stored string and `s' are empty strings.
+*/
+bool NxsString::EqualsCaseInsensitive(
+  const NxsString &s)	/* the comparison string */
+  const
+	{
+	unsigned k;
+	unsigned slen = s.size();
+	unsigned tlen = size();
+	if (slen != tlen)
+		return false;
+
+	for (k = 0; k < tlen; k++)
+		{
+  		if ((char)toupper((*this)[k]) != (char)toupper(s[k]))
+			return false;
+		}
+
+	return true;
+	}
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	Creates a string representation of the hexadecimal version of the long integer `p'. For example, if `p' equals 123,
+|	and if 2 was specified for `nFours', the resulting string would be "7B". If 4 was specified for `nFours', then the
+|	resulting string would be "007B". 
+*/
+NxsString NxsString::ToHex(
+  long p,			/* the value to display in hexadecimal */
+  unsigned nFours)	/* the number of hexadecimal digits to display */
+	{
+	NxsString s;
+	char decod[] = "0123456789ABCDEF";
+	for (int i = nFours - 1; i >= 0 ; i--)
+		{
+		unsigned long k = (p >> (4*i));
+		unsigned long masked = (k & 0x000f);
+		s += decod[masked];
+		}
+	return s;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Checks to see if the stored string begins with upper case letters and, if so, returns all of the contiguous capitalized
+|	prefix. If the stored string begins with lower case letters, an empty string is returned.
+*/
+NxsString NxsString::UpperCasePrefix() const
+	{
+	NxsString x;
+	unsigned i = 0;
+	while (i < size() && isupper((*this)[i]))
+		x += (*this)[i++];
+	return x;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Converts the stored string to an unsigned int using the standard C function strtol, throwing NxsX_NotANumber if the 
+|	conversion fails. Returns UINT_MAX if the number is too large to fit in an unsigned (or was a negative number).
+*/
+unsigned NxsString::ConvertToUnsigned() const
+	{
+	long l = ConvertToLong();
+	if (l < 0 || l >UINT_MAX) 
+		return UINT_MAX;
+	return static_cast<unsigned> (l);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Converts the stored string to an int using the standard C function strtol, throwing NxsX_NotANumber if the conversion 
+|	fails. Returns INT_MAX if the number is too large to fit in an int or -INT_MAX if it is too small.
+*/
+int NxsString::ConvertToInt() const
+	{
+	long l = ConvertToLong();
+	if (l == LONG_MAX || l > INT_MAX)
+		return INT_MAX;
+	if (l == -LONG_MAX || l <-INT_MAX)
+		return -INT_MAX;
+	return static_cast<int> (l);
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Converts the stored string to a long using the standard C function strtol, throwing NxsX_NotANumber if the conversion 
+|	fails.
+*/
+long NxsString::ConvertToLong() const
+	{
+	if (length() == 0 || !(isdigit(at(0)) || at(0) == '-'))
+		throw NxsX_NotANumber();
+	const char *b = c_str();
+	char *endP;
+	long l = strtol(b, &endP, 10);
+	if (l == 0 && endP == b)
+		throw NxsX_NotANumber();
+	return l;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Converts the stored string to a double using the standard C function strtod, throwing NxsX_NotANumber if the conversion
+|	fails. Returns DBL_MAX or -DBL_MAX if the number is out of bounds.
+*/
+double NxsString::ConvertToDouble() const
+	{
+	if (length() == 0)
+		throw NxsX_NotANumber();
+
+	char ch = at(0);
+	if (isdigit(ch) || ch == '-' || ch == '.'|| toupper(ch) == 'E')
+		{
+		const char *b = c_str();
+		char *endP;
+		double d = strtod(b, &endP);
+		if (d == 0.0 && endP == b)
+			throw NxsX_NotANumber();
+		if (d == HUGE_VAL)
+			return DBL_MAX;
+		if (d == -HUGE_VAL)
+			return -DBL_MAX;
+		return d;
+		}
+	throw NxsX_NotANumber();
+#if defined (DEMANDS_UNREACHABLE_RETURN)
+	return DBL_MAX;
+#endif
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Transforms the vector of NxsString objects by making them all lower case and then capitalizing the first portion of 
+|	them so that the capitalized portion is enough to uniquely specify each. Returns true if the strings are long enough 
+|	to uniquely specify each. Horrendously bad algorithm, but shouldn't be called often.
+*/
+bool SetToShortestAbbreviation(
+  NxsStringVector	&strVec,		/* vector of NxsString objects */
+  bool 				allowTooShort)	/* */
+	{
+	NxsStringVector upperCasePortion;
+	unsigned i;
+	for (i = 0; i < strVec.size(); i++)
+		{
+		// Change the next string to lower case
+		//
+		strVec[i].ToLower();
+
+		unsigned prefLen = 0;
+		NxsString pref;
+		
+		if (prefLen >= strVec[i].size())
+			return false;
+		pref += (char) toupper(strVec[i][prefLen++]);
+		bool moreChars = true;
+
+		// Keep adding letters from the current string until pref is unique.
+		// Then add this pref to upperCasePortion (vector of previous prefs)
+		//
+		for (;moreChars;)
+			{
+			size_t prevInd = 0;
+			for (; prevInd < upperCasePortion.size(); prevInd++)
+				{
+				if (pref == upperCasePortion[prevInd])
+					{
+					// 	Conflict  - both abbreviations need to grow
+					//
+					if (prefLen >= strVec[i].size())
+						{
+						if (allowTooShort)
+							{
+							if (prefLen < strVec[prevInd].size())
+								upperCasePortion[prevInd] += (char) toupper(strVec[prevInd][prefLen]);
+							moreChars = false;
+							break;
+							}
+						else
+							return false;
+						}
+					pref += (char) toupper(strVec[i][prefLen]);
+					if (prefLen >= strVec[prevInd].size())
+						{
+						if (allowTooShort)
+							{
+							prevInd = 0;
+							prefLen++;
+							break;
+							}
+						else
+							return false;
+						}
+					upperCasePortion[prevInd] += (char) toupper(strVec[prevInd][prefLen++]);
+					prevInd = 0;
+					break;
+					}
+				else
+					{
+					unsigned j;
+					for (j = 0; j < prefLen; j++)
+						{
+						if (pref[j] != upperCasePortion[prevInd][j])
+							break;
+						}
+					if (j == prefLen)
+						{
+						//	pref agrees with the first part of another abbreviation, lengthen it.
+						//
+						if (prefLen >= strVec[i].size())
+							{
+							if (allowTooShort)
+								{
+								moreChars = false;
+								break;
+								}
+							else
+								return false;
+							}
+						pref += (char) toupper(strVec[i][prefLen++]);
+						break;
+						}
+					}
+				}
+			if (prevInd == upperCasePortion.size() || !moreChars)
+				{
+				// Made it all the way through with no problems, add this 
+				// prefix as command i's upper case portion
+				//
+				upperCasePortion.push_back(pref);
+				break;
+				}
+			}
+		}
+
+	for (i = 0; i < strVec.size(); i++)
+		{
+		for (size_t j = 0; j < upperCasePortion[i].size(); j++)
+			strVec[i][j] = upperCasePortion[i][j];
+		}
+
+	return true;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns a vector of NxsString objects that match the entire `testStr'.
+*/
+NxsStringVector GetVecOfPossibleAbbrevMatches(
+  const NxsString		&testStr,		/* string to match */
+  const NxsStringVector	&possMatches)	/* vector of possible matches */
+	{
+	NxsStringVector matches;
+	for (size_t i = 0; i < possMatches.size(); i++)
+		{
+		if (testStr.Abbreviates(possMatches[i]))
+			matches.push_back(possMatches[i]);
+		}
+	return matches;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Written to make it easy to initialize a vector of strings. Similar to the perl split function. Converts a string like
+|	this -- "A|bro|ken strin|g" -- to a vector of strings with four elements:  "A", "bro", "ken string", and "g".
+*/
+NxsStringVector BreakPipeSeparatedList(
+  const NxsString &strList)	/* the string submitted for splitting */
+  	{
+	NxsString::const_iterator p = strList.begin();
+	NxsString ss;
+	NxsStringVector retVec;
+	for (;;)
+		{
+		bool done = (p == strList.end());
+		if (done || (*p == '|')) 
+			{
+			retVec.push_back(ss);
+			ss.clear();
+			if (done)
+				break;
+			p++;
+			}
+		ss += *p;
+		p++;
+		}
+	return retVec;
+	}
diff --git a/ncl/nxsstring.h b/ncl/nxsstring.h
new file mode 100644
index 0000000..2b86b65
--- /dev/null
+++ b/ncl/nxsstring.h
@@ -0,0 +1,610 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis and Mark T. Holder
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSSTRING_H
+#define NCL_NXSSTRING_H
+
+#include <cassert>
+#include <cstring>
+#include "nxsindent.h"
+
+class IndexSet;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	A string class for use with the Nexus Class Library. NxsString inherits most of its functionality from the standard
+|	template library class string, adding certain abilities needed for use in NCL, such as the ability to discern 
+|	whether a short string represents an abbreviation for the string currently stored. Another important addition is
+|	the member function PrintF, which accepts a format string and an arbitrary number of arguments, allowing a string
+|	to be built in a manner similar to the standard C function printf. Many operators are also provided for appending
+|	numbers to the ends of strings, an ability which is very useful for producing default labels (e.g. taxon1, taxon2,
+|	etc.).
+*/
+class NxsString
+  : public string
+	{
+	public:
+
+		class NxsX_NotANumber {};	/* exception thrown if attempt to convert string to a number fails */
+
+		enum CmpEnum				/* enum that is used to specify string comparison modes */
+			{
+			respect_case,		
+			no_respect_case, 
+			abbrev
+			};
+
+							NxsString();
+							NxsString(const char *s);
+							NxsString(const NxsString &s);
+
+		//	Accessors
+		//
+		bool				Abbreviates(const NxsString &s, NxsString::CmpEnum mode = NxsString::no_respect_case) const;
+		unsigned			ConvertToUnsigned() const;
+		int					ConvertToInt() const;
+		long				ConvertToLong() const;
+		double				ConvertToDouble() const;
+		bool				Equals(const NxsString &s, NxsString::CmpEnum mode = respect_case) const;
+		bool				EqualsCaseInsensitive(const NxsString &s) const;
+		NxsString			GetQuoted() const;
+		bool				IsADouble() const;
+		bool				IsALong() const;
+		bool				IsCapAbbreviation(const NxsString &s) const;
+		bool				IsInVector(const NxsStringVector &s, NxsString::CmpEnum mode = respect_case) const;
+		bool				IsStdAbbreviation(const NxsString &s, bool respectCase) const;
+		bool				IsNexusPunctuation(const char c) const;
+		bool				QuotesNeeded() const;
+		NxsString 			UpperCasePrefix() const;
+		friend ostream		&operator<<(std::ostream &out, const NxsString &s);
+
+		//	Modifiers
+		//
+		//NxsString		   &operator=(const NxsString &s);
+		NxsString			&operator=(char);
+		NxsString			&operator=(const char *s);
+		NxsString			&operator+=(const char *s);
+		NxsString			&operator+=(const NxsString &s);
+		NxsString			&operator+=(const char c);
+		NxsString			&operator+=(const int i);
+		NxsString			&operator+=(unsigned i);
+		NxsString			&operator+=(unsigned long i);
+		NxsString			&operator+=(const long l);
+		NxsString			&operator+=(const double d);
+		NxsString			&operator+=(const IndexSet &d);
+		NxsString			&operator<<(int i);
+		NxsString			&operator<<(unsigned i);
+		NxsString			&operator<<(long l);
+		NxsString			&operator<<(unsigned long l);
+		NxsString			&operator<<(double d);
+		NxsString			&operator<<(const char *c);
+		NxsString			&operator<<(char c);
+		NxsString			&operator<<(const NxsString &s);
+		NxsString			&operator<<(const IndexSet &s);
+		NxsString			&operator<<(Indent) {return *this;}	//@temp need a system for handling indentation
+		NxsString			&operator<<(NxsString &(*funcPtr)(NxsString	&));
+
+		// Functions that should be in base class string but aren't
+		void				clear();
+
+		int					PrintF(const char *formatStr, ...);
+
+		unsigned char		*p_str(unsigned char *) const;
+
+		NxsString			&AddQuotes();
+		NxsString 			&AddTail(char c, unsigned n);
+		NxsString			&NumberThenWord(unsigned i, NxsString s);
+		NxsString 			&ShortenTo(unsigned n);
+		NxsString			&AppendDouble(unsigned minFieldFormat, unsigned precFormat, double x);
+		NxsString 			&Capitalize();
+
+		NxsString 			&RightJustifyString(const NxsString &s, unsigned w, bool clear_first = false);
+		NxsString 			&RightJustifyLong(long x, unsigned w, bool clear_first = false);
+		NxsString 			&RightJustifyDbl(double x, unsigned w, unsigned p, bool clear_first = false);
+
+		NxsString 			&ToLower();
+		NxsString 			&ToUpper();
+
+		NxsString 			&BlanksToUnderscores();
+		NxsString 			&UnderscoresToBlanks();
+
+		//	Debugging
+		//	
+		static NxsString 	ToHex(long p, unsigned nFours);
+	};
+
+#if defined (NXS_SUPPORT_OLD_NAMES)
+	typedef NxsString nxsstring;
+#endif
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Function object (Unary Predicate functor) that stores one string. The ()(const NxsString &) operator then returns the 
+|	result of a case-insensitive compare. Useful for STL find algorithms. Could be made faster than sequential case 
+|	insenstive comparisons, because the string stored in the object is just capitalized once.
+*/
+class NStrCaseInsensitiveEquals 
+	{
+	public :
+
+					NStrCaseInsensitiveEquals(const NxsString &s);
+		bool		operator()(const NxsString &s);
+		
+	protected :
+
+		NxsString	compStr;
+	};
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Function object (Unary Predicate functor) that stores one string. The ()(const NxsString &) operator then returns the 
+|	result of a case-sensitive compare. Useful for STL find algorithms.
+*/
+class NStrCaseSensitiveEquals 
+	{
+	public :
+
+					NStrCaseSensitiveEquals(const NxsString &s);
+		bool		operator()(const NxsString &s) const;
+
+	protected :
+
+		NxsString	compStr;
+	};
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Binary function class that performs case-Insensitive string compares.
+*/
+struct NxsStringEqual
+  : public binary_function<NxsString, NxsString, bool>
+	{
+	bool operator()(const NxsString &x, const NxsString &y) const;
+	};
+
+// ############################# start NStrCaseInsensitiveEquals functions ##########################
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Creates a function object for case-insensitive comparisons of `s' to a container of strings. 
+*/
+inline NStrCaseInsensitiveEquals::NStrCaseInsensitiveEquals(
+  const NxsString &s)	/* the string to be compared */
+	{
+	compStr = s;
+	compStr.Capitalize();
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns the result of a case-sensitive compare of `s' and the string stored when the NStrCaseInsensitiveEquals object  
+|	was created. Could be made more efficient (currently capitalizes the entire argument even though the first character may 
+|	be wrong).
+*/
+inline bool NStrCaseInsensitiveEquals::operator()(
+  const NxsString &s)	/* the string to be compared */
+	{
+	if (s.length() == compStr.length())
+		{
+		NxsString capS(s);
+		capS.Capitalize();
+		return capS == compStr;
+		}
+	return false;
+	}
+
+// ############################# start NStrCaseSensitiveEquals functions ##########################
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Creates a function object for case-sensitive comparisons of `s' to a container of strings. 
+*/
+inline NStrCaseSensitiveEquals::NStrCaseSensitiveEquals(
+  const NxsString &s)	/* the string that all other strings will be compared to when the (const NxsString &) operator is called */  
+	{
+	compStr = s;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns the result of a case-sensitive compare of `s' and the string stored when the NStrCaseSensitiveEquals was 
+|	created.
+*/
+inline bool NStrCaseSensitiveEquals::operator()(
+  const NxsString &s)	/* the string to be compared */
+  const
+	{
+	return (compStr == s);
+	}
+	
+// ############################# start NxsStringEqual functions ##########################
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the strings `x' and `y' are identical (NOT case sensitive)
+*/
+inline bool NxsStringEqual::operator()(
+  const NxsString &x,	/* first string */
+  const NxsString &y)	/* second string to be compared with `x' */
+  const
+	{
+	return x.EqualsCaseInsensitive(y);
+	}
+
+// ############################# start NxsString functions ##########################
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	The default constructor.
+*/
+inline NxsString::NxsString()
+  : string()
+	{
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns a single-quoted version of the NxsString. The calling object is not altered. Written for ease of use. Simply 
+|	copies the stored string, then returns the copy after calling its AddQuotes function.
+*/
+inline NxsString NxsString::GetQuoted()
+  const
+	{
+	NxsString s(*this);
+	s.AddQuotes();
+	return s;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Most containers in the standard template library can be completely erased using the clear function, but none is 
+|	provided for the class string and hence is provided here.
+*/
+inline void NxsString::clear()
+	{
+	erase();
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if the Equals comparison function is true for this or any element in the vector `s'.
+*/
+inline bool NxsString::IsInVector(
+  const NxsStringVector &s, /* the vector of NxsString objects to be searched */
+  NxsString::CmpEnum mode)	/* the argument passed to the Equals function, which is called for every element in the vector `s' */
+  const	
+	{
+	for (NxsStringVector::const_iterator sIt = s.begin(); sIt != s.end(); sIt++)
+		{
+		if (Equals(*sIt, mode))
+			return true;
+		}
+	return false;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	A copy constructor taking a C-string argument.
+*/
+inline NxsString::NxsString(
+  const char *s)	/* the C-string that forms the basis for the new NxsString object */
+	{
+	assign(s);
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	A copy constructor taking a NxsString reference argument.
+*/
+inline NxsString::NxsString(
+  const NxsString &s)	/* reference to a NxsString to be used to create this copy */
+	{
+	assign(s);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Sets the stored string equal to the supplied C-string `s'.
+*/
+inline NxsString &NxsString::operator=(
+  const char *s)	/* the string for comparison */
+	{
+	assign(s);
+	return *this;
+	}
+	
+//inline NxsString& NxsString::operator=(
+//  const NxsString &s)
+//	{
+//	assign(s);
+//	return *this;
+//	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Appends the supplied C-string `s' to the stored string.
+*/
+inline NxsString &NxsString::operator+=(
+  const char *s)	/* the C-string to be appended */
+	{
+	append(string(s));
+	return *this;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Appends the characters in the supplied NxsString reference `s' to the stored string.
+*/
+inline NxsString &NxsString::operator+=(
+  const NxsString &s)	/* the string to append */
+	{
+	append(s);
+	return *this;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Appends the character `c' to the stored string.
+*/
+inline NxsString &NxsString::operator+=(
+  const char c)	/* the character to append */
+	{
+	char s[2];
+	s[0] = c;
+	s[1] = '\0';
+	append(string(s));
+	return *this;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Sets the stored string to the supplied character 'c'.
+*/
+inline NxsString &NxsString::operator=(
+  char c)	/* the character to which the stored string should be set */
+	{
+	clear();
+	return (*this += c);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Uses the standard C sprintf function to append the character representation of the supplied integer i' to the stored
+|	string (format code %d). For example, if the stored string is "taxon" and `i' is 9, the result is "taxon9".
+*/
+inline NxsString &NxsString::operator+=(
+  const int i)	/* the int to append */
+	{
+	char tmp[81];
+	sprintf(tmp, "%d", i);
+	append(tmp);
+	return *this;
+	}
+
+/*-------------------------------------------------------------------------------------------------------------------------- 
+|	Capitalizes all lower case letters in the stored string by calling ToUpper.
+*/
+inline NxsString &NxsString::Capitalize()
+	{
+	ToUpper();
+	return *this;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if the stored string is an abbreviation (or complete copy) of the supplied string `s'.
+*/
+inline bool NxsString::Abbreviates(
+  const NxsString	&s,		/* the full comparison string */
+  NxsString::CmpEnum	mode)	/* if equal to abbrev, a non-case-sensitive comparison will be made, otherwise comparison will respect case */
+  const
+	{
+	if (mode == NxsString::abbrev)
+		return IsCapAbbreviation(s);
+	else
+		return IsStdAbbreviation(s, mode == respect_case);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Uses standard C function sprintf to append the unsigned integer `i' to the stored string (format code %u). 
+*/
+inline NxsString& NxsString::operator+=(
+  unsigned i)	/* the integer to be appended */
+	{
+	char tmp[81];
+	sprintf(tmp, "%u", i);
+	append(tmp);
+	return *this;
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Uses standard C function sprintf to append the long integer `l' to the stored string (format code %ld).
+*/
+inline NxsString& NxsString::operator+=(
+  const long l)	/* the long integer to be appended */
+	{
+	char tmp[81];
+	sprintf(tmp, "%ld", l);
+	append(tmp);
+	return *this;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Uses standard C function sprintf to append the unsigned long integer `l' to the stored string (format code %lu).
+*/
+inline NxsString& NxsString::operator+=(
+  const unsigned long l)	/* the unsigned long integer to be appended */
+	{
+	char tmp[81];
+	sprintf(tmp, "%lu", l);
+	append(tmp);
+	return *this;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Uses the mode argument to call (and return the result of) the correct string comparison function. 
+*/
+inline bool NxsString::Equals(
+  const NxsString &s,		/* the string to which *this is compared */
+  NxsString::CmpEnum mode)	/* should be one of these three: respect_case, no_respect_case or abbrev */
+  const	
+	{
+	switch (mode) {
+		case NxsString::respect_case :
+			return (strcmp(this->c_str(), s.c_str()) == 0);
+		case NxsString::no_respect_case :
+			return this->EqualsCaseInsensitive(s);
+		case NxsString::abbrev :
+			return this->IsCapAbbreviation(s);
+		default :
+			assert(0);// incorrect setting for mode
+		}
+	return false;
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Allows functions that take and return references to NxsString strings to be placed in a series of << operators.
+|	See the NxsString endl function.
+*/
+inline NxsString &NxsString::operator<<(
+  NxsString &(*funcPtr)(NxsString &))	/* pointer to a function returning a reference to a NxsString */
+	{
+	return funcPtr(*this);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns true if `c' is any Nexus punctuation character:
+|>
+|	()[]{}/\,;:=*'"`-+<>
+|>
+*/
+inline bool NxsString::IsNexusPunctuation(
+  const char c)	/* the character in question */
+  const
+	{
+	return (strchr("()[]{}/\\,;:=*\'\"`-+<>", c) != 0);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Creates a new string (and returns a reference to the new string) composed of the integer `i' followed by a space and
+|	then the string `s'. If `i' is not 1, then an 's' character is appended to make `s' plural. For example, if `i' were 0,
+|	1, or 2, and `s' is "character", then the returned string would be "0 characters", "1 character" or "2 characters", 
+|	respectively. Obviously this only works if adding an 's' to the supplied string makes it plural.
+*/
+inline NxsString &NxsString::NumberThenWord(
+  unsigned i,			/* the number */
+  const NxsString s)	/* the string needing to be pluralized */
+  	{
+	(*this).erase();
+  	*this << i << ' ' << s;
+  	if (i != 1)
+  		*this << 's';
+  	return *this;
+  	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  int i)	/* the integer to append */
+  	{
+  	return (*this += i);
+  	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  unsigned i)	/* the unsigned integer to append */
+	{
+	return (*this += (int) i);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  long l)	/* the long integer to append */
+	{
+	return (*this += l);
+	}	
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  unsigned long l)	/* the unsigned long integer to append */
+	{
+	return (*this += l);
+	}	
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  double d)	/* the double floating point value to append */
+	{
+	return (*this += d);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  const char *c)	/* the C-string to append */
+	{	
+	return (*this += c);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  char c)	/* the char to append */
+	{	
+	return (*this += c);
+	}
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Another way to call the += operator (written to make it possible to use a NxsString like an ostream)
+*/
+inline NxsString &NxsString::operator<<(
+  const NxsString &s)	/* the NxsString to append */
+	{
+	return (*this += s);
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Returns string as a Pascal string (array of unsigned characters with the length in the first byte).
+*/
+inline unsigned char *NxsString::p_str(
+  unsigned char *buffer)	/* buffer to receive current string in Pascal form (i.e. length in first byte) */
+  const
+	{
+	memmove(buffer + 1, c_str(), length());
+	buffer[0] = length();
+	return buffer;
+	}
+
+// ############################# start of standalone functions ##########################
+
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Appends a newline character to the string `s' and the returns a reference to `s'. Used with << operator to allow 
+|	strings to be written to like ostreams.
+*/
+inline NxsString &endl(
+  NxsString &s)	/* the string to which the newline character is to be appended */
+	{
+	return (s += '\n');
+	}
+	
+/*--------------------------------------------------------------------------------------------------------------------------
+|	Writes the string `s' to the ostream `out'.
+*/
+inline ostream &operator<<(
+  ostream &out,			/* the stream to which the string `s' is to be written */
+  const NxsString &s)	/* the string to write */
+	{
+	out << s.c_str();
+	return out;
+	}
+
+NxsStringVector 	BreakPipeSeparatedList(const NxsString &strList);
+NxsStringVector 	GetVecOfPossibleAbbrevMatches(const NxsString &testStr,const NxsStringVector &possMatches);
+bool 				SetToShortestAbbreviation(NxsStringVector &strVec, bool allowTooShort = false);
+
+#endif
diff --git a/ncl/nxstaxablock.cpp b/ncl/nxstaxablock.cpp
new file mode 100644
index 0000000..51ff10f
--- /dev/null
+++ b/ncl/nxstaxablock.cpp
@@ -0,0 +1,352 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes id to "TAXA" and ntax to 0.
+*/
+NxsTaxaBlock::NxsTaxaBlock()
+  : NxsBlock()
+	{
+	ntax	= 0;
+	id		= "TAXA";
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Erases taxonLabels vector.
+*/
+NxsTaxaBlock::~NxsTaxaBlock()
+	{
+	taxonLabels.erase(taxonLabels.begin(), taxonLabels.end());
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NxsReader 
+|	object) to the end or endblock statement. Characters are read from the input stream in. Overrides the abstract 
+|	virtual function in the base class. 
+*/
+void NxsTaxaBlock::Read(
+  NxsToken &token)	/* the token used to read from in */
+	{
+	ntax				= 0;
+	int nominal_ntax	= 0;
+	isEmpty				= false;
+	isUserSupplied		= true;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";")) 
+		{
+		errormsg = "Expecting ';' after TAXA block name, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("DIMENSIONS"))
+			{
+			// This should be the NTAX keyword
+			//
+			token.GetNextToken(); 
+
+			if (!token.Equals("NTAX"))
+				{
+				errormsg = "Expecting NTAX keyword, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the equals sign
+			//
+			token.GetNextToken(); 
+
+			if (!token.Equals("=")) 
+				{
+				errormsg = "Expecting '=', but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the number of taxa
+			//
+			token.GetNextToken();
+
+			nominal_ntax = atoi(token.GetToken().c_str());
+			if (nominal_ntax <= 0)
+				{
+				errormsg = "NTAX should be greater than zero (";
+				errormsg += token.GetToken();
+				errormsg += " was specified)";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			// This should be the terminating semicolon
+			//
+			token.GetNextToken(); 
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate DIMENSIONS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}	// if (token.Equals("DIMENSIONS"))
+
+		else if (token.Equals("TAXLABELS")) 
+			{
+			if (nominal_ntax <= 0) 
+				{
+				errormsg = "NTAX must be specified before TAXLABELS command";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+
+			for (unsigned i = 0; (int)i < nominal_ntax; i++)
+				{
+				token.GetNextToken();
+				//@pol should check to make sure this is not punctuation
+				AddTaxonLabel(token.GetToken());
+				}
+
+			// This should be terminating semicolon
+			//
+			token.GetNextToken(); 
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate TAXLABELS command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}	// if (token.Equals("TAXLABELS")) 
+
+		else if (token.Equals("END") || token.Equals("ENDBLOCK"))
+			{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			break;
+			}	// if (token.Equals("END") || token.Equals("ENDBLOCK"))
+
+		else
+			{
+			SkippingCommand(token.GetToken());
+			do
+				{
+				token.GetNextToken();
+				}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}	// token not END, ENDBLOCK, TAXLABELS, or DIMENSIONS
+		}	// GetNextToken loop
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this taxa block. Overrides the abstract virtual function in
+|	the base class.
+*/
+void NxsTaxaBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	out << endl;
+	out << id << " block contains ";
+
+	if (ntax == 0)
+		{
+		out << "no taxa" << endl;
+		}
+	else if (ntax == 1)
+		out << "one taxon" << endl;
+	else
+		out << ntax << " taxa" << endl;
+
+	if (ntax == 0)
+		return;
+
+	for (unsigned k = 0; k < ntax; k++)
+		{
+		out << '\t' << (k+1) << '\t' << taxonLabels[k] << endl;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Flushes taxonLabels and sets ntax to 0 in preparation for reading a new TAXA block.
+*/
+void NxsTaxaBlock::Reset()
+	{
+	errormsg.clear();
+	isEmpty			= true;
+	isEnabled		= true;
+	isUserSupplied	= false;
+
+	ntax			= 0;
+	taxonLabels.clear();
+	needsQuotes.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds taxon label 's' to end of list of taxon labels and increments ntax by 1. Returns index of taxon label just 
+|	added.
+*/
+unsigned NxsTaxaBlock::AddTaxonLabel(
+  NxsString s)	/* the taxon label to add */
+	{
+	isEmpty = false;
+	if (s.QuotesNeeded())
+		needsQuotes.push_back(true);
+	else
+		needsQuotes.push_back(false);
+	
+	taxonLabels.push_back(s);
+	ntax++;
+	return (ntax-1);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Changes the label for taxon 'i' to 's'.
+*/
+void NxsTaxaBlock::ChangeTaxonLabel(
+  unsigned i,	/* the taxon label number to change */
+  NxsString s)	/* the string used to replace label i */
+	{
+	assert(i < (unsigned)taxonLabels.size());
+
+	if (s.QuotesNeeded())
+		needsQuotes[i] = true;
+	else
+		needsQuotes[i] = false;
+
+	taxonLabels[i] = s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the length of the longest taxon label stored. Useful for formatting purposes in outputting the data matrix 
+|	(i.e., you want the left edge of the matrix to line up).
+*/
+unsigned NxsTaxaBlock::GetMaxTaxonLabelLength()
+	{
+	assert(ntax == (unsigned)taxonLabels.size());
+
+	unsigned maxlen = 0;
+	for (unsigned i = 0; i < ntax; i++)
+		{
+		unsigned thislen = taxonLabels[i].size();
+		if (thislen > maxlen)
+			maxlen = thislen;
+		}
+	return maxlen;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the label for taxon 'i'.
+*/
+NxsString NxsTaxaBlock::GetTaxonLabel(
+  unsigned i)	/* the taxon label number to return */
+	{
+	assert(i >= 0);
+	assert(i < (unsigned)taxonLabels.size());
+
+	return taxonLabels[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if taxonLabels[i] contains embedded spaces and thus should be surrounded by single quotes if output is
+|	NEXUS format.
+*/
+bool NxsTaxaBlock::NeedsQuotes(
+  unsigned i)	/* the taxon label number in question */
+	{
+	assert(i >= 0);
+	assert(i < (unsigned)taxonLabels.size());
+
+	return needsQuotes[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if taxon label equal to 's' can be found in the taxonLabels list, and returns false otherwise.
+*/
+bool NxsTaxaBlock::IsAlreadyDefined(
+  NxsString s)	/* the s to attempt to find in the taxonLabels list */
+	{
+	NxsStringVector::const_iterator iter = find(taxonLabels.begin(), taxonLabels.end(), s);
+	bool taxonLabelFound = (iter != taxonLabels.end());
+	return taxonLabelFound;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns index of taxon named 's' in taxonLabels list. If taxon named 's' cannot be found, or if there are no 
+|	labels currently stored in the taxonLabels list, throws NxsX_NoSuchTaxon exception.
+*/
+unsigned NxsTaxaBlock::FindTaxon(
+  NxsString s)	/* the string to attempt to find in the taxonLabels list */
+	{
+	unsigned k = 0;
+	NxsStringVector::const_iterator i;
+	for (i = taxonLabels.begin(); i != taxonLabels.end(); ++i)
+		{
+		if (*i == s)
+			break;
+		k++;
+		}
+
+	if (i == taxonLabels.end())
+		throw NxsTaxaBlock::NxsX_NoSuchTaxon();
+
+	return k;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns number of taxon labels currently stored.
+*/
+unsigned NxsTaxaBlock::GetNumTaxonLabels()
+	{
+	return (unsigned)taxonLabels.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets ntax to n.
+*/
+void NxsTaxaBlock::SetNtax(
+  unsigned n)	/* the number of taxa */
+	{
+	ntax = n;
+	}
diff --git a/ncl/nxstaxablock.h b/ncl/nxstaxablock.h
new file mode 100644
index 0000000..74080c7
--- /dev/null
+++ b/ncl/nxstaxablock.h
@@ -0,0 +1,69 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#ifndef NCL_NXSTAXABLOCK_H
+#define NCL_NXSTAXABLOCK_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NxsReader block TAXA. It overrides the member functions Read and 
+|	Reset, which are abstract virtual functions in the base class NxsBlock. The taxon names are stored in an vector of
+|	strings (taxonLabels) that is accessible through the member functions GetTaxonLabel(int), AddTaxonLabel(NxsString), 
+|	ChangeTaxonLabel(int, NxsString), and GetNumTaxonLabels().
+*/
+class NxsTaxaBlock
+  : public NxsBlock
+	{
+	friend class NxsDataBlock;
+	friend class NxsAllelesBlock;
+	friend class NxsCharactersBlock;
+	friend class NxsDistancesBlock;
+
+	public:
+							NxsTaxaBlock();
+		virtual				~NxsTaxaBlock();
+
+		virtual unsigned	AddTaxonLabel(NxsString s);
+		void  				ChangeTaxonLabel(unsigned i, NxsString s);
+		unsigned			FindTaxon(NxsString label);
+		bool  				IsAlreadyDefined(NxsString label);
+		unsigned			GetMaxTaxonLabelLength();
+		unsigned			GetNumTaxonLabels();
+		NxsString 			GetTaxonLabel(unsigned i);
+		bool 				NeedsQuotes(unsigned i);
+		virtual void		Report(ostream &out);
+		virtual void 		Reset();
+
+		class NxsX_NoSuchTaxon {};	/* thrown if FindTaxon cannot locate a supplied taxon label in the taxonLabels vector */
+
+	protected:
+		unsigned		ntax;			/* number of taxa */
+		NxsStringVector	taxonLabels;	/* storage for list of taxon labels */
+		NxsBoolVector 	needsQuotes;	/* needsQuotes[i] true if label i needs to be quoted when output */
+
+		virtual void 	Read(NxsToken &token);
+
+	private:
+		void 			SetNtax(unsigned n);
+	};
+
+// The following typedef maintains compatibility with existing code.
+// The TaxaBlock class name is deprecated; please use NxsTaxaBlock instead.
+//
+typedef NxsTaxaBlock TaxaBlock;
+
+#endif
diff --git a/ncl/nxstoken.cpp b/ncl/nxstoken.cpp
new file mode 100644
index 0000000..90a1deb
--- /dev/null
+++ b/ncl/nxstoken.cpp
@@ -0,0 +1,622 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets atEOF and atEOL to false, comment and token to the empty string, filecol and fileline to 1, filepos to 0, 
+|	labileFlags to 0 and saved and special to the null character. Initializes the istream reference data 
+|	member in to the supplied istream `i'.
+*/
+NxsToken::NxsToken(
+  istream &i)	/* the istream object to which the token is to be associated */
+  : in(i)
+	{
+	atEOF		= false;
+	atEOL		= false;
+	comment.clear();
+	filecol		= 1L;
+	fileline	= 1L;
+	filepos		= 0L;
+	labileFlags	= 0;
+	saved		= '\0';
+	special		= '\0';
+	
+	whitespace[0]  = ' ';
+	whitespace[1]  = '\t';
+	whitespace[2]  = '\n';
+	whitespace[3]  = '\0';
+
+	punctuation[0]	= '(';
+	punctuation[1]	= ')';
+	punctuation[2]	= '[';
+	punctuation[3]	= ']';
+	punctuation[4]	= '{';
+	punctuation[5]	= '}';
+	punctuation[6]	= '/';
+	punctuation[7]	= '\\';
+	punctuation[8]	= ',';
+	punctuation[9]	= ';';
+	punctuation[10]	= ':';
+	punctuation[11]	= '=';
+	punctuation[12]	= '*';
+	punctuation[13]	= '\'';
+	punctuation[14]	= '"';
+	punctuation[15]	= '`';
+	punctuation[16]	= '+';
+	punctuation[17]	= '-';
+	punctuation[18]	= '<';
+	punctuation[19]	= '>';
+	punctuation[20]	= '\0';
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Nothing needs to be done; all objects take care of deleting themselves.
+*/
+NxsToken::~NxsToken()
+	{
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads rest of comment (starting '[' already input) and acts accordingly. If comment is an output comment, and if 
+|	an output stream has been attached, writes the output comment to the output stream. Otherwise, output comments are 
+|	simply ignored like regular comments. If the labileFlag bit saveCommandComments is in effect, the comment (without 
+|	the square brackets) will be stored in token. 
+*/
+void NxsToken::GetComment()
+	{
+	// Set comment level to 1 initially.  Every ']' encountered reduces
+	// level by one, so that we know we can stop when level becomes 0.
+	//
+	int level = 1;
+
+	// Get first character
+	//
+	char ch = GetNextChar();
+	if (atEOF)
+		{
+		errormsg = "Unexpected end of file inside comment";
+		throw NxsException( errormsg, GetFilePosition(), GetFileLine(), GetFileColumn());
+		}
+
+	// See if first character is the output comment symbol ('!')
+	// or command comment symbol (&)
+	//
+	int printing = 0;
+	int command = 0;
+	if (ch == '!')
+		printing = 1;
+	else if (ch == '&' && labileFlags & saveCommandComments)
+		{
+		command = 1;
+		AppendToToken(ch);
+		}
+	else if (ch == ']')
+		return;
+
+	// Now read the rest of the comment
+	//
+	for(;;)
+		{
+		ch = GetNextChar();
+		if (atEOF)
+			break;
+
+		if (ch == ']')
+			level--;
+		else if (ch == '[')
+			level++;
+
+		if (level == 0)
+			break;
+
+		if (printing)
+			AppendToComment(ch);
+		else if (command)
+			AppendToToken(ch);
+		}
+
+	if (printing)
+		{
+		// Allow output comment to be printed or displayed in most appropriate
+		// manner for target operating system
+		//
+		OutputComment(comment);
+
+		// Now that we are done with it, free the memory used to store the comment
+		//
+		//comment;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads rest of a token surrounded with curly brackets (the starting '{' has already been input) up to and including
+|	the matching '}' character. All nested curly-bracketed phrases will be included.
+*/
+void NxsToken::GetCurlyBracketedToken()
+	{
+	// Set level to 1 initially.  Every '}' encountered reduces
+	// level by one, so that we know we can stop when level becomes 0.
+	//
+	int level = 1;
+
+	char ch;
+	for(;;)
+		{
+		ch = GetNextChar();
+		if (atEOF)
+			break;
+
+		if (ch == '}')
+			level--;
+		else if (ch == '{')
+			level++;
+
+		AppendToToken(ch);
+
+		if (level == 0)
+			break;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Gets remainder of a double-quoted NEXUS word (the first double quote character was read in already by GetNextToken).
+|	This function reads characters until the next double quote is encountered. Tandem double quotes within a 
+|	double-quoted NEXUS word are not allowed and will be treated as the end of the first word and the beginning of the 
+|	next double-quoted NEXUS word. Tandem single quotes inside a double-quoted NEXUS word are saved as two separate 
+|	single quote characters; to embed a single quote inside a double-quoted NEXUS word, simply use the single quote by 
+|	itself (not paired with another tandem single quote).
+*/
+void NxsToken::GetDoubleQuotedToken()
+	{
+	char ch;
+
+	for(;;)
+		{
+		ch = GetNextChar();
+		if (atEOF)
+			break;
+
+		if (ch == '\"')
+			break;
+		else
+			AppendToToken(ch);
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Gets remainder of a quoted NEXUS word (the first single quote character was read in already by GetNextToken). This
+|	function reads characters until the next single quote is encountered. An exception occurs if two single quotes occur
+|	one after the other, in which case the function continues to gather characters until an isolated single quote is
+|	found. The tandem quotes are stored as a single quote character in the token NxsString.
+*/
+void NxsToken::GetQuoted()
+	{
+	char ch;
+
+	for(;;)
+		{
+		ch = GetNextChar();
+		if (atEOF)
+			break;
+
+		if (ch == '\'' && saved == '\'')
+			{
+			// Paired single quotes, save as one single quote
+			//
+			AppendToToken(ch);
+			saved = '\0';
+			}
+		else if (ch == '\'' && saved == '\0')
+			{
+			// Save the single quote to see if it is followed by another
+			//
+			saved = '\'';
+			}
+		else if (saved == '\'')
+			{
+			// Previously read character was single quote but this is something else, save current character so that it will
+			// be the first character in the next token read
+			//
+			saved = ch;
+			break;
+			}
+		else
+			AppendToToken(ch);
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads rest of parenthetical token (starting '(' already input) up to and including the matching ')' character.  All
+|	nested parenthetical phrases will be included.
+*/
+void NxsToken::GetParentheticalToken()
+	{
+	// Set level to 1 initially.  Every ')' encountered reduces
+	// level by one, so that we know we can stop when level becomes 0.
+	//
+	int level = 1;
+
+	char ch;
+	for(;;)
+		{
+		ch = GetNextChar();
+		if (atEOF)
+			break;
+
+		if (ch == ')')
+			level--;
+		else if (ch == '(')
+			level++;
+
+		AppendToToken(ch);
+
+		if (level == 0)
+			break;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if token begins with the capitalized portion of `s' and, if token is longer than `s', the remaining 
+|	characters match those in the lower-case portion of `s'. The comparison is case insensitive. This function should be
+|	used instead of the Begins function if you wish to allow for abbreviations of commands and also want to ensure that 
+|	user does not type in a word that does not correspond to any command.
+*/
+bool NxsToken::Abbreviation(
+  NxsString s)	/* the comparison string */
+	{
+	int k;
+	int slen = s.size();
+	int tlen = token.size();
+	char tokenChar, otherChar;
+
+	// The variable mlen refers to the "mandatory" portion
+	// that is the upper-case portion of s
+	//
+	int mlen;
+	for (mlen = 0; mlen < slen; mlen++)
+		{
+		if (!isupper(s[mlen]))
+			break;
+		}
+
+	// User must have typed at least mlen characters in
+	// for there to even be a chance at a match
+	//
+	if (tlen < mlen)
+		return false;
+
+	// If user typed in more characters than are contained in s,
+	// then there must be a mismatch
+	//
+	if (tlen > slen)
+		return false;
+
+	// Check the mandatory portion for mismatches
+	//
+	for (k = 0; k < mlen; k++)
+		{
+		tokenChar = (char)toupper( token[k]);
+		otherChar = s[k];
+		if (tokenChar != otherChar)
+			return false;
+		}
+
+	// Check the auxiliary portion for mismatches (if necessary)
+	//
+	for (k = mlen; k < tlen; k++)
+		{
+		tokenChar = (char)toupper( token[k]);
+		otherChar = (char)toupper( s[k]);
+		if (tokenChar != otherChar)
+			return false;
+		}
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if token NxsString begins with the NxsString `s'. This function should be used instead of the Equals 
+|	function if you wish to allow for abbreviations of commands.
+*/
+bool NxsToken::Begins(
+  NxsString s,			/* the comparison string */
+  bool respect_case)	/* determines whether comparison is case sensitive */
+	{
+	unsigned k;
+	char tokenChar, otherChar;
+
+	unsigned slen = s.size();
+	if (slen > token.size())
+		return false;
+
+	for (k = 0; k < slen; k++)
+		{
+		if (respect_case)
+			{
+			tokenChar = token[k];
+			otherChar = s[k];
+			}
+		else
+			{
+			tokenChar = (char)toupper( token[k]);
+			otherChar = (char)toupper( s[k]);
+			}
+
+		if (tokenChar != otherChar)
+			return false;
+		}
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if token NxsString exactly equals `s'. If abbreviations are to be allowed, either Begins or 
+|	Abbreviation should be used instead of Equals.
+*/
+bool NxsToken::Equals(
+  NxsString s,			/* the string for comparison to the string currently stored in this token */
+  bool respect_case)	/* if true, comparison will be case-sensitive */
+	{
+	unsigned k;
+	char tokenChar, otherChar;
+
+	unsigned slen = s.size();
+	if (slen != token.size())
+		return false;
+
+	for (k = 0; k < token.size(); k++)
+		{
+		if (respect_case)
+			{
+			tokenChar = token[k];
+			otherChar = s[k];
+			}
+		else
+			{
+			tokenChar = (char)toupper( token[k]);
+			otherChar = (char)toupper( s[k]);
+			}
+		if (tokenChar != otherChar)
+			return false;
+		}
+
+	return true;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads characters from in until a complete token has been read and stored in token. GetNextToken performs a number 
+|	of useful operations in the process of retrieving tokens:
+|~
+|	o any underscore characters encountered are stored as blank spaces (unless the labile flag bit preserveUnderscores
+|	  is set)
+|	o if the first character of the next token is an isolated single quote, then the entire quoted NxsString is saved 
+|	  as the next token
+|	o paired single quotes are automatically converted to single quotes before being stored
+|	o comments are handled automatically (normal comments are treated as whitespace and output comments are passed to 
+|	  the function OutputComment which does nothing in the NxsToken class but can be overridden in a derived class to 
+|	  handle these in an appropriate fashion)
+|	o leading whitespace (including comments) is automatically skipped
+|	o if the end of the file is reached on reading this token, the atEOF flag is set and may be queried using the AtEOF 
+|	  member function
+|	o punctuation characters are always returned as individual tokens (see the Maddison, Swofford, and Maddison paper 
+|	  for the definition of punctuation characters) unless the flag ignorePunctuation is set in labileFlags,
+|	  in which case the normal punctuation symbols are treated just like any other darkspace character.
+|~
+|	The behavior of GetNextToken may be altered by using labile flags. For example, the labile flag saveCommandComments 
+|	can be set using the member function SetLabileFlagBit. This will cause comments of the form [&X] to be saved as 
+|	tokens (without the square brackets), but only for the aquisition of the next token. Labile flags are cleared after 
+|	each application.
+*/
+void NxsToken::GetNextToken()
+	{
+	ResetToken();
+
+	char ch = ' ';
+	if (saved == '\0' || IsWhitespace(saved))
+		{
+		// Skip leading whitespace
+		//
+		while( IsWhitespace(ch) && !atEOF)
+			ch = GetNextChar();
+		saved = ch;
+		}
+
+	for(;;)
+		{
+		// Break now if singleCharacterToken mode on and token length > 0.
+		//
+		if (labileFlags & singleCharacterToken && token.size() > 0)
+			break;
+
+		// Get next character either from saved or from input stream.
+		//
+		if (saved != '\0')
+			{
+			ch = saved;
+			saved = '\0';
+			}
+		else
+			ch = GetNextChar();
+
+		// Break now if we've hit EOF.
+		//
+		if (atEOF)
+			break;
+
+		if (ch == '\n' && labileFlags & newlineIsToken)
+			{
+			if (token.size() > 0)
+				{
+				// Newline came after token, save newline until next time when it will be 
+				// reported as a separate token.
+				//
+				atEOL = 0;
+				saved = ch;
+				}
+			else
+				{
+				atEOL = 1;
+				AppendToToken(ch);
+				}
+			break;
+			}
+
+		else if (IsWhitespace(ch))
+			{
+			// Break only if we've begun adding to token (remember, if we hit a comment before a token,
+			// there might be further white space between the comment and the next token).
+			//
+			if (token.size() > 0)
+				break;
+			}
+
+		else if (ch == '_')
+			{
+			// If underscores are discovered in unquoted tokens, they should be 
+			// automatically converted to spaces.
+			//
+			if (!(labileFlags & preserveUnderscores))
+				ch = ' ';
+			AppendToToken(ch);
+			}
+
+		else if (ch == '[')
+			{
+			// Get rest of comment and deal with it, but notice that we only break if the comment ends a token,
+			// not if it starts one (comment counts as whitespace). In the case of command comments 
+			// (if saveCommandComment) GetComment will add to the token NxsString, causing us to break because
+			// token.size() will be greater than 0.
+			//
+			GetComment();
+			if (token.size() > 0)
+			break;
+			}
+
+		else if (ch == '(' && labileFlags & parentheticalToken)
+			{
+			AppendToToken(ch);
+
+			// Get rest of parenthetical token.
+			//
+			GetParentheticalToken();
+			break;
+			}
+
+		else if (ch == '{' && labileFlags & curlyBracketedToken)
+			{
+			AppendToToken(ch);
+
+			// Get rest of curly-bracketed token.
+			//
+			GetCurlyBracketedToken();
+			break;
+			}
+
+		else if (ch == '\"' && labileFlags & doubleQuotedToken)
+			{
+			// Get rest of double-quoted token.
+			//
+			GetDoubleQuotedToken();
+			break;
+			}
+
+		else if (ch == '\'')
+			{
+			if (token.size() > 0)
+				{
+				// We've encountered a single quote after a token has
+				// already begun to be read; should be another tandem
+				// single quote character immediately following.
+				//
+				ch = GetNextChar();
+				if (ch == '\'')
+					AppendToToken(ch);
+				else
+					{
+					errormsg = "Expecting second single quote character";
+					throw NxsException( errormsg, GetFilePosition(), GetFileLine(), GetFileColumn());
+					}
+				}
+			else
+				{
+				// Get rest of quoted NEXUS word and break, since
+				// we will have eaten one token after calling GetQuoted.
+				//
+				GetQuoted();
+				}
+			break;
+			}
+
+		else if (IsPunctuation(ch))
+			{
+			if (token.size() > 0)
+				{
+				// If we've already begun reading the token, encountering
+				// a punctuation character means we should stop, saving
+				// the punctuation character for the next token.
+				//
+				saved = ch;
+				break;
+				}
+			else
+				{
+				// If we haven't already begun reading the token, encountering
+				// a punctuation character means we should stop and return
+				// the punctuation character as this token (i.e., the token
+				// is just the single punctuation character.
+				//
+				AppendToToken(ch);
+				break;
+				}
+			}
+
+		else
+			{
+			AppendToToken(ch);
+			}
+
+		}
+
+	labileFlags = 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Strips whitespace from currently-stored token. Removes leading, trailing, and embedded whitespace characters.
+*/
+void NxsToken::StripWhitespace()
+	{
+	NxsString s;
+	for (unsigned j = 0; j < token.size(); j++)
+		{
+		if (IsWhitespace( token[j]))
+			continue;
+		s += token[j];
+		}
+	token = s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts all alphabetical characters in token to upper case.
+*/
+void NxsToken::ToUpper()
+	{
+	for (unsigned i = 0; i < token.size(); i++)
+		token[i] = (char)toupper(token[i]);
+	}
+
diff --git a/ncl/nxstoken.h b/ncl/nxstoken.h
new file mode 100644
index 0000000..f534fae
--- /dev/null
+++ b/ncl/nxstoken.h
@@ -0,0 +1,533 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSTOKEN_H
+#define NCL_NXSTOKEN_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	NxsToken objects are used by NxsReader to extract words (tokens) from a NEXUS data file. NxsToken objects know to
+|	correctly skip NEXUS comments and understand NEXUS punctuation, making reading a NEXUS file as simple as repeatedly
+|	calling the GetNextToken() function and then interpreting the token returned. If the token object is not attached 
+|	to an input stream, calls to GetNextToken() will have no effect. If the token object is not attached to an output
+|	stream, output comments will be discarded (i.e., not output anywhere) and calls to Write or Writeln will be 
+|	ineffective. If input and output streams have been attached to the token object, however, tokens are read one at a
+|	time from the input stream, and comments are correctly read and either written to the output stream (if an output
+|	comment) or ignored (if not an output comment). Sequences of characters surrounded by single quotes are read in as
+|	single tokens. A pair of adjacent single quotes are stored as a single quote, and underscore characters are stored
+|	as blanks.
+*/
+class NxsToken
+	{
+	public:
+
+		enum NxsTokenFlags	/* For use with the variable labileFlags */
+			{
+			saveCommandComments		= 0x0001,	/* if set, command comments of the form [&X] are not ignored but are instead saved as regular tokens (without the square brackets, however) */
+			parentheticalToken		= 0x0002,	/* if set, and if next character encountered is a left parenthesis, token will include everything up to the matching right parenthesis */
+			curlyBracketedToken		= 0x0004,	/* if set, and if next character encountered is a left curly bracket, token will include everything up to the matching right curly bracket */
+			doubleQuotedToken		= 0x0008,	/* if set, grabs entire phrase surrounded by double quotes */
+			singleCharacterToken	= 0x0010,	/* if set, next non-whitespace character returned as token */
+			newlineIsToken			= 0x0020,	/* if set, newline character treated as a token and atEOL set if newline encountered */
+			tildeIsPunctuation		= 0x0040,	/* if set, tilde character treated as punctuation and returned as a separate token */
+			useSpecialPunctuation	= 0x0080,	/* if set, character specified by the data member special is treated as punctuation and returned as a separate token */
+			hyphenNotPunctuation	= 0x0100,	/* if set, the hyphen character is not treated as punctutation (it is normally returned as a separate token) */
+			preserveUnderscores		= 0x0200,	/* if set, underscore characters inside tokens are not converted to blank spaces (normally, all underscores are automatically converted to blanks) */
+			ignorePunctuation		= 0x0400	/* if set, the normal punctuation symbols are treated the same as any other darkspace characters */
+			};
+
+		NxsString		errormsg;
+
+						NxsToken(istream &i);
+		virtual			~NxsToken();
+
+		bool			AtEOF();
+		bool			AtEOL();
+		bool			Abbreviation(NxsString s);
+		bool			Begins(NxsString s, bool respect_case = false);
+		void			BlanksToUnderscores();
+		bool			Equals(NxsString s, bool respect_case = false);
+		long			GetFileColumn() const;
+		file_pos		GetFilePosition() const;
+		long			GetFileLine() const;
+		void			GetNextToken();
+		NxsString		GetToken(bool respect_case = true);
+		const char		*GetTokenAsCStr(bool respect_case = true);
+		const NxsString	&GetTokenReference();
+		int				GetTokenLength() const;
+		bool			IsPlusMinusToken();
+		bool			IsPunctuationToken();
+		bool			IsWhitespaceToken();
+		void			ReplaceToken(const NxsString s);
+		void			ResetToken();
+		void			SetSpecialPunctuationCharacter(char c);
+		void			SetLabileFlagBit(int bit);
+		bool			StoppedOn(char ch);
+		void			StripWhitespace();
+		void			ToUpper();
+		void			Write(ostream &out);
+		void			Writeln(ostream &out);
+
+		virtual void	OutputComment(const NxsString &msg);
+		void GetNextContiguousToken(char stop_char); // Added by BQM
+	protected:
+
+		void			AppendToComment(char ch);
+		void			AppendToToken(char ch);
+		char			GetNextChar();
+		void			GetComment();
+		void			GetCurlyBracketedToken();
+		void			GetDoubleQuotedToken();
+		void			GetQuoted();
+		void			GetParentheticalToken();
+		bool			IsPunctuation(char ch);
+		bool			IsWhitespace(char ch);
+
+	private:
+
+		istream			∈				/* reference to input stream from which tokens will be read */
+		file_pos		filepos;			/* current file position (for Metrowerks compiler, type is streampos rather than long) */
+		long			fileline;			/* current file line */
+		long			filecol;			/* current column in current line (refers to column immediately following token just read) */
+		NxsString		token;				/* the character buffer used to store the current token */
+		NxsString		comment;			/* temporary buffer used to store output comments while they are being built */
+		char			saved;				/* either '\0' or is last character read from input stream */
+		bool			atEOF;				/* true if end of file has been encountered */
+		bool			atEOL;				/* true if newline encountered while newlineIsToken labile flag set */
+		char			special;			/* ad hoc punctuation character; default value is '\0' */
+		int				labileFlags;		/* storage for flags in the NxsTokenFlags enum */
+		char			punctuation[21];	/* stores the 20 NEXUS punctuation characters */
+		char			whitespace[4];		/* stores the 3 whitespace characters: blank space, tab and newline */
+	};
+
+typedef NxsToken NexusToken;
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the token for functions that only need read only access - faster than GetToken.
+*/
+inline const NxsString &NxsToken::GetTokenReference()
+	{
+	return token;
+	}
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function is called whenever an output comment (i.e., a comment beginning with an exclamation point) is found 
+|	in the data file. This version of OutputComment does nothing; override this virtual function to display the output 
+|	comment in the most appropriate way for the platform you are supporting.
+*/
+inline void NxsToken::OutputComment(
+  const NxsString &msg)	/* the contents of the printable comment discovered in the NEXUS data file */
+	{
+#	if defined(HAVE_PRAGMA_UNUSED)
+#		pragma unused(msg)
+#	endif
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds `ch' to end of comment NxsString.
+*/
+inline void NxsToken::AppendToComment(
+  char ch)	/* character to be appended to comment */
+	{
+	comment += ch;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Adds `ch' to end of current token.
+*/
+inline void NxsToken::AppendToToken(
+  char ch)	/* character to be appended to token */
+	{
+	// First three lines proved necessary to keep Borland's implementation of STL from crashing
+	// under some circumstances (may no longer be necessary)
+	//
+	char s[2];
+	s[0] = ch;
+	s[1] = '\0';
+
+	token += s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Reads next character from in and does all of the following before returning it to the calling function:
+|~
+|	o if character read is either a carriage return or line feed, the variable line is incremented by one and the
+|	  variable col is reset to zero
+|	o if character read is a carriage return, and a peek at the next character to be read reveals that it is a line
+|	  feed, then the next (line feed) character is read
+|	o if either a carriage return or line feed is read, the character returned to the calling function is '\n' if 
+|	  character read is neither a carriage return nor a line feed, col is incremented by one and the character is
+|	  returned as is to the calling function
+|	o in all cases, the variable filepos is updated using a call to the tellg function of istream.
+|~
+*/
+inline char NxsToken::GetNextChar()
+	{
+	int ch = in.get();
+	int failed = in.bad();
+	if (failed)
+		{
+		errormsg = "Unknown error reading data file (check to make sure file exists)";
+		throw NxsException(errormsg);
+		}
+
+	if (ch == 13 || ch == 10)
+		{
+		fileline++;
+		filecol = 1L;
+
+		if (ch == 13 && (int)in.peek() == 10)
+			ch = in.get();
+
+		atEOL = 1;
+		}
+	else if (ch == EOF)
+		atEOF = 1;
+	else
+		{
+		filecol++;
+		atEOL = 0;
+		}
+
+#	if defined(__DECCXX)
+		filepos = 0L;
+#	else
+		filepos = in.tellg();
+#	endif
+
+	if (atEOF)
+		return '\0';
+	else if (atEOL)
+		return '\n';
+	else
+		return (char)ch;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if character supplied is considered a punctuation character. The following twenty characters are 
+|	considered punctuation characters:
+|>
+|	()[]{}/\,;:=*'"`+-<>
+|>
+|	Exceptions:
+|~
+|	o The tilde character ('~') is also considered punctuation if the tildeIsPunctuation labile flag is set
+|	o The special punctuation character (specified using the SetSpecialPunctuationCharacter) is also considered 
+|	  punctuation if the useSpecialPunctuation labile flag is set
+|	o The hyphen (i.e., minus sign) character ('-') is not considered punctuation if the hyphenNotPunctuation 
+|	  labile flag is set
+|~
+|	Use the SetLabileFlagBit method to set one or more NxsLabileFlags flags in `labileFlags'
+*/
+inline bool NxsToken::IsPunctuation(
+  char ch)	/* the character in question */
+	{
+	// PAUP 4.0b10 
+	//  o allows ]`<> inside taxon names
+	//  o allows `<> inside taxset names
+	//
+	bool is_punctuation = false;
+	if (strchr(punctuation, ch))
+		is_punctuation = true;
+	if (labileFlags & tildeIsPunctuation  && ch == '~')
+		is_punctuation = true;
+	if (labileFlags & useSpecialPunctuation  && ch == special)
+		is_punctuation = true;
+	if (labileFlags & hyphenNotPunctuation  && ch == '-')
+		is_punctuation = false;
+
+	return is_punctuation;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if character supplied is considered a whitespace character. Note: treats '\n' as darkspace if labile
+|	flag newlineIsToken is in effect.
+*/
+inline bool NxsToken::IsWhitespace(
+  char ch)	/* the character in question */
+	{
+	bool ws = false;
+
+	// If ch is found in the whitespace array, it's whitespace
+	//
+	if (strchr(whitespace, ch))
+		ws = true;
+
+	// Unless of course ch is the newline character and we're currently
+	// treating newlines as darkspace!
+	//
+	if (labileFlags & newlineIsToken && ch == '\n')
+		ws = false;
+
+	return ws;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if and only if last call to GetNextToken encountered the end-of-file character (or for some reason the 
+|	input stream is now out of commission).
+*/
+inline bool NxsToken::AtEOF()
+	{
+	return atEOF;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if and only if last call to GetNextToken encountered the newline character while the newlineIsToken 
+|	labile flag was in effect.
+*/
+inline bool NxsToken::AtEOL()
+	{
+	return atEOL;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Converts all blanks in token to underscore characters. Normally, underscores found in the tokens read from a NEXUS
+|	file are converted to blanks automatically as they are read; this function reverts the blanks back to underscores. 
+*/
+inline void NxsToken::BlanksToUnderscores()
+	{
+	token.BlanksToUnderscores();
+	}
+	
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value stored in `filecol', which keeps track of the current column in the data file (i.e., number of 
+|	characters since the last new line was encountered).
+*/
+inline long  NxsToken::GetFileColumn() const
+	{
+	return filecol;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value stored in filepos, which keeps track of the current position in the data file (i.e., number of 
+|	characters since the beginning of the file).  Note: for Metrowerks compiler, you must use the offset() method of 
+|	the streampos class to use the value returned.
+*/
+inline file_pos  NxsToken::GetFilePosition() const
+	{
+	return filepos;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns value stored in `fileline', which keeps track of the current line in the data file (i.e., number of new 
+|	lines encountered thus far).
+*/
+inline long  NxsToken::GetFileLine() const
+	{
+	return fileline;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the data member `token'. Specifying false for`respect_case' parameter causes all characters in `token'
+|	to be converted to upper case before `token' is returned. Specifying true results in GetToken returning exactly 
+|	what it read from the file.
+*/
+inline NxsString NxsToken::GetToken(
+  bool respect_case)	/* determines whether token is converted to upper case before being returned */
+	{
+	if (!respect_case)
+		ToUpper();
+
+	return token;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the data member `token' as a C-style string. Specifying false for`respect_case' parameter causes all 
+|	characters in `token' to be converted to upper case before the `token' C-string is returned. Specifying true 
+|	results in GetTokenAsCStr returning exactly what it read from the file.
+*/
+inline const char *NxsToken::GetTokenAsCStr(
+  bool respect_case)	/* determines whether token is converted to upper case before being returned */
+	{
+	if (!respect_case)
+		ToUpper();
+
+	return token.c_str();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns token.size().
+*/
+inline int NxsToken::GetTokenLength() const
+	{
+	return token.size();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if current token is a single character and this character is either '+' or '-'.
+*/
+inline bool NxsToken::IsPlusMinusToken()
+	{
+	if (token.size() == 1 && ( token[0] == '+' || token[0] == '-') )
+		return true;
+	else
+		return false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if current token is a single character and this character is a punctuation character (as defined in 
+|	IsPunctuation function).
+*/
+inline bool NxsToken::IsPunctuationToken()
+	{
+	if (token.size() == 1 && IsPunctuation( token[0]))
+		return true;
+	else
+		return false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if current token is a single character and this character is a whitespace character (as defined in 
+|	IsWhitespace function).
+*/
+inline bool NxsToken::IsWhitespaceToken()
+	{
+	if (token.size() == 1 && IsWhitespace( token[0]))
+		return true;
+	else
+		return false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Replaces current token NxsString with s.
+*/
+inline void NxsToken::ReplaceToken(
+  const NxsString s)	/* NxsString to replace current token NxsString */
+	{
+	token = s;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets token to the empty NxsString ("").
+*/
+inline void NxsToken::ResetToken()
+	{
+	token.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the special punctuation character to `c'. If the labile bit useSpecialPunctuation is set, this character will 
+|	be added to the standard list of punctuation symbols, and will be returned as a separate token like the other 
+|	punctuation characters.
+*/
+inline void NxsToken::SetSpecialPunctuationCharacter(
+  char c)	/* the character to which `special' is set */
+	{
+	special = c;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Sets the bit specified in the variable `labileFlags'. The available bits are specified in the NxsTokenFlags enum.
+|	All bits in `labileFlags' are cleared after each token is read.
+*/
+inline void NxsToken::SetLabileFlagBit(
+  int bit)	/* the bit (see NxsTokenFlags enum) to set in `labileFlags' */
+	{
+	labileFlags |= bit;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Checks character stored in the variable saved to see if it matches supplied character `ch'. Good for checking such 
+|	things as whether token stopped reading characters because it encountered a newline (and labileFlags bit 
+|	newlineIsToken was set):
+|>
+|	StoppedOn('\n');
+|>
+|	or whether token stopped reading characters because of a punctuation character such as a comma:
+|>
+|	StoppedOn(',');
+|>
+*/
+inline bool NxsToken::StoppedOn(
+  char ch)	/* the character to compare with saved character */
+	{
+	if (saved == ch)
+		return true;
+	else
+		return false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Simply outputs the current NxsString stored in `token' to the output stream `out'. Does not send a newline to the 
+|	output stream afterwards.
+*/
+inline void NxsToken::Write(
+  ostream &out)	/* the output stream to which to write token NxsString */
+	{
+	out << token;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Simply outputs the current NxsString stored in `token' to the output stream `out'. Sends a newline to the output 
+|	stream afterwards.
+*/
+inline void NxsToken::Writeln(
+  ostream &out)	/* the output stream to which to write `token' */
+	{
+	out << token << endl;
+	}
+
+/**
+ * Added by BQM: return the contiguous string (including white space) as token
+ * until hitting stop_char
+ * @param stop_char a character to stop reading in
+ */
+inline void NxsToken::GetNextContiguousToken(char stop_char) {
+	ResetToken();
+
+	char ch = ' ';
+	if (saved == '\0' || IsWhitespace(saved))
+	{
+		// Skip leading whitespace
+		//
+		while( IsWhitespace(ch) && !atEOF)
+			ch = GetNextChar();
+		saved = ch;
+	}
+	for (;;) {
+
+		// Get next character either from saved or from input stream.
+		//
+		if (saved != '\0')
+			{
+			ch = saved;
+			saved = '\0';
+			}
+		else
+			ch = GetNextChar();
+
+		// Break now if we've hit EOF.
+		//
+		if (atEOF)
+			break;
+		if (ch == stop_char) {
+			saved = ch;
+			break;
+		}
+		AppendToToken(ch);
+	}
+	// Skip ending whitespace
+	if (token.empty()) return;
+	NxsString::iterator last = token.end();
+	while (last != token.begin() && IsWhitespace(*(last-1))) {
+		last--;
+	}
+	if (last != token.end()) token.erase(last, token.end());
+}
+
+#endif
diff --git a/ncl/nxstreesblock.cpp b/ncl/nxstreesblock.cpp
new file mode 100644
index 0000000..31553c6
--- /dev/null
+++ b/ncl/nxstreesblock.cpp
@@ -0,0 +1,557 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#include "ncl.h"
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Initializes `id' to "TREES", `ntrees' to 0, `defaultTree' to 0, and `taxa' to `tb'. Assumes `tb' is non-NULL.
+*/
+NxsTreesBlock::NxsTreesBlock(
+  NxsTaxaBlock *tb)	/* the NxsTaxaBlock object to be queried for taxon names appearing in tree descriptions */
+  : NxsBlock(), taxa(tb)
+	{
+	assert(tb != NULL);
+
+	id			= "TREES";
+	taxa		= tb;
+	ntrees		= 0;
+	defaultTree	= 0;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Clears `translateList', `rooted', `treeName' and `treeDescription'.
+*/
+NxsTreesBlock::~NxsTreesBlock()
+	{
+	translateList.clear();
+	rooted.clear();
+	treeName.clear();
+	treeDescription.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Makes data member `taxa' point to `tb' rather than the NxsTaxaBlock object it was previously pointing to. Assumes 
+|	`tb' is non-NULL.
+*/
+void NxsTreesBlock::ReplaceTaxaBlockPtr(
+  NxsTaxaBlock *tb)		/* pointer to new NxsTaxaBlock object (does not attempt to delete the object previously pointed to) */
+	{
+	assert(tb != NULL);
+
+	taxa = tb;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Takes control from the Read member function when a TREE or UTREE command is encountered. If a TREE command is found,
+|	the HandleTreeDescription member function is called with `utree' equal to false. If a UTREE command is found, 
+|	`utree' equals true.
+*/
+void NxsTreesBlock::HandleTreeDescription(
+  NxsToken &token,	/* the token used to read from `in' */
+  bool utree)		/* true if handling UTREE command, false if handling TREE command */
+	{
+	// Start off assuming that there will be no command comments contradicting
+	// the rooted/unrooted status impied by the use of the TREE/UTREE command
+	//
+	bool tree_is_unrooted = utree;
+
+	// This should be either an asterisk or a tree name
+	//
+	token.GetNextToken();
+
+	if (token.Equals("*"))
+		{
+		// ntrees is not incremented until entire tree command has been read
+		//
+		defaultTree = ntrees; 
+
+		// This should be tree name
+		//
+		token.GetNextToken();
+		}
+
+	// Save the tree name as the key
+	//
+	NxsString skey = token.GetToken();
+
+	// This should be an equals sign
+	//
+	token.GetNextToken();
+
+	if (!token.Equals("="))
+		{
+		errormsg = "Expecting '=' after tree name in TREE command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	// This should be either a tree description or a command comment specifying
+	// whether this tree is to be rooted ([&R]) or unrooted ([&U]).
+	//
+	token.SetLabileFlagBit(NxsToken::saveCommandComments);
+	token.SetLabileFlagBit(NxsToken::parentheticalToken);
+	token.GetNextToken();
+
+	NxsString s = token.GetToken();
+	NxsString cmdName = (utree ? "UTREE" : "TREE");
+	if (s.size() < 2)
+		{
+		errormsg = "Expecting command comment or tree description in ";
+		errormsg += cmdName;
+		errormsg += " command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	if (s[0] == '&') 
+		{
+		// Command comment found
+		//
+		if (s[1] == 'R' || s[1] == 'r')
+			{
+			tree_is_unrooted = false;
+			}
+		else if (s[1] == 'U' || s[1] == 'u')
+			{
+			tree_is_unrooted = true;
+			}
+		else
+			{
+			errormsg = "[";
+			errormsg += token.GetToken();
+			errormsg += "] is not a valid command comment in a ";
+			errormsg += cmdName;
+			errormsg += " command";
+			throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+			}
+
+		// This should be only the tree description
+		//
+		token.SetLabileFlagBit(NxsToken::parentheticalToken);
+		token.GetNextToken();
+		}
+
+	NxsString sval = token.GetToken();
+
+	// This should be a semicolon
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg = "Expecting ';' to terminate the ";
+		errormsg += cmdName;
+		errormsg += " command, but found ";
+		errormsg += token.GetToken();
+		errormsg += " instead";
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	ntrees++;
+	treeName.push_back(skey);
+	treeDescription.push_back(sval);
+
+	if (tree_is_unrooted)
+		rooted.push_back(false);
+	else
+		rooted.push_back(true);
+
+	assert(rooted.size() == (unsigned)ntrees);
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function provides the ability to read everything following the block name (which is read by the NxsReader 
+|	object) to the END or ENDBLOCK command. Characters are read from the input stream `in'. Overrides the abstract 
+|	virtual function in the base class.
+*/
+void NxsTreesBlock::Read(
+  NxsToken &token)	/* the token used to read from `in' */
+	{
+	isEmpty = false;
+	isUserSupplied = true;
+
+	// This should be the semicolon after the block name
+	//
+	token.GetNextToken();
+
+	if (!token.Equals(";"))
+		{
+		errormsg.PrintF("Expecting ';' after TREES block name, but found %s instead.", token.GetTokenAsCStr());
+		throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+		}
+
+	for (;;)
+		{
+		token.GetNextToken();
+
+		if (token.Equals("TRANSLATE")) 
+			{
+			// Note that numEntries will be 0 if no taxa block
+			// has been created
+			//
+			unsigned numEntries = taxa->GetNumTaxonLabels();
+			bool building_taxa_block = (numEntries == 0);
+
+			for (unsigned k = 0; ; k++) 
+				{
+				if (numEntries > 0 && k == numEntries)
+					break;
+
+				// Create the Association
+
+				// Get the key
+				//
+				token.GetNextToken();
+				NxsString skey = token.GetToken();
+
+				// Get the value
+				//
+				token.GetNextToken();
+				NxsString sval = token.GetToken();
+
+				// Add the taxon label to the TAXA block (if we are building one)
+				// and check to make sure taxon label is one that is in the taxa
+				// block (if we are not building up the taxa block as we go)
+				//
+				if (building_taxa_block)
+					taxa->AddTaxonLabel(sval);
+				else
+					{
+					try
+						{
+						taxa->FindTaxon(sval);
+						}
+					catch(NxsTaxaBlock::NxsX_NoSuchTaxon)
+						{
+						errormsg.clear();
+						errormsg.PrintF("The taxon \"%s\" was found in the TRANSLATE command of the TREES block, but was not found in the TAXA block", sval.c_str());
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					}
+
+				// Add the Association object to the translate list
+				//
+				translateList[skey] = sval;
+
+				// This should be a comma, unless we are at the last pair, in
+				// which case it should be a semicolon. If it is a semicolon,
+				// and a TAXA block exists, we should have read in exactly
+				// numEntries taxon labels at this point
+				//
+				token.GetNextToken();
+
+				if (token.Equals(";")) 
+					{
+					if (numEntries > 0 && k != numEntries - 1)
+						{
+						errormsg.clear();
+						errormsg.PrintF("There were %d entries in TRANSLATE command but only %d taxa in the TAXA block.", k + 1, numEntries);
+						errormsg += "\nThe number of TRANSLATE entries should equal the number of taxa.";
+						throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+						}
+					break;
+					}
+
+				else if (!token.Equals(","))
+					{
+					errormsg.clear();
+					errormsg.PrintF("Expecting ',' to terminate each number/name pair in TRANSLATE command, but found %s instead.", token.GetTokenAsCStr());
+					errormsg += "\nPerhaps there were fewer taxa in the tree file than previously defined.";
+					throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+					}
+				}	// for (unsigned k = 0; ; k++) 
+			}	// if (token.Equals("TRANSLATE")) 
+
+		else if (token.Equals("TREE")) 
+			{
+			HandleTreeDescription(token, false);
+			}	
+
+		else if (token.Equals("UTREE")) 
+			{
+			HandleTreeDescription(token, true);
+			}	
+
+		else if (token.Equals("END")) 
+			{
+			// Get the semicolon following END
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate the END command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			break;
+			}	// else if (token.Equals("END")) 
+
+		else if (token.Equals("ENDBLOCK")) 
+			{
+			// Get the semicolon following ENDBLOCK
+			//
+			token.GetNextToken();
+
+			if (!token.Equals(";"))
+				{
+				errormsg = "Expecting ';' to terminate the ENDBLOCK command, but found ";
+				errormsg += token.GetToken();
+				errormsg += " instead";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			break;
+			}	// else if (token.Equals("ENDBLOCK")) 
+
+		else
+			{
+			SkippingCommand(token.GetToken());
+
+			do
+				{
+				token.GetNextToken();
+				}
+			while (!token.AtEOF() && !token.Equals(";"));
+
+			if (token.AtEOF())
+				{
+				errormsg = "Unexpected end of file encountered";
+				throw NxsException(errormsg, token.GetFilePosition(), token.GetFileLine(), token.GetFileColumn());
+				}
+			}
+		}	// for (;;)
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Flushes `treeName', `treeDescription', `translateList' and `rooted', and sets `ntrees' and `defaultTree' both to 0
+|	in preparation for reading a new TREES block.
+*/
+void NxsTreesBlock::Reset()
+	{
+	// Reset base class data members that could have changed
+	//
+	errormsg.clear();
+	isEnabled      = true;
+	isEmpty        = true;
+	isUserSupplied = false;
+
+	ntrees			= 0;
+	defaultTree		= 0;
+
+	treeName.clear();
+	treeDescription.clear();
+	translateList.clear();
+	rooted.clear();
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the 0-offset index of the default tree, which will be 0 if there is only one tree stored or no trees 
+|	stored. If more than one tree is stored, the default tree will be the one specifically indicated by the user (using
+|	an asterisk in the data file), or 0 if the user failed to specify.
+*/
+unsigned NxsTreesBlock::GetNumDefaultTree()
+	{
+	return defaultTree;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the number of trees stored in this NxsTreesBlock object.
+*/
+unsigned NxsTreesBlock::GetNumTrees()
+	{
+	return ntrees;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the name of the tree stored at position `i' in `treeName'. Assumes that `i' will be in the range 
+|	[0..`ntrees').
+*/
+NxsString NxsTreesBlock::GetTreeName(
+  unsigned i)	/* the index of the tree for which the name is to be returned */
+	{
+	assert(i >= 0);
+	assert(i < ntrees);
+
+	return treeName[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the description of the tree stored at position `i' in `treeName'. Assumes that `i' will be in the range 
+|	[0..ntrees). Node numbers will be translated to names in the resulting tree description. Use GetTreeDescription if 
+|	translation is not desired.
+*/
+NxsString NxsTreesBlock::GetTranslatedTreeDescription(
+  unsigned i)	/* the index of the tree for which the description is to be returned */
+	{
+	assert(i >= 0);
+	assert(i < ntrees);
+
+	//bool adding_labels = (taxa->GetNumTaxonLabels() == 0);
+
+	// s is the original tree definition string 
+	//
+	NxsString s = treeDescription[i];
+	unsigned slen = s.size();
+	assert(slen > 1);
+
+	// x is the new tree definition string that will be built
+	// using s as the template
+	//
+	NxsString x;
+	x += s[0];
+
+	//bool inside_tip_label = false;
+	for (unsigned k = 1; k < slen; k++)
+		{
+		char prev = s[k - 1];
+		char curr = s[k];
+
+		if (isdigit(curr) && (prev == '(' || prev == ','))
+			{
+			// Discovered a number where a taxon label should be in the tree description
+			// Read entire number and then look it up in the translateList
+			//
+			NxsString ns;
+			ns += curr;
+			for (;;)
+				{
+				curr = s[k+1];
+				prev = s[k++];
+				if (isdigit(curr))
+					ns += curr;
+				else
+					{
+					--k;
+					break;
+					}
+				}
+			NxsString nss = translateList[ns];
+			x += nss;
+			}
+		else
+			x += curr;
+		}
+
+	return x;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns the description of the tree stored at position `i' in `treeDescription'. Assumes that `i' will be in the 
+|	range [0..`ntrees').
+*/
+NxsString NxsTreesBlock::GetTreeDescription(
+  unsigned i)	/* the index of the tree for which the description is to be returned */
+	{
+	assert(i >= 0);
+	assert(i < ntrees);
+
+	return treeDescription[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the `i'th tree (0-offset) is the default tree, false otherwise. Assumes that `i' will be in the 
+|	range [0..ntrees).
+*/
+bool NxsTreesBlock::IsDefaultTree(
+  unsigned i)	/* the index of the tree in question */
+	{
+	assert(i >= 0);
+	assert(i < ntrees);
+
+	if (i == GetNumDefaultTree())
+		return true;
+	else
+		return false;
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Returns true if the `i'th tree (0-offset) is rooted, false otherwise. Assumes that `i' will be in the 
+|	range [0..ntrees).
+*/
+bool NxsTreesBlock::IsRootedTree(
+  unsigned i)	/* the index of the tree in question */
+	{
+	assert(i >= 0);
+	assert(i < ntrees);
+
+	return rooted[i];
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This function outputs a brief report of the contents of this block. Overrides the abstract virtual function in the 
+|	base class.
+*/
+void NxsTreesBlock::Report(
+  ostream &out)	/* the output stream to which to write the report */
+	{
+	out << endl;
+	out << id << " block contains ";
+	if (ntrees == 0)
+		{
+		out << "no trees" << endl;
+		}
+	else if (ntrees == 1)
+		out << "one tree" << endl;
+	else
+		out << ntrees << " trees" << endl;
+
+	if (ntrees == 0)
+		return;
+
+	for (unsigned k = 0; k < ntrees; k++)
+		{
+		out << '\t' << (k+1) << '\t' << treeName[k];
+		out << "\t(";
+		if (rooted[k])
+			out << "rooted";
+		else
+			out << "unrooted";
+		if (defaultTree == k)
+			out << ",default tree)" << endl;
+		else
+			out << ')' << endl;
+		}
+	}
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	Outputs a brief description of this block's contents to the referenced NxsString. An example of the output of this 
+|	command is shown below:
+|>
+|	TREES block contains 102 trees
+|>
+*/
+void NxsTreesBlock::BriefReport(
+  NxsString &s)	/* reference to the string in which to store the contents of the brief report */
+	{
+	s = "\n\n";
+	s += id;
+	s += " block contains ";
+	if (ntrees == 0)
+		s += "no trees\n";
+	else if (ntrees == 1)
+		s += "one tree\n";
+	else
+		{
+		s += ntrees;
+		s += " trees\n";
+		}
+	}
diff --git a/ncl/nxstreesblock.h b/ncl/nxstreesblock.h
new file mode 100644
index 0000000..d8817d6
--- /dev/null
+++ b/ncl/nxstreesblock.h
@@ -0,0 +1,83 @@
+//	Copyright (C) 1999-2003 Paul O. Lewis
+//
+//	This file is part of NCL (Nexus Class Library) version 2.0.
+//
+//	NCL is free software; you can redistribute it and/or modify
+//	it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation; either version 2 of the License, or
+//	(at your option) any later version.
+//
+//	NCL is distributed in the hope that it will be useful,
+//	but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//	GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License
+//	along with NCL; if not, write to the Free Software Foundation, Inc., 
+//	59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+//
+
+#ifndef NCL_NXSTREESBLOCK_H
+#define NCL_NXSTREESBLOCK_H
+
+/*----------------------------------------------------------------------------------------------------------------------
+|	This class handles reading and storage for the NEXUS block TREES. It overrides the member functions Read and Reset,
+|	which are abstract virtual functions in the base class NxsBlock. The translation table (if one is supplied) is 
+|	stored in the `translateList'. The tree names are stored in `treeName' and the tree descriptions in 
+|	`treeDescription'. Information about rooting of trees is stored in `rooted'. Note that no checking is done to 
+|	ensure that the tree descriptions are valid. The validity of the tree descriptions could be checked after the TREES
+|	block has been read (but before the next block in the file has been read) by overriding the NxsReader::ExitingBlock
+|	member function, but no functionality for this is provided by the NCL. Below is a table showing the correspondence
+|	between the elements of a TREES block and the variables and member functions that can be used to access each piece 
+|	of information stored. 
+|>
+|	NEXUS command     Data members    Member functions
+|	-----------------------------------------------------
+|	TRANSLATE         translateList
+|	
+|	TREE              treeName        GetTreeName
+|	                                  GetTreeDescription
+|	                                  GetNumTrees
+|	                                  GetNumDefaultTree
+|	                                  IsDefaultTree
+|	
+|	                  rooted          IsRootedTree
+|	-----------------------------------------------------
+|>
+*/
+class NxsTreesBlock 
+  : public NxsBlock
+	{
+ 	public:
+							NxsTreesBlock(NxsTaxaBlock *tb);
+		virtual				~NxsTreesBlock();
+
+				void		ReplaceTaxaBlockPtr(NxsTaxaBlock *tb);
+				unsigned	GetNumDefaultTree();
+				unsigned	GetNumTrees();
+				NxsString	GetTreeName(unsigned i);
+				NxsString	GetTreeDescription(unsigned i);
+				NxsString	GetTranslatedTreeDescription(unsigned i);
+				bool		IsDefaultTree(unsigned i);
+				bool		IsRootedTree(unsigned i);
+		virtual void		Report(std::ostream &out);
+		virtual void		BriefReport(NxsString &s);
+		virtual void		Reset();
+
+	protected :
+
+		NxsStringMap		translateList;		/* storage for translation table (if any) */
+		NxsStringVector		treeName;			/* storage for tree names */
+		NxsStringVector		treeDescription;	/* storage for tree descriptions */
+		NxsBoolVector		rooted;				/* stores information about rooting for each tree */
+		NxsTaxaBlock		*taxa;				/* pointer to existing NxsTaxaBlock object */
+		unsigned			ntrees;				/* number of trees stored */
+		unsigned			defaultTree;		/* 0-offset index of default tree specified by user, or 0 if user failed to specify a default tree using an asterisk in the NEXUS data file */
+
+		virtual	void		Read(NxsToken &token);
+		void				HandleTreeDescription(NxsToken &token, bool utree);
+	};
+
+typedef NxsTreesBlock TreesBlock;
+
+#endif
diff --git a/ngs.cpp b/ngs.cpp
new file mode 100644
index 0000000..801af40
--- /dev/null
+++ b/ngs.cpp
@@ -0,0 +1,1226 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+/*
+	collection of classes for Next-generation sequencing
+*/
+
+#include "ngs.h"
+//#include "modeltest_wrapper.h"
+
+/****************************************************************************
+        NGSAlignment
+ ****************************************************************************/
+
+NGSAlignment::NGSAlignment(PhyloTree *atree) : AlignmentPairwise() {
+    tree = atree;
+}
+
+NGSAlignment::NGSAlignment(const char *filename) : AlignmentPairwise() {
+    readFritzFile(filename);
+}
+
+NGSAlignment::NGSAlignment(int nstate, int ncat, double *freq) : AlignmentPairwise() {
+    num_states = nstate;
+    ncategory = ncat;
+    int total_size = ncategory*num_states*num_states;
+    pair_freq = new double[total_size];
+    memcpy(pair_freq, freq, total_size * sizeof(double));
+}
+
+NGSAlignment::NGSAlignment(int nstate, string &seq1, string &seq2) {
+    num_states = nstate;
+    ncategory = 1;
+    pair_freq = new double[nstate*nstate];
+    memset(pair_freq, 0, sizeof(double)*nstate*nstate);
+    assert(seq1.length() == seq2.length());
+    int len = seq1.length();
+    int i;
+    for (i = 0; i < len; i++) {
+        int state1 = convertState(seq1[i], SEQ_DNA);
+        int state2 = convertState(seq2[i], SEQ_DNA);
+        if (state1 < num_states && state2 < num_states)
+            pair_freq[state1*num_states+state2] += 1;
+    }
+}
+
+char NGSAlignment::convertState(char state, SeqType seq_type) {
+    char c = Alignment::convertState(state, SEQ_DNA);
+    if (c == STATE_UNKNOWN) return 4;
+    if (c >= 4) return 5;
+    return c;
+}
+
+
+void NGSAlignment::readFritzFile(const char *filename) {
+    cout << "Reading Fritz file " << filename << " ..." << endl;
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(filename);
+        in.clear();
+        int i, total_size;
+        string tmp;
+        in >> tmp;
+        ncategory = convert_int(tmp.c_str());
+        if (ncategory < 1) throw "Wrong number of positions";
+        in >> tmp;
+        num_states = convert_int(tmp.c_str());
+        total_size = ncategory*num_states*num_states;
+        if (num_states < 1) throw "Wrong number of states";
+        pair_freq = new double[total_size];
+        for (i=0; i < total_size; i++) {
+            in >> tmp;
+            double count = convert_double(tmp.c_str());
+            if (count < 0) throw "Wrong count";
+            pair_freq[i] = count;
+        }
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    }
+
+    cout << ncategory << " matrices of size " << num_states << endl;
+}
+
+void NGSAlignment::computeStateFreq (double *stateFrqArr, size_t num_unknown_states) {
+    int cat, i, j, id = 0;
+    double *state_count = new double[num_states];
+    memset(state_count, 0, sizeof(double)*num_states);
+    for (cat = 0, id = 0; cat < ncategory; cat++) {
+        for (i = 0; i < num_states; i++)
+            for (j = 0; j < num_states; j++, id++) {
+                state_count[i] += pair_freq[id];
+                state_count[j] += pair_freq[id];
+            }
+    }
+
+    double sum_count = 0;
+    for (i = 0; i < num_states; i++) sum_count += state_count[i];
+    if (sum_count == 0) throw "Empty data observed";
+    for (i = 0; i < num_states; i++) stateFrqArr[i] = double(state_count[i]) / sum_count;
+    /*if (verbose_mode >= VB_MIN)*/ {
+        cout << "Empirical state frequencies: ";
+        for (i = 0; i < num_states; i++)
+            cout << stateFrqArr[i] << " ";
+        cout << endl;
+    }
+    delete [] state_count;
+}
+
+void NGSAlignment::computeSumPairFreq (double *sum_pair_freq) {
+    int cat, id, i, j;
+    memset(sum_pair_freq, 0, sizeof(double)*num_states*num_states);
+    for (cat = 0, id = 0; cat < ncategory; cat++) {
+        for (i = 0; i < num_states; i++)
+            for (j = 0; j < num_states; j++, id++) {
+                sum_pair_freq[i*num_states+j] += pair_freq[id];
+            }
+    }
+}
+
+void NGSAlignment::computeEmpiricalRate (double *rates) {
+    int i, j, k, cat, id;
+    assert(rates);
+    double **pair_rates = (double**) new double[num_states];
+    for (i = 0; i < num_states; i++) {
+        pair_rates[i] = new double[num_states];
+        memset(pair_rates[i], 0, sizeof(double)*num_states);
+    }
+
+    for (cat = 0, id = 0; cat < ncategory; cat++) {
+        for (i = 0; i < num_states; i++)
+            for (j = 0; j < num_states; j++, id++) {
+                pair_rates[i][j] += pair_freq[id];
+            }
+    }
+
+    k = 0;
+    double last_rate = pair_rates[num_states-2][num_states-1] + pair_rates[num_states-1][num_states-2];
+    if (last_rate == 0.0) throw "Last rate entry is ZERO";
+    for (i = 0; i < num_states-1; i++)
+        for (j = i+1; j < num_states; j++)
+            rates[k++] = (pair_rates[i][j] + pair_rates[j][i]) / last_rate;
+    /*if (verbose_mode >= VB_MIN)*/ {
+        cout << "Empirical rates: ";
+        for (k = 0; k < num_states*(num_states-1)/2; k++)
+            cout << rates[k] << " ";
+        cout << endl;
+    }
+    for (i = num_states-1; i >= 0; i--) {
+        delete [] pair_rates[i];
+    }
+    delete [] pair_rates;
+}
+
+double NGSAlignment::computeEmpiricalDist(int cat) {
+    int i;
+    int trans_size = num_states*num_states;
+    double *pair_pos = pair_freq + (cat*trans_size);
+    double match_pos = 0, total_pos = 0;
+    for (i = 0; i < num_states; i++)
+        match_pos += pair_pos[i*num_states+i];
+    for (i = 0; i < trans_size; i++)
+        total_pos += pair_pos[i];
+    if (total_pos == 0) total_pos = 1;
+    return (double)(total_pos - match_pos) / total_pos;
+}
+
+
+double NGSAlignment::computeFunctionCat(int cat, double value) {
+    int trans_size = num_states*num_states;
+    double lh = 0.0;
+    double *trans_mat = new double[trans_size];
+    int i;
+
+    tree->getModelFactory()->computeTransMatrix(value, trans_mat);
+    double *pair_pos = pair_freq + cat*trans_size;
+
+    for (i = 0; i < trans_size; i++) if (pair_pos[i] > 1e-6) {
+            if (trans_mat[i] <= 0) throw "Negative transition probability";
+            lh -= pair_pos[i] * log(trans_mat[i]);
+        }
+    delete [] trans_mat;
+    return lh;
+}
+
+
+void NGSAlignment::computeFuncDervCat(int cat, double value, double &df, double &ddf) {
+    int trans_size = num_states*num_states;
+//    double lh = 0.0;
+    df = 0.0;
+    ddf = 0.0;
+    int i;
+    double derv1 = 0.0, derv2 = 0.0;
+    double *trans_mat = new double[trans_size];
+    double *trans_derv1 = new double[trans_size];
+    double *trans_derv2 = new double[trans_size];
+
+
+    tree->getModelFactory()->computeTransDerv(value, trans_mat, trans_derv1, trans_derv2);
+    double *pair_pos = pair_freq + cat*trans_size;
+    for (i = 0; i < trans_size; i++) if (pair_pos[i] > 1e-6) {
+            if (trans_mat[i] <= 0) throw "Negative transition probability";
+            double d1 = trans_derv1[i] / trans_mat[i];
+            derv1 += pair_pos[i] * d1;
+            derv2 += pair_pos[i] * (trans_derv2[i]/trans_mat[i] - d1 * d1);
+//            lh -= pair_pos[i] * log(trans_mat[i]);
+        }
+    //df -= derv1 * rate_val;
+    //ddf -= derv2 * rate_val * rate_val;
+    df -= derv1;
+    ddf -= derv2;
+	delete [] trans_derv2;
+	delete [] trans_derv1;
+	delete [] trans_mat;
+//    return lh;
+    return;
+}
+
+/****************************************************************************
+        NGSRate
+ ****************************************************************************/
+NGSRate::NGSRate(PhyloTree *tree) {
+    phylo_tree = tree;
+    ncategory = ((NGSAlignment*)tree->aln)->ncategory;
+    rates = new double[ncategory];
+    int i;
+    for (i = 0; i < ncategory; i++) {
+        rates[i] = ((NGSAlignment*)tree->aln)->computeEmpiricalDist(i);
+        if (rates[i] < 1e-6) rates[i] = 1e-6;
+    }
+
+    name = "+F";
+    name += convertIntToString(ncategory);
+    full_name = name;
+    is_categorized = true;
+
+}
+
+double NGSRate::optimizeParameters(double epsilon) {
+    int cat;
+    double negative_lh;
+    for (cat = 0; cat < ncategory; cat++) {
+        optimizing_cat = cat;
+        if (phylo_tree->optimize_by_newton)
+            rates[cat] = minimizeNewtonSafeMode(1e-6, rates[cat], 10.0, max(epsilon,1e-6), negative_lh);
+        else
+            rates[cat] = minimizeOneDimenSafeMode(1e-6, rates[cat], 10.0, max(epsilon, 1e-6), &negative_lh);
+    }
+    return phylo_tree->computeLikelihood();
+}
+
+
+double NGSRate::computeFunction(double value) {
+    return ((NGSAlignment*)phylo_tree->aln)->computeFunctionCat(optimizing_cat, value);
+}
+void NGSRate::computeFuncDerv(double value, double &df, double &ddf) {
+    ((NGSAlignment*)phylo_tree->aln)->computeFuncDervCat(optimizing_cat, value, df, ddf);
+}
+
+void NGSRate::writeInfo(ostream &out) {
+}
+
+/****************************************************************************
+        NGSRateCat
+ ****************************************************************************/
+NGSRateCat::NGSRateCat(PhyloTree *tree, int ncat) {
+    phylo_tree = tree;
+    ncategory = ncat;
+    rates = new double[ncategory];
+    proportion = new double[ncategory];
+    int i;
+    for (i = 0; i < ncategory; i++) {
+        rates[i] = random_double();
+        proportion[i] = 1.0/ncategory;
+		
+    }
+
+    sum_pair_freq = new double[tree->aln->num_states * tree->aln->num_states];
+    ((NGSAlignment*)tree->aln)->computeSumPairFreq(sum_pair_freq);
+
+    name = "+FC";
+    name += convertIntToString(ncategory);
+    full_name = name;
+    is_categorized = true;
+}
+
+
+/**
+	return the number of dimensions
+*/
+int NGSRateCat::getNDim() {
+    return 2*ncategory-1;
+}
+
+void NGSRateCat::setVariables(double *variables) {
+    memcpy(variables+1, rates, ncategory * sizeof(double));
+    memcpy(variables+ncategory+1, proportion, (ncategory-1)*sizeof(double));
+}
+
+void NGSRateCat::getVariables(double *variables) {
+    memcpy(rates, variables+1, ncategory * sizeof(double));
+    memcpy(proportion, variables+ncategory+1, (ncategory-1)*sizeof(double));
+    double sum = 0.0;
+    for (int i = 0; i < ncategory-1; i++)
+        sum += proportion[i];
+    proportion[ncategory-1] = 1.0 - sum;
+}
+
+
+/**
+	the target function which needs to be optimized
+	@param x the input vector x
+	@return the function value at x
+*/
+double NGSRateCat::targetFunk(double x[]) {
+    getVariables(x);
+    if (proportion[ncategory-1] <= 1e-6) return 1e9;
+    return -phylo_tree->computeLikelihood();
+}
+
+
+double NGSRateCat::optimizeParameters(double epsilon) {
+    int ndim = getNDim();
+
+    // return if nothing to be optimized
+    if (ndim == 0) return 0.0;
+
+    cout << "Optimizing " << name << " model parameters..." << endl;
+
+
+    double *variables = new double[ndim+1];
+    double *upper_bound = new double[ndim+1];
+    double *lower_bound = new double[ndim+1];
+    bool *bound_check = new bool[ndim+1];
+    int i;
+    double score;
+
+    // by BFGS algorithm
+    setVariables(variables);
+    for (i = 1; i <= ndim; i++) {
+        //cout << variables[i] << endl;
+        lower_bound[i] = 1e-4;
+        upper_bound[i] = 100.0;
+        bound_check[i] = false;
+    }
+    for (i = ndim-ncategory+2; i <= ndim; i++)
+        upper_bound[i] = 1.0;
+    //packData(variables, lower_bound, upper_bound, bound_check);
+    score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, max(epsilon, 1e-6));
+
+    getVariables(variables);
+
+    delete [] bound_check;
+    delete [] lower_bound;
+    delete [] upper_bound;
+    delete [] variables;
+
+    return score;
+}
+
+
+void NGSRateCat::writeInfo(ostream &out) {
+    int i;
+    double avg = 0.0;
+    out << "Rates: ";
+    for (i = 0; i < ncategory; i++)
+        out << " " << rates[i];
+    out << endl << "Proportion: ";
+    for (i = 0; i < ncategory; i++)
+        out << " " << proportion[i];
+    out << endl;
+    for (i = 0; i < ncategory; i++)
+        avg += rates[i]*proportion[i];
+    cout << avg << endl;
+}
+
+/****************************************************************************
+        NGSTree
+ ****************************************************************************/
+
+NGSTree::NGSTree(Params &params, NGSAlignment *alignment) {
+    aln = alignment;
+    model = NULL;
+    site_rate = NULL;
+    model_factory = NULL;
+    optimize_by_newton = params.optimize_by_newton;
+    //tree.sse = params.SSE;
+    setLikelihoodKernel(LK_NORMAL);
+}
+
+double NGSTree::computeLikelihood(double *pattern_lh) {
+    return -((NGSAlignment*)aln)->computeFunction(1.0);
+}
+
+double NGSTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
+    return computeLikelihood();
+}
+
+
+/****************************************************************************
+        NGSTreeCat
+ ****************************************************************************/
+
+NGSTreeCat::NGSTreeCat(Params &params, NGSAlignment *alignment) : NGSTree(params, alignment) {
+}
+
+double NGSTreeCat::computeLikelihood(double *pattern_lh) {
+    int num_states = getModel()->num_states;
+    int trans_size = num_states*num_states;
+    double *sum_trans_mat = new double[trans_size];
+    double *trans_mat = new double[trans_size];
+    int i, cat;
+    NGSRateCat *site_rate = (NGSRateCat*)getRate();
+
+    memset(sum_trans_mat, 0, trans_size * sizeof(double));
+    for (cat = 0; cat < site_rate->getNDiscreteRate(); cat++) {
+        getModel()->computeTransMatrix(site_rate->getRate(cat), trans_mat);
+        for (i = 0; i < trans_size; i++)
+            sum_trans_mat[i] += site_rate->proportion[cat]*trans_mat[i];
+    }
+    double lh = 0.0;
+    for (i = 0; i < trans_size; i++)
+        lh += ((NGSAlignment*)aln)->pair_freq[i] * log(sum_trans_mat[i]);
+    delete [] trans_mat;
+    delete [] sum_trans_mat;
+    return lh;
+}
+
+
+/****************************************************************************
+        NGSRead
+ ****************************************************************************/
+
+NGSRead::NGSRead(PhyloTree *atree) : NGSAlignment(atree) {
+    init();
+    if (tree) {
+        num_states = tree->aln->num_states;
+    } else num_states = 4;
+    pair_freq = new double[(num_states+1) * (num_states+1)];
+}
+
+void NGSRead::init() {
+    scaff.clear();
+    read.clear();
+    id = -2;
+    match_pos= -2;
+    flag=true;
+    identity=-2;
+    times=1.0;
+    direction=true;
+}
+
+void NGSRead::computePairFreq() {
+    int len = scaff.length();
+    assert(len == read.length());
+    memset(pair_freq, 0, sizeof(double)*num_states*num_states);
+    for (int i = 0; i < len; i++)
+        if (scaff[i] < num_states && read[i] < num_states)
+            pair_freq[scaff[i]*num_states+read[i]] += 1;
+}
+
+
+double NGSRead::computeFunction(double value) {
+
+    RateHeterogeneity *site_rate = tree->getRate();
+    int i, rate_id;
+    int nptn = scaff.length();
+    double lh = 0.0;
+    if (homo_rate > 0.0) {
+        int trans_size = num_states*num_states;
+        double *trans_mat = new double[trans_size];
+        tree->getModelFactory()->computeTransMatrix(value * homo_rate, trans_mat);
+        for (i = 0; i < trans_size; i++) if (pair_freq[i] > 1e-6) {
+                lh -= pair_freq[i] * log(trans_mat[i]);
+            }
+        delete [] trans_mat;
+        return lh;
+    }
+    // site-specific rates
+    for (i = 0, rate_id = 0; i < nptn; i++) {
+        int state1 = scaff[i];
+        int state2 = read[i];
+        if (state1 >= num_states || state2 >= num_states) continue;
+        double trans;
+        double rate_val = site_rate->getRate(rate_id);
+        trans = tree->getModelFactory()->computeTrans(value * rate_val, state1, state2);
+        lh -= log(trans);
+        rate_id++;
+    }
+    return lh;
+}
+
+void NGSRead::computeFuncDerv(double value, double &df, double &ddf) {
+    RateHeterogeneity *site_rate = tree->getRate();
+    int i, rate_id;
+    int nptn = scaff.length();
+//    double lh = 0.0;
+    df = 0.0;
+    ddf = 0.0;
+
+    if (homo_rate > 0.0) { // homogeneous rate
+        int trans_size = num_states*num_states;
+        double *trans_mat = new double[trans_size];
+        double *trans_derv1 = new double[trans_size];
+        double *trans_derv2 = new double[trans_size];
+        tree->getModelFactory()->computeTransDerv(value * homo_rate, trans_mat, trans_derv1, trans_derv2);
+        for (i = 0; i < trans_size; i++) if (pair_freq[i] > 1e-6) {
+//                lh -= pair_freq[i] * log(trans_mat[i]);
+                double d1 = trans_derv1[i] / trans_mat[i];
+                df -=  pair_freq[i] * d1;
+                ddf -= pair_freq[i] * (trans_derv2[i]/trans_mat[i] - d1*d1);
+            }
+        df *= homo_rate;
+        ddf *= homo_rate * homo_rate;
+        delete [] trans_derv2;
+        delete [] trans_derv1;
+        delete [] trans_mat;
+//        return lh;
+        return;
+    }
+
+    for (i = 0, rate_id = 0; i < nptn; i++) {
+        int state1 = scaff[i];
+        int state2 = read[i];
+        if (state1 >= num_states || state2 >= num_states) continue;
+        double trans, derv1, derv2;
+        double rate_val = site_rate->getRate(rate_id);
+        double rate_sqr = rate_val * rate_val;
+        trans = tree->getModelFactory()->computeTrans(value * rate_val, state1, state2, derv1, derv2);
+//        lh -= log(trans);
+        double d1 = derv1 / trans;
+        df -= rate_val * d1;
+        ddf -= rate_sqr * (derv2/trans - d1*d1);
+        rate_id++;
+    }
+
+//    return lh;
+}
+
+/****************************************************************************
+        NGSReadSet
+ ****************************************************************************/
+
+void reverseComplement(string &str) {
+    string out;
+    out.resize(str.length(), ' ');
+    string::reverse_iterator it;
+    string::iterator oit;
+    for (it = str.rbegin(), oit = out.begin(); it != str.rend(); it++, oit++) {
+        char c = toupper(*it);
+        //*oit = c;
+        switch (c) {
+        case 'A':
+            *oit = 'T';
+            break;
+        case 'T':
+            *oit = 'A';
+            break;
+        case 'G':
+            *oit = 'C';
+            break;
+        case 'C':
+            *oit = 'G';
+            break;
+        default:
+            *oit = c;
+            break;
+        }
+    }
+    //cout << str << endl << out << endl;
+    str = out;
+}
+
+//("File","total",0.8,-1)
+void NGSReadSet::parseNextGen(string filename, string ref_ID,double ident,int mismatches)
+{
+//	cout<<"start"<<endl;
+    string a= "total";
+    size_t buffer_size = 1200;
+    ifstream myfile; //test2
+    myfile.open(filename.c_str(),ifstream::in);
+    if (!myfile.good()) {
+        cout<<"No such file "<<filename.c_str()<<endl;
+        exit(0);
+    }
+    char* line = new char[buffer_size];
+//	cout<<"start"<<endl;
+
+    NGSRead tempread(tree);
+
+    myfile.getline(line,buffer_size);
+    string ref;
+    for (; !myfile.eof(); ) {
+        if (line[0]=='S'&& line[1]=='e') {
+            for (size_t i=0;i<buffer_size;i++) {
+                if (line[i]=='\0' ||line[i]=='\n' ) {
+                    break;
+                }
+                if (tempread.id ==-2 && strncmp(&line[i],"ID: ",4)==0) {
+                    tempread.id = atoi(&line[i+4]);
+                } else if (tempread.id !=-2 && strncmp(&line[i],"ID: ",4)==0) {
+                    int id = atoi(&line[i+4]);
+
+                    if (id==0) {
+                        tempread.flag=true;
+                    } else {
+                        tempread.flag=false;
+                    }
+
+                } else if (strncmp(&line[i],"forward",7)==0) {
+                    tempread.direction=true;
+//					cout<<i<<endl;
+                } else if (strncmp(&line[i],"backward",8)==0) {
+                    tempread.direction=false;
+                }
+
+                if (strncmp(&line[i],"me: ",4)==0) {
+                    i=i+4;
+                    while (i<buffer_size&&line[i]!=' ') {
+                        tempread.name+=line[i];
+                        i++;
+                    }
+                }
+                if (strncmp(&line[i],"re: ",4)==0) {
+                    tempread.score= atoi(&line[i+4]);
+                    break;
+                }
+
+                if (strncmp(&line[i],"at: ",4)==0) {
+                    tempread.match_pos= atoi(&line[i+4])+1;
+                }
+                if (strncmp(&line[i],"ld: ",4)==0) {
+                    tempread.chr.clear();
+                    size_t t=i+4;
+                    while (t<buffer_size && line[t]!='\n' &&  line[t]!='\0') {
+                        //tempread.chr.size()>3 &&
+                        if ( line[t]==' ') {
+                            break;
+                        }
+                        tempread.chr+=line[t];
+                        t++;
+                    }
+                }
+            }
+
+            if ( (strcmp(tempread.chr.c_str(),ref_ID.c_str())==0 || strcmp(a.c_str(),ref_ID.c_str())==0 )) {
+
+                myfile.getline(line,buffer_size);
+                for (size_t i=0;i<buffer_size;i++) {
+                    if (line[i]=='\0' ||line[i]=='\n' ) {
+                        break;
+                    }
+                    if (strncmp(&line[i],"es: ",4)==0) {
+                        tempread.times= atof(&line[i+4]);
+                    }
+                    if (strncmp(&line[i],"ty: ",4)==0) {
+                        tempread.identity=atof(&line[i+4]);
+                        break;
+                    }
+                }
+
+                if (tempread.identity>=ident) {
+                    string scaff;
+                    string read;
+                    myfile.getline(line,buffer_size);
+                    size_t i=0;
+                    while (i<buffer_size &&line[i]!=' '  &&line[i]!='\t'&&line[i]!='\0'&&line[i]!='\n') {
+                        scaff+=line[i];
+                        i++;
+                    }
+
+                    myfile.getline(line,buffer_size);
+                    i=0;
+                    int count=0;
+                    while (i<buffer_size && line[i]!=' ' &&line[i]!='\t'&&line[i]!='\0'&&line[i]!='\n') {
+                        read+=line[i];
+                        if (line[i]!='-' && scaff[i]!='-' && scaff[i]!=line[i]) {
+                            count++;
+                        }
+                        i++;
+                    }
+
+                    tempread.scaff=scaff;
+                    tempread.read=read;
+
+                    if (count==mismatches || mismatches < 0) {
+                        processReadWhileParsing(tempread);
+                    }
+                    scaff.clear();
+                    read.clear();
+                }
+            }
+            tempread.chr.clear();
+            tempread.init();
+        }
+
+        myfile.getline(line,buffer_size);
+        if (size()>0 && size() % 10000 == 0) cout << size() << " reads processed" << endl;
+
+    }
+
+    cout << size() << " reads processed in total" << endl;
+
+    myfile.close();
+    delete [] line;
+}
+
+void NGSReadSet::processReadWhileParsing(NGSRead &tempread) {
+
+    //if (!tempread.flag) return;
+    int i, id;
+
+    if (!tempread.direction) {
+        reverseComplement(tempread.scaff);
+        reverseComplement(tempread.read);
+    }
+    tempread.convertStateStr(tempread.scaff, SEQ_DNA);
+    tempread.convertStateStr(tempread.read, SEQ_DNA);
+    assert(tempread.scaff.length() == tempread.read.length());
+
+    int nstates = 4 + (!ngs_ignore_gaps);
+
+    for (i = 0, id = 0; i < tempread.scaff.length(); i++) {
+        int state1 = tempread.scaff[i];
+        int state2 = tempread.read[i];
+        if (state1 >= nstates || state2 >= nstates) continue;
+        double *pair_pos, *state_pos;
+        while (id >= state_freq.size()) {
+            state_pos = new double[nstates];
+            memset(state_pos, 0, sizeof(double)*(nstates));
+            state_freq.push_back(state_pos);
+        }
+        state_pos = state_freq[id];
+        state_pos[state2] += 1.0/tempread.times;
+        while (id >= pair_freq.size()) {
+            pair_pos = new double[(nstates) * (nstates)];
+            memset(pair_pos, 0, sizeof(double)*(nstates) * (nstates));
+            pair_freq.push_back(pair_pos);
+        }
+        pair_pos = pair_freq[id];
+        pair_pos[state1*(nstates) + state2] += 1.0/tempread.times;
+        id++;
+    }
+
+    if (tree) {
+        ReadInfo read_info;
+        tempread.homo_rate = homo_rate;
+        tempread.computePairFreq();
+        read_info.homo_distance = tempread.optimizeDist(1.0-tempread.identity);
+        read_info.homo_logl = -tempread.computeFunction(read_info.homo_distance);
+        tempread.homo_rate = 0.0;
+        read_info.distance = tempread.optimizeDist(read_info.homo_distance);
+        read_info.logl = -tempread.computeFunction(read_info.distance);
+        read_info.id = tempread.id;
+        read_info.identity = tempread.identity;
+        push_back(read_info);
+    }
+
+
+}
+
+void NGSReadSet::writeInfo() {
+    //cout << size() << " reads process in total" << endl;
+    return;
+}
+
+void NGSReadSet::writeFreqMatrix(ostream &out) {
+    int num_states = 4 + (!ngs_ignore_gaps);
+    out << pair_freq.size() << " " << num_states << endl;
+    vector<double*>::iterator it;
+    vector<double*>::iterator pit;
+
+    for (it = pair_freq.begin(), pit = state_freq.begin(); it != pair_freq.end(); it++, pit++) {
+        for (int i = 0; i < num_states; i++) {
+            for (int j = 0; j < num_states; j++) {
+                if (!ngs_ignore_gaps && i == num_states-1 && j == num_states-1)
+                    out << int(round((*pit)[i]*((*pit)[i]-1)/2));
+                else out << int(round((*it)[i*num_states+j])) << ((j<num_states-1) ? "\t" : "");
+            }
+            out << endl;
+        }
+        out << endl;
+    }
+}
+
+/****************************************************************************
+        main function
+ ****************************************************************************/
+
+void reportNGSAnalysis(const char *file_name, Params &params, NGSAlignment &aln, NGSTree &tree,
+                       DoubleMatrix &rate_info, StrVector &rate_name) {
+    ofstream out(file_name);
+    out.setf(ios::fixed,ios::floatfield);
+
+    int i, j, k;
+
+
+    double *rate_param = new double[aln.num_states * aln.num_states];
+    double *rate_matrix = new double[aln.num_states * aln.num_states];
+
+    out << "Input file: " << params.ngs_file << endl;
+    out << "Model of evolution: " << tree.getModel()->name << endl << endl;
+
+    out << "Substitution process assuming one homogeneous model among all positions:" << endl;
+
+    out << "Rate parameters: " << endl;
+
+    tree.getModel()->getRateMatrix(rate_param);
+
+    if (tree.getModel()->name == "UNREST") {
+        for (i = 0, k=0; i < aln.num_states; i++)
+            for (j = 0; j < aln.num_states; j++)
+                if (i != j)
+                    rate_matrix[i*aln.num_states+j] = rate_param[k++];
+    } else {
+        for (i = 0, k=0; i < aln.num_states-1; i++)
+            for (j = i+1; j < aln.num_states; j++, k++)
+                rate_matrix[i*aln.num_states+j] = rate_matrix[j*aln.num_states+i] = rate_param[k];
+    }
+
+    for (i = 0; i < aln.num_states; i++) {
+        for (j = 0; j < aln.num_states; j++) {
+            if (j > 0) out << " \t";
+            if (j != i) out << rate_matrix[i*aln.num_states+j];
+            else out << "-";
+        }
+        out << endl;
+    }
+    out << endl;
+    out << "State frequencies: ";
+    switch (tree.getModel()->getFreqType()) {
+    case FREQ_EMPIRICAL:
+        out << "(empirical counts from alignment)" << endl;
+        break;
+    case FREQ_ESTIMATE:
+        out << "(estimated with maximum likelihood)" << endl;
+        break;
+    case FREQ_USER_DEFINED:
+        out << "(user-defined)" << endl;
+        break;
+    case FREQ_EQUAL:
+        out << "(equal frequencies)" << endl;
+        break;
+    default:
+        break;
+    }
+
+    double *state_freq = new double[aln.num_states];
+    tree.getModel()->getStateFrequency(state_freq);
+
+    for (i = 0; i < aln.num_states; i++) out << state_freq[i] << " \t";
+    out << endl << endl;
+
+    out << "Q matrix can be obtained by multiplying rate parameters with state frequencies" << endl << endl;
+
+    double *q_mat = new double[tree.aln->num_states * tree.aln->num_states];
+    tree.getModel()->getQMatrix(q_mat);
+
+    for (i = 0, k = 0; i < tree.aln->num_states; i++) {
+        for (j = 0; j < tree.aln->num_states; j++, k++)
+            out << "  " << q_mat[k];
+        out << endl;
+    }
+
+    delete [] q_mat;
+
+    out << endl;
+
+    out << "Log-likelihood: " << tree.computeLikelihood() << endl << endl;
+
+    out << "Inferred posisiton-specific rates under one model or position-specific model: " << endl;
+
+    out << "Position\tSeq_error";
+    for (StrVector::iterator it = rate_name.begin(); it != rate_name.end(); it++)
+        out << "\t" << (*it);
+    out << endl;
+    for (i = 0; i < aln.ncategory; i++) {
+        out << i+1 << '\t' << tree.getRate()->getRate(i);
+        DoubleVector *rate_vec = &rate_info[i];
+        for (DoubleVector::iterator dit = rate_vec->begin(); dit != rate_vec->end(); dit++)
+            out << "\t" << *dit;
+        out << endl;
+    }
+    out.close();
+    cout << endl << "Results written to: " << file_name << endl << endl;
+    delete [] state_freq;
+    delete [] rate_matrix;
+    delete [] rate_param;
+}
+
+/*
+bool checkFreq(int *pair_freq, int n) {
+	int i, count = 0;
+	for (i=0; i < n*n; i++)
+		if (pair_freq[i] != 0) count++;
+	if (count <= n) return false;
+	return true;
+}*/
+
+void testSingleRateModel(Params &params, NGSAlignment &aln, NGSTree &tree, string model,
+                         double *freq, DoubleVector &rate_info, StrVector &rate_name,
+                         bool write_info, const char *report_file)
+{
+    char model_name[20];
+    NGSAlignment sum_aln(aln.num_states, 1, freq);
+    ModelsBlock *models_block = new ModelsBlock;
+
+    NGSTree sum_tree(params, &sum_aln);
+    sum_aln.tree = &sum_tree;
+
+    if (model == "")
+        sprintf(model_name, "GTR+F1");
+    else
+        sprintf(model_name, "%s+F1", model.c_str());
+    try {
+        params.model_name = model_name;
+        sum_tree.setModelFactory(new ModelFactory(params, &sum_tree, models_block));
+        sum_tree.setModel(sum_tree.getModelFactory()->model);
+        sum_tree.setRate(sum_tree.getModelFactory()->site_rate);
+        double bestTreeScore = sum_tree.getModelFactory()->optimizeParameters(false, write_info);
+        cout << "LogL: " << bestTreeScore;
+        cout << " / Rate: " << sum_tree.getRate()->getRate(0) << endl;
+    } catch (...) {
+        cout << "Skipped due to sparse matrix" << endl;
+        //rate_info.push_back(MIN_SITE_RATE);
+        rate_info.insert(rate_info.end(), rate_name.size(), MIN_SITE_RATE);
+        return;
+    }
+    //return sum_tree.getRate()->getRate(0);
+    rate_info.push_back(sum_tree.getRate()->getRate(0));
+
+    double *rate_mat = new double[aln.num_states*aln.num_states];
+    memset(rate_mat, 0, aln.num_states*aln.num_states*sizeof(double));
+    sum_tree.getModel()->getRateMatrix(rate_mat);
+    rate_info.insert(rate_info.end(), rate_mat, rate_mat+sum_tree.getModel()->getNumRateEntries());
+
+    if (tree.getModel()->isReversible()) {
+        sum_tree.getModel()->getStateFrequency(rate_mat);
+        rate_info.insert(rate_info.end(), rate_mat, rate_mat+aln.num_states);
+    }
+	delete [] rate_mat;
+	delete models_block;
+
+    if (report_file) {
+        DoubleMatrix tmp(1);
+        tmp[0] = rate_info;
+        reportNGSAnalysis(report_file, params, sum_aln, sum_tree, tmp, rate_name);
+    }
+}
+
+void testTwoRateModel(Params &params, NGSAlignment &aln, NGSTree &tree, string model,
+                      double *freq, DoubleVector &rate_info, StrVector &rate_name,
+                      bool write_info, const char *report_file)
+{
+    char model_name[20];
+    NGSAlignment sum_aln(aln.num_states, 1, freq);
+
+
+    NGSTreeCat sum_tree(params, &sum_aln);
+    sum_aln.tree = &sum_tree;
+
+    ModelsBlock *models_block = new ModelsBlock;
+
+    if (model == "")
+        sprintf(model_name, "GTR+FC2");
+    else
+        sprintf(model_name, "%s+FC2", model.c_str());
+    try {
+        params.model_name = model_name;
+        sum_tree.setModelFactory(new ModelFactory(params, &sum_tree, models_block));
+        sum_tree.setModel(sum_tree.getModelFactory()->model);
+        sum_tree.setRate(sum_tree.getModelFactory()->site_rate);
+        double bestTreeScore = sum_tree.getModelFactory()->optimizeParameters(false, write_info);
+        cout << "LogL: " << bestTreeScore;
+        cout << " / Rate: " << sum_tree.getRate()->getRate(0) << endl;
+    } catch (const char*) {
+        cout << "Skipped due to sparse matrix" << endl;
+        //rate_info.insert(rate_info.end(), rate_name.size(), MIN_SITE_RATE);
+        return;
+    } catch (string &str) {
+        cout << str;
+        return;
+    }
+    delete models_block;
+    //return sum_tree.getRate()->getRate(0);
+
+    /*
+    	rate_info.push_back(sum_tree.getRate()->getRate(0));
+
+        double rate_mat[aln.num_states*aln.num_states];
+        memset(rate_mat, 0, aln.num_states*aln.num_states*sizeof(double));
+        sum_tree.getModel()->getRateMatrix(rate_mat);
+        rate_info.insert(rate_info.end(), rate_mat, rate_mat+sum_tree.getModel()->getNumRateEntries());
+
+    	if (tree.getModel()->isReversible()) {
+    		sum_tree.getModel()->getStateFrequency(rate_mat);
+    		rate_info.insert(rate_info.end(), rate_mat, rate_mat+aln.num_states);
+        }
+
+    	if (report_file) {
+    		DoubleMatrix tmp(1);
+    		tmp[0] = rate_info;
+    		reportNGSAnalysis(report_file, params, sum_aln, sum_tree, tmp, rate_name);
+    	}*/
+}
+
+/*
+
+void testSingleRateModel(Params &params, NGSAlignment &aln, NGSTree &tree, string model, int *sum_freq) {
+	char model_name[20];
+
+	NGSAlignment sum_aln(aln.num_states, 1, sum_freq);
+
+	NGSTree sum_tree(params, &sum_aln);
+	sum_aln.tree = &sum_tree;
+
+	if (model == "")
+		sprintf(model_name, "GTR+F1");
+	else
+		sprintf(model_name, "%s+F1", model.c_str());
+	params.model_name = model_name;
+    sum_tree.setModelFactory(new ModelFactory(params, &sum_tree));
+    sum_tree.setModel(sum_tree.getModelFactory()->model);
+    sum_tree.setRate(sum_tree.getModelFactory()->site_rate);
+
+    double bestTreeScore = sum_tree.getModelFactory()->optimizeParameters(false, false);
+    cout << "Log-likelihood of null model: " << bestTreeScore << endl;
+    cout << "Rate (or distance) of null model: " << sum_tree.getRate()->getRate(0) << endl;
+    double lh_diff = 2*(tree.computeLikelihood() - bestTreeScore);
+    cout << "2(lnL1 - lnL0) = " << lh_diff << endl;
+    cout << "p-value (chi-square test, df = " << aln.ncategory-1 << "): " << computePValueChiSquare(lh_diff, aln.ncategory-1) << endl;
+
+	string out_file = params.out_prefix;
+	out_file += ".ngs_e";
+	DoubleVector tmp;
+	reportNGSAnalysis(out_file.c_str(), params, sum_aln, sum_tree, tmp);
+
+}*/
+
+void reportNGSReads(const char *file_name, Params &params, NGSReadSet &ngs_reads)
+{
+    ofstream out(file_name);
+    out.setf(ios::fixed,ios::floatfield);
+    out << "Read\tHamm_dist\tHomo_dist\tHete_dist\tHomo_logl\tHete_logl" << endl;
+    for (int i = 0; i < ngs_reads.size(); i++)
+        out << ngs_reads[i].id << '\t' << 1.0 - ngs_reads[i].identity <<
+        '\t' << ngs_reads[i].homo_distance << '\t' << ngs_reads[i].distance <<
+        '\t' << ngs_reads[i].homo_logl << '\t' << ngs_reads[i].logl << endl;
+    out.close();
+    cout << endl << "Read distances to the reference written to: " << file_name << endl << endl;
+
+    string count_file = params.ngs_mapped_reads;
+    count_file += ".freq";
+    out.open(count_file.c_str());
+    ngs_reads.writeFreqMatrix(out);
+    out.close();
+    cout << "Position-specific pair counts written to: " << count_file << endl << endl;
+
+}
+
+void computePairCount(Params &params, NGSTree *tree, double homo_rate) {
+    NGSReadSet ngs_reads;
+    ngs_reads.tree = tree;
+    ngs_reads.homo_rate = homo_rate;
+    ngs_reads.ngs_ignore_gaps = params.ngs_ignore_gaps;
+    //cout << "Homogeneous rate: " << ngs_reads.homo_rate << endl;
+    cout << "Computing read distances to reference from file " << params.ngs_mapped_reads << " ... " << endl;
+    ngs_reads.parseNextGen(params.ngs_mapped_reads);
+    ngs_reads.writeInfo();
+
+    string out_file = params.ngs_mapped_reads;
+    out_file += ".dist";
+    reportNGSReads(out_file.c_str(), params, ngs_reads);
+}
+
+
+
+void runNGSAnalysis(Params &params) {
+
+    time_t begin_time;
+    time(&begin_time);
+
+    char model_name[20];
+
+    if (!params.ngs_file) {
+        computePairCount(params, NULL, 0.0);
+        return;
+    }
+
+    // read input file, initialize NGSAlignment
+    NGSAlignment aln(params.ngs_file);
+    cout.setf(ios::fixed,ios::floatfield);
+
+    //params.freq_type = FREQ_ESTIMATE;
+
+    // initialize NGSTree
+    NGSTree tree(params, &aln);
+    aln.tree = &tree;
+    ModelsBlock *models_block = new ModelsBlock;
+
+    // initialize Model
+    string original_model = params.model_name;
+    if (params.model_name == "") {
+        sprintf(model_name, "GTR+F%d", aln.ncategory);
+        params.freq_type = FREQ_ESTIMATE;
+    }
+    else
+        sprintf(model_name, "%s+F%d", params.model_name.c_str(), aln.ncategory);
+    params.model_name = model_name;
+    tree.setModelFactory(new ModelFactory(params, &tree, models_block));
+    tree.setModel(tree.getModelFactory()->model);
+    tree.setRate(tree.getModelFactory()->site_rate);
+
+    delete models_block;
+
+    int model_df = tree.getModel()->getNDim() + tree.getRate()->getNDim();
+    cout << endl;
+    cout << "Model of evolution: " << tree.getModelName() << " (" << model_df << " free parameters)" << endl;
+    cout << endl;
+
+    // optimize model parameters and rate scaling factors
+    cout << "Optimizing model parameters" << endl;
+    double bestTreeScore = tree.getModelFactory()->optimizeParameters(false, true);
+    cout << "Log-likelihood: " << bestTreeScore << endl;
+
+
+    DoubleMatrix part_rate(aln.ncategory);
+    StrVector rate_name;
+
+
+    int i, j;
+
+    rate_name.push_back("Hete_error");
+
+    if (tree.getModel()->isReversible()) {
+        for (i = 0; i < aln.num_states-1; i++)
+            for (j = i+1; j < aln.num_states; j++) {
+                stringstream x;
+                x << aln.convertStateBackStr(i) << "<->" << aln.convertStateBackStr(j);
+                rate_name.push_back(x.str());
+            }
+        for (i = 0; i < aln.num_states; i++) {
+            stringstream x;
+            x << aln.convertStateBackStr(i);
+            rate_name.push_back(x.str());
+        }
+    } else {
+        for (i = 0; i < aln.num_states; i++)
+            for (j = 0; j < aln.num_states; j++) if (j != i) {
+                    stringstream x;
+                    x << aln.convertStateBackStr(i) << "->" << aln.convertStateBackStr(j);
+                    rate_name.push_back(x.str());
+                }
+    }
+
+
+    VerboseMode vb_saved = verbose_mode;
+    verbose_mode = VB_QUIET;
+
+    cout << endl << "--> INFERING RATE ASSUMING POSITION-SPECIFIC MODEL..." << endl << endl;
+    for (int pos = 0; pos < aln.ncategory; pos++) {
+        cout << "Position " << pos+1 << " / ";
+        double *pair_pos = aln.pair_freq + (pos*aln.num_states*aln.num_states);
+        testSingleRateModel(params, aln, tree, original_model, pair_pos, part_rate[pos], rate_name, false, NULL);
+    }
+
+
+    verbose_mode = vb_saved;
+
+    double *sum_freq = new double[aln.num_states*aln.num_states];
+    cout << endl << "-->INFERING RATE UNDER EQUAL-RATE NULL MODEL..." << endl << endl;
+    aln.computeSumPairFreq(sum_freq);
+    DoubleVector null_rate;
+    string out_file = params.out_prefix;
+    out_file += ".ngs_e";
+    for (i = 0; i < aln.num_states*aln.num_states; i++)
+        cout << sum_freq[i] << " ";
+    cout << endl;
+    testSingleRateModel(params, aln, tree, original_model, sum_freq, null_rate, rate_name, true, out_file.c_str());
+
+    DoubleVector two_rate;
+
+    cout << endl << "-->INFERING RATE UNDER TWO-RATE MODEL..." << endl << endl;
+    testTwoRateModel(params, aln, tree, original_model, sum_freq, two_rate, rate_name, true, NULL);
+
+
+    // report running results
+    out_file = params.out_prefix;
+    out_file += ".ngs";
+    reportNGSAnalysis(out_file.c_str(), params, aln, tree, part_rate, rate_name);
+
+    if (params.ngs_mapped_reads) {
+        computePairCount(params, &tree, null_rate[0]);
+    }
+
+
+    time_t end_time;
+    time(&end_time);
+
+    cout << "Total run time: " << difftime(end_time, begin_time) << " seconds" << endl << endl;
+	delete [] sum_freq;
+}
diff --git a/ngs.h b/ngs.h
new file mode 100644
index 0000000..458732b
--- /dev/null
+++ b/ngs.h
@@ -0,0 +1,430 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef NGS_H
+#define NGS_H
+
+#include "phylotree.h"
+#include "alignmentpairwise.h"
+#include "model/ratemeyerdiscrete.h"
+
+/*
+	collection of classes for Next-generation sequencing 
+*/
+
+/**
+NGS Pairwise alignment
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class NGSAlignment : public AlignmentPairwise
+{
+public:
+
+	/**
+		constructor
+		@param filename file in Fritz's format
+	*/
+	NGSAlignment(PhyloTree *atree);
+
+    NGSAlignment(const char *filename);
+
+	/**
+		constructor
+		@param nstate number of states
+		@param ncat number of categories
+		@param freq pair-state frequencies for all categories
+	*/
+	NGSAlignment(int nstate, int ncat, double *freq);
+
+	NGSAlignment(int nstate, string &seq1, string &seq2);
+
+	virtual char convertState(char state, SeqType seq_type);
+
+
+	/**
+		read file in Fritz's format
+	*/
+	void readFritzFile(const char *filename);
+
+	/**
+		compute empirical state frequencies from the alignment
+		@param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with 
+			at least num_states entries.
+	*/
+	virtual void computeStateFreq(double *state_freq, size_t num_unknown_states = 0);
+
+	/**
+		compute the sum of pair state frequencies over all categories
+		@param sum_pair_freq (OUT) will be filled in with num_states*num_states entries. 
+			Memory has to be allocated before calling this function.
+	*/
+	void computeSumPairFreq (double *sum_pair_freq);
+
+	/**
+		compute empirical rates between state pairs
+		@param rates (OUT) vector of size num_states*(num_states-1)/2 for the rates
+	*/
+	virtual void computeEmpiricalRate (double *rates);
+
+	/**
+		compute the empirical distance for a category, used to initialize rate scaling factor
+		@param cat specific category, between 0 and ncategory-1
+	*/
+	double computeEmpiricalDist(int cat);
+
+	/**
+		negative likelihood function for a category with a rate scaling factor
+		@param cat specific category, between 0 and ncategory-1
+		@param value a rate scaling factor
+		@return negative log-likelihood (for minimization purpose)
+	*/
+	double computeFunctionCat(int cat, double value);
+
+	/**
+		negative likelihood and 1st and 2nd derivative function for a category with a rate scaling factor
+		@param cat specific category, between 0 and ncategory-1
+		@param value a rate scaling factor
+		@param df (OUT) 1st derivative
+		@param ddf (OUT) 2nd derivative
+		@return negative log-likelihood (for minimization purpose)
+	*/
+	void computeFuncDervCat(int cat, double value, double &df, double &ddf);
+
+	/**
+		number of category
+	*/
+	int ncategory;
+
+	//double *pair_freq;
+};
+
+
+class NGSTree : public PhyloTree {
+
+public:
+
+    /**
+     * Constructor with given alignment
+     * @param params program parameters
+     * @param alignment
+     */
+	NGSTree(Params &params, NGSAlignment *alignment);	
+
+    /**
+            compute the tree likelihood
+            @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
+                            assuming pattern_lh has the size of the number of patterns
+            @return tree likelihood
+     */
+    virtual double computeLikelihood(double *pattern_lh = NULL);
+
+    /**
+            optimize all branch lengths of the tree
+            @param iterations number of iterations to loop through all branches
+            @return the likelihood of the tree
+     */
+    virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD, int maxNRStep = 100);
+
+};
+
+class NGSRate : public RateMeyerDiscrete {
+public:
+
+	/**
+		@param tree must be NGSTree type
+	*/
+	NGSRate(PhyloTree *tree);
+
+	/**
+		get rate category of a specified site-pattern. 
+		@param ptn pattern ID 
+		@return the rate category of the specified site-pattern
+	*/
+	virtual int getPtnCat(int ptn) { return 0; }
+
+	/**
+		optimize rates of all site-patterns
+		compute categorized rates from the "continuous" rate of the original Meyer & von Haeseler model.
+		The current implementation uses the k-means algorithm with k-means++ package.
+	*/
+	virtual double optimizeParameters(double epsilon);
+
+
+	/**
+		This function is inherited from Optimization class for optimizting site rates 
+		@param value x-value of the function
+		@return f(value) of function f you want to minimize
+	*/
+	virtual double computeFunction(double value);
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+	/**
+		classify rates into categories.
+		@param tree_lh the current tree log-likelihood
+	*/
+	virtual double classifyRates(double tree_lh) { return tree_lh; }
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+};
+
+class NGSRateCat : public RateMeyerDiscrete {
+public:
+
+	/**
+		@param tree must be NGSTree type
+	*/
+	NGSRateCat(PhyloTree *tree, int ncat);
+
+	/**
+		optimize rates of all site-patterns
+		compute categorized rates from the "continuous" rate of the original Meyer & von Haeseler model.
+		The current implementation uses the k-means algorithm with k-means++ package.
+	*/
+	virtual double optimizeParameters(double epsilon);
+
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim();
+	
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]);
+
+	/**
+		write information
+		@param out output stream
+	*/
+	virtual void writeInfo(ostream &out);
+
+	/**
+		proportion of position categories
+	*/
+	double *proportion;
+
+	/**
+		sum of pair freq from all positions
+	*/
+	double *sum_pair_freq;
+
+protected:
+
+	/**
+		this function is served for the multi-dimension optimization. It should pack the model parameters 
+		into a vector that is index from 1 (NOTE: not from 0)
+		@param variables (OUT) vector of variables, indexed from 1
+	*/
+	virtual void setVariables(double *variables);
+
+	/**
+		this function is served for the multi-dimension optimization. It should assign the model parameters 
+		from a vector of variables that is index from 1 (NOTE: not from 0)
+		@param variables vector of variables, indexed from 1
+	*/
+	virtual void getVariables(double *variables);
+};
+
+
+class NGSTreeCat : public NGSTree {
+
+public:
+
+    /**
+     * Constructor with given alignment
+     * @param params program parameters
+     * @param alignment
+     */
+	NGSTreeCat(Params &params, NGSAlignment *alignment);	
+    /**
+            compute the tree likelihood
+            @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
+                            assuming pattern_lh has the size of the number of patterns
+            @return tree likelihood
+     */
+    virtual double computeLikelihood(double *pattern_lh = NULL);
+};
+
+
+class NGSRead : public NGSAlignment {
+public:
+
+	/** 
+		constructor
+	*/
+	NGSRead(PhyloTree *atree);
+
+	void init();
+
+	//int orig_length;
+
+	/**
+		alignment score
+	*/
+	int score; //brauch ich das???
+
+	/**
+		read ID
+	*/
+	int id;
+
+	//int scaff_id;
+
+	/**
+		matched position in the reference sequence
+	*/
+	int match_pos;
+
+	/**
+		TRUE for mapping forward strand, FALSE for backward
+	*/
+	bool direction;
+
+	/**
+		name of the reference sequence for which match found
+	*/
+	string chr;
+
+	/**
+		mapped portion of reference sequence
+	*/
+	string scaff;
+
+	/**
+		mapped portion of read
+	*/
+	string read;
+
+	/**
+		number of times that read is mapped (multiple optimal alignment scores)
+	*/
+	double times;
+
+	/**
+		TRUE if it is the first match, FALSE otherwise (in case of multiple hits)
+	*/
+	bool flag;
+
+	/**
+		alignment identity 
+	*/
+	float identity;
+
+	/**
+		read name
+	*/
+	string name;
+
+	double homo_rate;
+
+	void computePairFreq();
+
+	/**
+		compute the likelihood for a distance between two sequences. Used for the ML optimization of the distance.
+		@param value x-value of the function
+		@return log-likelihood 
+	*/
+	virtual double computeFunction(double value);
+
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		used by Newton raphson method to minimize the function.
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+	
+};
+
+struct ReadInfo {
+	int id;
+	float identity;
+	float distance;
+	float logl;
+	float homo_distance;
+	float homo_logl;
+};
+
+
+class NGSReadSet : public vector<ReadInfo>  {
+public:
+
+	/**
+		read in file containing mapped reads to the reference
+		@param filename file name
+		@param ref_ID reference sequence name to accept reads
+		@param ident identity threshold to accept reads
+		@param mismatches number of exact mismatches to accept reads
+	*/
+	void parseNextGen(string filename, string ref_ID="total",double ident=0.0,int mismatches=-1);
+
+	/**
+		this function will be called everytime a read is accepted from the parseNextGen()
+		@param tempread read at current position while parsing 
+	*/
+	virtual void processReadWhileParsing(NGSRead &tempread);
+
+	void writeFreqMatrix(ostream &out);
+
+	/**
+		write information
+	*/
+	void writeInfo();
+
+	PhyloTree *tree;
+
+	double homo_rate;
+
+	vector<double*> pair_freq;
+
+	vector<double*> state_freq;
+
+	bool ngs_ignore_gaps;
+
+};
+
+/**
+	Main function
+	@param params input program parameters
+*/
+void runNGSAnalysis(Params &params);
+
+#endif
diff --git a/node.cpp b/node.cpp
new file mode 100644
index 0000000..65cd70f
--- /dev/null
+++ b/node.cpp
@@ -0,0 +1,249 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "node.h"
+
+//#include <sys/time.h>
+//#include <time.h>
+#include <math.h>
+
+//#define INFINITY 1000000000
+
+/*********************************************
+        class Node
+ *********************************************/
+
+Node::Node(int aid) {
+    id = aid;
+    //name = NULL;
+    height = -1;
+}
+
+Node::Node(int aid, int aname) {
+    id = aid;
+    char str[20];
+    sprintf(str, "%d", aname);
+    name = str;
+    height = -1;
+}
+
+Node::Node(int aid, const char *aname) {
+    id = aid;
+    if (aname)
+        name = aname;
+    height = -1;
+}
+
+bool Node::isLeaf() {
+    return neighbors.size() <= 1;
+}
+
+bool Node::isInCherry() {
+	if (this->isLeaf()) {
+		if (neighbors[0]->node->isCherry()) {
+			return true;
+		} else {
+			return false;
+		}
+	} else {
+		return false;
+	}
+}
+
+bool Node::isCherry() {
+    int num_leaves = 0;
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++)
+        if ((*it)->node->isLeaf()) num_leaves++;
+    return (num_leaves > 1);
+}
+
+int Node::degree() {
+    return neighbors.size();
+}
+
+/** calculate the height of the subtree rooted at this node,
+        given the dad. Also return the lowestLeaf.
+        @param dad the dad of this node
+        @return the leaf at the lowest level. Also modify the height, highestNei of this class.
+ */
+Node *Node::calcHeight(Node *dad) {
+    if (isLeaf() && dad != NULL) {
+        // if a leaf, but not the root
+        height = 0;
+        highestNei = NULL;
+        return this;
+    }
+    // scan through all children
+    height = -INFINITY;
+    Node *lowestLeaf = NULL;
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++)
+        if ((*it)->node != dad) {
+            Node *leaf = (*it)->node->calcHeight(this);
+            if ((*it)->node->height + (*it)->length > height) {
+                height = (*it)->node->height + (*it)->length;
+                highestNei = (*it);
+                lowestLeaf = leaf;
+            }
+        }
+    return lowestLeaf;
+}
+
+int Node::calDist(Node* partner, Node* dad, int curLen) {
+    if ( this->isLeaf() && this != partner && dad != NULL )
+        return 0;
+    if ( this->isLeaf() && dad == NULL ) {
+        return this->neighbors[0]->node->calDist(partner, this, 1);
+    } else {
+        Node* left = NULL;
+        Node* right = NULL;
+        for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++) {
+            if ((*it)->node != dad) {
+                if (left == NULL)
+                    left = (*it)->node;
+                else
+                    right = (*it)->node;
+            }
+        }
+        curLen++;
+//        cout << left->id << endl;
+//        cout << right->id << endl;
+        int sumLeft = 0;
+        int sumRight = 0;
+        if ( left->isLeaf() ) {
+            if ( left == partner)
+            {
+                //cout << " I found you baby" << endl;
+                return curLen;
+            }
+        }
+        else {
+            sumLeft = left->calDist(partner, this, curLen);
+        }
+        if ( right->isLeaf() ) {
+            if ( right == partner) {
+                //cout << " I found you baby" << endl;
+                return curLen;
+            }
+        }
+        else {
+            sumRight = right->calDist(partner, this, curLen);
+        }
+        return sumRight + sumLeft;
+    }
+
+}
+
+
+/**
+        efficient longest path algorithm
+ */
+double Node::longestPath2(Node* &node1, Node* &node2) {
+    // step 1: find the farthest leaf from this node (as a leaf)
+    assert(isLeaf());
+    node1 = calcHeight();
+    // step 2: find the farthest leaf from node1
+    node2 = node1->calcHeight();
+    return node1->height;
+}
+
+Neighbor *Node::findNeighbor(Node *node) {
+	int size = neighbors.size();
+    for (int i = 0; i < size; i++)
+        if (neighbors[i]->node == node) return neighbors[i];
+    /*
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it ++)
+            if ((*it)->node == node)
+                    return (*it);*/
+    cout << "ERROR : Could not find neighbors of node " << node->id << endl;
+    assert(0);
+    return NULL;
+}
+
+bool Node::isNeighbor(Node* node) {
+    int size = neighbors.size();
+    for (int i = 0; i < size; i++)
+        if (neighbors[i]->node == node) return true;
+    return false;
+}
+
+NeighborVec::iterator Node::findNeighborIt(Node *node) {
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++)
+        if ((*it)->node == node)
+            return it;
+    assert(0);
+    return neighbors.end();
+}
+
+void Node::addNeighbor(Node *node, double length, int id) {
+    neighbors.push_back(new Neighbor(node, length, id));
+}
+
+void Node::updateNeighbor(NeighborVec::iterator nei_it, Neighbor *newnei) {
+    assert(nei_it != neighbors.end());
+    *nei_it = newnei;
+}
+
+void Node::updateNeighbor(NeighborVec::iterator nei_it, Neighbor *newnei, double newlen) {
+    assert(nei_it != neighbors.end());
+    *nei_it = newnei;
+    newnei->length = newlen;
+}
+
+void Node::updateNeighbor(Node *node, Neighbor *newnei) {
+    NeighborVec::iterator nei_it = findNeighborIt(node);
+    assert(nei_it != neighbors.end());
+    *nei_it = newnei;
+}
+
+void Node::updateNeighbor(Node *node, Neighbor *newnei, double newlen) {
+    NeighborVec::iterator nei_it = findNeighborIt(node);
+    assert(nei_it != neighbors.end());
+    *nei_it = newnei;
+    newnei->length = newlen;
+}
+
+void Node::updateNeighbor(Node* node, Node *newnode, double newlen) {
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++)
+        if ((*it)->node == node) {
+            (*it)->node = newnode;
+            (*it)->length = newlen;
+            break;
+        }
+}
+
+double Node::updateNeighbor(Node* node, Node *newnode) {
+    for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it++)
+        if ((*it)->node == node) {
+            (*it)->node = newnode;
+            return (*it)->length;
+        }
+    return -1;
+}
+
+void Node::deleteNode() {
+    NeighborVec::reverse_iterator it;
+    for (it = neighbors.rbegin(); it != neighbors.rend(); it++)
+        delete (*it);
+    neighbors.clear();
+}
+
+Node::~Node() {
+    deleteNode();
+}
+
diff --git a/node.h b/node.h
new file mode 100644
index 0000000..b4e6500
--- /dev/null
+++ b/node.h
@@ -0,0 +1,416 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef NODE_H
+#define NODE_H
+
+#include <vector>
+#include <string>
+#include <set>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+
+//#include <sys/time.h>
+//#include <time.h>
+#include <math.h>
+#include "ncl/ncl.h"
+
+#include "tools.h"
+
+using namespace std;
+
+/*--------------------------------------------------------------*/
+
+class Node;
+
+/**
+    Neighbor list of a node in the tree
+ */
+class Neighbor {
+public:
+
+    /**
+        the other end of the branch
+     */
+    Node *node;
+
+    /**
+        branch length
+     */
+    double length;
+
+    /**
+        branch ID
+     */
+    int id;
+
+    /**
+        construct class with a node and length
+        @param anode the other end of the branch
+        @param alength length of branch
+     */
+    Neighbor(Node *anode, double alength) {
+        node = anode;
+        length = alength;
+        id = -1;
+    }
+
+    /**
+        construct class with a node and length
+        @param anode the other end of the branch
+        @param alength length of branch
+        @param id branch ID
+     */
+    Neighbor(Node *anode, double alength, int aid) {
+        node = anode;
+        length = alength;
+        id = aid;
+    }
+
+    /**
+        construct class with another Neighbor
+        @param nei another Neighbor
+     */
+    Neighbor(Neighbor *nei) {
+        node = nei->node;
+        length = nei->length;
+        id = nei->id;
+    }
+
+    /**
+        destructor
+     */
+    virtual ~Neighbor() {
+    }
+};
+
+/**
+    Neighbor vector
+ */
+typedef vector<Neighbor*> NeighborVec;
+
+/**
+    Node vector
+ */
+typedef vector<Node*> NodeVector;
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+A Node in the tree
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+ */
+class Node {
+public:
+    /**
+        node id.
+     */
+    int id;
+
+    /**
+        node name
+     */
+    string name;
+
+    /**
+        list of neighbors
+     */
+    NeighborVec neighbors;
+
+    /**
+        the height of subtree rooted at this node, used for greedy algorithm
+     */
+    double height;
+
+    /**
+        child of maximal height of subtree rooted at this node, used for greedy algorithm
+     */
+    Neighbor *highestNei;
+
+
+    /**
+     *      List of closest leaves to the current node.
+     */
+    NodeVector closestLeaves;
+
+    /**
+        constructor
+     */
+    Node() {
+        id = -1;
+        height = -1;
+    };
+
+
+    /**
+        constructor
+        @param aid id of this node
+     */
+    Node(int aid);
+
+    /**
+        constructor
+        @param aid id of this node
+        @param aname name of this node
+     */
+    Node(int aid, int aname);
+
+    /**
+        constructor
+        @param aid id of this node
+        @param aname name of this node
+     */
+    Node(int aid, const char *aname);
+
+    /**
+        destructor
+     */
+    virtual ~Node();
+
+    /**
+        used for the destructor
+     */
+    virtual void deleteNode();
+
+
+    /**
+        @return true of this is a leaf
+     */
+    bool isLeaf();
+
+    /**
+     *  @return TRUE if this node is a leaf in a cherry
+     */
+    bool isInCherry();
+
+    /**
+        @return TRUE if this node is a cherry, FALSE otherwise
+     */
+    bool isCherry();
+
+    /**
+        @return the number of adjacent nodes
+     */
+    int degree();
+
+    /** calculate the height of the subtree rooted at this node,
+        given the dad. Also return the lowestLeaf.
+        @param dad the dad of this node
+        @return the leaf at the lowest level. Also modify the height, highestNei of this class.
+     */
+    Node *calcHeight(Node *dad = NULL);
+
+
+    /**
+     * Calculate the distance between 2 nodes. Only for binary tree.
+     * @param parner the other node
+     * @return the distance
+     */
+    int calDist(Node *parner, Node *dad = NULL, int curLen = 0);
+
+    /** calculate the longest path in the subtree (version 2: more efficient)
+        @param node1 the returned node1 of the one end of the path
+        @param node2 the returned node2 of the one end of the path
+        @return the length of the longest path
+     */
+    double longestPath2(Node* &node1, Node* &node2);
+
+    /**
+        @param node the target node
+        @return the iterator to the neighbor that has the node. If not found, return NULL
+     */
+    Neighbor *findNeighbor(Node *node);
+
+    /**
+     * @brief check whether the two nodes are neighbors
+     * @param[in] node the other node
+     */
+    bool isNeighbor(Node *node);
+
+    /**
+        @param node the target node
+        @return the iterator to the neighbor that has the node. If not found, return neighbors.end()
+     */
+    NeighborVec::iterator findNeighborIt(Node *node);
+
+    /**
+        update the neighbor node with the newnode
+        @param node old neighbor node
+        @param newnode new neighbor node
+        @param newlen new length applied for the corresponding branch
+     */
+    void updateNeighbor(Node* node, Node *newnode, double newlen);
+
+    /**
+        update the neighbor node with the newnode
+        @param node old neighbor node
+        @param newnode new neighbor node
+        @return length applied for the corresponding branch
+     */
+    double updateNeighbor(Node* node, Node *newnode);
+
+    /**
+        update the neighbor node with the newnode
+        @param nei_it iterator to the neighbor
+        @param newnei new neighbor
+     */
+    void updateNeighbor(NeighborVec::iterator nei_it, Neighbor *newnei);
+
+    /**
+        update the neighbor node with the newnode
+        @param nei_it iterator to the neighbor
+        @param newnei new neighbor
+        @param newlen new branch length
+     */
+    void updateNeighbor(NeighborVec::iterator nei_it, Neighbor *newnei, double newlen);
+
+    /**
+        update the neighbor node with the newnode
+        @param node old neighbor node
+        @param newnei new neighbor
+     */
+    void updateNeighbor(Node *node, Neighbor *newnei);
+
+    /**
+        update the neighbor node with the newnode
+        @param node old neighbor node
+        @param newnei new neighbor
+        @param newlen new branch length
+     */
+    void updateNeighbor(Node *node, Neighbor *newnei, double newlen);
+
+    /**
+        add a neighbor
+        @param node the neighbor node
+        @param length branch length
+        @param id branch ID
+     */
+    virtual void addNeighbor(Node *node, double length, int id = -1);
+};
+/*
+class Branch {
+public:
+    Node* node1;
+
+    Node* node2;
+
+    string key;
+
+    Branch(Node* node1, Node* node2) {
+        assert(node1->isNeighbor(node2));
+        assert(node2->isNeighbor(node1));
+
+        if (node1->id < node2->id) {
+            this->node1 = node1;
+            this->node2 = node2;
+        } else {
+            this->node1 = node2;
+            this->node2 = node1;
+        }
+
+        key = convertIntToString(this->node1->id) + convertIntToString(this->node2->id);
+    }
+
+    inline string getKey() {
+        return key;
+    }
+};
+*/
+
+/*
+    some macros to transverse neighbors of a node
+ */
+#define FOR_NEIGHBOR(mynode, mydad, it) \
+	for (it = (mynode)->neighbors.begin(); it != (mynode)->neighbors.end(); it++) \
+		if ((*it)->node != (mydad))
+
+#define FOR_NEIGHBOR_IT(mynode, mydad, it) \
+	for (NeighborVec::iterator it = (mynode)->neighbors.begin(); it != (mynode)->neighbors.end(); it++) \
+		if ((*it)->node != (mydad))
+
+#define FOR_NEIGHBOR_DECLARE(mynode, mydad, it) \
+	NeighborVec::iterator it; \
+	for (it = (mynode)->neighbors.begin(); it != (mynode)->neighbors.end(); it++) \
+		if ((*it)->node != (mydad))
+
+
+
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+    nodecmp, for pruning algorithm
+ */
+struct nodecmp {
+
+    /**
+        nodecmp, for pruning algorithm
+     */
+    bool operator()(const Node* s1, const Node* s2) const {
+        return (s1->neighbors[0]->length) < (s2->neighbors[0]->length);
+    }
+};
+
+inline int nodenamecmp(const Node* a, const Node* b) {
+    return (a->name < b->name);
+}
+
+/**
+    set of leaves, sorted in ascending order by the length of the incident branch.
+    For pruning algorithm
+ */
+typedef multiset<Node*, nodecmp> LeafSet;
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+    map from leaf name to node class
+ */
+typedef map<const string, Node*> LeafMapName;
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+    neighborcmp, for greedy algorithm
+ */
+struct neighborcmp {
+
+    /**
+        neighborcmp, for greedy algorithm
+     */
+    bool operator()(const Neighbor* s1, const Neighbor* s2) const {
+        return ((s1->length + s1->node->height) > (s2->length + s2->node->height));
+    }
+};
+
+/**
+    set of branches, sorted in descending order by the height of the corresponding subtree.
+    For greedy algorithm.
+ */
+typedef multiset<Neighbor*, neighborcmp> NeighborSet;
+
+
+#endif
diff --git a/optimization.cpp b/optimization.cpp
new file mode 100644
index 0000000..40fd5cb
--- /dev/null
+++ b/optimization.cpp
@@ -0,0 +1,1083 @@
+//
+// C++ Implementation: optimization
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#include "optimization.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include "lbfgsb/lbfgsb_new.h"
+#include "tools.h"
+
+
+using namespace std;
+
+const double ERROR_X = 1.0e-4;
+
+double ran1(long *idum);
+double *new_vector(long nl, long nh);
+void free_vector(double *v, long nl, long nh);
+double **new_matrix(long nrl, long nrh, long ncl, long nch);
+void free_matrix(double **m, long nrl, long nrh, long ncl, long nch);
+void fixBound(double x[], double lower[], double upper[], int n);
+
+#define NR_END 1
+#define FREE_ARG char*
+
+#define GET_PSUM \
+					for (n=1;n<=ndim;n++) {\
+					for (sum=0.0,m=1;m<=mpts;m++) sum += p[m][n];\
+					psum[n]=sum;}
+
+
+#define IA 16807
+#define IM 2147483647
+#define AM (1.0/IM)
+#define IQ 127773
+#define IR 2836
+#define NTAB 32
+#define NDIV (1+(IM-1)/NTAB)
+#define EPS 1.2e-7
+#define RNMX (1.0-EPS)
+
+double ran1(long *idum) {
+	int j;
+	long k;
+	static long iy=0;
+	static long iv[NTAB];
+	double temp;
+
+	if (*idum <= 0 || !iy) {
+		if (-(*idum) < 1) *idum=1;
+		else *idum = -(*idum);
+		for (j=NTAB+7;j>=0;j--) {
+			k=(*idum)/IQ;
+			*idum=IA*(*idum-k*IQ)-IR*k;
+			if (*idum < 0) *idum += IM;
+			if (j < NTAB) iv[j] = *idum;
+		}
+		iy=iv[0];
+	}
+	k=(*idum)/IQ;
+	*idum=IA*(*idum-k*IQ)-IR*k;
+	if (*idum < 0) *idum += IM;
+	j=iy/NDIV;
+	iy=iv[j];
+	iv[j] = *idum;
+	if ((temp=AM*iy) > RNMX) return RNMX;
+	else return temp;
+}
+#undef IA
+#undef IM
+#undef AM
+#undef IQ
+#undef IR
+#undef NTAB
+#undef NDIV
+#undef EPS
+#undef RNMX
+
+
+long idum = 123456;
+double tt;
+
+void nrerror(const char *error_text)
+/* Numerical Recipes standard error handler */
+{
+	cerr << "NUMERICAL ERROR: " << error_text << endl;
+	//exit(1);
+	abort();
+}
+
+double *new_vector(long nl, long nh)
+/* allocate a double vector with subscript range v[nl..nh] */
+{
+	double *v;
+
+	v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
+	if (!v) nrerror("allocation failure in vector()");
+	return v-nl+NR_END;
+}
+
+double **new_matrix(long nrl, long nrh, long ncl, long nch)
+/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
+{
+	long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
+	double **m;
+
+	/* allocate pointers to rows */
+	m=(double **) malloc((size_t)((nrow+NR_END)*sizeof(double*)));
+	if (!m) nrerror("allocation failure 1 in matrix()");
+	m += NR_END;
+	m -= nrl;
+
+	/* allocate rows and set pointers to them */
+	m[nrl]=(double *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
+	if (!m[nrl]) nrerror("allocation failure 2 in matrix()");
+	m[nrl] += NR_END;
+	m[nrl] -= ncl;
+
+	for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
+
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+
+void free_vector(double *v, long nl, long nh)
+/* free a double vector allocated with vector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_matrix(double **m, long nrl, long nrh, long ncl, long nch)
+/* free a double matrix allocated by dmatrix() */
+{
+	free((FREE_ARG) (m[nrl]+ncl-NR_END));
+	free((FREE_ARG) (m+nrl-NR_END));
+}
+
+void fixBound(double x[], double lower[], double upper[], int n) {
+	for (int i = 1; i <= n; i++) {
+		if (x[i] < lower[i])
+			x[i] = lower[i];
+		else if (x[i] > upper[i])
+			x[i] = upper[i];
+	}
+}
+
+/**********************************************
+	Optimization routines
+**********************************************/
+Optimization::Optimization()
+{
+}
+
+
+Optimization::~Optimization()
+{
+}
+
+/*****************************************************
+	One dimensional optimization with Brent method
+*****************************************************/
+	
+#define ITMAX 100
+#define CGOLD 0.3819660
+#define GOLD 1.618034
+#define GLIMIT 100.0
+#define TINY 1.0e-20
+#define ZEPS 1.0e-10
+#define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d);
+#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+
+/* Brents method in one dimension */
+double Optimization::brent_opt (double ax, double bx, double cx, double tol,
+                          double *foptx, double *f2optx, double fax, double fbx, double fcx) {
+	int iter;
+	double a,b,d=0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm;
+	double xw,wv,vx;
+	double e=0.0;
+
+	a=(ax < cx ? ax : cx);
+	b=(ax > cx ? ax : cx);
+	x=bx;
+	fx=fbx;
+	if (fax < fcx) {
+		w=ax;
+		fw=fax;
+		v=cx;
+		fv=fcx;
+	} else {
+		w=cx;
+		fw=fcx;
+		v=ax;
+		fv=fax;
+	}
+
+	for (iter=1;iter<=ITMAX;iter++) {
+		xm=0.5*(a+b);
+		tol2=2.0*(tol1=tol*fabs(x)+ZEPS);
+		if (fabs(x-xm) <= (tol2-0.5*(b-a))) {
+			*foptx = fx;
+			xw = x-w;
+			wv = w-v;
+			vx = v-x;
+			*f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/
+			          (v*v*xw + x*x*wv + w*w*vx);
+			return x;
+		}
+
+		if (fabs(e) > tol1) {
+			r=(x-w)*(fx-fv);
+			q=(x-v)*(fx-fw);
+			p=(x-v)*q-(x-w)*r;
+			q=2.0*(q-r);
+			if (q > 0.0)
+				p = -p;
+			q=fabs(q);
+			etemp=e;
+			e=d;
+			if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x))
+				d=CGOLD*(e=(x >= xm ? a-x : b-x));
+			else {
+				d=p/q;
+				u=x+d;
+				if (u-a < tol2 || b-u < tol2)
+					d=SIGN(tol1,xm-x);
+			}
+		} else {
+			d=CGOLD*(e=(x >= xm ? a-x : b-x));
+		}
+
+		u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d));
+		fu=computeFunction(u);
+		if (fu <= fx) {
+			if (u >= x)
+				a=x;
+			else
+				b=x;
+
+			SHFT(v,w,x,u)
+			SHFT(fv,fw,fx,fu)
+		} else {
+			if (u < x)
+				a=u;
+			else
+				b=u;
+			if (fu <= fw || w == x) {
+				v=w;
+				w=u;
+				fv=fw;
+				fw=fu;
+			} else
+				if (fu <= fv || v == x || v == w) {
+					v=u;
+					fv=fu;
+				}
+		}
+	}
+
+	*foptx = fx;
+	xw = x-w;
+	wv = w-v;
+	vx = v-x;
+	*f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/(v*v*xw + x*x*wv + w*w*vx);
+
+	return x;
+}
+
+#undef ITMAX
+#undef CGOLD
+#undef ZEPS
+#undef SHFT
+#undef SIGN
+#undef GOLD
+#undef GLIMIT
+#undef TINY
+
+
+double Optimization::minimizeOneDimen(double xmin, double xguess, double xmax, double tolerance, double *fx, double *f2x) {
+	double eps, optx, ax, bx, cx, fa, fb, fc;
+	//int    converged;	/* not converged error flag */
+		
+	/* first attempt to bracketize minimum */
+	if (xguess < xmin) xguess = xmin;
+	if (xguess > xmax) xguess = xmax;
+	eps = xguess*tolerance*50.0;
+	ax = xguess - eps;
+	if (ax < xmin) ax = xmin;
+	bx = xguess;
+	cx = xguess + eps;
+	if (cx > xmax) cx = xmax;
+	
+	/* check if this works */
+	fa = computeFunction(ax);
+	fb = computeFunction(bx);
+	fc = computeFunction(cx);
+
+	/* if it works use these borders else be conservative */
+	if ((fa < fb) || (fc < fb)) {
+		if (ax != xmin) fa = computeFunction(xmin);
+		if (cx != xmax) fc = computeFunction(xmax);
+		ax = xmin;
+		cx = xmax;
+	}
+	/*
+	const int MAX_ROUND = 10;
+	for (i = 0; ((fa < fb-tolerance) || (fc < fb-tolerance)) && (i<MAX_ROUND); i++) {
+		// brent method require that fb is smaller than both fa and fc
+		// find some random values until fb achieve this
+			bx = (((double)rand()) / RAND_MAX)*(cx-ax) + ax;
+			fb = computeFunction(bx);
+	}*/
+
+/*
+	if ((fa < fb) || (fc < fb)) {
+		if (fa < fc) { bx = ax; fb = fa; } else { bx = cx; fb = fc; }
+		//cout << "WARNING: Initial value for Brent method is set at bound " << bx << endl;
+	}*/
+	//	optx = brent_opt(xmin, xguess, xmax, tolerance, fx, f2x, fa, fb, fc);
+	//} else
+	optx = brent_opt(ax, bx, cx, tolerance, fx, f2x, fa, fb, fc);
+    if (*fx > fb) // if worse, return initial value 
+    {
+        *fx = computeFunction(bx);
+        return bx;
+    }
+
+	return optx; /* return optimal x */
+}
+
+double Optimization::minimizeOneDimenSafeMode(double xmin, double xguess, double xmax, double tolerance, double *f)
+{
+	double ferror;
+	double optx = minimizeOneDimen(xmin, xguess, xmax, tolerance, f, &ferror);
+	double fnew;
+	// check value at the boundary
+	if ((optx < xmax) && (fnew = computeFunction(xmax)) <= *f+tolerance) {
+		//if (verbose_mode >= VB_MAX)
+			//cout << "Note from Newton safe mode: " << optx << " (" << f << ") -> " << xmax << " ("<< fnew << ")" << endl;
+		optx = xmax;
+		*f = fnew;
+	}
+	if ((optx > xmin) && (fnew = computeFunction(xmin)) <= *f+tolerance) {
+		//if (verbose_mode >= VB_MAX)
+			//cout << "Note from Newton safe mode: " << optx << " -> " << xmin << endl;
+		optx = xmin;
+		*f = fnew;
+	}
+	return optx;
+}
+
+/*****************************************************
+	One dimensional optimization with Newton Raphson 
+	only applicable if 1st and 2nd derivatives are easy to compute
+*****************************************************/
+
+
+double Optimization::minimizeNewtonSafeMode(double xmin, double xguess, double xmax, double tolerance, double &f)
+{
+	double optx = minimizeNewton(xmin, xguess, xmax, tolerance, f);
+	double fnew;
+	// check value at the boundary
+	if ((optx < xmax) && (fnew = computeFunction(xmax)) <= f+tolerance) {
+		//if (verbose_mode >= VB_MAX)
+			//cout << "Note from Newton safe mode: " << optx << " (" << f << ") -> " << xmax << " ("<< fnew << ")" << endl;
+		optx = xmax;
+		f = fnew;
+	}
+	if ((optx > xmin) && (fnew = computeFunction(xmin)) <= f+tolerance) {
+		//if (verbose_mode >= VB_MAX)
+			//cout << "Note from Newton safe mode: " << optx << " -> " << xmin << endl;
+		optx = xmin;
+		f = fnew;
+	}
+	return optx;
+}
+
+double Optimization::minimizeNewton(double x1, double xguess, double x2, double xacc, double &d2l, int maxNRStep)
+{
+	int j;
+	double df,dx,dxold,f;
+	double temp,xh,xl,rts, rts_old, xinit;
+
+	rts = xguess;
+	if (rts < x1) rts = x1;
+	if (rts > x2) rts = x2;
+	xinit = xguess;
+//	finit = fold = fm = computeFuncDerv(rts,f,df);
+    computeFuncDerv(rts,f,df);
+	d2l = df;
+	if (!isfinite(f) || !isfinite(df)) {
+		nrerror("Wrong computeFuncDerv");
+	}
+	if (df >= 0.0 && fabs(f) < xacc) return rts;
+	if (f < 0.0) {
+		xl = rts;
+		xh = x2;
+	} else {
+		xh = rts;
+		xl = x1;	
+	}
+
+	dx=dxold=fabs(xh-xl);
+	for (j=1;j<=maxNRStep;j++) {
+		rts_old = rts;
+		if (
+			(df <= 0.0) // function is concave
+//			|| (fm > fold + xacc) // increasing
+			|| (((rts-xh)*df-f)*((rts-xl)*df-f) >= 0.0) // out of bound
+			//|| (fabs(2.0*f) > fabs(dxold*df))  // converge too slow
+			) {
+			dxold=dx;
+			dx=0.5*(xh-xl);
+			rts=xl+dx;
+            d2l = df;
+			if (xl == rts) return rts;
+		} else {
+			dxold=dx;
+			dx=f/df;
+			temp=rts;
+			rts -= dx;
+			d2l = df;
+			if (temp == rts) return rts;
+		}
+		if (fabs(dx) < xacc || (j == maxNRStep)) {
+//			if (fm > finit) {
+//				// happen in rare cases that it is worse than starting point: revert init value
+//				fm = computeFunction(xinit);
+//				return xinit;
+//			}
+			return rts_old;
+//			return rts;
+		}
+//		fold = fm;
+//		fm = computeFuncDerv(rts,f,df);
+        computeFuncDerv(rts,f,df);
+		if (!isfinite(f) || !isfinite(df)) nrerror("Wrong computeFuncDerv");
+		if (df > 0.0 && fabs(f) < xacc) {
+			d2l = df;
+//			if (fm > finit) {
+//				// happen in rare cases that it is worse than starting point: revert init value
+//				fm = computeFunction(xinit);
+//				return xinit;
+//			}
+			return rts;
+		}
+		if (f < 0.0)
+			xl=rts;
+		else
+			xh=rts;
+	}
+	nrerror("Maximum number of iterations exceeded in minimizeNewton");
+	d2l = 0.0;
+	return 0.0;
+}
+
+double Optimization::minimizeNewton(double x1, double xguess, double x2, double xacc, int maxNRStep)
+{
+	double var;
+	double optx = minimizeNewton(x1, xguess, x2, xacc, var, maxNRStep);
+	return optx;
+}
+
+/*****************************************************
+	Multi dimensional optimization with BFGS method
+*****************************************************/
+
+#define ALF 1.0e-4
+#define TOLX 1.0e-7
+static double maxarg1,maxarg2;
+#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
+        (maxarg1) : (maxarg2))
+
+void Optimization::lnsrch(int n, double xold[], double fold, double g[], double p[], double x[],
+                   double *f, double stpmax, int *check, double lower[], double upper[]) {
+	int i;
+	double a,alam,alam2=0,alamin,b,disc,f2=0,fold2=0,rhs1,rhs2,slope,sum,temp,
+	test,tmplam;
+
+	*check=0;
+	for (sum=0.0,i=1;i<=n;i++) sum += p[i]*p[i];
+	sum=sqrt(sum);
+	if (sum > stpmax)
+		for (i=1;i<=n;i++) p[i] *= stpmax/sum;
+	for (slope=0.0,i=1;i<=n;i++)
+		slope += g[i]*p[i];
+	test=0.0;
+	for (i=1;i<=n;i++) {
+		temp=fabs(p[i])/FMAX(fabs(xold[i]),1.0);
+		if (temp > test) test=temp;
+	}
+	alamin=TOLX/test;
+	alam=1.0;
+	/*
+	int rep = 0;
+	do {
+		for (i=1;i<=n;i++) x[i]=xold[i]+alam*p[i];
+		if (!checkRange(x))
+			alam *= 0.5;
+		else
+			break;
+		rep++;
+	} while (rep < 10);
+	*/
+	bool first_time = true;
+	for (;;) {
+		for (i=1;i<=n;i++) x[i]=xold[i]+alam*p[i];
+		fixBound(x, lower, upper, n);
+		//checkRange(x);
+		*f=targetFunk(x);
+		if (alam < alamin) {
+			for (i=1;i<=n;i++) x[i]=xold[i];
+			*check=1;
+			return;
+		} else if (*f <= fold+ALF*alam*slope) return;
+		else {
+			if (first_time)
+				tmplam = -slope/(2.0*(*f-fold-slope));
+			else {
+				rhs1 = *f-fold-alam*slope;
+				rhs2=f2-fold2-alam2*slope;
+				a=(rhs1/(alam*alam)-rhs2/(alam2*alam2))/(alam-alam2);
+				b=(-alam2*rhs1/(alam*alam)+alam*rhs2/(alam2*alam2))/(alam-alam2);
+				if (a == 0.0) tmplam = -slope/(2.0*b);
+				else {
+					disc=b*b-3.0*a*slope;
+					if (disc<0.0) //nrerror("Roundoff problem in lnsrch.");
+						tmplam = 0.5 * alam;
+					else if (b <= 0.0) tmplam=(-b+sqrt(disc))/(3.0*a);
+					else tmplam = -slope/(b+sqrt(disc));
+				}
+				if (tmplam>0.5*alam)
+					tmplam=0.5*alam;
+			}
+		}
+		alam2=alam;
+		f2 = *f;
+		fold2=fold;
+		alam=FMAX(tmplam,0.1*alam);
+		first_time = false;
+	}
+}
+#undef ALF
+#undef TOLX
+
+
+const int MAX_ITER = 3;
+extern double random_double();
+
+double Optimization::minimizeMultiDimen(double guess[], int ndim, double lower[], double upper[], bool bound_check[], double gtol) {
+	int i, iter;
+	double fret, minf = 10000000.0;
+	double *minx = new double [ndim+1];
+	int count = 0;
+	bool restart;
+	do {
+		dfpmin(guess, ndim, lower, upper, gtol, &iter, &fret);
+		if (fret < minf) {
+ 			minf = fret;
+			for (i = 1; i <= ndim; i++)
+				minx[i] = guess[i];
+		}
+		count++;
+		// restart the search if at the boundary
+		// it's likely to end at a local optimum at the boundary
+		restart = false;
+		
+		
+		for (i = 1; i <= ndim; i++)
+			if (bound_check[i])
+			if (fabs(guess[i]-lower[i]) < 1e-4 || fabs(guess[i]-upper[i]) < 1e-4) {
+				restart = true;
+				break;
+			}
+		
+		if (!restart)
+			break;
+
+		if (count == MAX_ITER)
+			break;
+			
+		do {
+			for (i = 1; i <= ndim; i++) {
+				guess[i] = random_double() * (upper[i] - lower[i])/3 + lower[i];
+			}
+		} while (false);
+		cout << "Restart estimation at the boundary... " << std::endl;
+	} while (count < MAX_ITER);
+	if (count > 1) {
+		for (i = 1; i <= ndim; i++)
+			guess[i] = minx[i];
+		fret = minf;
+	}
+	delete [] minx;
+	
+	return fret;
+}
+
+
+#define ITMAX 200
+static double sqrarg;
+#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
+#define EPS 3.0e-8
+#define TOLX (4*EPS)
+#define STPMX 100.0
+
+#define FREEALL free_vector(xi,1,n);free_vector(pnew,1,n); \
+free_matrix(hessin,1,n,1,n);free_vector(hdg,1,n);free_vector(g,1,n); \
+free_vector(dg,1,n);
+
+
+
+void Optimization::dfpmin(double p[], int n, double lower[], double upper[], double gtol, int *iter, double *fret) {
+	int check,i,its,j;
+	double den,fac,fad,fae,fp,stpmax,sum=0.0,sumdg,sumxi,temp,test;
+	double *dg,*g,*hdg,**hessin,*pnew,*xi;
+
+	dg=new_vector(1,n);
+	g=new_vector(1,n);
+	hdg=new_vector(1,n);
+	hessin=new_matrix(1,n,1,n);
+	pnew=new_vector(1,n);
+	xi=new_vector(1,n);
+	fp = derivativeFunk(p,g);
+	for (i=1;i<=n;i++) {
+		for (j=1;j<=n;j++) hessin[i][j]=0.0;
+		hessin[i][i]=1.0;
+		xi[i] = -g[i];
+		sum += p[i]*p[i];
+	}
+	//checkBound(p, xi, lower, upper, n);
+	//checkDirection(p, xi);
+
+	stpmax=STPMX*FMAX(sqrt(sum),(double)n);
+	for (its=1;its<=ITMAX;its++) {
+		*iter=its;
+		lnsrch(n,p,fp,g,xi,pnew,fret,stpmax,&check, lower, upper);
+		fp = *fret;
+		for (i=1;i<=n;i++) {
+			xi[i]=pnew[i]-p[i];
+			p[i]=pnew[i];
+		}
+		test=0.0;
+		for (i=1;i<=n;i++) {
+			temp=fabs(xi[i])/FMAX(fabs(p[i]),1.0);
+			if (temp > test) test=temp;
+		}
+		if (test < TOLX) {
+			FREEALL
+			return;
+		}
+		for (i=1;i<=n;i++) dg[i]=g[i];
+		derivativeFunk(p,g);
+		test=0.0;
+		den=FMAX(fabs(*fret),1.0); // fix bug found by Tung, as also suggested by NR author
+		for (i=1;i<=n;i++) {
+			temp=fabs(g[i])*FMAX(fabs(p[i]),1.0)/den;
+			if (temp > test) test=temp;
+		}
+		if (test < gtol) {
+			FREEALL
+			return;
+		}
+		for (i=1;i<=n;i++) dg[i]=g[i]-dg[i];
+		for (i=1;i<=n;i++) {
+			hdg[i]=0.0;
+			for (j=1;j<=n;j++) hdg[i] += hessin[i][j]*dg[j];
+		}
+		fac=fae=sumdg=sumxi=0.0;
+		for (i=1;i<=n;i++) {
+			fac += dg[i]*xi[i];
+			fae += dg[i]*hdg[i];
+			sumdg += SQR(dg[i]);
+			sumxi += SQR(xi[i]);
+		}
+		if (fac*fac > EPS*sumdg*sumxi)
+		{
+			fac=1.0/fac;
+			fad=1.0/fae;
+			for (i=1;i<=n;i++) dg[i]=fac*xi[i]-fad*hdg[i];
+			for (i=1;i<=n;i++) {
+				for (j=1;j<=n;j++) {
+					hessin[i][j] += fac*xi[i]*xi[j]
+					                -fad*hdg[i]*hdg[j]+fae*dg[i]*dg[j];
+				}
+			}
+		}
+		for (i=1;i<=n;i++) {
+			xi[i]=0.0;
+			for (j=1;j<=n;j++) xi[i] -= hessin[i][j]*g[j];
+		}
+		//checkBound(p, xi, lower, upper, n);
+		//checkDirection(p, xi);
+		//if (*iter > 200) cout << "iteration=" << *iter << endl;
+	}
+	// BQM: TODO disable this message!
+	//nrerror("too many iterations in dfpmin");
+	FREEALL
+}
+#undef ITMAX
+#undef SQR
+#undef EPS
+#undef TOLX
+#undef STPMX
+#undef FREEALL
+#undef FMAX
+
+
+/**
+	the approximated derivative function
+	@param x the input vector x
+	@param dfx the derivative at x
+	@return the function value at x
+*/
+double Optimization::derivativeFunk(double x[], double dfx[]) {
+	/*
+	if (!checkRange(x))
+		return INFINITIVE;
+	*/
+	int ndim = getNDim();
+	double *h = new double[ndim+1];
+    double temp;
+    int dim;
+	for (dim = 1; dim <= ndim; dim++ ){
+		temp = x[dim];
+		h[dim] = ERROR_X * fabs(temp);
+		if (h[dim] == 0.0) h[dim] = ERROR_X;
+		x[dim] = temp + h[dim];
+		h[dim] = x[dim] - temp;
+		dfx[dim] = (targetFunk(x));
+		x[dim] = temp;
+	}
+	double fx = targetFunk(x);
+	for (dim = 1; dim <= ndim; dim++ )
+        dfx[dim] = (dfx[dim] - fx) / h[dim];
+    delete [] h;
+	return fx;
+}
+
+
+/*#define NRANSI
+#define ITMAX 100
+#define CGOLD 0.3819660
+#define ZEPS 1.0e-10
+#define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d);
+#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+
+double Optimization::brent(double ax, double bx, double cx, double tol,
+	double *xmin)
+{
+	int iter;
+	double a,b,d=0.0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm;
+	double e=0.0;
+
+	a=(ax < cx ? ax : cx);
+	b=(ax > cx ? ax : cx);
+	x=w=v=bx;
+	fw=fv=fx=computeFunction(x);
+	for (iter=1;iter<=ITMAX;iter++) {
+		xm=0.5*(a+b);
+		tol2=2.0*(tol1=tol*fabs(x)+ZEPS);
+		if (fabs(x-xm) <= (tol2-0.5*(b-a))) {
+			*xmin=x;
+			return fx;
+		}
+		if (fabs(e) > tol1) {
+			r=(x-w)*(fx-fv);
+			q=(x-v)*(fx-fw);
+			p=(x-v)*q-(x-w)*r;
+			q=2.0*(q-r);
+			if (q > 0.0) p = -p;
+			q=fabs(q);
+			etemp=e;
+			e=d;
+			if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x))
+				d=CGOLD*(e=(x >= xm ? a-x : b-x));
+			else {
+				d=p/q;
+				u=x+d;
+				if (u-a < tol2 || b-u < tol2)
+					d=SIGN(tol1,xm-x);
+			}
+		} else {
+			d=CGOLD*(e=(x >= xm ? a-x : b-x));
+		}
+		u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d));
+		fu=computeFunction(u);
+		if (fu <= fx) {
+			if (u >= x) a=x; else b=x;
+			SHFT(v,w,x,u)
+			SHFT(fv,fw,fx,fu)
+		} else {
+			if (u < x) a=u; else b=u;
+			if (fu <= fw || w == x) {
+				v=w;
+				w=u;
+				fv=fw;
+				fw=fu;
+			} else if (fu <= fv || v == x || v == w) {
+				v=u;
+				fv=fu;
+			}
+		}
+	}
+	nrerror("Too many iterations in brent");
+	*xmin=x;
+	return fx;
+}
+
+#undef SIGN
+#undef ITMAX
+#undef CGOLD
+#undef ZEPS
+#undef SHFT
+#undef NRANSI*/
+
+/*#define JMAX 20
+
+double Optimization::minimizeNewton(double xmin, double xguess, double xmax, double tolerance, double &f)
+{
+	return rtsafe(xmin, xguess, xmax, tolerance, f);
+	//double fe;
+	//return minimizeOneDimen(xmin, rtn, xmax, tolerance, &f, &fe);
+
+	int j;
+	double df,ddf,dx,rtn,rtnold, fstart=0, fnew;
+
+	rtn=xguess;
+	if (rtn < xmin) rtn = xmin;
+	if (rtn > xmax) rtn = xmax;
+	
+
+	for (j=1;j<=JMAX;j++) {
+		f = computeFuncDerv(rtn,df,ddf);
+		if (!isfinite(f)) 
+			return 0;
+		if (j == 1) fstart = f;
+		if (ddf == 0.0) break;
+		dx=(df/fabs(ddf));
+		if (fabs(dx) <= tolerance) break;
+
+		rtnold = rtn; rtn = rtn-dx;
+		if (rtn < xmin) rtn = xmin;
+		if (rtn > xmax) rtn = xmax;
+		dx = rtnold-rtn;
+
+		while (fabs(dx) > tolerance && (fnew = computeFunction(rtn)) > f + tolerance) {
+			dx /= 2;
+			rtn = rtnold - dx;
+		}
+		if (fabs(dx) <= tolerance) { rtn = rtnold; break; }
+	}
+	//if (j > JMAX)
+		//nrerror("Maximum number of iterations exceeded in Newton-Raphson");
+	if (f <= fstart && j <= JMAX && (j > 1 || xguess > xmin+tolerance)) 
+		return rtn;
+	// Newton does not work, turn to other method
+	double fe;
+	return minimizeOneDimen(xmin, xguess, xmax, tolerance, &f, &fe);
+}*/
+
+/*
+double Optimization::minimizeNewton(double xmin, double xguess, double xmax, double tolerance, double &f)
+{
+	int j;
+	double df,ddf,dx,dxold,rtn,temp, fold, fstart;
+	double xmin_orig = xmin, xmax_orig = xmax;
+
+	rtn=xguess;
+	dx=dxold=(xmax-xmin);
+	fstart = fold = f = computeFuncDerv(rtn,df,ddf);
+
+	for (j=1;j<=JMAX;j++) {
+		if (ddf <= 0.0) break;
+		if ((((rtn-xmax)*ddf-df) * ((rtn-xmin)*ddf-df) > 0) || // run out of range
+			(fabs(2.0*df) > fabs(dxold*ddf)) // dx not decreasing fast enough
+			) // f even increase
+		{
+			dxold = dx;
+			dx = 0.5*(xmax-xmin);
+			rtn = xmin+dx;
+			if (xmin == rtn) break;
+		} else {
+			dxold=dx;
+			if (ddf == 0.0)
+				nrerror("2nd derivative is zero");
+			dx=df/ddf;
+			temp=rtn;
+			//if (f > fold) dx /= 2.0;
+			//if (ddf < 0) dx = -dx;
+			rtn -= dx;
+			//if (rtn < xmin) rtn = xmin;
+			//if (rtn > xmax) rtn = xmax;
+			dx = temp - rtn;
+			if (temp == rtn) break;
+		}
+		if (fabs(dx) < tolerance) break;
+		fold = f;
+		f = computeFuncDerv(rtn,df,ddf);
+		//if (f > fold) break; // Does not decrease function, escape
+		if (df < 0.0) 
+			xmin = rtn;
+		else
+			xmax = rtn;
+	}
+	if (j > JMAX)
+	nrerror("Maximum number of iterations exceeded in Newton-Raphson");
+	if (f <= fstart) return rtn;
+	// Newton does not work (find a max instead of min), turn to other method
+	double fe;
+	return minimizeOneDimen(xmin_orig, xguess, xmax_orig, tolerance, &f, &fe);
+	//return 0.0;
+}
+#undef JMAX*/
+
+
+double Optimization::L_BFGS_B(int n, double* x, double* l, double* u, double pgtol, int maxit) {
+	int i;
+	double Fmin;
+	int fail;
+	int fncount;
+	int grcount;
+	char msg[100];
+
+	int m = 5;          // number of BFGS updates retained in the "L-BFGS-B" method. It defaults to 5.
+
+	int *nbd;           // 0: unbounded; 1: lower bounded; 2: both lower & upper; 3: upper bounded
+	nbd = new int[n];
+	for (i=0; i<n; i++)
+		nbd[i] = 2;
+
+	double factr = 1e+10; // control the convergence of the "L-BFGS-B" method.
+	// Convergence occurs when the reduction in the object is within this factor
+	// of the machine tolerance.
+	// Default is 1e7, that is a tolerance of about 1e-8
+
+//	double pgtol = 0;   // helps control the convergence of the "L-BFGS-B" method.
+	// It is a tolerance on the projected gradient in the current search direction.
+	// Default is zero, when the check is suppressed
+
+	int trace = 0;      // non-negative integer.
+    if (verbose_mode >= VB_MED)
+        trace = 1;
+	// If positive, tracing information on the progress of the optimization is produced.
+	// Higher values may produce more tracing information.
+
+	int nREPORT = 10;   // The frequency of reports for the "L-BFGS-B" methods if "trace" is positive.
+	// Defaults to every 10 iterations.
+
+/*#ifdef USE_OLD_PARAM
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, fn, gr1, &fail, ex,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+#else*/
+
+	lbfgsb(n, m, x, l, u, nbd, &Fmin, &fail,
+			factr, pgtol, &fncount, &grcount, maxit, msg, trace, nREPORT);
+//#endif
+
+	delete[] nbd;
+    
+    return Fmin;
+}
+
+void Optimization::lbfgsb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *Fmin, int *fail,
+		double factr, double pgtol,
+		int *fncount, int *grcount, int maxit, char *msg,
+		int trace, int nREPORT)
+{
+	char task[60];
+	double f, *g, dsave[29], *wa;
+	int tr = -1, iter = 0, *iwa, isave[44], lsave[4];
+
+	/* shut up gcc -Wall in 4.6.x */
+
+	for(int i = 0; i < 4; i++) lsave[i] = 0;
+
+	if(n == 0) { /* not handled in setulb */
+		*fncount = 1;
+		*grcount = 0;
+		*Fmin = optimFunc(n, u);
+		strcpy(msg, "NOTHING TO DO");
+		*fail = 0;
+		return;
+	}
+	if (nREPORT <= 0) {
+		cerr << "REPORT must be > 0 (method = \"L-BFGS-B\")" << endl;
+		exit(1);
+	}
+	switch(trace) {
+	case 2: tr = 0; break;
+	case 3: tr = nREPORT; break;
+	case 4: tr = 99; break;
+	case 5: tr = 100; break;
+	case 6: tr = 101; break;
+	default: tr = -1; break;
+	}
+
+	*fail = 0;
+	g = (double*) malloc (n * sizeof(double));
+	/* this needs to be zeroed for snd in mainlb to be zeroed */
+	wa = (double *) malloc((2*m*n+4*n+11*m*m+8*m) * sizeof(double));
+	iwa = (int *) malloc(3*n * sizeof(int));
+	strcpy(task, "START");
+	while(1) {
+		/* Main workhorse setulb() from ../appl/lbfgsb.c : */
+		setulb(n, m, x, l, u, nbd, &f, g, factr, &pgtol, wa, iwa, task,
+				tr, lsave, isave, dsave);
+		/*    Rprintf("in lbfgsb - %s\n", task);*/
+		if (strncmp(task, "FG", 2) == 0) {
+			f = optimGradient(n, x, g);
+			if (!isfinite(f)) {
+				cerr << "L-BFGS-B needs finite values of 'fn'" << endl;
+				exit(1);
+			}
+			
+		} else if (strncmp(task, "NEW_X", 5) == 0) {
+			iter++;
+			if(trace == 1 && (iter % nREPORT == 0)) {
+				cout << "iter " << iter << " value " << f << endl;
+			}
+			if (iter > maxit) {
+				*fail = 1;
+				break;
+			}
+		} else if (strncmp(task, "WARN", 4) == 0) {
+			*fail = 51;
+			break;
+		} else if (strncmp(task, "CONV", 4) == 0) {
+			break;
+		} else if (strncmp(task, "ERROR", 5) == 0) {
+			*fail = 52;
+			break;
+		} else { /* some other condition that is not supposed to happen */
+			*fail = 52;
+			break;
+		}
+	}
+	*Fmin = f;
+	*fncount = *grcount = isave[33];
+	if (trace) {
+		cout << "final value " << *Fmin << endl;
+		if (iter < maxit && *fail == 0)
+			cout << "converged" << endl;
+		else
+			cout << "stopped after " << iter << " iterations\n";
+	}
+	strcpy(msg, task);
+	free(g);
+	free(wa);
+	free(iwa);
+}
+
+double Optimization::optimFunc(int nvar, double *vars) { 
+    return targetFunk(vars-1);
+}
+
+
+double Optimization::optimGradient(int nvar, double *x, double *dfx) {
+    return derivativeFunk(x-1, dfx-1);
+//    const double ERRORX = 1e-5;
+//	double fx = optimFunc(nvar, x);
+//	double h, temp;
+//	for (int dim = 0; dim <= nvar; dim++ ){
+//		temp = x[dim];
+//		h = ERRORX * fabs(temp);
+//		if (h == 0.0) h = ERRORX;
+//		x[dim] = temp + h;
+//		h = x[dim] - temp;
+//		dfx[dim] = (optimFunc(nvar, x) - fx) / h;
+//		x[dim] = temp;
+//	}
+//	return fx;
+}
diff --git a/optimization.h b/optimization.h
new file mode 100644
index 0000000..8fbeb39
--- /dev/null
+++ b/optimization.h
@@ -0,0 +1,195 @@
+//
+// C++ Interface: optimization
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#ifndef OPTIMIZATION_H
+#define OPTIMIZATION_H
+
+/**
+Optimization class, implement some methods like Brent, Newton-Raphson (for 1 variable function), BFGS (for multi-dimensional function)
+
+	@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+*/
+class Optimization{
+public:
+    Optimization();
+
+
+	/*****************************************************
+		One dimensional optimization with Brent method
+	*****************************************************/
+	/**
+		This function calculate f(value) of the f() function, used by other general optimization method to minimize it.
+		Please always override this function to adapt to likelihood or parsimony score.
+		The default is for function f(x)=x.
+		@param value x-value of the function
+		@return f(value) of function f you want to minimize
+	*/
+	virtual double computeFunction(double value) { return value; }
+
+	/**
+		the brent method to find the value that minimizes the computeFunction().
+		@return the x-value that minimize the function
+		@param xmin lower bound
+		@param xmax upper bound
+		@param xguess first guess
+		@param tolerance tolerance
+		@param fx (OUT) function value at the minimum x found
+		@param ferror (OUT) Dont know
+	*/
+	double minimizeOneDimen(double xmin, double xguess, double xmax, double tolerance, double *fx, double *ferror);
+
+	double minimizeOneDimenSafeMode(double xmin, double xguess, double xmax, double tolerance, double *fx);
+
+	/*****************************************************
+		One dimensional optimization with Newton Raphson
+		only applicable if 1st and 2nd derivatives are easy to compute
+	*****************************************************/
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		used by Newton raphson method to minimize the function.
+		Please always override this function to adapt to likelihood or parsimony score.
+		The default is for function f(x) = x^2.
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf) {
+		df = 2.0*value; ddf = 2.0;
+//		return value*value+1.0;
+	}
+
+	/**
+		Newton-Raphson method to minimize computeFuncDerv()
+		@return the x-value that minimize the function
+		@param xmin lower bound
+		@param xmax upper bound
+		@param xguess first guess
+		@param tolerance tolerance of x-value to stop the iterations
+		@param fx (OUT) function value at the minimum x found
+		@param var (OUT) variance estimate of x
+		@param maxNRStep max number of NR steps
+	*/
+	double minimizeNewton(double xmin, double xguess, double xmax, double tolerance, int maxNRStep = 100);
+
+	double minimizeNewton(double xmin, double xguess, double xmax, double tolerance, double &d2l, int maxNRStep = 100);
+
+	double minimizeNewtonSafeMode(double xmin, double xguess, double xmax, double tolerance, double &f);
+
+
+//	double rtsafe(double x1, double xguess, double x2, double xacc, double &f);
+
+	/*****************************************************
+		Multi dimensional optimization with BFGS method
+	*****************************************************/
+
+	/**
+		return the number of dimensions
+	*/
+	virtual int getNDim() { return 0; }
+
+
+	/**
+		the target function which needs to be optimized
+		@param x the input vector x
+		@return the function value at x
+	*/
+	virtual double targetFunk(double x[]) { return 0.0; }
+
+	/**
+		the approximated derivative function
+		@param x the input vector x
+		@param dfx the derivative at x
+		@return the function value at x
+	*/
+	virtual double derivativeFunk(double x[], double dfx[]);
+
+	/**
+		multi dimensional optimization by BFGS method
+		@param guess the initial starting point
+		@param ndim number of dimension
+		@param gtol tolerance
+		@param lower the lower bound vector
+		@param upper the upper bound vector
+		@param bound_check bound checking vector
+		@return the minimum function value obtained
+	*/
+	double minimizeMultiDimen(double guess[], int ndim, double lower[], double upper[],
+		bool bound_check[], double gtol);
+
+	/*****************************************************
+		NEW 2015-08-19: Multi dimensional optimization with L-BFGS-B method
+	*****************************************************/
+
+    /**
+     Function to access the L-BFGS-B function, taken from HAL_HAS software package
+     
+     1. int nvar : The number of the variables
+     2. double* vars : initial values of the variables
+     3. double* lower : lower bounds of the variables
+     4. double* upper : upper bounds of the variables
+     5. double pgtol: gradient tolerance
+     5. int maxit : max # of iterations
+     @return minimized function value
+     After the function is invoked, the values of x will be updated
+    */
+    double L_BFGS_B(int nvar, double* vars, double* lower, double* upper, double pgtol = 1e-5, int maxit = 1000);
+
+    /** internal function called by L_BFGS_B
+        should return function value 
+        @param nvar number of variables
+        @param vars variables
+    */
+    virtual double optimFunc(int nvar, double *vars);
+    
+    /** internal function called by L_BFGS_B
+        should return gradient value
+        @param nvar number of variables
+        @param vars variables
+        @param gradient (OUT) function gradient
+        @return function value
+    */
+    virtual double optimGradient(int nvar, double *vars, double *gradient);
+    
+
+    ~Optimization();
+
+	/**
+		original numerical recipes method
+	*/
+	double brent(double ax, double bx, double cx, double tol, double *xmin);
+
+private:
+
+
+	double brent_opt (double ax, double bx, double cx, double tol,
+		double *foptx, double *f2optx, double fax, double fbx, double fcx);
+
+	double dbrent(double ax, double bx, double cx, double tol, double *xmin);
+
+	void dfpmin(double p[], int n, double lower[], double upper[], double gtol, int *iter, double *fret);
+
+	void lnsrch(int n, double xold[], double fold, double g[], double p[], double x[],
+		double *f, double stpmax, int *check, double lower[], double upper[]);
+
+    void lbfgsb(int n, int m, double *x, double *l, double *u, int *nbd,
+		double *Fmin, int *fail,
+		double factr, double pgtol,
+		int *fncount, int *grcount, int maxit, char *msg,
+		int trace, int nREPORT);
+    
+};
+
+
+void nrerror(const char *error_text);
+
+
+#endif
diff --git a/parsmultistate.cpp b/parsmultistate.cpp
new file mode 100644
index 0000000..5cf4c02
--- /dev/null
+++ b/parsmultistate.cpp
@@ -0,0 +1,36 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include "phylotree.h"
+#include "tinatree.h"
+#include "parsmultistate.h"
+#include "alignment.h"
+
+void doParsMultiState(Params &params) {
+	cout << "Here\n";
+    Alignment alignment(params.aln_file, params.sequence_type, params.intype);
+    TinaTree tree;
+    tree.readTree(params.user_file, params.is_rooted);
+	tree.setAlignment(&alignment);
+	tree.drawTree(cout);
+	cout << "Parsimony score is: " << tree.computeParsimonyScore() << endl;
+	cout << "Parsimony score ver2 is: " << tree.computeParsimony() << endl;
+	//tree.printParsimonyStates();
+}
diff --git a/parsmultistate.h b/parsmultistate.h
new file mode 100644
index 0000000..c1c3eb1
--- /dev/null
+++ b/parsmultistate.h
@@ -0,0 +1,28 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef PARSMULTISTATE_H
+#define PARSMULTISTATE_H
+
+#include "tools.h"
+
+void doParsMultiState(Params &params);
+
+#endif
diff --git a/pattern.cpp b/pattern.cpp
new file mode 100644
index 0000000..faef093
--- /dev/null
+++ b/pattern.cpp
@@ -0,0 +1,59 @@
+//
+// C++ Implementation: pattern
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#include "pattern.h"
+#include "alignment.h"
+
+Pattern::Pattern()
+        : string()
+{
+    frequency = 0;
+    is_const = false;
+    is_informative = false;
+    const_char = 255;
+    num_chars = 0;
+}
+
+Pattern::Pattern(const Pattern &pat)
+        : string(pat)
+{
+    frequency = pat.frequency;
+    is_const = pat.is_const;
+    is_informative = pat.is_informative;
+    const_char = pat.const_char;
+    num_chars = pat.num_chars;
+}
+
+Pattern::~Pattern()
+{
+}
+
+int Pattern::computeAmbiguousChar(int num_states) {
+    int num = 0;
+    for (iterator i = begin(); i != end(); i++)
+        if (*i >= num_states) num++;
+    return num;
+}
+
+int Pattern::computeGapChar(int num_states, int STATE_UNKNOWN) {
+    int num = 0;
+    for (iterator i = begin(); i != end(); i++)
+        if (*i == STATE_UNKNOWN) num++;
+    return num;
+}
+
+//Pattern &Pattern::operator= (Pattern pat) {
+//    assign(pat);
+//    frequency = pat.frequency;
+//    is_const = pat.is_const;
+//    const_char = pat.const_char;
+//    return *this;
+//}
diff --git a/pattern.h b/pattern.h
new file mode 100644
index 0000000..c854259
--- /dev/null
+++ b/pattern.h
@@ -0,0 +1,74 @@
+//
+// C++ Interface: pattern
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#ifndef PATTERN_H
+#define PATTERN_H
+
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+/**
+	Site-patterns in a multiple sequence alignment
+	@author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+*/
+class Pattern : public string
+{
+public:
+	/** 
+		constructor
+	*/
+    Pattern();
+
+    Pattern(const Pattern &pat);
+
+    /**
+		@param num_states number of states of the model
+		@return the number of ambiguous character incl. gaps 
+	*/
+	int computeAmbiguousChar(int num_states);
+
+	/**
+		@param num_states number of states of the model
+		@return the number of gaps 
+	*/
+	int computeGapChar(int num_states, int STATE_UNKNOWN);
+
+//    Pattern &operator= (Pattern pat);
+
+	/** 
+		destructor
+	*/
+    virtual ~Pattern();
+
+	/**
+		frequency appearance of the pattern
+	*/
+	int frequency;
+
+	/**
+		true if this is a constant pattern
+		2015-03-04: is_const will also be true for pattern like "AA-A--AAA"
+	*/
+	bool is_const;
+    
+    /** true if pattern is informative, false otherwise */
+    bool is_informative;
+
+	/** 2015-03-04: if is_const is true, this will store the const character for the pattern */
+	char const_char;
+
+    /** number of different character states */
+    int num_chars;
+};
+
+#endif
diff --git a/pda.cpp b/pda.cpp
new file mode 100644
index 0000000..c7e9b81
--- /dev/null
+++ b/pda.cpp
@@ -0,0 +1,2400 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <iqtree_config.h>
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+//#include <winsock2.h>
+//#include <windows.h>
+//extern __declspec(dllexport) int gethostname(char *name, int namelen);
+#else
+#include <sys/resource.h>
+#endif
+
+//#include "Eigen/Core"
+#include <stdio.h>
+#include "phylotree.h"
+#include <signal.h>
+#include <cstdio>
+#include <streambuf>
+#include <iostream>
+#include <cstdlib>
+#include <errno.h>
+#include "greedy.h"
+#include "pruning.h"
+//#include "naivegreedy.h"
+#include "splitgraph.h"
+#include "circularnetwork.h"
+#include "mtreeset.h"
+#include "mexttree.h"
+#include "ncl/ncl.h"
+#include "msetsblock.h"
+#include "myreader.h"
+#include "phyloanalysis.h"
+#include "matree.h"
+#include "ngs.h"
+#include "parsmultistate.h"
+#include "gss.h"
+#include "maalignment.h" //added by MA
+#include "ncbitree.h"
+#include "ecopd.h"
+#include "upperbounds.h"
+#include "ecopdmtreeset.h"
+#include "gurobiwrapper.h"
+#include "timeutil.h"
+//#include <unistd.h>
+#include <stdlib.h>
+#include "vectorclass/vectorclass.h"
+
+#ifdef _OPENMP
+	#include <omp.h>
+#endif
+
+using namespace std;
+
+
+
+void generateRandomTree(Params &params)
+{
+	if (params.sub_size < 3 && !params.aln_file) {
+		outError(ERR_FEW_TAXA);
+	}
+
+	if (!params.user_file) {
+		outError("Please specify an output tree file name");
+	}
+	////cout << "Random number seed: " << params.ran_seed << endl << endl;
+
+	SplitGraph sg;
+
+	try {
+
+		if (params.tree_gen == YULE_HARDING || params.tree_gen == CATERPILLAR ||
+			params.tree_gen == BALANCED || params.tree_gen == UNIFORM || params.tree_gen == STAR_TREE) {
+			if (!overwriteFile(params.user_file)) return;
+			ofstream out;
+			out.open(params.user_file);
+			MTree itree;
+
+			if (params.second_tree) {
+				cout << "Generating random branch lengths on tree " << params.second_tree << " ..." << endl;
+				itree.readTree(params.second_tree, params.is_rooted);
+			} else
+			switch (params.tree_gen) {
+			case YULE_HARDING:
+				cout << "Generating random Yule-Harding tree..." << endl;
+				break;
+			case UNIFORM:
+				cout << "Generating random uniform tree..." << endl;
+				break;
+			case CATERPILLAR:
+				cout << "Generating random caterpillar tree..." << endl;
+				break;
+			case BALANCED:
+				cout << "Generating random balanced tree..." << endl;
+				break;
+			case STAR_TREE:
+				cout << "Generating star tree with random external branch lengths..." << endl;
+				break;
+			default: break;
+			}
+			ofstream out2;
+			if (params.num_zero_len) {
+				cout << "Setting " << params.num_zero_len << " internal branches to zero length..." << endl;
+				string str = params.user_file;
+				str += ".collapsed";
+				out2.open(str.c_str());
+			}
+			for (int i = 0; i < params.repeated_time; i++) {
+				MExtTree mtree;
+				if (itree.root) {
+					mtree.copyTree(&itree);
+					mtree.generateRandomBranchLengths(params);
+				} else {
+					mtree.generateRandomTree(params.tree_gen, params);
+				}
+				if (params.num_zero_len) {
+					mtree.setZeroInternalBranches(params.num_zero_len);
+					MExtTree collapsed_tree;
+					collapsed_tree.copyTree(&mtree);
+					collapsed_tree.collapseZeroBranches();
+					collapsed_tree.printTree(out2);
+					out2 << endl;
+				}
+				mtree.printTree(out);
+				out << endl;
+			}
+			out.close();
+			cout << params.repeated_time << " tree(s) printed to " << params.user_file << endl;
+			if (params.num_zero_len) {
+				out2.close();
+				cout << params.repeated_time << " collapsed tree(s) printed to " << params.user_file << ".collapsed" << endl;
+			}
+		}
+		// Generate random trees if optioned
+		else if (params.tree_gen == CIRCULAR_SPLIT_GRAPH) {
+			cout << "Generating random circular split network..." << endl;
+			if (!overwriteFile(params.user_file)) return;
+			sg.generateCircular(params);
+		} else if (params.tree_gen == TAXA_SET) {
+			sg.init(params);
+			cout << "Generating random taxa set of size " << params.sub_size <<
+				" overlap " << params.overlap << " with " << params.repeated_time << " times..." << endl;
+			if (!overwriteFile(params.pdtaxa_file)) return;
+			sg.generateTaxaSet(params.pdtaxa_file, params.sub_size, params.overlap, params.repeated_time);
+		}
+	} catch (bad_alloc) {
+		outError(ERR_NO_MEMORY);
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, params.user_file);
+	}
+
+	// calculate the distance
+	if (params.run_mode == CALC_DIST) {
+		if (params.tree_gen == CIRCULAR_SPLIT_GRAPH) {
+			cout << "Calculating distance matrix..." << endl;
+			sg.calcDistance(params.dist_file);
+			cout << "Distances printed to " << params.dist_file << endl;
+		}// else {
+			//mtree.calcDist(params.dist_file);
+		//}
+	}
+
+}
+
+inline void separator(ostream &out, int type = 0) {
+	switch (type) {
+	case 0:
+		out << endl << "==============================================================================" << endl;
+		break;
+	case 1:
+		out << endl << "-----------------------------------------------------------" << endl;
+		break;
+	default:
+		break;
+	}
+}
+
+
+void printCopyright(ostream &out) {
+#ifdef IQ_TREE
+ 	out << "IQ-TREE";
+	#ifdef _OPENMP
+	out << " multicore";
+	#endif
+ 	out << " version ";
+#else
+ 	out << "PDA - Phylogenetic Diversity Analyzer version ";
+#endif
+	out << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR << "." << iqtree_VERSION_PATCH;
+
+#if defined _WIN32 || defined WIN32
+	out << " for Windows";
+#elif defined __APPLE__ || defined __MACH__
+	out << " for Mac OS X";
+#elif defined __linux__
+	out << " for Linux";
+#elif defined __unix__ || defined __unix
+	out << " for Unix";
+#else 
+	out << " for unknown platform"
+#endif
+
+	out	<< " " << 8*sizeof(void*) << "-bit" << " built " << __DATE__;
+#if defined DEBUG 
+	out << " - debug mode";
+#endif
+
+#ifdef IQ_TREE
+	out << endl << "Copyright (c) 2011-2015 Nguyen Lam Tung, Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
+#else
+	out << endl << "Copyright (c) 2006-2014 Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
+#endif
+}
+
+void printRunMode(ostream &out, RunMode run_mode) {
+	switch (run_mode) {
+		case DETECTED: out << "Detected"; break;
+		case GREEDY: out << "Greedy"; break;
+		case PRUNING: out << "Pruning"; break;
+		case BOTH_ALG: out << "Greedy and Pruning"; break;
+		case EXHAUSTIVE: out << "Exhaustive"; break;
+		case DYNAMIC_PROGRAMMING: out << "Dynamic Programming"; break;
+		case LINEAR_PROGRAMMING: out << "Integer Linear Programming"; break;
+		default: outError(ERR_INTERNAL);
+	}
+}
+
+/**
+	summarize the running with header
+*/
+void summarizeHeader(ostream &out, Params &params, bool budget_constraint, InputType analysis_type) {
+	printCopyright(out);
+	out << "Input tree/split network file name: " << params.user_file << endl;
+	if(params.eco_dag_file)
+		out << "Input food web file name: "<<params.eco_dag_file<<endl;
+ 	out << "Input file format: " << ((params.intype == IN_NEWICK) ? "Newick" : ( (params.intype == IN_NEXUS) ? "Nexus" : "Unknown" )) << endl;
+	if (params.initial_file != NULL)
+		out << "Initial taxa file: " << params.initial_file << endl;
+	if (params.param_file != NULL)
+		out << "Parameter file: " << params.param_file << endl;
+	out << endl;
+	out << "Type of measure: " << ((params.root != NULL || params.is_rooted) ? "Rooted": "Unrooted") <<
+			(analysis_type== IN_NEWICK ? " phylogenetic diversity (PD)" : " split diversity (SD)");
+	if (params.root != NULL) out << " at " << params.root;
+	out << endl;
+	if (params.run_mode != CALC_DIST && params.run_mode != PD_USER_SET) {
+		out << "Search objective: " << ((params.find_pd_min) ? "Minimum" : "Maximum") << endl;
+		out << "Search algorithm: ";
+		printRunMode(out, params.run_mode);
+		if (params.run_mode == DETECTED) {
+			out << " -> ";
+			printRunMode(out, params.detected_mode);
+		}
+		out << endl;
+		out << "Search option: " << ((params.find_all) ? "Multiple optimal sets" : "Single optimal set") << endl;
+	}
+	out << endl;
+	out << "Type of analysis: ";
+	switch (params.run_mode) {
+		case PD_USER_SET: out << "PD/SD of user sets";
+			if (params.pdtaxa_file) out << " (" << params.pdtaxa_file << ")"; break;
+		case CALC_DIST: out << "Distance matrix computation"; break;
+		default:
+			out << ((budget_constraint) ? "Budget constraint " : "Subset size k ");
+			if (params.intype == IN_NEWICK)
+				out << ((analysis_type == IN_NEWICK) ? "on tree" : "on tree -> split network");
+			else
+				out << "on split network";
+	}
+	out << endl;
+	//out << "Random number seed: " << params.ran_seed << endl;
+}
+
+void summarizeFooter(ostream &out, Params &params) {
+	separator(out);
+	time_t beginTime;
+	time (&beginTime);
+	char *date;
+	date = ctime(&beginTime);
+
+	out << "Time used: " << params.run_time  << " seconds." << endl;
+	out << "Finished time: " << date << endl;
+}
+
+
+int getMaxNameLen(vector<string> &setName) {
+	int len = 0;
+	for (vector<string>::iterator it = setName.begin(); it != setName.end(); it++)
+		if (len < (*it).length())
+			len = (*it).length();
+	return len;
+}
+
+void printPDUser(ostream &out, Params &params, PDRelatedMeasures &pd_more) {
+	out << "List of user-defined sets of taxa with PD score computed" << endl << endl;
+	int maxlen = getMaxNameLen(pd_more.setName)+2;
+	out.width(maxlen);
+	out << "Name" << "     PD";
+	if (params.exclusive_pd) out << "   excl.-PD";
+	if (params.endemic_pd) out << "   PD-Endem.";
+	if (params.complement_area) out << "   PD-Compl. given area " << params.complement_area;
+	out << endl;
+	int cnt;
+	for (cnt = 0; cnt < pd_more.setName.size(); cnt++) {
+		out.width(maxlen);
+		out << pd_more.setName[cnt] << " ";
+		out.width(7);
+		out << pd_more.PDScore[cnt] << "  ";
+		if (params.exclusive_pd) {
+			out.width(7);
+			out << pd_more.exclusivePD[cnt] << "  ";
+		}
+		if (params.endemic_pd) {
+			out.width(7);
+			out << pd_more.PDEndemism[cnt] << "  ";
+		}
+		if (params.complement_area) {
+			out.width(8);
+			out << pd_more.PDComplementarity[cnt];
+		}
+		out << endl;
+	}
+	separator(out, 1);
+}
+
+void summarizeTree(Params &params, PDTree &tree, vector<PDTaxaSet> &taxa_set,
+	PDRelatedMeasures &pd_more) {
+	string filename;
+	if (params.out_file == NULL) {
+		filename = params.out_prefix;
+		filename += ".pda";
+	} else
+		filename = params.out_file;
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+
+		summarizeHeader(out, params, false, IN_NEWICK);
+		out << "Tree size: " << tree.leafNum-params.is_rooted << " taxa, " <<
+			tree.nodeNum-1-params.is_rooted << " branches" << endl;
+		separator(out);
+
+		vector<PDTaxaSet>::iterator tid;
+
+		if (params.run_mode == PD_USER_SET) {
+			printPDUser(out, params, pd_more);
+		}
+		else if (taxa_set.size() > 1)
+			out << "Optimal PD-sets with k = " << params.min_size-params.is_rooted <<
+			" to " << params.sub_size-params.is_rooted << endl << endl;
+
+
+		int subsize = params.min_size-params.is_rooted;
+		if (params.run_mode == PD_USER_SET) subsize = 1;
+		for (tid = taxa_set.begin(); tid != taxa_set.end(); tid++, subsize++) {
+			if (tid != taxa_set.begin())
+				separator(out, 1);
+			if (params.run_mode == PD_USER_SET) {
+				out << "Set " << subsize << " has PD score of " << tid->score << endl;
+			}
+			else {
+				out << "For k = " << subsize << " the optimal PD score is " << (*tid).score << endl;
+				out << "The optimal PD set has " << subsize << " taxa:" << endl;
+			}
+			for (NodeVector::iterator it = (*tid).begin(); it != (*tid).end(); it++)
+				if ((*it)->name != ROOT_NAME){
+					out << (*it)->name << endl;
+				}
+			if (!tid->tree_str.empty()) {
+				out << endl << "Corresponding sub-tree: " << endl;
+				out << tid->tree_str << endl;
+			}
+			tid->clear();
+		}
+		taxa_set.clear();
+
+		summarizeFooter(out, params);
+		out.close();
+		cout << endl << "Results are summarized in " << filename << endl << endl;
+
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+}
+
+
+void printTaxaSet(Params &params, vector<PDTaxaSet> &taxa_set, RunMode cur_mode) {
+	int subsize = params.min_size-params.is_rooted;
+	ofstream out;
+	ofstream scoreout;
+	string filename;
+	filename = params.out_prefix;
+	filename += ".score";
+	scoreout.open(filename.c_str());
+	if (!scoreout.is_open())
+		outError(ERR_WRITE_OUTPUT, filename);
+	cout << "PD scores printed to " << filename << endl;
+
+	if (params.nr_output == 1) {
+		filename = params.out_prefix;
+		filename += ".pdtaxa";
+		out.open(filename.c_str());
+		if (!out.is_open())
+			outError(ERR_WRITE_OUTPUT, filename);
+	}
+	for (vector<PDTaxaSet>::iterator tid = taxa_set.begin(); tid != taxa_set.end(); tid++, subsize++) {
+		if (params.nr_output > 10) {
+			filename = params.out_prefix;
+			filename += ".";
+			filename += subsize;
+			if (params.run_mode == BOTH_ALG) {
+				if (cur_mode == GREEDY)
+					filename += ".greedy";
+				else
+					filename += ".pruning";
+			} else {
+				filename += ".pdtree";
+			}
+			(*tid).printTree((char*)filename.c_str());
+
+			filename = params.out_prefix;
+			filename += ".";
+			filename += subsize;
+			filename += ".pdtaxa";
+			(*tid).printTaxa((char*)filename.c_str());
+		} else {
+			out << subsize << " " << (*tid).score << endl;
+			scoreout << subsize << " " << (*tid).score << endl;
+			(*tid).printTaxa(out);
+		}
+	}
+
+	if (params.nr_output == 1) {
+		out.close();
+		cout << "All taxa list(s) printed to " << filename << endl;
+	}
+
+	scoreout.close();
+}
+
+
+/**
+	run PD algorithm on trees
+*/
+void runPDTree(Params &params)
+{
+
+	if (params.run_mode == CALC_DIST) {
+		bool is_rooted = false;
+		MExtTree tree(params.user_file, is_rooted);
+		cout << "Tree contains " << tree.leafNum << " taxa." << endl;
+		cout << "Calculating distance matrix..." << endl;
+		tree.calcDist(params.dist_file);
+		cout << "Distances printed to " << params.dist_file << endl;
+		return;
+	}
+
+	double t_begin, t_end;
+	//char filename[300];
+	//int idx;
+
+	vector<PDTaxaSet> taxa_set;
+
+	if (params.run_mode == PD_USER_SET) {
+		// compute score of user-defined sets
+		t_begin = getCPUTime();
+		cout << "Computing PD score for user-defined set of taxa..." << endl;
+		PDTree tree(params);
+		PDRelatedMeasures pd_more;
+		tree.computePD(params, taxa_set, pd_more);
+
+		if (params.endemic_pd)
+			tree.calcPDEndemism(taxa_set, pd_more.PDEndemism);
+		if (params.complement_area != NULL)
+			tree.calcPDComplementarity(taxa_set, params.complement_area, pd_more.PDComplementarity);
+
+		t_end = getCPUTime();
+		params.run_time = (t_end-t_begin);
+		summarizeTree(params, tree, taxa_set, pd_more);
+		return;
+	}
+
+
+	/*********************************************
+		run greedy algorithm
+	*********************************************/
+
+	if (params.sub_size < 2) {
+		outError(ERR_NO_K);
+	}
+
+	bool detected_greedy = (params.run_mode != PRUNING);
+
+	Greedy test_greedy;
+
+	test_greedy.init(params);
+
+	if (params.root == NULL && !params.is_rooted)
+		cout << endl << "Running PD algorithm on UNROOTED tree..." << endl;
+	else
+		cout << endl << "Running PD algorithm on ROOTED tree..." << endl;
+
+	if (verbose_mode >= VB_DEBUG)
+		test_greedy.drawTree(cout, WT_INT_NODE + WT_BR_SCALE + WT_BR_LEN);
+
+	if (params.run_mode == GREEDY || params.run_mode == BOTH_ALG ||
+		(params.run_mode == DETECTED)) {
+
+		if (params.run_mode == DETECTED && params.sub_size >= test_greedy.leafNum * 7 / 10
+			&& params.min_size < 2)
+			detected_greedy = false;
+
+		if (detected_greedy) {
+			params.detected_mode = GREEDY;
+			t_begin=getCPUTime();
+			cout << endl << "Greedy Algorithm..." << endl;
+
+			taxa_set.clear();
+			test_greedy.run(params, taxa_set);
+
+			t_end=getCPUTime();
+			params.run_time = (t_end-t_begin);
+			cout << "Time used: " << params.run_time << " seconds." << endl;
+			if (params.min_size == params.sub_size)
+				cout << "Resulting tree length = " << taxa_set[0].score << endl;
+
+			if (params.nr_output > 0)
+				printTaxaSet(params, taxa_set, GREEDY);
+
+			PDRelatedMeasures pd_more;
+
+			summarizeTree(params, test_greedy, taxa_set, pd_more);
+		}
+	}
+
+	/*********************************************
+		run pruning algorithm
+	*********************************************/
+	if (params.run_mode == PRUNING || params.run_mode == BOTH_ALG ||
+		(params.run_mode == DETECTED)) {
+
+		Pruning test_pruning;
+
+		if (params.run_mode == PRUNING || params.run_mode == BOTH_ALG) {
+			//Pruning test_pruning(params);
+			test_pruning.init(params);
+		} else if (!detected_greedy) {
+			test_pruning.init(test_greedy);
+		} else {
+			return;
+		}
+		params.detected_mode = PRUNING;
+		t_begin=getCPUTime();
+		cout << endl << "Pruning Algorithm..." << endl;
+		taxa_set.clear();
+		test_pruning.run(params, taxa_set);
+
+		t_end=getCPUTime();
+		params.run_time = (t_end-t_begin) ;
+		cout << "Time used: " << params.run_time << " seconds.\n";
+		if (params.min_size == params.sub_size)
+			cout << "Resulting tree length = " << taxa_set[0].score << endl;
+
+		if (params.nr_output > 0)
+			printTaxaSet(params, taxa_set, PRUNING);
+
+		PDRelatedMeasures pd_more;
+
+		summarizeTree(params, test_pruning, taxa_set, pd_more);
+
+	}
+
+}
+
+void checkSplitDistance(ostream &out, PDNetwork &sg) {
+	matrix(double) dist;
+	sg.calcDistance(dist);
+	int ntaxa = sg.getNTaxa();
+	int i, j;
+	bool found = false;
+	for (i = 0; i < ntaxa-1; i++) {
+		bool first = true;
+		for (j = i+1; j < ntaxa; j++)
+			if (abs(dist[i][j]) <= 1e-5) {
+				if (!found) {
+					out << "The following sets of taxa (each set in a line) have very small split-distance" << endl;
+					out << "( <= 1e-5) as computed from the split system. To avoid a lot of multiple" << endl;
+					out << "optimal PD sets to be reported, one should only keep one taxon from each set" << endl;
+					out << "and exclude the rest from the analysis." << endl << endl;
+				}
+				if (first)
+					out << sg.getTaxa()->GetTaxonLabel(i);
+				found = true;
+				first = false;
+				out << ", " << sg.getTaxa()->GetTaxonLabel(j);
+			}
+		if (!first) out << endl;
+	}
+	if (found)
+		separator(out);
+}
+
+
+
+/**
+	check if the set are nested and there are no multiple optimal sets.
+	If yes, return the ranking as could be produced by a greedy algorithm
+*/
+bool makeRanking(vector<SplitSet> &pd_set, IntVector &indices, IntVector &ranking) {
+	vector<SplitSet>::iterator it;
+	IntVector::iterator inti;
+	ranking.clear();
+	bool nested = true;
+	Split *cur_sp = NULL;
+	int id = 1;
+	for (it = pd_set.begin(); it != pd_set.end(); it++) {
+		if ((*it).empty()) continue;
+		if ((*it).size() > 1) {
+			nested = false;
+			ranking.push_back(-10);
+			indices.push_back(0);
+		}
+		Split *sp = (*it)[0];
+
+		if (!cur_sp) {
+			IntVector sp_tax;
+			sp->getTaxaList(sp_tax);
+			ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
+			for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
+				indices.push_back(id++);
+		} else {
+			if ( !cur_sp->subsetOf(*sp)) {
+				ranking.push_back(-1);
+				indices.push_back(0);
+				nested = false;
+			}
+			Split sp_diff(*sp);
+			sp_diff -= *cur_sp;
+			Split sp_diff2(*cur_sp);
+			sp_diff2 -= *sp;
+			IntVector sp_tax;
+			sp_diff2.getTaxaList(sp_tax);
+			ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
+			for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
+				indices.push_back(-id);
+			sp_diff.getTaxaList(sp_tax);
+			ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
+			for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
+				indices.push_back(id);
+			if ( !cur_sp->subsetOf(*sp)) {
+				ranking.push_back(-2);
+				indices.push_back(0);
+			}
+			id++;
+		}
+		cur_sp = sp;
+	}
+	return nested;
+}
+
+
+void printNexusSets(const char *filename, PDNetwork &sg, vector<SplitSet> &pd_set) {
+	try {
+		ofstream out;
+		out.open(filename);
+		out << "#NEXUS" << endl << "BEGIN Sets;" << endl;
+		vector<SplitSet>::iterator it;
+		for (it = pd_set.begin(); it != pd_set.end(); it++) {
+			int id = 1;
+			for (SplitSet::iterator sit = (*it).begin(); sit != (*it).end(); sit++, id++) {
+				IntVector taxa;
+				(*sit)->getTaxaList(taxa);
+				out << "   TAXSET Opt_" << taxa.size() << "_" << id << " =";
+				for (IntVector::iterator iit = taxa.begin(); iit != taxa.end(); iit++) {
+					if (sg.isPDArea())
+						out << " '" << sg.getSetsBlock()->getSet(*iit)->name << "'";
+					else
+						out << " '" << sg.getTaxa()->GetTaxonLabel(*iit) << "'";
+				}
+				out << ";" << endl;
+			}
+		}
+		out << "END; [Sets]" << endl;
+		out.close();
+		cout << endl << "Optimal sets are written to nexus file " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+
+
+void computeTaxaFrequency(SplitSet &taxa_set, DoubleVector &freq) {
+	assert(taxa_set.size());
+	int ntaxa = taxa_set[0]->getNTaxa();
+	int i;
+
+	freq.resize(ntaxa, 0);
+	for (SplitSet::iterator it2 = taxa_set.begin(); it2 != taxa_set.end(); it2++) {
+		for ( i = 0; i < ntaxa; i++)
+			if ((*it2)->containTaxon(i)) freq[i] += 1.0;
+	}
+
+	for ( i = 0; i < ntaxa; i++)
+		freq[i] /= taxa_set.size();
+
+}
+
+/**
+	summarize the running results
+*/
+void summarizeSplit(Params &params, PDNetwork &sg, vector<SplitSet> &pd_set, PDRelatedMeasures &pd_more, bool full_report) {
+	int i;
+
+
+	if (params.nexus_output) {
+		string nex_file = params.out_prefix;
+		nex_file += ".pdsets.nex";
+		printNexusSets(nex_file.c_str(), sg, pd_set);
+	}
+	string filename;
+	if (params.out_file == NULL) {
+		filename = params.out_prefix;
+		filename += ".pda";
+	} else
+		filename = params.out_file;
+
+	try {
+		ofstream out;
+		out.open(filename.c_str());
+		/****************************/
+		/********** HEADER **********/
+		/****************************/
+		summarizeHeader(out, params, sg.isBudgetConstraint(), IN_NEXUS);
+
+		out << "Network size: " << sg.getNTaxa()-params.is_rooted << " taxa, " <<
+			sg.getNSplits()-params.is_rooted << " splits (of which " <<
+			sg.getNTrivialSplits() << " are trivial splits)" << endl;
+		out << "Network type: " << ((sg.isCircular()) ? "Circular" : "General") << endl;
+
+		separator(out);
+
+		checkSplitDistance(out, sg);
+
+		int c_num = 0;
+		//int subsize = (sg.isBudgetConstraint()) ? params.budget : (params.sub_size-params.is_rooted);
+		//subsize -= pd_set.size()-1;
+		int subsize = (sg.isBudgetConstraint()) ? params.min_budget : params.min_size-params.is_rooted;
+		int stepsize = (sg.isBudgetConstraint()) ? params.step_budget : params.step_size;
+		if (params.detected_mode != LINEAR_PROGRAMMING) stepsize = 1;
+		vector<SplitSet>::iterator it;
+		SplitSet::iterator it2;
+
+
+		if (params.run_mode == PD_USER_SET) {
+			printPDUser(out, params, pd_more);
+		}
+
+		/****************************/
+		/********** SUMMARY *********/
+		/****************************/
+
+		if (params.run_mode != PD_USER_SET && !params.num_bootstrap_samples) {
+			out << "Summary of the PD-score and the number of optimal PD-sets with the same " << endl << "optimal PD-score found." << endl;
+
+			if (sg.isBudgetConstraint())
+				out << endl << "Budget   PD-score   %PD-score   #PD-sets" << endl;
+			else
+				out << endl << "Size-k   PD-score   %PD-score   #PD-sets" << endl;
+
+			int sizex = subsize;
+			double total = sg.calcWeight();
+
+			for (it = pd_set.begin(); it != pd_set.end(); it++, sizex+=stepsize) {
+				out.width(6);
+				out << right << sizex << " ";
+				out.width(10);
+				out << right << (*it).getWeight() << " ";
+				out.width(10);
+				out << right << ((*it).getWeight()/total)*100.0 << " ";
+				out.width(6);
+				out << right << (*it).size();
+				out << endl;
+			}
+
+			out << endl;
+			if (!params.find_all)
+				out << "Note: You did not choose the option to find multiple optimal PD sets." << endl <<
+					"That's why we only reported one PD-set per size-k or budget. If you want" << endl <<
+					"to determine all multiple PD-sets, use the '-a' option.";
+			else {
+				out << "Note: The number of multiple optimal PD sets to be reported is limited to " << params.pd_limit << "." << endl <<
+					"There might be cases where the actual #PD-sets exceeds that upper-limit but" << endl <<
+					"won't be listed here. Please refer to the above list to identify such cases." << endl <<
+					"To increase the upper-limit, use the '-lim <limit_number>' option.";
+			}
+			out << endl;
+			separator(out);
+		}
+
+		if (!full_report) {
+			out.close();
+			return;
+		}
+
+
+		/****************************/
+		/********* BOOTSTRAP ********/
+		/****************************/
+		if (params.run_mode != PD_USER_SET && params.num_bootstrap_samples) {
+			out << "Summary of the bootstrap analysis " << endl;
+			for (it = pd_set.begin(); it != pd_set.end(); it++) {
+				DoubleVector freq;
+				computeTaxaFrequency((*it), freq);
+				out << "For k/budget = " << subsize << " the " << ((sg.isPDArea()) ? "areas" : "taxa")
+					<< " supports are: " << endl;
+				for (i = 0; i < freq.size(); i++)
+					out << ((sg.isPDArea()) ? sg.getSetsBlock()->getSet(i)->name : sg.getTaxa()->GetTaxonLabel(i))
+						<< "\t" << freq[i] << endl;
+				if ((it+1) != pd_set.end()) separator(out, 1);
+			}
+			out << endl;
+			separator(out);
+		}
+
+		/****************************/
+		/********** RANKING *********/
+		/****************************/
+
+		if (params.run_mode != PD_USER_SET && !params.num_bootstrap_samples) {
+
+
+			IntVector ranking;
+			IntVector index;
+
+			out << "Ranking based on the optimal sets" << endl;
+
+
+			if (!makeRanking(pd_set, index, ranking)) {
+				out << "WARNING: Optimal sets are not nested, so ranking should not be considered stable" << endl;
+			}
+			if (subsize > 1) {
+				out << "WARNING: The first " << subsize << " ranks should be treated equal" << endl;
+			}
+			out << endl << "Rank*   ";
+			if (!sg.isPDArea())
+				out << "Taxon names" << endl;
+			else
+				out << "Area names" << endl;
+
+
+			for (IntVector::iterator intv = ranking.begin(), intid = index.begin(); intv != ranking.end(); intv ++, intid++) {
+				if (*intv == -10)
+					out << "<--- multiple optimal set here --->" << endl;
+				else if (*intv == -1)
+					out << "<--- BEGIN: greedy does not work --->" << endl;
+				else if (*intv == -2)
+					out << "<--- END --->" << endl;
+				else {
+					out.width(5);
+					out <<  right << *intid << "   ";
+					if (sg.isPDArea())
+						out << sg.getSetsBlock()->getSet(*intv)->name << endl;
+					else
+						out << sg.getTaxa()->GetTaxonLabel(*intv) << endl;
+				}
+			}
+			out << endl;
+			out <<  "(*) Negative ranks indicate the point at which the greedy algorithm" << endl <<
+					"    does not work. In that case, the corresponding taxon/area names" << endl <<
+					"    should be deleted from the optimal set of the same size" << endl;
+			separator(out);
+		}
+
+		int max_len = sg.getTaxa()->GetMaxTaxonLabelLength();
+
+		/****************************/
+		/***** DETAILED SETS ********/
+		/****************************/
+
+		if (params.run_mode != PD_USER_SET)
+			out << "Detailed information of all taxa found in the optimal PD-sets" << endl;
+
+		if (pd_set.size() > 1) {
+			if (sg.isBudgetConstraint())
+				out << "with budget = " << params.min_budget <<
+					" to " << params.budget << endl << endl;
+			else
+				out << "with k = " << params.min_size-params.is_rooted <<
+					" to " << params.sub_size-params.is_rooted << endl << endl;
+		}
+
+		if (params.run_mode != PD_USER_SET)
+			separator(out,1);
+
+		for (it = pd_set.begin(); it != pd_set.end(); it++, subsize+=stepsize) {
+
+			// check if the pd-sets are the same as previous one
+			if (sg.isBudgetConstraint() && it != pd_set.begin()) {
+				vector<SplitSet>::iterator prev, next;
+				for (next=it, prev=it-1; next != pd_set.end() && next->getWeight() == (*prev).getWeight() &&
+					next->size() == (*prev).size(); next++ ) ;
+				if (next != it) {
+					// found something in between!
+					out << endl;
+					//out << endl << "**************************************************************" << endl;
+					out << "For budget = " << subsize << " -> " << subsize+(next-it-1)*stepsize <<
+						" the optimal PD score and PD sets" << endl;
+					out << "are identical to the case when budget = " << subsize-stepsize << endl;
+					//out << "**************************************************************" << endl;
+					subsize += (next-it)*stepsize;
+					it = next;
+					if (it == pd_set.end()) break;
+				}
+			}
+
+			if (it != pd_set.begin()) separator(out, 1);
+
+			int num_sets = (*it).size();
+			double weight = (*it).getWeight();
+
+			if (params.run_mode != PD_USER_SET) {
+				out << "For " << ((sg.isBudgetConstraint()) ? "budget" : "k") << " = " << subsize;
+				out << " the optimal PD score is " << weight << endl;
+
+				if (num_sets == 1) {
+					if (!sg.isBudgetConstraint())
+						out << "The optimal PD set has " << (*it)[0]->countTaxa()-params.is_rooted <<
+							((sg.isPDArea()) ? " areas" : " taxa");
+					else
+						out << "The optimal PD set has " << (*it)[0]->countTaxa()-params.is_rooted <<
+						((sg.isPDArea()) ? " areas" : " taxa") << " and requires " << sg.calcCost(*(*it)[0]) << " budget";
+					if (!sg.isPDArea()) out << " and covers " << sg.countSplits(*(*it)[0]) <<
+						" splits (of which " << sg.countInternalSplits(*(*it)[0]) << " are internal splits)";
+					out << endl;
+				}
+				else
+					out << "Found " << num_sets << " PD sets with the same optimal score." << endl;
+			}
+			for (it2 = (*it).begin(), c_num=1; it2 != (*it).end(); it2++, c_num++){
+				Split *this_set = *it2;
+
+				if (params.run_mode == PD_USER_SET && it2 != (*it).begin())
+					separator(out, 1);
+
+				if (params.run_mode == PD_USER_SET) {
+					if (!sg.isBudgetConstraint())
+						out << "Set " << c_num << " has PD score of " << this_set->getWeight();
+					else
+						out << "Set " << c_num << " has PD score of " << this_set->getWeight() <<
+						" and requires " << sg.calcCost(*this_set) << " budget";
+				} else if (num_sets > 1) {
+					if (!sg.isBudgetConstraint())
+						out << endl << "PD set " << c_num;
+					else
+						out << endl << "PD set " << c_num << " has " << this_set->countTaxa()-params.is_rooted <<
+						" taxa and requires " << sg.calcCost(*this_set) << " budget";
+				}
+
+				if (!sg.isPDArea() && (num_sets > 1 || params.run_mode == PD_USER_SET ))
+					out << " and covers " << sg.countSplits(*(*it)[0]) << " splits (of which "
+					<< sg.countInternalSplits(*(*it)[0]) << " are internal splits)";
+				out << endl;
+
+				if (params.run_mode != PD_USER_SET && sg.isPDArea()) {
+					for (i = 0; i < sg.getSetsBlock()->getNSets(); i++)
+						if (this_set->containTaxon(i)) {
+							if (sg.isBudgetConstraint()) {
+								out.width(max_len);
+								out << left << sg.getSetsBlock()->getSet(i)->name << "\t";
+								out.width(10);
+								out << right << sg.getPdaBlock()->getCost(i);
+								out << endl;
+
+							} else {
+								out << sg.getSetsBlock()->getSet(i)->name << endl;
+							}
+						}
+
+					Split sp(sg.getNTaxa());
+					for (i = 0; i < sg.getSetsBlock()->getNSets(); i++)
+						if (this_set->containTaxon(i))
+							sp += *(sg.area_taxa[i]);
+					out << endl << "which contains " << sp.countTaxa() - params.is_rooted << " taxa: " << endl;
+					for (i = 0; i < sg.getNTaxa(); i++)
+						if (sg.getTaxa()->GetTaxonLabel(i) != ROOT_NAME && sp.containTaxon(i))
+							out << sg.getTaxa()->GetTaxonLabel(i) << endl;
+
+				} else
+				for ( i = 0; i < sg.getNTaxa(); i++)
+					if (sg.getTaxa()->GetTaxonLabel(i) != ROOT_NAME && this_set->containTaxon(i)) {
+						if (sg.isBudgetConstraint()) {
+							out.width(max_len);
+							out << left << sg.getTaxa()->GetTaxonLabel(i) << "\t";
+							out.width(10);
+							out << right << sg.getPdaBlock()->getCost(i);
+							out << endl;
+
+						} else {
+							out << sg.getTaxa()->GetTaxonLabel(i) << endl;
+						}
+					}
+			}
+		}
+
+		/****************************/
+		/********** FOOTER **********/
+		/****************************/
+
+		summarizeFooter(out, params);
+
+		out.close();
+		cout << endl << "Results are summarized in " << filename << endl << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+}
+
+void printGainMatrix(char *filename, matrix(double) &delta_gain, int start_k) {
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		int k = start_k;
+		for (matrix(double)::iterator it = delta_gain.begin(); it != delta_gain.end(); it++, k++) {
+			out << k;
+			for (int i = 0; i < (*it).size(); i++)
+				out << "  " << (*it)[i];
+			out << endl;
+		}
+		out.close();
+		cout << "PD gain matrix printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+}
+
+/**
+	run PD algorithm on split networks
+*/
+void runPDSplit(Params &params) {
+
+	cout << "Using NCL - Nexus Class Library" << endl << endl;
+
+	// init a split graph class from the parameters
+	CircularNetwork sg(params);
+	int i;
+
+	// this vector of SplitSet store all the optimal PD sets
+	vector<SplitSet> pd_set;
+	// this define an order of taxa (circular order in case of circular networks)
+	vector<int> taxa_order;
+	// this store a particular taxa set
+	Split taxa_set;
+
+
+	if (sg.isCircular()) {
+		// is a circular network, get circular order
+		for (i = 0; i < sg.getNTaxa(); i++)
+			taxa_order.push_back(sg.getCircleId(i));
+	} else
+		// otherwise, get the incremental order
+		for (i = 0; i < sg.getNTaxa(); i++)
+			taxa_order.push_back(i);
+
+	PDRelatedMeasures pd_more;
+
+	// begining time of the algorithm run
+	double time_begin = getCPUTime();
+	//time(&time_begin);
+	// check parameters
+	if (sg.isPDArea()) {
+		if (sg.isBudgetConstraint()) {
+			int budget = (params.budget >= 0) ? params.budget : sg.getPdaBlock()->getBudget();
+			if (budget < 0 && params.pd_proportion == 0.0) params.run_mode = PD_USER_SET;
+		} else {
+			int sub_size = (params.sub_size >= 1) ? params.sub_size : sg.getPdaBlock()->getSubSize();
+			if (sub_size < 1 && params.pd_proportion == 0.0) params.run_mode = PD_USER_SET;
+
+		}
+	}
+
+	if (params.run_mode == PD_USER_SET) {
+		// compute score of user-defined sets
+		cout << "Computing PD score for user-defined set of taxa..." << endl;
+		pd_set.resize(1);
+		sg.computePD(params, pd_set[0], pd_more);
+		if (params.endemic_pd)
+			sg.calcPDEndemism(pd_set[0], pd_more.PDEndemism);
+
+		if (params.complement_area != NULL)
+			sg.calcPDComplementarity(pd_set[0], params.complement_area, pd_more.setName, pd_more.PDComplementarity);
+
+	} else {
+		// otherwise, call the main function
+		if (params.num_bootstrap_samples) {
+			cout << endl << "======= START BOOTSTRAP ANALYSIS =======" << endl;
+			MTreeSet *mtrees = sg.getMTrees();
+			if (mtrees->size() < 100)
+				cout << "WARNING: bootstrap may be unstable with less than 100 trees" << endl;
+			vector<string> taxname;
+			sg.getTaxaName(taxname);
+			i = 1;
+			for (MTreeSet::iterator it = mtrees->begin(); it != mtrees->end(); it++, i++) {
+				cout << "---------- TREE " << i << " ----------" << endl;
+				// convert tree into split sytem
+				SplitGraph sg2;
+				(*it)->convertSplits(taxname, sg2);
+				// change the current split system
+				for (SplitGraph::reverse_iterator it = sg.rbegin(); it != sg.rend(); it++) {
+					delete *it;
+				}
+				sg.clear();
+				sg.insert(sg.begin(), sg2.begin(), sg2.end());
+				sg2.clear();
+
+				// now findPD on the converted tree-split system
+				sg.findPD(params, pd_set, taxa_order);
+			}
+			cout << "======= DONE BOOTSTRAP ANALYSIS =======" << endl << endl;
+		} else {
+			sg.findPD(params, pd_set, taxa_order);
+		}
+	}
+
+	// ending time
+	double time_end = getCPUTime();
+	//time(&time_end);
+	params.run_time = time_end - time_begin;
+
+	cout << "Time used: " << (double) (params.run_time) << " seconds." << endl;
+
+	if (verbose_mode >= VB_DEBUG && !sg.isPDArea()) {
+		cout << "PD set(s) with score(s): " << endl;
+		for (vector<SplitSet>::iterator it = pd_set.begin(); it != pd_set.end(); it++)
+		for (SplitSet::iterator it2 = (*it).begin(); it2 != (*it).end(); it2++ ){
+			//(*it)->report(cout);
+			cout << "  " << (*it2)->getWeight() << "    ";
+			for (i = 0; i < sg.getNTaxa(); i++)
+				if ((*it2)->containTaxon(i))
+				cout << sg.getTaxa()->GetTaxonLabel(i) << "  ";
+			if (sg.isBudgetConstraint())
+				cout << " (budget = " << sg.calcCost(*(*it2)) << ")";
+			cout << endl;
+		}
+	}
+
+	sg.printOutputSetScore(params, pd_set);
+
+
+	summarizeSplit(params, sg, pd_set, pd_more, true);
+
+	if (params.calc_pdgain) {
+		matrix(double) delta_gain;
+		sg.calcPDGain(pd_set, delta_gain);
+		string filename = params.out_prefix;
+		filename += ".pdgain";
+		printGainMatrix((char*)filename.c_str(), delta_gain, pd_set.front().front()->countTaxa());
+		//cout << delta_gain;
+	}
+
+
+	//for (i = pd_set.size()-1; i >= 0; i--)
+	//	delete pd_set[i];
+
+}
+
+void printSplitSet(SplitGraph &sg, SplitIntMap &hash_ss) {
+/*
+	for (SplitIntMap::iterator it = hash_ss.begin(); it != hash_ss.end(); it++) {
+		if ((*it)->getWeight() > 50 && (*it)->countTaxa() > 1)
+		(*it)->report(cout);
+	}*/
+	sg.getTaxa()->Report(cout);
+	for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++) {
+		if ((*it)->getWeight() > 50 && (*it)->countTaxa() > 1)
+		(*it)->report(cout);
+	}
+}
+
+void readTaxaOrder(char *taxa_order_file, StrVector &taxa_order) {
+
+}
+
+void calcTreeCluster(Params &params) {
+	assert(params.taxa_order_file);
+	MExtTree tree(params.user_file, params.is_rooted);
+//	StrVector taxa_order;
+	//readTaxaOrder(params.taxa_order_file, taxa_order);
+	NodeVector taxa;
+	matrix(int) clusters;
+	clusters.reserve(tree.leafNum - 3);
+	tree.getTaxa(taxa);
+	sort(taxa.begin(), taxa.end(), nodenamecmp);
+	tree.createCluster(taxa, clusters);
+	int cnt = 1;
+
+	string treename = params.out_prefix;
+	treename += ".clu-id";
+	tree.printTree(treename.c_str());
+
+	for (matrix(int)::iterator it = clusters.begin(); it != clusters.end(); it++, cnt++) {
+		ostringstream filename;
+		filename << params.out_prefix << "." << cnt << ".clu";
+		ofstream out(filename.str().c_str());
+
+		ostringstream filename2;
+		filename2 << params.out_prefix << "." << cnt << ".name-clu";
+		ofstream out2(filename2.str().c_str());
+
+		out << "w" << endl << "c" << endl << "4" << endl << "b" << endl << "g" << endl << 4-params.is_rooted << endl;
+		IntVector::iterator it2;
+		NodeVector::iterator it3;
+		for (it2 = (*it).begin(), it3 = taxa.begin(); it2 != (*it).end(); it2++, it3++)
+			if ((*it3)->name != ROOT_NAME) {
+				out << char((*it2)+'a') << endl;
+				out2 << (*it3)->name << "  " << char((*it2)+'a') << endl;
+			}
+		out << "y" << endl;
+		out.close();
+		out2.close();
+		cout << "Cluster " << cnt << " printed to " << filename.rdbuf() << " and " << filename2.rdbuf() << endl;
+	}
+}
+
+
+void printTaxa(Params &params) {
+	MTree mytree(params.user_file, params.is_rooted);
+	vector<string> taxname;
+	taxname.resize(mytree.leafNum);
+	mytree.getTaxaName(taxname);
+	sort(taxname.begin(), taxname.end());
+
+	string filename = params.out_prefix;
+	filename += ".taxa";
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+		for (vector<string>::iterator it = taxname.begin(); it != taxname.end(); it++) {
+			if ((*it) != ROOT_NAME) out << (*it);
+			out << endl;
+		}
+		out.close();
+		cout << "All taxa names printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+}
+
+void printAreaList(Params &params) {
+	MSetsBlock *sets;
+	sets = new MSetsBlock();
+ 	cout << "Reading input file " << params.user_file << "..." << endl;
+
+	MyReader nexus(params.user_file);
+
+	nexus.Add(sets);
+
+	MyToken token(nexus.inf);
+	nexus.Execute(token);
+
+	//sets->Report(cout);
+
+	TaxaSetNameVector *allsets = sets->getSets();
+
+	string filename = params.out_prefix;
+	filename += ".names";
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+		for (TaxaSetNameVector::iterator it = allsets->begin(); it != allsets->end(); it++) {
+			out << (*it)->name;
+			out << endl;
+		}
+		out.close();
+		cout << "All area names printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	delete sets;
+}
+
+void scaleBranchLength(Params &params) {
+	params.is_rooted = true;
+	PDTree tree(params);
+	if (params.run_mode == SCALE_BRANCH_LEN) {
+		cout << "Scaling branch length with a factor of " << params.scaling_factor << " ..." << endl;
+		tree.scaleLength(params.scaling_factor, false);
+	} else {
+		cout << "Scaling clade support with a factor of " << params.scaling_factor << " ..." << endl;
+		tree.scaleCladeSupport(params.scaling_factor, false);
+	}
+	if (params.out_file != NULL)
+		tree.printTree(params.out_file);
+	else {
+		tree.printTree(cout);
+		cout << endl;
+	}
+}
+
+void calcDistribution(Params &params) {
+
+	PDTree mytree(params);
+
+	string filename = params.out_prefix;
+	filename += ".randompd";
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+		for (int size = params.min_size; size <= params.sub_size; size += params.step_size) {
+			out << size;
+			for (int sample = 0; sample < params.sample_size; sample++) {
+				Split taxset(mytree.leafNum);
+				taxset.randomize(size);
+				mytree.calcPD(taxset);
+				out << "  " << taxset.getWeight();
+			}
+			out << endl;
+		}
+		out.close();
+		cout << "PD distribution is printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+}
+
+void printRFDist(ostream &out, int *rfdist, int n, int m, int rf_dist_mode) {
+	int i, j;
+	if (rf_dist_mode == RF_ADJACENT_PAIR) {
+		out << "XXX        ";
+		out << 1 << " " << n << endl;
+		for (i = 0; i < n; i++)
+			out << " " << rfdist[i];
+		out << endl;
+	} else {
+		// all pairs
+		out << n << " " << m << endl;
+		for (i = 0; i < n; i++)  {
+			out << "Tree" << i << "      ";
+			for (j = 0; j < m; j++)
+				out << " " << rfdist[i*m+j];
+			out << endl;
+		}
+	}
+}
+
+void computeRFDistExtended(const char *trees1, const char *trees2, const char *filename) {
+	cout << "Reading input trees 1 file " << trees1 << endl;
+	int ntrees = 0, ntrees2 = 0;
+	int *rfdist_raw = NULL;
+	try {
+		ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(trees1);
+    	IntVector rfdist;
+    	for (ntrees = 1; !in.eof(); ntrees++) {
+    		MTree tree;
+    		bool is_rooted = false;
+
+    		// read in the tree and convert into split system for indexing
+    		tree.readTree(in, is_rooted);
+    		if (verbose_mode >= VB_DEBUG)
+    			cout << ntrees << " " << endl;
+    		IntVector dist;
+    		tree.computeRFDist(trees2, dist);
+    		ntrees2 = dist.size();
+    		rfdist.insert(rfdist.end(), dist.begin(), dist.end());
+    		char ch;
+    		in.exceptions(ios::goodbit);
+    		(in) >> ch;
+    		if (in.eof()) break;
+    		in.unget();
+    		in.exceptions(ios::failbit | ios::badbit);
+
+    	}
+
+		in.close();
+		assert(ntrees * ntrees2 == rfdist.size());
+		rfdist_raw = new int[rfdist.size()];
+		copy(rfdist.begin(), rfdist.end(), rfdist_raw);
+
+	} catch (ios::failure) {
+		outError(ERR_READ_INPUT, trees1);
+	}
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		printRFDist(out, rfdist_raw, ntrees, ntrees2, RF_TWO_TREE_SETS_EXTENDED);
+		out.close();
+		cout << "Robinson-Foulds distances printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+void computeRFDist(Params &params) {
+
+	if (!params.user_file) outError("User tree file not provided");
+
+	string filename = params.out_prefix;
+	filename += ".rfdist";
+
+	if (params.rf_dist_mode == RF_TWO_TREE_SETS_EXTENDED) {
+		computeRFDistExtended(params.user_file, params.second_tree, filename.c_str());
+		return;
+	}
+
+	MTreeSet trees(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
+	int n = trees.size(), m = trees.size();
+	int *rfdist;
+	int *incomp_splits = NULL;
+	string infoname = params.out_prefix;
+	infoname += ".rfinfo";
+	string treename = params.out_prefix;
+	treename += ".rftree";
+	if (params.rf_dist_mode == RF_TWO_TREE_SETS) {
+		MTreeSet treeset2(params.second_tree, params.is_rooted, params.tree_burnin, params.tree_max_count);
+		cout << "Computing Robinson-Foulds distances between two sets of trees" << endl;
+		m = treeset2.size();
+		rfdist = new int [n*m];
+		memset(rfdist, 0, n*m* sizeof(int));
+		if (verbose_mode >= VB_MAX) {
+			incomp_splits = new int [n*m];
+			memset(incomp_splits, 0, n*m* sizeof(int));
+		}
+		if (verbose_mode >= VB_MED)
+			trees.computeRFDist(rfdist, &treeset2, infoname.c_str(),treename.c_str(), incomp_splits);
+		else
+			trees.computeRFDist(rfdist, &treeset2);
+	} else {
+		rfdist = new int [n*n];
+		memset(rfdist, 0, n*n* sizeof(int));
+		trees.computeRFDist(rfdist, params.rf_dist_mode, params.split_weight_threshold);
+	}
+
+	if (verbose_mode >= VB_MED) printRFDist(cout, rfdist, n, m, params.rf_dist_mode);
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+		printRFDist(out, rfdist, n, m, params.rf_dist_mode);
+		out.close();
+		cout << "Robinson-Foulds distances printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	if (incomp_splits)
+	try {
+		filename = params.out_prefix;
+		filename += ".incomp";
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename.c_str());
+		printRFDist(out, incomp_splits, n, m, params.rf_dist_mode);
+		out.close();
+		cout << "Number of incompatible splits in printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	if (incomp_splits) delete [] incomp_splits;
+	delete [] rfdist;
+}
+
+
+void testInputFile(Params &params) {
+	SplitGraph sg(params);
+	if (sg.isWeaklyCompatible())
+		cout << "The split system is weakly compatible." << endl;
+	else
+		cout << "The split system is NOT weakly compatible." << endl;
+
+}
+
+/**MINH ANH: for some statistics about the branches on the input tree*/
+void branchStats(Params &params){
+	MaTree mytree(params.user_file, params.is_rooted);
+	mytree.drawTree(cout,WT_TAXON_ID + WT_INT_NODE);
+	//report to output file
+	string output;
+	if (params.out_file)
+		output = params.out_file;
+	else {
+		if (params.out_prefix)
+			output = params.out_prefix;
+		else
+			output = params.user_file;
+		output += ".stats";
+	}
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(output.c_str());
+		mytree.printBrInfo(out);
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, output);
+	}
+	cout << "Information about branch lengths of the tree is printed to: " << output << endl;
+	
+	/***** Following added by BQM to print internal branch lengths */
+	NodeVector nodes1, nodes2;
+	mytree.getAllInnerBranches(nodes1, nodes2);
+	output = params.out_prefix;
+	output += ".inlen";
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(output.c_str());
+		for (int i = 0; i < nodes1.size(); i++)
+			out << nodes1[i]->findNeighbor(nodes2[i])->length << " ";
+		out << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, output);
+	}
+	cout << "Internal branch lengths printed to: " << output << endl;
+}
+
+/**MINH ANH: for comparison between the input tree and each tree in a given set of trees*/
+void compare(Params &params){
+	MaTree mytree(params.second_tree, params.is_rooted);
+	//sort taxon names and update nodeID, to be consistent with MTreeSet
+	NodeVector taxa;
+	mytree.getTaxa(taxa);
+	sort(taxa.begin(), taxa.end(), nodenamecmp);
+	int i;
+	NodeVector::iterator it;
+	for (it = taxa.begin(), i = 0; it != taxa.end(); it++, i++)
+			(*it)->id = i;
+
+	string drawFile = params.second_tree;
+	drawFile += ".draw";
+	try {
+		ofstream out1;
+		out1.exceptions(ios::failbit | ios::badbit);
+		out1.open(drawFile.c_str());
+		mytree.drawTree(out1,WT_TAXON_ID + WT_INT_NODE);
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, drawFile);
+	}
+	cout << "Tree with branchID (nodeID) was printed to: " << drawFile << endl;
+
+
+	MTreeSet trees(params.user_file,params.is_rooted, params.tree_burnin, params.tree_max_count);
+	DoubleMatrix brMatrix;
+	DoubleVector BSDs;
+	IntVector RFs;
+	mytree.comparedTo(trees, brMatrix, RFs, BSDs);
+	int numTree = trees.size();
+	int numNode = mytree.nodeNum;
+
+	string output;
+	if (params.out_file)
+		output = params.out_file;
+	else {
+		if (params.out_prefix)
+			output = params.out_prefix;
+		else
+			output = params.user_file;
+		output += ".compare";
+	}
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(output.c_str());
+		//print the header
+		out << "tree  " ;
+		for (int nodeID = 0; nodeID < numNode; nodeID++ )
+			if ( brMatrix[0][nodeID] != -2 )
+				out << "br_" << nodeID << "  ";
+		out << "RF  BSD" << endl;
+		for ( int treeID = 0; treeID < numTree; treeID++ )
+		{
+			out << treeID << "  ";
+			for (int nodeID = 0; nodeID < numNode; nodeID++ )
+				if ( brMatrix[treeID][nodeID] != -2 )
+					out << brMatrix[treeID][nodeID] << "  ";
+			out << RFs[treeID] << "  " << BSDs[treeID] << endl;
+		}
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, output);
+	}
+	cout << "Comparison with the given set of trees is printed to: " << output << endl;
+}
+
+/**MINH ANH: to compute 'guided bootstrap' alignment*/
+void guidedBootstrap(Params &params)
+{
+	MaAlignment inputAlign(params.aln_file,params.sequence_type, params.intype);
+	inputAlign.readLogLL(params.siteLL_file);
+
+	string outFre_name = params.out_prefix;
+    outFre_name += ".patInfo";
+
+	inputAlign.printPatObsExpFre(outFre_name.c_str());
+
+	string gboAln_name = params.out_prefix;
+    gboAln_name += ".gbo";
+
+	MaAlignment gboAlign;
+	double prob;
+	gboAlign.generateExpectedAlignment(&inputAlign, prob);
+	gboAlign.printPhylip(gboAln_name.c_str());
+
+
+	string outProb_name = params.out_prefix;
+	outProb_name += ".gbo.logP";
+	try {
+		ofstream outProb;
+		outProb.exceptions(ios::failbit | ios::badbit);
+		outProb.open(outProb_name.c_str());
+		outProb.precision(10);
+		outProb << prob << endl;
+		outProb.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outProb_name);
+	}
+
+	cout << "Information about patterns in the input alignment is printed to: " << outFre_name << endl;
+	cout << "A 'guided bootstrap' alignment is printed to: " << gboAln_name << endl;
+	cout << "Log of the probability of the new alignment is printed to: " << outProb_name << endl;
+}
+
+/**MINH ANH: to compute the probability of an alignment given the multinomial distribution of patterns frequencies derived from a reference alignment*/
+void computeMulProb(Params &params)
+{
+	Alignment refAlign(params.second_align, params.sequence_type, params.intype);
+	Alignment inputAlign(params.aln_file, params.sequence_type, params.intype);
+	double prob;
+	inputAlign.multinomialProb(refAlign,prob);
+	//Printing
+	string outProb_name = params.out_prefix;
+	outProb_name += ".mprob";
+	try {
+		ofstream outProb;
+		outProb.exceptions(ios::failbit | ios::badbit);
+		outProb.open(outProb_name.c_str());
+		outProb.precision(10);
+		outProb << prob << endl;
+		outProb.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outProb_name);
+	}
+	cout << "Probability of alignment " << params.aln_file << " given alignment " << params.second_align << " is: " << prob << endl;
+	cout << "The probability is printed to: " << outProb_name << endl;
+}
+
+void processNCBITree(Params &params) {
+	NCBITree tree;
+	Node *dad = tree.readNCBITree(params.user_file, params.ncbi_taxid, params.ncbi_taxon_level, params.ncbi_ignore_level);
+	if (params.ncbi_names_file) tree.readNCBINames(params.ncbi_names_file);
+
+	cout << "Dad ID: " << dad->name << " Root ID: " << tree.root->name << endl;
+	string str = params.user_file;
+	str += ".tree";
+	if (params.out_file) str = params.out_file;
+	//tree.printTree(str.c_str(), WT_SORT_TAXA | WT_BR_LEN);
+	cout << "NCBI tree printed to " << str << endl;
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(str.c_str());
+		tree.printTree(out, WT_SORT_TAXA | WT_BR_LEN | WT_TAXON_ID, tree.root, dad);
+		out << ";" << endl;
+		out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, str);
+	}
+}
+
+/* write simultaneously to cout/cerr and a file */
+class outstreambuf : public streambuf {
+public:
+    outstreambuf* open( const char* name, ios::openmode mode = ios::out);
+    outstreambuf* close();
+    ~outstreambuf() { close(); }
+    
+protected:
+	ofstream fout;
+	streambuf *cout_buf;
+	streambuf *cerr_buf;
+	streambuf *fout_buf;
+    virtual int     overflow( int c = EOF);
+    virtual int     sync();
+};
+
+
+outstreambuf* outstreambuf::open( const char* name, ios::openmode mode) {
+    fout.open(name, mode);
+	if (!fout.is_open()) {
+		cout << "Could not open " << name << " for logging" << endl;
+		return NULL;
+	}
+	cout_buf = cout.rdbuf();
+	cerr_buf = cerr.rdbuf();
+	fout_buf = fout.rdbuf();
+	cout.rdbuf(this);
+	cerr.rdbuf(this);
+    return this;
+}
+
+outstreambuf* outstreambuf::close() {
+    if ( fout.is_open()) {
+        sync();
+        cout.rdbuf(cout_buf);
+        cerr.rdbuf(cerr_buf);
+		fout.close();
+        return this;
+    }
+    return NULL;
+}
+
+int outstreambuf::overflow( int c) { // used for output buffer only
+	if (verbose_mode >= VB_MIN)
+		if (cout_buf->sputc(c) == EOF) return EOF;
+	if (fout_buf->sputc(c) == EOF) return EOF;
+	return c;
+}
+
+int outstreambuf::sync() { // used for output buffer only
+	if (verbose_mode >= VB_MIN)
+		cout_buf->pubsync();
+	return fout_buf->pubsync();
+}
+
+outstreambuf _out_buf;
+string _log_file;
+int _exit_wait_optn = FALSE;
+
+
+extern "C" void startLogFile() {
+	_out_buf.open(_log_file.c_str());
+}
+
+extern "C" void appendLogFile() {
+	_out_buf.open(_log_file.c_str(), ios::app);
+}
+
+extern "C" void endLogFile() {
+	_out_buf.close();
+}
+
+void funcExit(void) {
+	if(_exit_wait_optn) {
+		printf("\npress [return] to finish: ");
+		fflush(stdout);
+		while (getchar() != '\n');
+	}
+	
+	endLogFile();
+}
+
+extern "C" void funcAbort(int signal_number)
+{
+    /*Your code goes here. You can output debugging info.
+      If you return from this function, and it was called 
+      because abort() was called, your program will exit or crash anyway
+      (with a dialog box on Windows).
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
+	print_stacktrace(cerr);
+#endif
+
+	cout << endl << "*** IQ-TREE CRASHES WITH SIGNAL ";
+	switch (signal_number) {
+		case SIGABRT: cout << "ABORTED"; break;
+		case SIGFPE:  cout << "ERRONEOUS NUMERIC"; break;
+		case SIGILL:  cout << "ILLEGAL INSTRUCTION"; break;
+		case SIGSEGV: cout << "SEGMENTATION FAULT"; break;
+	}
+    cout << endl;
+	cout << "*** For bug report please send to developers:" << endl << "***    Log file: " << _log_file;
+	cout << endl << "***    Alignment files (if possible)" << endl;
+	funcExit();
+	signal(signal_number, SIG_DFL);
+}
+
+extern "C" void getintargv(int *argc, char **argv[]) 
+{
+	int    done;
+	int    count;
+	int    n;
+	int    l;
+	char   ch;
+	char  *argtmp;
+	char **argstr;
+
+	argtmp = (char  *)calloc(10100, sizeof(char));
+	argstr = (char **)calloc(100, sizeof(char*));
+	for(n=0; n<100; n++) {
+		argstr[n] = &(argtmp[n * 100]);
+	}
+	n=1;
+
+	fprintf(stdout, "\nYou seem to have click-started this program,");
+	fprintf(stdout, "\ndo you want to enter commandline parameters: [y]es, [n]o: ");
+	fflush(stdout);
+
+	/* read one char */
+	ch = getc(stdin);
+	if (ch != '\n') {
+		do ;
+		while (getc(stdin) != '\n');
+	}
+	ch = (char) tolower((int) ch);
+
+	if (ch == 'y') {
+		done=FALSE;
+
+		fprintf(stdout, "\nEnter single parameter [! for none]: ");
+		fflush(stdout);
+		count = fscanf(stdin, "%s", argstr[n]);
+		do ;
+		while (getc(stdin) != '\n');
+
+		if(argstr[0][0] == '!') {
+			count = 0;
+		} else {
+			if (strlen(argstr[n]) > 100) {
+				fprintf(stdout, "\nParameter too long!!!\n");
+			} else {
+				n++;
+			}
+		}
+
+		while(!done) {
+			fprintf(stdout, "\nCurrent commandline: ");
+			for(l=1; l<n; l++) {
+				fprintf(stdout, "%s ", argstr[l]);
+			}
+			fprintf(stdout, "\nQuit [q]; confirm [y]%s%s%s: ",
+				(n<99 ? ", extend [e]" : ""),
+				(n>1 ? ", delete last [l]" : ""),
+				(n>1 ? ", delete all [a]" : ""));
+			fflush(stdout);
+
+			/* read one char */
+			ch = getc(stdin);
+			/* ch = getchar(); */
+			if (ch != '\n') {
+				do ;
+				while (getc(stdin) != '\n');
+				/* while (getchar() != '\n'); */
+			}
+			ch = (char) tolower((int) ch);
+		
+			switch (ch) {
+				case 'y': 
+					done=TRUE;
+					break;
+				case 'e': 
+					fprintf(stdout, "\nEnter single parameter [! for none]: ");
+					fflush(stdout);
+					count = fscanf(stdin, "%s", argstr[n]);
+					do ;
+					while (getc(stdin) != '\n');
+		
+					if(argstr[0][0] == '!') {
+						count = 0;
+					} else {
+						if (strlen(argstr[n]) > 100) {
+							fprintf(stdout, "\nParameter too long!!!\n");
+						} else {
+							n++;
+						}
+					}
+					break;
+				case 'l': 
+					if (n>1) n--;
+					break;
+				case 'a': 
+					n=1;
+					break;
+				case 'q': 
+   					// tp_exit(0, NULL, FALSE, __FILE__, __LINE__, _exit_wait_optn);
+					if(_exit_wait_optn) {
+						printf("\npress [return] to finish: ");
+						fflush(stdout);
+						while (getchar() != '\n');
+					}
+					exit(0);
+					break;
+			}
+		}
+	}
+
+	*argc = n;
+	*argv = argstr;
+} /* getintargv */
+
+/*********************************************************************************************************************************
+	Olga: ECOpd - phylogenetic diversity with ecological constraint: choosing a viable subset of species which maximizes PD/SD
+*********************************************************************************************************************************/
+
+void processECOpd(Params &params) {
+	double startTime = getCPUTime();
+	params.detected_mode = LINEAR_PROGRAMMING;
+	cout<<"----------------------------------------------------------------------------------------"<<endl;
+	int i;
+	double score;
+	double *variables;
+	int threads = params.gurobi_threads;
+	params.gurobi_format=true;
+
+	string model_file,subFoodWeb,outFile;
+
+	model_file = params.out_prefix;
+	model_file += ".lp";
+
+	subFoodWeb = params.out_prefix;
+	subFoodWeb += ".subFoodWeb";
+
+	outFile = params.out_prefix;
+	outFile += ".pda";
+
+	//Defining the input phylo type: t - rooted/unrooted tree, n - split network
+	params.intype=detectInputFile(params.user_file);
+	if(params.intype == IN_NEWICK){
+		params.eco_type = "t";
+	} else if(params.intype == IN_NEXUS){
+		params.eco_type = "n";
+	}
+
+	// Checking whether to treat the food web as weighted or non weighted
+	if(params.diet_max == 0){
+		params.eco_weighted = false;
+	}else if(params.diet_max > 100 || params.diet_max < 0){
+		cout<<"The minimum percentage of the diet to be conserved for each predator"<<endl;
+		cout<<"d = "<<params.diet_max<<endl;
+		cout<<"ERROR: Wrong value of parameter d. It must be within the range 0 <= d <= 100"<<endl;
+		exit(0);
+	}else{
+		params.eco_weighted = true;
+	}
+
+	if(strcmp(params.eco_type,"t")==0){
+	/*--------------------------------- EcoPD Trees ---------------------------------*/
+		ECOpd tree(params.user_file,params.is_rooted);
+
+		// Setting all the information-----------------
+		tree.phyloType = "t";
+		tree.TaxaNUM = tree.leafNum;
+		if(verbose_mode == VB_MAX){
+			cout<<"TaxaNUM = "<<tree.TaxaNUM<<endl;
+			cout<<"LeafNUM = "<<tree.leafNum<<endl;
+			cout<<"root_id = "<<tree.root->id<<" root_name = "<<tree.root->name<<endl;
+
+			for(i=0; i<tree.leafNum; i++){
+				cout<<i<<" "<<tree.findNodeID(i)->name <<endl;
+			}
+		}
+
+		//Getting Species Names from tree
+		for(i = 0; i < tree.TaxaNUM; i++)
+			(tree.phyloNames).push_back(tree.findNodeID(i)->name);
+		//for(i=0;i<tree.phyloNames.size();i++)
+		//	cout<<"["<<i<<"] "<<tree.phyloNames[i]<<endl;
+
+		// Full species list including info from tree and food web. Here adding names from phyloInput.
+		for(i=0; i<tree.TaxaNUM; i++)
+			tree.names.push_back(&(tree.phyloNames[i]));
+
+		// Read the taxa to be included in the final optimal subset
+		if(params.initial_file)
+			tree.readInitialTaxa(params.initial_file);
+
+		// Read the DAG file, Synchronize species on the Tree and in the Food Web
+		tree.weighted = params.eco_weighted;
+		tree.T = params.diet_max*0.01;
+		tree.readDAG(params.eco_dag_file);
+		tree.defineK(params);
+
+		// IP formulation
+		cout<<"Formulating an IP problem..."<<endl;
+		if(tree.rooted){
+			tree.printECOlpRooted(model_file.c_str(),tree);
+		} else {
+			tree.printECOlpUnrooted(model_file.c_str(),tree);
+		}
+
+		// Solve IP problem
+		cout<<"Solving the problem..."<<endl;
+		variables = new double[tree.nvar];
+		int g_return = gurobi_solve((char*)model_file.c_str(), tree.nvar, &score, variables, verbose_mode, threads);
+		if(verbose_mode == VB_MAX){
+			cout<<"GUROBI finished with "<<g_return<<" return."<<endl;
+			for(i=0; i<tree.nvar; i++)
+				cout<<"x"<<i<<" = "<<variables[i]<<endl;
+			cout<<"score = "<<score<<endl;
+		}
+		tree.dietConserved(variables);
+		params.run_time = getCPUTime() - startTime;
+		tree.printResults((char*)outFile.c_str(),variables,score,params);
+		tree.printSubFoodWeb((char*)subFoodWeb.c_str(),variables);
+		delete[] variables;
+
+	} else if(strcmp(params.eco_type,"n")==0){
+	/*----------------------------- EcoPD SplitNetwork ------------------------------*/
+		params.intype=detectInputFile(params.user_file);
+		PDNetwork splitSYS(params);
+		ECOpd ecoInfDAG;
+
+		// Get the species names from SplitNetwork
+		splitSYS.speciesList(&(ecoInfDAG.phyloNames));
+		//for(i=0;i<ecoInfDAG.phyloNames.size();i++)
+		//	cout<<"["<<i<<"] "<<ecoInfDAG.phyloNames[i]<<endl;
+
+		ecoInfDAG.phyloType = "n";
+		ecoInfDAG.TaxaNUM = splitSYS.getNTaxa();
+
+		// Full species list including info from tree and food web
+		for(i=0; i<ecoInfDAG.TaxaNUM; i++)
+			ecoInfDAG.names.push_back(&(ecoInfDAG.phyloNames[i]));
+
+		ecoInfDAG.weighted = params.eco_weighted;
+		// Read the taxa to be included in the final optimal subset
+		if(params.initial_file)
+			ecoInfDAG.readInitialTaxa(params.initial_file);
+		ecoInfDAG.T = params.diet_max*0.01;
+		ecoInfDAG.readDAG(params.eco_dag_file);
+		ecoInfDAG.defineK(params);
+
+		cout<<"Formulating an IP problem..."<<endl;
+		splitSYS.transformEcoLP(params, model_file.c_str(), 0);
+		/**
+		 * (subset_size-4) - influences constraints for conserved splits.
+		 * should be less than taxaNUM in the split system.
+		 * With 0 prints all the constraints.
+		 * Values different of 0 reduce the # of constraints.
+		 **/
+
+		ecoInfDAG.printInfDAG(model_file.c_str(),splitSYS,params);
+		cout<<"Solving the problem..."<<endl;
+		variables = new double[ecoInfDAG.nvar];
+		int g_return = gurobi_solve((char*)model_file.c_str(), ecoInfDAG.nvar, &score, variables, verbose_mode, threads);
+		if(verbose_mode == VB_MAX){
+			cout<<"GUROBI finished with "<<g_return<<" return."<<endl;
+			for(i=0; i<ecoInfDAG.nvar; i++)
+				cout<<"x"<<i<<" = "<<variables[i]<<endl;
+			cout<<"score = "<<score<<endl;
+		}
+		ecoInfDAG.splitsNUM = splitSYS.getNSplits();
+		ecoInfDAG.totalSD = splitSYS.calcWeight();
+		ecoInfDAG.dietConserved(variables);
+		params.run_time = getCPUTime() - startTime;
+		ecoInfDAG.printResults((char*)outFile.c_str(),variables, score,params);
+		ecoInfDAG.printSubFoodWeb((char*)subFoodWeb.c_str(),variables);
+		delete[] variables;
+	}
+}
+
+/********************************************************
+	main function
+********************************************************/
+/*
+int main(){
+	IQTree tree;
+	char * str = "(1, (2, 345));";
+	string k;
+	tree.pllConvertTaxaID2IQTreeForm(str, k);
+	cout << str << endl;
+	cout << k << endl;
+	cout << "WHAT" << endl;
+	return 0;
+}
+*/
+
+/*
+Instruction set ID reported by vectorclass::instrset_detect
+0           = 80386 instruction set
+1  or above = SSE (XMM) supported by CPU (not testing for O.S. support)
+2  or above = SSE2
+3  or above = SSE3
+4  or above = Supplementary SSE3 (SSSE3)
+5  or above = SSE4.1
+6  or above = SSE4.2
+7  or above = AVX supported by CPU and operating system
+8  or above = AVX2
+9  or above = AVX512F
+*/
+int instruction_set;
+
+int main(int argc, char *argv[])
+{
+
+	/*************************/
+	{ /* local scope */
+		int found=FALSE;              /* "click" found in cmd name? */
+		int n, dummyint;
+		char *tmpstr;
+		int     intargc; 
+		char  **intargv; 
+		intargc = 0; 
+		intargv = NULL; 
+		
+		for (n = strlen(argv[0]) - 5; 
+		    (n >= 0) && !found && (argv[0][n] != '/')
+		             && (argv[0][n] != '\\'); n--) {
+
+			tmpstr = &(argv[0][n]);
+			dummyint = 0;
+			(void)sscanf(tmpstr, "click%n", &dummyint);
+			if (dummyint == 5) found = TRUE;
+			else {
+				dummyint = 0;
+				(void)sscanf(tmpstr, "CLICK%n", &dummyint);
+				if (dummyint == 5) found = TRUE;
+				else {
+					dummyint = 0;
+					(void)sscanf(tmpstr, "Click%n", &dummyint);
+					if (dummyint == 5) found = TRUE;
+				}
+			}
+		}
+		if(found) _exit_wait_optn = TRUE;
+
+		if (_exit_wait_optn) { // get commandline parameters from keyboard
+			getintargv(&intargc, &intargv); 
+			fprintf(stdout, "\n\n");
+			if(intargc > 1) { // if there were option entered, use them as argc/argv
+				argc = intargc; 
+				argv = intargv; 
+			} 
+		}
+	} /* local scope */
+	/*************************/
+
+	//Params params;
+	parseArg(argc, argv, Params::getInstance());
+
+	_log_file = Params::getInstance().out_prefix;
+	_log_file += ".log";
+	startLogFile();
+	atexit(funcExit);
+	signal(SIGABRT, &funcAbort);
+	signal(SIGFPE, &funcAbort);
+	signal(SIGILL, &funcAbort);
+	signal(SIGSEGV, &funcAbort);
+	printCopyright(cout);
+	/*
+	double x=1e-100;
+	double y=1e-101;
+	if (x > y) cout << "ok!" << endl;
+	else cout << "shit!" << endl;
+	*/
+	//FILE *pfile = popen("hostname","r");
+	char hostname[100];
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+    WSADATA wsaData;
+    WSAStartup(MAKEWORD(2, 2), &wsaData);
+    gethostname(hostname, sizeof(hostname));
+    WSACleanup();
+#else
+	gethostname(hostname, sizeof(hostname));
+#endif
+	//fgets(hostname, sizeof(hostname), pfile);
+	//pclose(pfile);
+
+	instruction_set = instrset_detect();
+#if defined(BINARY32) || defined(__NOAVX__)
+    instruction_set = min(instruction_set, 6);
+#endif
+	if (instruction_set < 3) outError("Your CPU does not support SSE3!");
+	bool has_fma3 = (instruction_set >= 7) && hasFMA3();
+	bool has_fma4 = (instruction_set >= 7) && hasFMA4();
+
+#ifdef __FMA__
+	bool has_fma =  has_fma3 || has_fma4;
+	if (!has_fma) {
+		outError("Your CPU does not support FMA instruction, quiting now...");
+	}
+#endif
+
+	cout << "Host:    " << hostname << " (";
+	switch (instruction_set) {
+	case 3: cout << "SSE3, "; break;
+	case 4: cout << "SSSE3, "; break;
+	case 5: cout << "SSE4.1, "; break;
+	case 6: cout << "SSE4.2, "; break;
+	case 7: cout << "AVX, "; break;
+	case 8: cout << "AVX2, "; break;
+	default: cout << "AVX512F, "; break;
+	}
+	if (has_fma3) cout << "FMA3, ";
+	if (has_fma4) cout << "FMA4, ";
+//#if defined __APPLE__ || defined __MACH__
+	cout << (int)(((getMemorySize()/1024.0)/1024)/1024) << " GB RAM)" << endl;
+//#else
+//	cout << (int)(((getMemorySize()/1000.0)/1000)/1000) << " GB RAM)" << endl;
+//#endif
+
+	cout << "Command:";
+	for (int i = 0; i < argc; i++)
+		cout << " " << argv[i];
+	cout << endl;
+
+	cout << "Seed:    " << Params::getInstance().ran_seed <<  " ";
+	init_random(Params::getInstance().ran_seed);
+
+	time_t cur_time;
+	time(&cur_time);
+	cout << "Time:    " << ctime(&cur_time);
+
+	if (Params::getInstance().lk_no_avx)
+		instruction_set = min(instruction_set, 6);
+
+	cout << "Kernel:  ";
+	if (Params::getInstance().pll) {
+#ifdef __AVX__
+		cout << "PLL-AVX";
+#else
+		cout << "PLL-SSE3";
+#endif
+	} else {
+		switch (Params::getInstance().SSE) {
+		case LK_NORMAL: cout << "Slow"; break;
+		case LK_SSE: cout << "Slow SSE3"; break;
+		case LK_EIGEN: cout << "No SSE"; break;
+		case LK_EIGEN_SSE:
+			if (instruction_set >= 7) {
+				cout << "AVX";
+			} else {
+				cout << "SSE3";
+			}
+
+#ifdef __FMA__
+			cout << "+FMA";
+#endif
+			break;
+		}
+	}
+
+
+
+#ifdef _OPENMP
+	if (Params::getInstance().num_threads == 0) {
+		cout << endl << endl;
+		outError("Please specify the number of cores to use (-nt option)!");
+	}
+	if (Params::getInstance().num_threads) omp_set_num_threads(Params::getInstance().num_threads);
+//	int max_threads = omp_get_max_threads();
+	Params::getInstance().num_threads = omp_get_max_threads();
+	int max_procs = countPhysicalCPUCores();
+	cout << " - " << Params::getInstance().num_threads  << " threads (" << max_procs << " CPU cores detected)";
+	if (Params::getInstance().num_threads  > max_procs) {
+		cout << endl;
+		outError("You have specified more threads than CPU cores available");
+	}
+	omp_set_nested(false); // don't allow nested OpenMP parallelism
+#else
+	if (Params::getInstance().num_threads != 1) {
+		cout << endl << endl;
+		outError("Number of threads must be 1 for sequential version.");
+	}
+    int num_procs = countPhysicalCPUCores();
+    if (num_procs > 1) {
+        cout << endl << endl << "NOTE: Consider using the multicore version because your CPU has " << num_procs << " cores!";
+    }
+#endif
+	//cout << "sizeof(int)=" << sizeof(int) << endl;
+	cout << endl << endl;
+
+	cout.precision(3);
+	cout.setf(ios::fixed);
+
+	// call the main function
+	if (Params::getInstance().tree_gen != NONE) {
+		generateRandomTree(Params::getInstance());
+	} else if (Params::getInstance().do_pars_multistate) {
+		doParsMultiState(Params::getInstance());
+	} else if (Params::getInstance().rf_dist_mode != 0) {
+		computeRFDist(Params::getInstance());
+	} else if (Params::getInstance().test_input != TEST_NONE) {
+		Params::getInstance().intype = detectInputFile(Params::getInstance().user_file);
+		testInputFile(Params::getInstance());
+	} else if (Params::getInstance().run_mode == PRINT_TAXA) {
+		printTaxa(Params::getInstance());
+	} else if (Params::getInstance().run_mode == PRINT_AREA) {
+		printAreaList(Params::getInstance());
+	} else if (Params::getInstance().run_mode == SCALE_BRANCH_LEN || Params::getInstance().run_mode == SCALE_NODE_NAME) {
+		scaleBranchLength(Params::getInstance());
+	} else if (Params::getInstance().run_mode == PD_DISTRIBUTION) {
+		calcDistribution(Params::getInstance());
+	} else if (Params::getInstance().run_mode == STATS){ /**MINH ANH: for some statistics on the input tree*/
+		branchStats(Params::getInstance()); // MA
+	} else if (Params::getInstance().branch_cluster > 0) {
+		calcTreeCluster(Params::getInstance());
+	} else if (Params::getInstance().ncbi_taxid) {
+		processNCBITree(Params::getInstance());
+	} else if (Params::getInstance().user_file && Params::getInstance().eco_dag_file) { /**ECOpd analysis*/
+		processECOpd(Params::getInstance());
+	} else if (Params::getInstance().aln_file || Params::getInstance().partition_file) {
+		if ((Params::getInstance().siteLL_file || Params::getInstance().second_align) && !Params::getInstance().gbo_replicates)
+		{
+			if (Params::getInstance().siteLL_file)
+				guidedBootstrap(Params::getInstance());
+			if (Params::getInstance().second_align)
+				computeMulProb(Params::getInstance());
+		} else {
+			runPhyloAnalysis(Params::getInstance());
+		}
+	} else if (Params::getInstance().ngs_file || Params::getInstance().ngs_mapped_reads) {
+		runNGSAnalysis(Params::getInstance());
+	} else if (Params::getInstance().pdtaxa_file && Params::getInstance().gene_scale_factor >=0.0 && Params::getInstance().gene_pvalue_file) {
+		runGSSAnalysis(Params::getInstance());
+	} else if (Params::getInstance().consensus_type != CT_NONE) {
+		MExtTree tree;
+		switch (Params::getInstance().consensus_type) {
+			case CT_CONSENSUS_TREE:
+				computeConsensusTree(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count, Params::getInstance().split_threshold,
+					Params::getInstance().split_weight_threshold, Params::getInstance().out_file, Params::getInstance().out_prefix, Params::getInstance().tree_weight_file, &Params::getInstance());
+				break;
+			case CT_CONSENSUS_NETWORK:
+				computeConsensusNetwork(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count, Params::getInstance().split_threshold,
+					Params::getInstance().split_weight_summary, Params::getInstance().split_weight_threshold, Params::getInstance().out_file, Params::getInstance().out_prefix, Params::getInstance().tree_weight_file);
+				break;
+			case CT_ASSIGN_SUPPORT:
+				assignBootstrapSupport(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count, 
+					Params::getInstance().second_tree, Params::getInstance().is_rooted, Params::getInstance().out_file,
+					Params::getInstance().out_prefix, tree, Params::getInstance().tree_weight_file, &Params::getInstance());
+				break;
+			case CT_ASSIGN_SUPPORT_EXTENDED:
+				assignBranchSupportNew(Params::getInstance());
+				break;
+			case CT_NONE: break;
+			/**MINH ANH: for some comparison*/
+			case COMPARE: compare(Params::getInstance()); break; //MA
+		}
+	} else {
+		Params::getInstance().intype = detectInputFile(Params::getInstance().user_file);
+		if (Params::getInstance().intype == IN_NEWICK && Params::getInstance().pdtaxa_file && Params::getInstance().tree_gen == NONE) {
+			if (Params::getInstance().budget_file) {
+				//if (Params::getInstance().budget < 0) Params::getInstance().run_mode = PD_USER_SET;
+			} else {
+				if (Params::getInstance().sub_size < 1 && Params::getInstance().pd_proportion == 0.0)
+					Params::getInstance().run_mode = PD_USER_SET;
+			}
+			// input is a tree, check if it is a reserve selection -> convert to splits
+			if (Params::getInstance().run_mode != PD_USER_SET) Params::getInstance().multi_tree = true;
+		}
+
+
+		if (Params::getInstance().intype == IN_NEWICK && !Params::getInstance().find_all && Params::getInstance().budget_file == NULL &&
+			Params::getInstance().find_pd_min == false && Params::getInstance().calc_pdgain == false &&
+			Params::getInstance().run_mode != LINEAR_PROGRAMMING && Params::getInstance().multi_tree == false)
+			runPDTree(Params::getInstance());
+		else if (Params::getInstance().intype == IN_NEXUS || Params::getInstance().intype == IN_NEWICK) {
+			if (Params::getInstance().run_mode == LINEAR_PROGRAMMING && Params::getInstance().find_pd_min)
+				outError("Current linear programming does not support finding minimal PD sets!");
+			if (Params::getInstance().find_all && Params::getInstance().run_mode == LINEAR_PROGRAMMING)
+				Params::getInstance().binary_programming = true;
+			runPDSplit(Params::getInstance());
+		} else {
+			outError("Unknown file input format");
+		}
+	}
+
+	time(&cur_time);
+	cout << "Date and Time: " << ctime(&cur_time);
+
+	finish_random();
+	return EXIT_SUCCESS;
+}
diff --git a/pdnetwork.cpp b/pdnetwork.cpp
new file mode 100644
index 0000000..6f679cd
--- /dev/null
+++ b/pdnetwork.cpp
@@ -0,0 +1,1964 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "mtree.h"
+#include "pdnetwork.h"
+#include "ncl/ncl.h"
+#include "msetsblock.h"
+#include "myreader.h" 
+#include "lpwrapper.h"
+#include "gurobiwrapper.h"
+
+extern void summarizeSplit(Params &params, PDNetwork &sg, vector<SplitSet> &pd_set, PDRelatedMeasures &pd_more, bool full_report);
+
+
+PDNetwork::PDNetwork()
+ : SplitGraph()
+{
+	extra_pd = 0;
+	min_pd = false;
+}
+
+PDNetwork::PDNetwork(Params &params) : SplitGraph(params) {
+	extra_pd = 0;
+	min_pd = false;
+
+	if (params.is_rooted) 
+		readRootNode(ROOT_NAME);
+
+	// read the parameter file
+	if (params.param_file != NULL) 
+		readParams(params);
+
+	if (params.budget_file != NULL) {
+		if (isPDArea())
+			pda->readBudgetAreaFile(params);
+		else
+			pda->readBudgetFile(params);
+	}
+	// identify the root
+	if (params.root != NULL) 
+		readRootNode(params.root);
+
+	// initial PD min
+	if (params.find_pd_min)
+		initPDMin();
+
+	// read the initial set of taxa, incoporate info into the split system
+	if (params.initial_file != NULL && params.eco_dag_file == NULL)
+		readInitialSet(params);
+
+	if (!initialset.empty() && !isPDArea())
+		proceedInitialSet();
+
+	if (params.initial_area_file != NULL)
+		readInitialAreas(params);
+
+
+
+}
+
+
+/**
+	Identify the root node if specified, include it into the initial set
+	@param root_name name of the root node
+*/
+void PDNetwork::readRootNode(const char *root_name) {
+	int id = -1;
+	try {
+		id = taxa->FindTaxon(root_name);
+	} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+		outError(ERR_NO_TAXON, root_name);
+	}
+	initialset.clear();
+	initialset.push_back(id);
+	//if (sets->getNSets() == 0)
+}
+
+
+
+void PDNetwork::readParams(Params &params) {
+	int ntaxa = getNTaxa() - params.is_rooted;
+
+	// read parameters from file	
+	double scale;
+	StrVector tax_name;
+	DoubleVector ori_weight, tax_weight;
+	readWeightFile(params, ntaxa, scale, tax_name, ori_weight);
+
+	// now convert the weights
+	tax_weight.resize(ntaxa, 0);
+	for (int i = 0; i < tax_name.size(); i++) {	
+		int id = -1;
+		try {
+			string name = "";
+			name.append(tax_name[i]);
+			id = taxa->FindTaxon(NxsString(name.c_str()));
+		} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+			outError(ERR_NO_TAXON, tax_name[i]);
+		}
+		tax_weight[id] = ori_weight[i];
+	}
+
+	if (params.scaling_factor >= 0) {
+		if (params.scaling_factor > 1) outError("Scaling factor must be between 0 and 1");
+		cout << "Rescaling split weights with " << params.scaling_factor << 
+			" and taxa weights with " << 1 - params.scaling_factor << endl;
+		scale = params.scaling_factor;
+		for (DoubleVector::iterator it = tax_weight.begin(); it != tax_weight.end(); it++)
+			(*it) *= (1 - scale);
+	}
+
+	// incoporate into the split system
+	for (iterator it = begin(); it != end(); it++) {
+		int id = (*it)->trivial();
+		// first, multiply split weight with the coefficient
+		(*it)->weight *= scale;		
+
+		// if a trivial split, add the important parameter f
+		if (id >= 0)
+			(*it)->weight += tax_weight[id];	
+	}	
+}
+
+
+/**
+	read the initial set of taxa to be included into PD-tree
+	@param params program parameters
+*/
+void PDNetwork::readInitialSet(Params &params) {
+	extra_pd = 0.0;
+	int ntaxa = getNTaxa() - params.is_rooted;
+	StrVector tax_name;
+	readInitTaxaFile(params, ntaxa, tax_name);
+	if (tax_name.empty()) 
+		outError("No taxa found");
+	for (StrVector::iterator it = tax_name.begin(); it != tax_name.end(); it++) {
+		int id = -1;
+		try {
+			string name = "";
+			name.append(*it);
+			id = taxa->FindTaxon(NxsString(name.c_str()));
+		} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+			outError(ERR_NO_TAXON, *it);
+		}
+		initialset.push_back(id);
+	}
+
+	if (isPDArea()) return;
+
+	if (isBudgetConstraint()) {
+		int budget = (params.budget >= 0) ? params.budget : pda->budget;
+		if (calcCost(initialset) > budget)
+			outError(ERR_TOO_SMALL_BUDGET);
+	} else {
+		int sub_size = (params.sub_size > 1) ? params.sub_size : pda->sub_size;
+		if (initialset.size() > sub_size) 
+			outError(ERR_TOO_SMALL_K);
+	}
+}
+
+void PDNetwork::proceedInitialSet() {
+	double total_w = trunc(abs(calcWeight())+1);
+	// get the set of initial taxa
+	set<int> iset;
+	for (IntVector::iterator it2 = initialset.begin(); it2 != initialset.end(); it2++)
+		iset.insert(*it2);
+	// now modifying the split weights
+	for (iterator it = begin(); it != end(); it++) {
+
+		// get the taxa id of trivial split
+		int id = (*it)->trivial();
+		// if not trivial split, continue
+		if (id < 0) continue;
+
+		if (iset.find(id) != iset.end()) {
+			// increase the trivial split weight
+			(*it)->weight += total_w;
+			extra_pd += total_w;			
+		}
+	}
+}
+
+void PDNetwork::readInitialAreas(Params &params) {
+	if (!isPDArea())
+		outError("Invalid -ia option: no areas specified");
+	int nareas = sets->getNSets();
+	StrVector area_name;
+	readInitAreaFile(params, nareas, area_name);
+	if (area_name.empty()) 
+		outError("No area found");
+	for (StrVector::iterator it = area_name.begin(); it != area_name.end(); it++) {
+		int id = -1;
+		id = sets->findArea(*it);
+		if (id < 0)
+			outError(ERR_NO_AREA, *it);
+		initialareas.push_back(id);
+	}
+
+	if (isBudgetConstraint()) {
+		int budget = (params.budget >= 0) ? params.budget : pda->budget;
+		if (calcCost(initialareas) > budget)
+			outError(ERR_TOO_SMALL_BUDGET);
+	} else {
+		int sub_size = (params.sub_size >= 1) ? params.sub_size : pda->sub_size;
+		if (initialareas.size() > sub_size) 
+			outError(ERR_TOO_SMALL_K);
+	}
+}
+
+
+void PDNetwork::initPDMin() {
+	min_pd = true;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->weight = -(*it)->weight;
+	}
+}
+
+
+/**
+	compute the required costs to conserve a taxa set
+	@param taxset set of taxa
+	@return minimum budget required
+*/
+int PDNetwork::calcCost(IntVector &taxset) {
+	int sum = 0;
+	for (IntVector::iterator it = taxset.begin(); it != taxset.end(); it++)
+		sum += pda->costs[*it];
+	return sum;
+}
+
+/**
+	compute the required costs to conserve a taxa set
+	@param taxset set of taxa
+	@return minimum budget required
+*/
+int PDNetwork::calcCost(Split &taxset) {
+	IntVector invec;
+	taxset.getTaxaList(invec);
+	return calcCost(invec);
+}
+
+
+
+/********************************************************
+	Now comes PD related stuff
+********************************************************/
+
+
+
+void PDNetwork::calcPD(Split &id_set) {
+	if (initialset.empty()) {
+		id_set.weight = calcWeight(id_set);
+		return;
+	}
+	Split id(id_set);
+	for (IntVector::iterator it = initialset.begin(); it != initialset.end(); it++)
+		id.addTaxon(*it);
+	id_set.weight = calcWeight(id);
+}
+
+void PDNetwork::calcExclusivePD(Split &id_set) {
+	id_set.invert();
+	calcPD(id_set);
+	id_set.invert();
+	id_set.weight = calcWeight() - id_set.weight;
+}
+
+
+
+void PDNetwork::computePD(Params &params, SplitSet &pd_set, PDRelatedMeasures &pd_more) {
+	//MSetsBlock *sets;
+	//sets = new MSetsBlock();
+
+	//sets->Report(cout);
+	TaxaSetNameVector *allsets = sets->getSets();
+	TaxaSetNameVector::iterator i;
+	for (i = allsets->begin(); i != allsets->end(); i++) {
+		Split *id_set = new Split(getNTaxa());
+		/*
+		for (IntVector::iterator it = initialset.begin(); it != initialset.end(); it++)
+			id_set->addTaxon(*it);
+		*/
+		for (vector<string>::iterator it2 = (*i)->taxlist.begin(); it2 != (*i)->taxlist.end(); it2++) {
+			int id = -1;
+			try {
+				id = taxa->FindTaxon(NxsString(it2->c_str()));
+			} catch (NxsTaxaBlock::NxsX_NoSuchTaxon) {
+				outError(ERR_NO_TAXON, *it2);
+			}
+			if (id >= 0)
+				id_set->addTaxon(id);
+		}
+		pd_more.setName.push_back((*i)->name);
+		if (params.exclusive_pd) {
+			calcExclusivePD(*id_set);
+			pd_more.exclusivePD.push_back(id_set->getWeight());
+		}
+		calcPD(*id_set);
+		pd_more.PDScore.push_back(id_set->weight);
+		pd_set.push_back(id_set);
+	}
+	//delete sets;
+}
+
+
+
+/********************************************************
+	EXHAUSTIVE FUNCTION
+********************************************************/
+
+void PDNetwork::updateSplitVector(Split &curset, SplitSet &best_set) 
+{
+	if (curset.weight > best_set[0]->weight) {
+		for (int it = best_set.size()-1; it >= 0; it--) 
+			delete best_set[it];
+		best_set.clear();
+	}
+	best_set.push_back(new Split(curset));
+}
+
+/**
+	calculate sum of weights of preserved splits in the taxa_set
+	@param taxa_set a set of taxa
+*/
+double PDNetwork::calcRaisedWeight(Split &taxa_set, 
+	IntList &rem_splits, IntList::iterator &rem_it)
+{
+	double sum = 0.0;
+	for (IntList::iterator it = rem_splits.begin(); it != rem_it;)
+		if ((*this)[*it]->preserved(taxa_set)) {
+			sum += (*this)[*it]->weight;
+			IntList::iterator prev_it = rem_it;
+			prev_it--;
+			int temp = *it;
+			*it = *prev_it;
+			*prev_it = temp;
+			rem_it = prev_it;
+		} else it++;
+	return sum;
+}
+
+int PDNetwork::calcMaxBudget() {
+	int sum = 0;
+	for (DoubleVector::iterator it = pda->costs.begin(); it != pda->costs.end(); it++)
+		sum += (*it);
+	return sum;
+}
+
+
+void PDNetwork::enterFindPD(Params &params) {
+	// check parameters
+	if (params.pd_proportion == 0.0) {
+		if (isBudgetConstraint()) {
+			int budget = (params.budget >= 0) ? params.budget : pda->getBudget();
+			if (budget < 0) {
+				outError(ERR_NO_BUDGET);
+			}
+		} else {
+			int min_accepted = !isPDArea() + 1;
+			int sub_size = (params.sub_size >= min_accepted) ? params.sub_size : pda->getSubSize();
+			if (sub_size < min_accepted && params.pdtaxa_file == NULL) {
+				outError(ERR_NO_K);
+			}
+			
+		}
+	}
+	if (initialset.size() > 0) {
+		cout << "Consider split network as ROOTED." << endl;
+	} else {
+		cout << "Consider split network as UNROOTED." << endl;
+	}
+
+	cout << "Total split weights: " << calcWeight() << endl;
+	cout << "  Internal split weights: " << calcWeight() - calcTrivialWeight() << endl;
+	cout << "  Trivial split weights : " << calcTrivialWeight() << endl;
+
+	if (params.pd_proportion == 0.0) {
+	
+		if (isBudgetConstraint()) {
+			// fix the budget and min_budget first
+			if (params.budget < 0) params.budget = pda->budget;
+			if (verbose_mode >= VB_DEBUG) {
+				pda->Report(cout);
+			}
+			cout << "Budget constraint with budget = " << params.budget << " ..." << endl;
+			if (params.min_budget < 0)
+				params.min_budget = pda->min_budget;
+			if (params.min_budget < 0) params.min_budget = params.budget;
+	
+			// resize the taxa_set
+			int max_budget = calcMaxBudget();
+			if (params.budget > max_budget) {
+				cout << "Only maximum budget of " << max_budget << " required, truncating to that value..." << endl;
+				params.budget = max_budget;
+				if (params.min_budget > params.budget)
+					params.min_budget = params.budget;
+			}
+		
+		} else	{
+			int min_accepted = !isPDArea() + 1;
+			if (params.sub_size <= 0) params.sub_size = pda->sub_size;
+			if (!isPDArea()) {
+				if (params.sub_size < 2 || params.sub_size > getNTaxa()) {
+					ostringstream str;
+					str <<"k must be between 2 and " << getNTaxa()-params.is_rooted;
+					outError(str.str());
+				}
+			} else if (params.sub_size < 1 || params.sub_size > sets->getNSets()) {
+					ostringstream str;
+					str << "k must be between 1 and " << sets->getNSets();
+					outError(str.str());
+				}
+			if (params.min_size < min_accepted) params.min_size = params.sub_size;
+		}
+	} 	
+}
+
+void printLPVersion(bool gurobi_format) {
+	if (gurobi_format)
+		cout << "Using GUROBI" << endl;
+	else {
+		//int lp_majorversion, lp_minorversion, lp_release, lp_build;
+		//lp_solve_version_info(&lp_majorversion, &lp_minorversion, &lp_release, &lp_build);
+		//cout << "Using LP_SOLVE " << lp_majorversion << "." << lp_minorversion << "." << lp_release << "." << lp_build << endl;
+	}
+}
+
+void PDNetwork::findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order) {
+
+	// call the entering function
+	enterFindPD(params);
+
+	int ntaxa = getNTaxa();
+	int nsplits = getNSplits();
+	Split curset(ntaxa, 0.0);
+	IntList rem_splits;
+
+	for (int i = 0; i < nsplits; i++) 
+		rem_splits.push_back(i);
+	IntList::iterator rem_it = rem_splits.end();
+
+	params.detected_mode = EXHAUSTIVE;
+
+
+	if (isPDArea()) {
+		params.detected_mode = LINEAR_PROGRAMMING;
+		printLPVersion(params.gurobi_format);
+		cout << "Optimizing PD over " << sets->getNSets() << " areas..." << endl;
+		cout << "Linear programming on general split network..." << endl;
+		findPDArea_LP(params, taxa_set);
+	} else if (params.run_mode == GREEDY) {	
+		// greedy search, not ensure to give the optimal sets!
+		cout << "Start greedy search..." << endl;
+		greedyPD(params.sub_size, curset, taxa_order);
+		localSearchPD(params.sub_size, curset, taxa_order);
+		taxa_set.resize(1);
+		taxa_set[0].push_back(new Split(curset));
+	} else if (params.run_mode != EXHAUSTIVE) {
+		params.detected_mode = LINEAR_PROGRAMMING;
+		printLPVersion(params.gurobi_format);
+		cout << "Linear programming on general split network..." << endl;
+		findPD_LP(params, taxa_set);
+	} 
+	else if (isBudgetConstraint()) {
+		// exhaustive search by the order
+		cout << endl << "Start exhaustive search..." << endl;
+		taxa_set.resize(1);
+		taxa_set[0].push_back(new Split(ntaxa, 0.0));
+		exhaustPDBudget(params.budget, -1, curset, params.find_all, taxa_set[0], taxa_order, rem_splits, rem_it);
+	} else	{
+		// exhaustive search by the order
+		cout << endl << "Start exhaustive search..." << endl;
+		taxa_set.resize(1);
+		taxa_set[0].push_back(new Split(ntaxa, 0.0));
+		exhaustPD2(params.sub_size, -1, curset, params.find_all, taxa_set[0], taxa_order, rem_splits, rem_it);
+	}
+
+	// call the leaving function
+	leaveFindPD(taxa_set);
+}
+
+void PDNetwork::leaveFindPD(vector<SplitSet> &taxa_set) {
+	// subtract the weights from the extra_pd
+	if (extra_pd > 0)
+		for (vector<SplitSet>::iterator it = taxa_set.begin(); it != taxa_set.end(); it++) 
+			for (SplitSet::iterator it2 = (*it).begin(); it2 != (*it).end(); it2++)
+				(*it2)->weight -= extra_pd;
+	if (min_pd) 
+		for (vector<SplitSet>::iterator it = taxa_set.begin(); it != taxa_set.end(); it++) 
+			for (SplitSet::iterator it2 = (*it).begin(); it2 != (*it).end(); it2++)
+				(*it2)->weight = -(*it2)->weight;
+}
+
+
+/**
+	exhaustive search VERSION 2 for maximal phylogenetic diversity of a given size 
+	@param subsize the subset size
+	@param best_set (OUT) the set of taxa in the maximal PD set
+	@param cur_tax current taxon
+	@param curset current set
+	@param taxa_order (OUT) order of inserted taxa
+	@param rem_splits (IN) remaining splits
+	@return the PD score of the maximal set
+*/
+double PDNetwork::exhaustPD2(int subsize, int cur_tax, Split &curset, 
+	bool find_all,SplitSet &best_set, vector<int> &taxa_order, 
+	IntList &rem_splits, IntList::iterator &rem_it ) 
+{
+	int ntaxa = getNTaxa();
+	double saved_score = curset.weight;
+	for (int tax = cur_tax+1; tax <= ntaxa - subsize; tax ++) {
+		curset.addTaxon(taxa_order[tax]);
+		IntList::iterator saved_it = rem_it;
+		curset.weight += calcRaisedWeight(curset, rem_splits, rem_it);
+		if (subsize > 1)
+			exhaustPD2(subsize-1, tax, curset, find_all, best_set, taxa_order, rem_splits, rem_it);
+		else {
+			if (curset.weight >= best_set[0]->weight) {
+				updateSplitVector(curset, best_set);
+				//curset.report(cout);
+			}
+			//curset.report(cout);
+		}
+		curset.removeTaxon(taxa_order[tax]);
+		curset.weight = saved_score;
+		rem_it = saved_it;
+		//restoreSplit(subsize, rem_splits, out_splits);
+	}
+	return best_set[0]->weight;
+}
+
+
+
+double PDNetwork::exhaustPDBudget(int cur_budget, int cur_tax, Split &curset, 
+	bool find_all,SplitSet &best_set, vector<int> &taxa_order, 
+	IntList &rem_splits, IntList::iterator &rem_it ) 
+{
+	int ntaxa = getNTaxa();
+	double saved_score = curset.weight;
+	for (int tax = cur_tax+1; tax < ntaxa; tax ++) 
+	if (pda->costs[taxa_order[tax]] <= cur_budget)
+	{
+		curset.addTaxon(taxa_order[tax]);
+		IntList::iterator saved_it = rem_it;
+		
+		curset.weight += calcRaisedWeight(curset, rem_splits, rem_it);
+		if (curset.weight >= best_set[0]->weight) {
+			updateSplitVector(curset, best_set);
+			//curset.report(cout);
+		}
+
+		if (tax < ntaxa-1)
+			exhaustPDBudget(cur_budget - pda->costs[taxa_order[tax]], tax, 
+				curset, find_all, best_set, taxa_order, rem_splits, rem_it);
+		
+			//curset.report(cout);
+		curset.removeTaxon(taxa_order[tax]);
+		curset.weight = saved_score;
+		rem_it = saved_it;
+		//restoreSplit(subsize, rem_splits, out_splits);
+	}
+	return best_set[0]->weight;
+}
+
+
+/********************************************************
+	GREEDY SEARCH!
+********************************************************/
+
+/**
+	greedy algorithm for phylogenetic diversity of a given size 
+	@param subsize the subset size
+	@param taxa_set (OUT) the set of taxa in the PD-set
+	@return the PD score of the maximal set, also returned in taxa_set.weight
+*/
+double PDNetwork::greedyPD(int subsize, Split &taxa_set, vector<int> &taxa_order) {
+	int ntaxa = getNTaxa();
+	taxa_set.setNTaxa(ntaxa);
+	taxa_set.weight = 0;
+	taxa_order.clear();
+	taxa_order.reserve(ntaxa);
+
+	int besti, bestj, i, j;
+
+	// start from the PD-2 set
+	for (i = 0; i < ntaxa - 1; i++)
+		for (j = 0; j < ntaxa; j++) {
+			Split curset;
+			curset.setNTaxa(ntaxa);
+			curset.addTaxon(i);
+			curset.addTaxon(j);
+			curset.weight = calcWeight(curset);
+			if (curset.weight > taxa_set.weight) {
+				taxa_set = curset;
+				besti = i;
+				bestj = j;
+			}
+		}
+
+	//taxa_set.report(cout);
+	taxa_order.push_back(besti);
+	taxa_order.push_back(bestj);
+
+	for (int step = 2; step < subsize; step++) {
+		Split pdk_set = taxa_set;
+		besti = -1;
+		for (i = 0; i < ntaxa; i++) 
+		if (!pdk_set.containTaxon(i)) {
+			Split curset;
+			curset.setNTaxa(ntaxa);
+			curset = pdk_set;
+			curset.addTaxon(i);
+			curset.weight = calcWeight(curset);
+			if (curset.weight > taxa_set.weight || besti == -1) {
+				taxa_set = curset;
+				besti = i;
+			}
+		}
+		//taxa_set.report(cout);
+		taxa_order.push_back(besti);
+	}
+	return taxa_set.getWeight();
+}
+
+
+/**
+	testing algorithm for phylogenetic diversity of a given size 
+	@param subsize the subset size
+	@param taxa_set (OUT) the set of taxa in the PD-set
+	@return the PD score of the maximal set, also returned in taxa_set.weight
+*/
+double PDNetwork::localSearchPD(int subsize, Split &taxa_set, vector<int> &taxa_order) {
+	int ntaxa = getNTaxa();
+	//int nsplits = getNSplits();
+	int i;
+	taxa_set.setNTaxa(ntaxa);
+	for (i = 0; i < subsize; i++) 
+		taxa_set.addTaxon(taxa_order[i]);
+	taxa_set.weight = calcWeight(taxa_set);
+	taxa_set.report(cout);
+	bool stop;
+	do {
+		stop = true;
+		for (i = 0; i < ntaxa; i++) if (taxa_set.containTaxon(i)) {
+			for (int j = 0; j < ntaxa; j++) if (!taxa_set.containTaxon(j)) 
+			{
+				taxa_set.addTaxon(j);
+				taxa_set.removeTaxon(i);
+				double new_w = calcWeight(taxa_set);
+				if (new_w > taxa_set.weight) {
+					taxa_set.weight = new_w;
+					stop = false;
+					taxa_set.report(cout);
+					break;
+				}
+				taxa_set.removeTaxon(j);
+				taxa_set.addTaxon(i);
+			}
+			if (!stop) break;
+		}
+	} while (!stop);
+	return taxa_set.getWeight();
+}
+
+
+void PDNetwork::calcPDGain(vector<SplitSet> &pd_set, matrix(double) &delta) {
+	vector<SplitSet>::iterator it;
+	int ntaxa = pd_set.front().front()->getNTaxa();
+	delta.resize(pd_set.size());
+	int cnt = 0;
+	for (cnt = 0; cnt < delta.size(); cnt++) 
+		delta[cnt].resize(ntaxa, 0);
+
+
+
+	for (it = pd_set.begin(), cnt = 0; it != pd_set.end(); it++, cnt++) {
+		assert(!(*it).empty());
+		// take only the first split for calculation
+		Split *sp = (*it).front();
+		for (int tax = 0; tax < ntaxa; tax++)
+			if (!sp->containTaxon(tax)) {
+				sp->addTaxon(tax);
+				delta[cnt][tax] = calcWeight(*sp) - sp->weight;
+				sp->removeTaxon(tax);
+			}
+	}
+}
+
+void PDNetwork::calcPDEndemism(SplitSet &area_set, DoubleVector &pd_endem) {
+	SplitSet::iterator it_s;
+
+	// make union of all id_sets
+	Split id_union(getNTaxa());
+	for (it_s = area_set.begin(); it_s != area_set.end(); it_s++) 
+		id_union += *(*it_s);
+	
+	// calculate PD of union 
+	calcPD(id_union);
+
+	// now calculate PD endemism
+	pd_endem.clear();
+	for (it_s = area_set.begin(); it_s != area_set.end(); it_s++) {
+		// make union of all other set
+		Split id_other(getNTaxa());
+		for (SplitSet::iterator it_s2 = area_set.begin(); it_s2 != area_set.end(); it_s2++)
+			if (it_s2 != it_s) id_other += *(*it_s2);
+		// calculate PD of all other sets
+		calcPD(id_other);
+
+		// calc PD endemism
+		pd_endem.push_back(id_union.weight - id_other.weight);
+	}
+}
+
+void PDNetwork::calcPDComplementarity(SplitSet &area_set, char *area_names, 
+	vector<string> &all_names, DoubleVector &pd_comp) {
+
+	set<string> given_areas;
+
+	parseAreaName(area_names, given_areas);
+
+/*
+	for (set<string>::iterator it = given_areas.begin(); it != given_areas.end(); it++)
+		cout << (*it) << "!";
+	cout << endl;
+*/
+	SplitSet::iterator it_s;
+	vector<string>::iterator it_n;
+
+	Split given_id(getNTaxa());
+
+	// convert taxa set to id set
+	for (it_s = area_set.begin(), it_n = all_names.begin(); it_s != area_set.end(); it_s++, it_n++) {
+		if (given_areas.find(*it_n) != given_areas.end())
+			given_id += *(*it_s);
+	}
+	
+	if (given_id.countTaxa() == 0)
+		outError("Complementary area name(s) not correct");
+
+	calcPD(given_id);
+
+	// now calculate PD complementarity
+	pd_comp.clear();
+	for (it_s = area_set.begin(); it_s != area_set.end(); it_s++) {
+		// make union the two sets
+		Split id_both(*(*it_s));
+		id_both += given_id;
+		// calculate PD of both sets
+		calcPD(id_both);
+		// calc PD complementarity
+		pd_comp.push_back(id_both.weight - given_id.weight);
+	}
+
+}
+
+void PDNetwork::transformLP2(Params &params, const char *outfile, int total_size, bool make_bin) {
+	Split included_tax(getNTaxa());
+	IntVector::iterator it2;
+	for (it2 = initialset.begin(); it2 != initialset.end(); it2++)
+		included_tax.addTaxon(*it2);
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile);
+		vector<int> y_value;
+		checkYValue(total_size, y_value);
+
+		lpObjectiveMaxSD(out, params, y_value, total_size);
+		lpSplitConstraint_TS(out, params, y_value, total_size);
+		lpK_BudgetConstraint(out, params, total_size);
+		lpVariableBound(out, params, included_tax, y_value);
+		if (make_bin) 
+			lpVariableBinary(out, params, included_tax);
+
+		out.close();
+		//cout << "Transformed LP problem printed to " << outfile << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+}
+
+//Olga:ECOpd split system
+void PDNetwork::transformEcoLP(Params &params, const char *outfile, int total_size) {
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile);
+		vector<int> y_value;
+		y_value.resize(getNSplits(), -1);
+		lpObjectiveMaxSD(out, params, y_value, total_size);
+		lpSplitConstraint_TS(out, params, y_value, total_size);
+		out.close();
+
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+}
+
+void PDNetwork::findPD_LP(Params &params, vector<SplitSet> &taxa_set) {
+	if (params.find_all)
+		outError("Current linear programming does not support multiple optimal sets!");
+
+	string ofile = params.out_prefix;
+	ofile += ".lp";
+	double score;
+	int lp_ret, i, ntaxa = getNTaxa();
+	int k, min_k, max_k, step_k, index;
+
+	double *variables = new double[ntaxa];
+
+	if (isBudgetConstraint()) { // non-budget case
+		min_k = params.min_budget;
+		max_k = params.budget;
+		step_k = params.step_budget;
+	} else {
+		min_k = params.min_size;
+		max_k = params.sub_size;
+		step_k = params.step_size;
+	}
+	taxa_set.resize((max_k - min_k)/step_k + 1);
+
+	// now construction the optimal PD sets
+	if (isBudgetConstraint())
+		cout << "running budget = ";
+	else
+		cout << "running k = ";
+	for (k = min_k; k <= max_k; k += step_k) {
+		index = (k - min_k) / step_k;
+		if (!params.binary_programming) {
+			transformLP2(params, ofile.c_str(), k, false);
+			cout << " " << k;
+			cout.flush();
+			if (params.gurobi_format)
+				lp_ret = gurobi_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode, params.gurobi_threads);
+			else
+				lp_ret = lp_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode);
+		} else lp_ret = 7;
+		if (lp_ret != 0 && lp_ret != 7)
+			outError("Something went wrong with LP solver!");
+		if (lp_ret == 7) { // fail with non-binary case, do again with strict binary
+			if (params.binary_programming)
+				transformLP2(params, ofile.c_str(), k, true);
+			else 
+				lpVariableBinary(ofile.c_str(), params, initialset);
+			cout << " " << k << "(bin)";
+			cout.flush();
+			if (params.gurobi_format)
+				lp_ret = gurobi_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode, params.gurobi_threads);
+			else
+				lp_ret = lp_solve((char*)ofile.c_str(), ntaxa, &score, variables, verbose_mode);
+			if (lp_ret != 0) // check error again without allowing non-binary
+				outError("Something went wrong with LP solver!");
+		}	
+
+		Split *pd_set = new Split(ntaxa, score);
+		for (i = 0; i < ntaxa; i++)
+			if (1.0 - variables[i] < tolerance) {
+				//pd_set->addTaxon(taxa_order[i]);
+				pd_set->addTaxon(i);
+			}
+		calcPD(*pd_set);
+		taxa_set[index].push_back(pd_set);
+	}
+	cout << endl;
+	delete variables;	
+}
+
+void PDNetwork::transformLP_Area2(Params &params, const char *outfile, int total_size, bool make_bin) {
+	int nareas = getNAreas();
+	Split included_area(nareas);
+	IntVector::iterator it2;
+	for (it2 = initialareas.begin(); it2 != initialareas.end(); it2++)
+		included_area.addTaxon(*it2);
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile);
+		vector<int> y_value, count1, count2;
+		checkYValue_Area(total_size, y_value, count1, count2);
+
+		lpObjectiveMaxSD(out, params, y_value, total_size);
+		lpSplitConstraint_RS(out, params, y_value, count1, count2, total_size);
+		lpInitialArea(out, params);
+		lpK_BudgetConstraint(out, params, total_size);
+		lpBoundaryConstraint(out, params);
+		lpVariableBound(out, params, included_area, y_value);
+		if (make_bin)
+			lpVariableBinary(out, params, included_area);
+
+		out.close();
+		//cout << "Transformed LP problem printed to " << outfile << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+
+}
+
+void PDNetwork::transformMinK_Area2(Params &params, const char *outfile, double pd_proportion, bool make_bin) {
+	int nareas = getNAreas();
+	Split included_area(nareas);
+	IntVector::iterator it2;
+	for (it2 = initialareas.begin(); it2 != initialareas.end(); it2++)
+		included_area.addTaxon(*it2);
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile);
+		vector<int> y_value, count1, count2;
+		checkYValue_Area(0, y_value, count1, count2);
+
+		lpObjectiveMinK(out, params);
+		lpMinSDConstraint(out, params, y_value, pd_proportion);
+		lpSplitConstraint_RS(out, params, y_value, count1, count2, 0);
+		lpInitialArea(out, params);
+		lpBoundaryConstraint(out, params);
+		lpVariableBound(out, params, included_area, y_value);
+		if (make_bin)
+			lpVariableBinary(out, params, included_area);
+
+		out.close();
+		//cout << "Transformed LP problem printed to " << outfile << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+}
+
+
+double PDNetwork::findMinKArea_LP(Params &params, const char* filename, double pd_proportion, Split &area) {
+	int nareas = area_taxa.size();
+	double *variables = new double[nareas];
+	double score;
+	int lp_ret, i;
+
+
+	if (!params.binary_programming) {
+		cout << " " << pd_proportion;
+		cout.flush();
+		transformMinK_Area2(params, filename, pd_proportion, false);
+		if (params.gurobi_format)
+			lp_ret = gurobi_solve((char*)filename, nareas, &score, variables, verbose_mode, params.gurobi_threads);
+		else
+			lp_ret = lp_solve((char*)filename, nareas, &score, variables, verbose_mode);
+	} else lp_ret = 7;
+	if (lp_ret != 0 && lp_ret != 7)
+		outError("Something went wrong with LP solver!");
+	if (lp_ret == 7) { // fail with non-binary case, do again with strict binary
+		cout << " " << pd_proportion << "(bin)";
+		cout.flush();
+		if (params.binary_programming)
+			transformMinK_Area2(params, filename, pd_proportion, true);
+		else
+			lpVariableBinary(filename, params, initialareas);
+		if (params.gurobi_format)
+			lp_ret = gurobi_solve((char*)filename, nareas, &score, variables, verbose_mode, params.gurobi_threads);
+		else
+			lp_ret = lp_solve((char*)filename, nareas, &score, variables, verbose_mode);
+		if (lp_ret != 0) // check error again without allowing non-binary
+			outError("Something went wrong with LP solver!");
+	}	
+	area.setNTaxa(nareas);
+	for (i = 0; i < nareas; i++)
+		if (1.0 - variables[i] < tolerance) {
+			//pd_set->addTaxon(taxa_order[i]);
+			area.addTaxon(i);
+		}
+	calcPDArea(area);
+	cout << " score: " << area.weight;
+	double budget_k;
+	if (isBudgetConstraint()) {
+		budget_k = calcCost(area);
+	} else {
+		budget_k = area.countTaxa();
+	}
+	delete variables;
+	return budget_k;
+}
+
+void PDNetwork::computeFeasibleBudget(Params &params, IntVector &ok_budget) {
+	if (!isBudgetConstraint()) {
+		ok_budget.resize(params.sub_size+1, 1);
+		return;
+	}
+	cout << "Computing feasible budget values..." << endl;
+	IntVector cost_present;
+	cost_present.resize((*max_element(pda->costs.begin(), pda->costs.end())) + 1, 0);
+	int i, j, num_cost = 0;
+	DoubleVector::iterator it;
+	for (it = pda->costs.begin(); it != pda->costs.end(); it++) {
+		if ((*it) != round(*it)) {
+			outError("Non integer cost detected.");
+		}
+		if ((*it) != 0 && !(cost_present[*it])) {
+			num_cost++;	
+			cost_present[*it] = 1;
+		}
+	}
+	if (num_cost == 0) outError("All costs are zero! Please check the input budget file.");
+	if (cost_present[1]) {
+		// if cost of 1 detected, all budget values are feasible
+		ok_budget.resize(params.budget+1, 1);
+		return;
+	}
+	IntVector unique_cost;
+	IntVector::iterator it2;
+	for (i = 0, it2 = cost_present.begin(); it2 != cost_present.end(); it2++, i++)
+		if (*it2) unique_cost.push_back(i);
+	assert(unique_cost.size() == num_cost);
+
+	ok_budget.resize(params.budget+1, 0);
+	// initialize all entry with corresponding cost
+	for (it2 = unique_cost.begin(); it2 != unique_cost.end(); it2++)
+	if (*it2 < ok_budget.size())
+		ok_budget[*it2] = 1;
+	// now use dynamic programming to find feasible budgets
+
+	for (i = 0; i <= params.budget; i++) 
+		for (it2 = unique_cost.begin(); it2 != unique_cost.end(); it2++) {
+			j = i - (*it2);
+			if (j < 0) continue;
+			if (ok_budget[j]) {
+				ok_budget[i] = 1;
+				break;
+			}
+		}
+		
+
+	if (verbose_mode < VB_MED)
+		return;
+	cout << "Feasible budgets:";
+	for (i = 0; i < ok_budget.size(); i++)
+		if (ok_budget[i]) cout << " " << i;
+	cout << endl;
+}
+
+
+void PDNetwork::printOutputSetScore(Params &params, vector<SplitSet> &pd_set) {
+	char filename[300];
+	//int c_old = -1;
+	int c_num = 0, i;
+	//double w_old = -1.0;
+	char scorename[300];
+	ofstream scoreout;
+	ofstream out;
+	if (params.nr_output == 1) {
+		if (params.run_mode == PD_USER_SET || !isPDArea()) {
+			sprintf(filename, "%s.pdtaxa", params.out_prefix);
+			cout << "All taxa list(s) printed to " << filename << endl;
+		} else { 
+			sprintf(filename, "%s.pdarea", params.out_prefix);
+			cout << "All area list(s) printed to " << filename << endl;
+		}
+		out.open(filename);
+		sprintf(scorename, "%s.score", params.out_prefix);
+		scoreout.open(scorename);
+	}
+	double total_weight = calcWeight();
+
+	for (vector<SplitSet>::iterator it = pd_set.begin(); it != pd_set.end(); it++) {
+		// ignore, if get the same PD sets again
+		//if (it != pd_set.begin() && it->getWeight() == (it-1)->getWeight() && it->size() == (it-1)->size()) 
+			//continue;
+		if ((*it).empty()) continue;
+		c_num = 0;
+		if (params.nr_output == 1)
+			scoreout << (*it)[0]->countTaxa() << "  " << (it)->getWeight() << endl;
+
+		for (SplitSet::iterator it2 = (*it).begin(); it2 != (*it).end(); it2++, c_num++ ){
+			Split *this_set = *it2;
+			int count = this_set->countTaxa();
+			//if (count == 0) continue;
+			
+			//if (count != c_old) {
+			if (c_num == 0) {
+				//c_num = 0;
+				sprintf(filename, "%s.%d.pdtaxa", params.out_prefix, count);
+			}
+			else {
+				//c_num++;
+				sprintf(filename, "%s.%d.pdtaxa.%d", params.out_prefix, count, c_num);
+			}
+			//if (fabs(w_old - this_set->getWeight()) > 1e-5 || (c_old != count))
+	//			if (params.nr_output == 1)
+		//			scoreout << count << "  " << this_set->getWeight() << endl;
+			//w_old = this_set->getWeight();
+	
+			//c_old = count;
+			if (params.nr_output > 10) {
+				out.open(filename);
+				if (params.run_mode == PD_USER_SET || !isPDArea()) {
+					for (i = 0; i < getNTaxa(); i++) 
+						if (this_set->containTaxon(i))
+							out << getTaxa()->GetTaxonLabel(i) << endl;
+				} else {
+					for (i = 0; i < getSetsBlock()->getNSets(); i++) 
+						if (this_set->containTaxon(i))
+							out << getSetsBlock()->getSet(i)->name << endl;
+				}
+				out.close();
+				//cout << "Taxa list printed to " << filename << endl;
+			} else if (params.nr_output == 1) {
+				out << count << "  " << this_set->getWeight() << " " << 
+					this_set->getWeight()  / total_weight << " " <<
+					calcCost(*this_set) << " " << computeBoundary(*this_set) << " " <<
+					params.boundary_modifier << endl;
+
+				if (params.run_mode == PD_USER_SET || !isPDArea()) {
+					for (i = 0; i < getNTaxa(); i++) 
+						if (this_set->containTaxon(i))
+							out << getTaxa()->GetTaxonLabel(i) << endl;
+				} else {
+					for (i = 0; i < getSetsBlock()->getNSets(); i++) 
+						if (this_set->containTaxon(i))
+							out << getSetsBlock()->getSet(i)->name << endl;
+				}
+			}
+		}
+	}
+
+	if (params.nr_output == 1) {
+		out.close();
+		scoreout.close();
+		//cout << "PD scores printed to " << scorename << endl;
+	}
+}
+
+
+void PDNetwork::findPDArea_LP(Params &params, vector<SplitSet> &areas_set) {
+	if (params.find_all)
+		outError("Current linear programming does not support multiple optimal sets!");
+	PDRelatedMeasures pd_more;
+	// get the taxa in the areas, only if EMPTY!
+	Split *area_coverage = new Split();
+	int num_area_coverage = params.sub_size;
+	if (area_taxa.empty()) {
+		computePD(params, area_taxa, pd_more);
+		if (params.root || params.is_rooted) {
+			assert(!initialset.empty());
+			int root_id = initialset[0];
+			for (SplitSet::iterator it = area_taxa.begin(); it != area_taxa.end(); it++)
+				(*it)->addTaxon(root_id);
+		}
+		checkAreaCoverage();
+		num_area_coverage = findMinAreas(params, *area_coverage);
+		calcPDArea(*area_coverage);
+		cout << "We found ";
+		if (isBudgetConstraint())
+			cout << "a budget of " << num_area_coverage << " is enough";
+		else
+			cout << "a number of " << num_area_coverage << " areas are enough";
+		cout << " to cover all feasible taxa" << endl;
+		if (isBudgetConstraint()) {
+			if (params.budget > num_area_coverage) {
+				params.budget = num_area_coverage;
+				if (params.min_budget > params.budget)
+					params.min_budget = params.budget;
+				cout << "budget is therefore set to a maximum of " << num_area_coverage << endl;
+			}
+		} else
+		if (params.sub_size > num_area_coverage) {
+			params.sub_size = num_area_coverage;
+			if (params.min_size > params.sub_size) 
+				params.min_size = params.sub_size;
+			cout << "k is therefore set to a maximum of " << num_area_coverage << endl;
+		}
+	}
+
+	string ofile = params.out_prefix;
+	ofile += ".lp";
+	double score;
+	int lp_ret, i;
+	int nareas = area_taxa.size();
+	int k, min_k, max_k, step_k, index;
+
+	if (params.pd_proportion == 1.0 && params.min_proportion == 0.0) {
+		if (area_coverage->empty()) num_area_coverage = findMinAreas(params, *area_coverage);
+		areas_set.resize(1);
+		areas_set[0].push_back(area_coverage);
+		if (isBudgetConstraint()) {
+			params.budget = params.min_budget = num_area_coverage;
+		} else {
+			params.sub_size = params.min_size = num_area_coverage;
+		}
+		return;
+	}
+
+
+	double *variables = new double[nareas];
+
+	// identifying minimum k/budget to conserve the proportion of SD
+	if (params.pd_proportion != 0.0) {
+		if (params.min_proportion == 0.0) params.min_proportion = params.pd_proportion;
+		cout << "running p = ";
+		double prop;
+		areas_set.resize(round((params.pd_proportion-params.min_proportion)/params.step_proportion) + 1);
+		for (prop = params.min_proportion, index = 0; prop <= params.pd_proportion + 1e-6; prop += params.step_proportion, index++) {
+			Split *area = new Split(nareas);
+			if (prop < 1.0) 
+				findMinKArea_LP(params, ofile.c_str(), prop, *area);
+			else
+				*area = *area_coverage;
+ 			areas_set[index].push_back(area);
+		}
+/*		if (params.min_proportion != 0.0)
+			min_bk = findMinKArea_LP(params, ofile.c_str(), params.min_proportion);
+		if (isBudgetConstraint()) {
+			params.budget = bk;
+			params.min_budget = min_bk;
+		} else {
+			params.sub_size = bk;
+			params.min_size = min_bk;
+		}
+		cout << endl << (isBudgetConstraint() ? "budget" : "k") << " from " << min_bk << " to " << bk << endl;*/
+		cout << endl;
+		delete [] variables;	
+		delete area_coverage;
+		return;
+	}
+
+	IntVector list_k;
+
+	if (isBudgetConstraint()) { // non-budget case
+		min_k = params.min_budget;
+		max_k = params.budget;
+		step_k = params.step_budget;
+	} else {
+		min_k = params.min_size;
+		max_k = params.sub_size;
+		step_k = params.step_size;
+	}
+	areas_set.resize((max_k - min_k)/step_k + 1);
+	computeFeasibleBudget(params, list_k);
+
+	time_t time_init;
+	time(&time_init);
+	// now construction the optimal PD sets
+	if (isBudgetConstraint())
+		cout << "running budget = ";
+	else
+		cout << "running k = ";
+	for (k = min_k; k <= max_k; k += step_k) {
+		if (!list_k[k]) continue;
+		index = (k - min_k) / step_k;
+		if (!params.binary_programming) {
+			cout << " " << k;
+			cout.flush();
+			transformLP_Area2(params, ofile.c_str(), k, false);
+			if (params.gurobi_format)
+				lp_ret = gurobi_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode, params.gurobi_threads);
+			else
+				lp_ret = lp_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode);
+		} else lp_ret = 7;
+
+		if (lp_ret != 0 && lp_ret != 7)
+			outError("Something went wrong with LP solver!");
+		if (lp_ret == 7) { // fail with non-binary case, do again with strict binary
+			cout << " " << k << "(bin)";
+			cout.flush();
+			if (params.binary_programming)
+				transformLP_Area2(params, ofile.c_str(), k, true);
+			else
+				lpVariableBinary(ofile.c_str(), params, initialareas);
+			if (params.gurobi_format)
+				lp_ret = gurobi_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode, params.gurobi_threads);
+			else
+				lp_ret = lp_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode);
+			if (lp_ret != 0) // check error again without allowing non-binary
+				outError("Something went wrong with LP solver!");
+		}	
+
+		Split *area = new Split(nareas, score);
+		for (i = 0; i < nareas; i++)
+			if (1.0 - variables[i] < tolerance) {
+				//pd_set->addTaxon(taxa_order[i]);
+				area->addTaxon(i);
+			}
+		calcPDArea(*area);
+		areas_set[index].push_back(area);
+		time_t time_cur;
+		time(&time_cur);
+		if (difftime(time_cur, time_init) > 10) {
+			// write output if more than 10 seconds have elapsed
+			printOutputSetScore(params, areas_set);
+			PDRelatedMeasures pd_more; // just for called function, nothing
+			summarizeSplit(params, *this, areas_set, pd_more, false);
+			time_init = time_cur;
+		}
+	}
+	cout << endl;
+	delete [] variables;	
+	delete area_coverage;
+}
+
+
+bool PDNetwork::isPDArea() {
+	return (sets->getNSets() > 0);
+}
+
+void PDNetwork::calcPDArea(Split &area_id_set) {
+	int ntaxa = getNTaxa();
+	int nareas = area_taxa.size();
+	Split sp(ntaxa);
+	for (int i = 0; i < nareas; i++)
+		if (area_id_set.containTaxon(i))
+			sp += *area_taxa[i];
+	calcPD(sp);
+	area_id_set.weight = sp.weight;
+}
+
+bool PDNetwork::isUniquelyCovered(int taxon, int &area) {
+	area = -1;
+	for (int i = 0; i < getNAreas(); i++)
+		if (area_taxa[i]->containTaxon(taxon)) {
+			if (area < 0) area = i;	else return false;
+		}
+	return (area >= 0);
+}
+
+
+void PDNetwork::transformLP_Area_Coverage(const char *outfile, Params &params, Split &included_area) {
+	int ntaxa = getNTaxa();
+	int nareas = getNAreas();
+	int i, j;
+	IntVector::iterator it;
+	Split tax_cover(ntaxa);
+	for (it = initialareas.begin(); it != initialareas.end(); it++) {
+		tax_cover += *(area_taxa[*it]);
+		included_area.addTaxon(*it);
+	}
+	for (j = 0; j < ntaxa; j++) {
+		if (isUniquelyCovered(j, i)) {
+			if (verbose_mode >= VB_MED) {
+				cout << "Taxon " << taxa->GetTaxonLabel(j) << " is uniquely covered by " << sets->getSet(i)->name << endl;
+			}
+			included_area.addTaxon(i);
+			tax_cover.addTaxon(j);
+		}
+	}	
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile);
+		iterator spit;
+
+		lpObjectiveMinK(out, params);
+
+		// add constraint: every taxon should be covered by some area
+		for (j = 0; j < ntaxa; j++) {
+			if (tax_cover.containTaxon(j)) continue;
+			bool ok = false;
+			for (i = 0; i < nareas; i++)
+				if (area_taxa[i]->containTaxon(j)) {
+					out << " +x" << i;
+					ok = true;
+				}
+			if (!ok) continue;
+			out << " >= 1";
+			if (params.gurobi_format)
+				out << endl;
+			else
+				out << ";" << endl;
+		}
+		lpBoundaryConstraint(out, params);
+
+		// add bound for variable x
+		IntVector y_value;
+		lpVariableBound(out, params, included_area, y_value);
+		out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+}
+
+
+int PDNetwork::findMinAreas(Params &params, Split &area_id) {
+	string ofile = params.out_prefix;
+	ofile += ".lp";
+	int nareas = getNAreas();
+	int i;
+	double *variables = new double[nareas];
+	double score;
+	Split included_area(nareas);
+	transformLP_Area_Coverage(ofile.c_str(), params, included_area);
+	int lp_ret;
+	if (params.gurobi_format)
+		lp_ret = gurobi_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode, params.gurobi_threads);
+	else
+		lp_ret = lp_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode);
+
+	if (lp_ret != 0 && lp_ret != 7)
+		outError("Something went wrong with LP solver!");
+	if (lp_ret == 7) { // fail with non-binary case, do again with strict binary
+		lpVariableBinary(ofile.c_str(), params, included_area);
+				
+		if (params.gurobi_format)
+			lp_ret = gurobi_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode, params.gurobi_threads);
+		else
+			lp_ret = lp_solve((char*)ofile.c_str(), nareas, &score, variables, verbose_mode);
+		if (lp_ret != 0) // check error again without allowing non-binary
+			outError("Something went wrong with LP solver!");
+	}
+	area_id.setNTaxa(nareas);
+	int count = 0;
+	// for checking purpose
+	Split taxon_coverage(getNTaxa());
+
+	for (i = 0; i < nareas; i++)
+		if (1.0 - variables[i] < tolerance) {
+			//pd_set->addTaxon(taxa_order[i]);
+			area_id.addTaxon(i);
+			taxon_coverage += *(area_taxa[i]);
+			if (isBudgetConstraint())
+				count += pda->getCost(i);
+			else
+				count++;
+		}
+	ofile = params.out_prefix;
+	ofile += ".cover";
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(ofile.c_str());
+		out << area_id.countTaxa() << " " << count << " " << computeBoundary(area_id) << " " << params.boundary_modifier << endl;
+		for (i = 0; i < nareas; i++)
+			if (area_id.containTaxon(i))
+				out << sets->getSet(i)->name << endl;
+		out.close();
+		//cout << "Transformed LP problem printed to " << outfile << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, ofile);
+	}
+	
+			
+	/*
+	if (taxon_coverage.countTaxa() != getNTaxa()) {
+		outError("Something wrong with LP in determining taxon coverage");
+	}*/
+	delete [] variables;
+	return (count);
+}
+
+
+bool PDNetwork::checkAreaCoverage() {
+	int ntaxa = getNTaxa();
+	Split tax_cover(ntaxa);
+	for (SplitSet::iterator it = area_taxa.begin(); it != area_taxa.end(); it++) {
+		tax_cover += *(*it);
+	}
+	if (tax_cover.countTaxa() == ntaxa) {
+		return true;
+	}
+
+	cout << "WARNING: some taxa are not covered by any area including: ";
+	for (int i = 0; i < ntaxa; i++)
+		if (!tax_cover.containTaxon(i)) cout << taxa->GetTaxonLabel(i) << " ";
+	cout << endl;
+	return false;
+}
+
+
+/***********************************************
+***********************************************
+	LP-aided functions
+***********************************************/
+
+
+void PDNetwork::lpObjectiveMaxSD(ostream &out, Params &params, IntVector &y_value, int total_size) {
+	//IntVector y_value, count1, count2;
+	iterator spit;
+	int i;
+	// define the objective function
+	if (params.gurobi_format)
+		out << "Maximize" << endl;
+	else
+		out << "max: ";
+	
+	for (spit = begin(),i=0; spit != end(); spit++,i++)	{
+		if (y_value[i] < 0)
+			out << " +" << (*spit)->getWeight() << " y" << i;
+		else if (y_value[i] >= 2)
+			out << " +" << (*spit)->getWeight() << " x" << y_value[i] - 2;
+	}
+
+	if (params.gurobi_format)
+		out << endl << "Subject to" << endl;
+	else
+		out << ";" << endl;
+}
+
+///// TODO FOR taxon selection
+void PDNetwork::lpObjectiveMinK(ostream &out, Params &params) {
+	iterator spit;
+	int i, j;
+	int nareas = area_taxa.size();
+
+	// define the objective function
+	if (params.gurobi_format)
+		out << "Minimize" << endl;
+	else
+		out << "min: ";
+	
+	for (j = 0; j < nareas; j++) {
+		double coeff = (isBudgetConstraint()) ? getPdaBlock()->getCost(j) : 1.0;
+		if (areas_boundary) coeff += areas_boundary[j*nareas+j] * params.boundary_modifier;
+		out << ((j>0) ? " +" : "") << coeff << " x" << j;
+
+
+	}
+
+	if (areas_boundary && params.boundary_modifier != 0.0) {
+		if (params.quad_programming)
+			out << " + [";
+		for (i = 0; i < nareas-1; i++) 
+		for (j = i+1; j < nareas; j++) 
+		if (areas_boundary[i*nareas+j] > 0.0) {
+			double coeff = 2*areas_boundary[i*nareas+j] * params.boundary_modifier;
+			if (params.quad_programming)
+				out << " -" << coeff << " x" << i << " * x" << j;
+			else
+				out << " -" << coeff << " y" << i << "_" << j;
+		}
+		if (params.quad_programming)
+			out << " ] / 2";
+	}
+
+	if (params.gurobi_format)
+		out << endl << "Subject to" << endl;
+	else
+		out << ";" << endl;
+}
+
+void PDNetwork::lpK_BudgetConstraint(ostream &out, Params &params, int total_size) {
+
+	int nvars;
+	int i, j;
+	if (isPDArea())
+		nvars = area_taxa.size();
+	else
+		nvars = getNTaxa();
+
+	for (j = 0; j < nvars; j++) {
+		double coeff = (isBudgetConstraint()) ? getPdaBlock()->getCost(j) : 1.0;
+		if (areas_boundary) coeff += areas_boundary[j*nvars+j] * params.boundary_modifier;
+		out << ((j>0) ? " +" : "") << coeff << " x" << j;
+		
+	}
+	
+	if (areas_boundary && params.boundary_modifier != 0.0) {
+		for (i = 0; i < nvars-1; i++) 
+		for (j = i+1; j < nvars; j++) 
+		if (areas_boundary[i*nvars+j] > 0.0) {
+			double coeff = 2*areas_boundary[i*nvars+j] * params.boundary_modifier;
+				out << " -" << coeff << " y" << i << "_" << j;
+		}
+	}
+	out << " <= " << total_size;
+	
+	// constraint for k-set or total budget
+	/*
+	if (isBudgetConstraint()) {
+		for (j = 0; j < nvars; j++) {
+			out << ((j==0)? "" : " +") << getPdaBlock()->getCost(j) << " x" << j;
+		}
+		out << " <= " << total_size;
+	} else {
+		for (j = 0; j < nvars; j++) {
+			out << ((j==0)? "" : " +") << "x" << j;
+		}
+		out  << " = " << total_size;
+	}*/
+	if (params.gurobi_format)
+		out << endl;
+	else
+		out << ";" << endl;
+}
+
+void PDNetwork::lpBoundaryConstraint(ostream &out, Params &params) {
+	// constraint on the variable for the shared boundary between areas
+	if (!areas_boundary || params.boundary_modifier == 0.0) 
+		return;
+	if (params.quad_programming) return;
+	int i, j;
+	int nareas = area_taxa.size();
+
+	for (i = 0; i < nareas-1; i++)
+		for (j = i+1; j < nareas; j++)
+			if (areas_boundary[i*nareas+j] > 0.0) {
+				out << "x" << i << " - y" << i << "_" << j << " >= 0";
+				if (params.gurobi_format)
+					out << endl;
+				else
+					out << ";" << endl;
+				out << "x" << j << " - y" << i << "_" << j << " >= 0";
+				if (params.gurobi_format)
+					out << endl;
+				else
+					out << ";" << endl;
+			}
+}
+
+void PDNetwork::lpSplitConstraint_RS(ostream &out, Params &params, IntVector &y_value, IntVector &count1, IntVector &count2, int total_size) {
+	iterator spit;
+	int i,j;
+	//int root_id = -1;
+	//if (params.root || params.is_rooted) root_id = initialset[0];
+	int nareas = area_taxa.size();
+
+
+	// adding the constraint for splits
+	for (spit = begin(),i=0; spit != end(); spit++,i++) {
+		if (y_value[i] >= 0) continue;
+		Split *sp = (*spit);
+
+		if (count1[i] < nareas && (isBudgetConstraint() || count1[i] <= nareas - total_size))
+		{
+			out << "y" << i;
+			if (!params.gurobi_format)
+				out << " <=";
+			for (j = 0; j < nareas; j++) {
+				if (sp->overlap(*area_taxa[j])) {
+					if (params.gurobi_format)
+						out << " -x" << j;
+					else
+						out << " +x" << j;
+				}
+			}
+			if (params.gurobi_format)
+				out << " <= 0" << endl;
+			else
+				out << ";" << endl;
+		}
+
+		if (count2[i] < nareas && (isBudgetConstraint() || count2[i] <= nareas - total_size))
+		{
+			sp->invert(); // scan the invert
+			out << "y" << i;
+			if (!params.gurobi_format)
+				out << " <=";
+			for (j = 0; j < nareas; j++) {
+				if (sp->overlap(*area_taxa[j])) {
+					if (params.gurobi_format)
+						out << " -x" << j;
+					else
+						out << " +x" << j;
+				}
+			}
+			if (params.gurobi_format)
+				out << " <= 0" << endl;
+			else
+				out << ";" << endl;
+			sp->invert(); // invert back to original
+		}
+	}
+}
+
+void PDNetwork::lpSplitConstraint_TS(ostream &out, Params &params, IntVector &y_value, int total_size) {
+	iterator spit;
+	int i,j;
+	int ntaxa = getNTaxa();
+	// adding the constraint for splits
+	for (spit = begin(),i=0; spit != end(); spit++,i++) {
+		if (y_value[i] >= 0) continue;
+		
+		Split *sp = (*spit);
+		bool contain_initset = sp->containAny(initialset);
+
+		if (!contain_initset && (isBudgetConstraint() || sp->countTaxa() <= ntaxa - total_size)) {
+			out << "y" << i;
+			for (j = 0; j < ntaxa; j++)
+				if (sp->containTaxon(j))
+					out << " -x" << j;
+			out << " <= 0";
+			if (params.gurobi_format)
+				out << endl;
+			else
+				out << ";" << endl;
+		}
+		contain_initset = false;
+		if (initialset.size() > 0) {
+			sp->invert();
+			contain_initset =  sp->containAny(initialset);
+			sp->invert();
+		}
+		if (!contain_initset && (isBudgetConstraint() || sp->countTaxa() >= total_size)) {
+			out << "y" << i;
+			for (j = 0; j < ntaxa; j++) 
+				if (!sp->containTaxon(j)) 
+					out << " -x" << j;
+			out << " <= 0";
+			if (params.gurobi_format)
+				out << endl;
+			else
+				out << ";" << endl;
+		}
+	}
+}
+
+
+void PDNetwork::lpMinSDConstraint(ostream &out, Params &params, IntVector &y_value, double pd_proportion) {
+	iterator spit;
+	int i;
+	double total_weight = calcWeight();
+	double required_sd = total_weight * pd_proportion;
+	if (required_sd > total_weight) required_sd = total_weight;
+	required_sd -= 1e-6;
+	// adding constraint for min conserved PD proportion
+	for (spit = begin(),i=0; spit != end(); spit++,i++)	{
+		if (y_value[i] < 0)
+			out << " +" << (*spit)->getWeight() << " y" << i;
+		else if (y_value[i] >= 2)
+			out << " +" << (*spit)->getWeight() << " x" << y_value[i] - 2;
+		else if (y_value[i] == 1) required_sd -= (*spit)->getWeight();
+	}
+	out.precision(12);
+	out << " >= " << required_sd;
+	out.precision(6);
+
+	if (params.gurobi_format)
+		out << endl;
+	else
+		out << ";" << endl;
+}
+
+void PDNetwork::lpVariableBound(ostream &out, Params &params, Split &included_vars, IntVector &y_value) {
+	IntVector::iterator it2;
+	int i, j;
+	// define the variable boundary
+
+	if (params.gurobi_format)
+		out << "Bounds" << endl;
+
+
+	for (j = 0; j < included_vars.getNTaxa(); j++) {
+		if (included_vars.containTaxon(j)) {
+			out << "x" << j << " = 1";
+		} else {
+			if (params.gurobi_format)
+				out << "0 <= ";
+			out << "x" << j << " <= 1";
+		}		
+		if (params.gurobi_format)
+			out << endl;
+		else
+			out << ";" << endl;
+	}
+
+	if (!y_value.empty()) {
+		for (i = 0; i < getNSplits(); i++) {
+			if (y_value[i] >= 0) continue;
+			if (params.gurobi_format)
+				out << "0 <= ";
+			out << "y" << i << " <= 1";
+			if (params.gurobi_format)
+				out << endl;
+			else
+				out << ";" << endl;
+		}
+	}
+	int nvars = included_vars.getNTaxa();
+	if (areas_boundary && params.boundary_modifier != 0.0 && !params.quad_programming) {
+		for (i = 0; i < included_vars.getNTaxa()-1; i++)
+		for (j = i+1; j < included_vars.getNTaxa(); j++) 
+			if (areas_boundary[i*nvars+j] > 0.0) {
+				if (params.gurobi_format)
+					out << "0 <= ";
+				out << "y" << i << "_" << j << " <= 1";
+				if (params.gurobi_format)
+					out << endl;
+				else
+					out << ";" << endl;
+				
+			}
+	}
+}
+
+void PDNetwork::lpVariableBinary(ostream &out, Params &params, Split &included_vars) {
+	int nvars;
+	int j;
+	if (isPDArea())
+		nvars = area_taxa.size();
+	else
+		nvars = getNTaxa();
+
+	bool first = true;
+	for (j = 0; j < nvars; j++) {
+		if (included_vars.containTaxon(j)) continue;
+		if (params.gurobi_format) {
+			if (!first)
+				out << " ";
+			else 
+				out << "Binary" << endl;
+		} else {
+			if (!first) 
+				out << ", ";
+			else
+				out << "bin ";
+		}
+		out << "x" << j;
+		first = false;
+	}
+	if (!first) {
+		if (params.gurobi_format)
+			out << endl;
+		else
+			out << ";" << endl;
+	}
+}
+
+
+/**
+	add binary variables
+*/
+void PDNetwork::lpVariableBinary(const char *outfile, Params &params, Split &included_vars) {
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile, ios::app);
+		lpVariableBinary(out, params, included_vars);
+		out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}	
+}
+
+
+void PDNetwork::lpVariableBinary(const char *outfile, Params &params, IntVector &initialset) {
+	int nvars;
+	if (isPDArea())
+		nvars = area_taxa.size();
+	else
+		nvars = getNTaxa();
+	Split included_vars(nvars);
+	for (IntVector::iterator it2 = initialset.begin(); it2 != initialset.end(); it2++)
+		included_vars.addTaxon(*it2);
+	lpVariableBinary(outfile, params, included_vars);
+}
+
+void PDNetwork::lpInitialArea(ostream &out, Params &params) {
+	int nareas = getNAreas();
+	int j;
+
+	// adding constraint for initialset
+	for (IntVector::iterator it = initialset.begin(); it != initialset.end(); it++) {
+		if (it == initialset.begin() && (params.root || params.is_rooted)) // ignore the root
+			continue;
+		out << "1 <= ";
+		bool ok = false;
+		for (j = 0; j < nareas; j++)
+			if (area_taxa[j]->containTaxon(*it)) {
+				out << " +x" << j;
+				ok = true;
+			}
+		if (params.gurobi_format)
+			out << endl;
+		else
+			out << ";" << endl;
+		if (!ok) {
+			outError("No area contains taxon ", taxa->GetTaxonLabel(*it));
+		}
+	}
+}
+
+void PDNetwork::checkYValue(int total_size, vector<int> &y_value) {
+	iterator spit;
+	int ntaxa = getNTaxa();
+	int i;
+
+	y_value.resize(getNSplits(), -1);
+	for (spit = begin(),i=0; spit != end(); spit++,i++) {
+		Split *sp = (*spit);
+		int id = -1;
+		int cnt = sp->countTaxa();
+		if (cnt > ntaxa / 2) {
+			sp->invert();
+			cnt = ntaxa - cnt;
+		}
+		if (cnt == 1)
+			id = sp->firstTaxon();
+		if (id >= 0) {
+			// if the split is external -> y[i] = x[id]
+			y_value[i] = id+2;
+			continue;
+		}
+		if (!isBudgetConstraint()) {
+			if (cnt > ntaxa - total_size && cnt < total_size) {
+				// if both constraints can be dropped -> y[i] = 1
+				y_value[i] = 1;
+			}
+		}
+	}
+	
+}
+
+
+
+
+void PDNetwork::checkYValue_Area(int total_size, vector<int> &y_value, vector<int> &count1, vector<int> &count2) {
+	iterator spit;
+	int nareas = area_taxa.size();
+	int i, j;
+
+	y_value.resize(getNSplits(), -1);
+	count1.resize(getNSplits(), 0);
+	count2.resize(getNSplits(), 0);
+	for (spit = begin(),i=0; spit != end(); spit++,i++) {
+		Split *sp = (*spit);
+		int id1 = -1, id2 = -1;
+		for (j = 0; j < nareas; j++) {
+			if (sp->overlap(*area_taxa[j])) { 
+				count1[i]++;
+				id1 = j;
+			}
+		}
+		sp->invert();
+		for (j = 0; j < nareas; j++) {
+			if (sp->overlap(*area_taxa[j])) { count2[i]++; id2 = j; }
+		}
+		sp->invert(); // invert back to original
+		if (count1[i] == 0 || count2[i] == 0) 
+			y_value[i] = 0;
+		else {
+
+			if (count1[i] == nareas && count2[i] == nareas) {
+				y_value[i] = 1;
+				continue;
+			}
+			if (isBudgetConstraint())
+				continue;
+
+			if (count1[i] == 1 && count2[i] > nareas - total_size) {
+				y_value[i] = id1 + 2;
+			} else if (count2[i] == 1 && count1[i] > nareas - total_size) {
+				y_value[i] = id2 + 2;
+				//continue;
+			} else if (count1[i] > nareas - total_size && count2[i] > nareas - total_size) {
+				y_value[i] = 1;
+			}
+		}
+	}
+	
+}
+
+void PDNetwork::speciesList(vector<string> *speciesNames)
+{
+	for(int i=0; i<getNTaxa();i++)
+		(*speciesNames).push_back(taxa->GetTaxonLabel(i));
+
+}
diff --git a/pdnetwork.h b/pdnetwork.h
new file mode 100644
index 0000000..af3c0c7
--- /dev/null
+++ b/pdnetwork.h
@@ -0,0 +1,416 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PDNETWORK_H
+#define PDNETWORK_H
+
+#include "splitgraph.h"
+
+/**
+General Split Network for Phylogenetic Diversity Algorithm
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class PDNetwork : public SplitGraph
+{
+public:
+
+	friend class MTree;
+	friend class ECOpd;
+
+	/**
+		empty constructor
+	*/
+    PDNetwork();
+
+	/**
+		construct PD network from a NEXUS or NEWICK file, e.g. produced by SplitsTree
+		@param params program parameters
+	*/
+    PDNetwork(Params &params);
+
+
+	/**
+		Identify the root node if specified, include it into the initial set
+		@param root_name name of the root node
+	*/
+	void readRootNode(const char *root_name);
+
+	/**
+		read the parameter from the file and incoporate into split system
+		@param params program parameters
+	*/
+	void readParams(Params &params);
+
+	/**
+		read the initial set of taxa to be included into PD-tree
+		@param params program parameters
+	*/
+	void readInitialSet(Params &params);
+
+
+	/**
+		read the initial areas to be included into PD set
+		@param params program parameters
+	*/
+	void readInitialAreas(Params &params);
+	
+	/**
+		increase the weight of the split associated with initial set
+	*/
+	void proceedInitialSet();
+
+	/**
+		initialize when PD min specified
+	*/
+	void initPDMin();
+
+
+	/**
+		compute the minimum required costs to conserve a taxa set
+		@param taxset set of taxa
+		@return budget required
+	*/
+	int calcCost(IntVector &taxset);
+
+	/**
+		compute the minimum required costs to conserve a taxa set
+		@param taxset set of taxa
+		@return budget required
+	*/
+	int calcCost(Split &taxset);
+
+	void printOutputSetScore(Params &params, vector<SplitSet> &pd_set);
+
+	/**
+		compute the PD score of a given taxa set in filename
+		@param params program parameters
+		@param taxa_set (OUT) corresponding set of taxa
+		@param pd_more (OUT) more computed PD measures will be stored here
+	*/
+	void computePD(Params &params, SplitSet &taxa_set, PDRelatedMeasures &pd_more);
+
+	/**
+		this will be called by findPD at the beginning
+		@param params program parameters
+	*/
+	virtual void enterFindPD(Params &params);
+
+	/**
+		main function to search for maximal phylogenetic diversity
+		@param params program parameters
+		@param taxa_set (OUT) the vector of set of taxa in the maximal PD set
+		@param taxa_order (OUT) order of inserted taxa
+	*/
+	virtual void findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
+
+
+	/**
+		this function will be called by findPD at the end
+		@param taxa_set (IN/OUT) the vector of set of taxa in the maximal PD set
+	*/
+	virtual void leaveFindPD(vector<SplitSet> &taxa_set);
+
+	/**
+		calculate the PD gain matrix in terms of delta_k^j = pd(PD_k \/ {j}) - pd_k
+		@param pd_set set of optimal PD sets
+		@param delta (OUT) PD gain matrix
+	*/
+	void calcPDGain(vector<SplitSet> &pd_set, matrix(double) &delta);
+
+	/**
+		compute the PD score of a given taxa set with name in taxa_name, result is written to id_set.weight. 
+		The difference from calcWeight() is that calcPD takes initialset into account
+		@param id_set (IN/OUT) corresponding set of taxa
+		
+	*/
+	void calcPD(Split &id_set);
+
+	/**
+		compute PD of a set of areas. It implicitly takes area_taxa map into account.
+		@param area_id_set IDs of areas in the set
+	*/
+	void calcPDArea(Split &area_id_set);
+
+	/**
+		compute the EXCLUSIVE PD score of a given taxa set with name in taxa_name, result is written to id_set.weight
+		@param id_set (IN/OUT) corresponding set of taxa IDs
+	*/
+	void calcExclusivePD(Split &id_set);
+
+	/**
+		compute the area's PD ENDEMISM of set of area
+		@param area_set set of area
+		@param pd_endem (OUT) corresponding PD endemism
+	*/
+	void calcPDEndemism(SplitSet &area_set, DoubleVector &pd_endem);
+
+	/**
+		compute the area's PD complementarity given a specific area
+		@param area_set set of area
+		@param area_names given area names as string separated by commas
+		@param all_names all area names
+		@param pd_comp (OUT) corresponding PD endemism
+	*/
+	void calcPDComplementarity(SplitSet &area_set, char *area_names, 
+		vector<string> &all_names, DoubleVector &pd_comp);
+
+
+	/**
+		transform the problem into an Integer Linear Programming and write to .lp file
+		@param params program parameters
+		@param outfile name of output file in LP format
+		@param total_size k for PD_k or total budget
+		@param make_bin TRUE if creating binary programming
+	*/
+	void transformLP(Params &params, const char *outfile, int total_size, bool make_bin);
+	void transformLP2(Params &params, const char *outfile, int total_size, bool make_bin);
+	void transformEcoLP(Params &params, const char *outline, int total_size);
+
+	/**
+		transform the problem into an Integer Linear Programming and write to .lp file
+		@param params program parameters
+		@param outfile name of output file in LP format
+		@param total_size k for PD_k or total budget
+		@param make_bin TRUE if creating binary programming
+	*/
+	void transformLP_Area(Params &params, const char *outfile, int total_size, bool make_bin);
+	void transformLP_Area2(Params &params, const char *outfile, int total_size, bool make_bin);
+
+	/**
+		transform the problem into an Integer Linear Programming and write to .lp file
+		@param params program parameters
+		@param outfile name of output file in LP format
+		@param pd_proportion minimum PD proprotion to be conserved
+		@param make_bin TRUE if creating binary programming
+	*/
+	void transformMinK_Area(Params &params, const char *outfile, double pd_proprotion, bool make_bin);
+	void transformMinK_Area2(Params &params, const char *outfile, double pd_proportion, bool make_bin);
+
+	/**
+		transform the PD problem into linear programming and solve it
+		@param params program parameters
+		@param taxa_set (OUT) the vector of set of taxa in the maximal PD set
+	*/
+	void findPD_LP(Params &params, vector<SplitSet> &taxa_set);
+
+	/**
+		transform the PD problem into linear programming and solve it
+		@param params program parameters
+		@param areas_set (OUT) the vector of set of areas in the maximal PD set
+	*/
+	void findPDArea_LP(Params &params, vector<SplitSet> &areas_set);
+
+	double findMinKArea_LP(Params &params, const char* filename, double pd_proportion, Split &area);
+
+	/**
+		@return TRUE if we are doing PD area optimization
+	*/
+	virtual bool isPDArea();
+
+	/**
+		check if all taxa are covered by the set of areas
+		@return false if there exists some taxon which is not covered by any areas
+	*/
+	bool checkAreaCoverage();
+
+	/**
+		transform the problem into an Integer Linear Programming and write to .lp file
+		@param outfile name of output file in LP format
+		@param included_area (OUT) collection of areas that should always be included
+	*/
+	void transformLP_Area_Coverage(const char *outfile, Params &params, Split &included_area);
+
+
+	/**
+		@return the minimum number of areas needed to cover all taxa
+		@param params program parameters
+		@param area_id (OUT) minimal set of areas which cover all taxa
+	*/
+	int findMinAreas(Params &params, Split &area_id);
+
+
+
+	/**
+		the set of areas, each item contains the set of taxa in the area.
+	*/
+	SplitSet area_taxa;
+
+	/**
+	 	speciesList is used in ECOpd analysis for synchronization of species in SplitNetwork with species in FoodWeb
+	 */
+	void speciesList(vector<string> *speciesNames);
+
+protected:
+
+	/**
+		extra PD when integrating initial set
+	*/
+	double extra_pd;
+
+	/**
+		when computing PD min (instead of PD max)
+	*/
+	bool min_pd;
+
+	/**
+		taxa set to be included into optimal PD set (with -i option)
+	*/
+	IntVector initialset;
+
+
+	/**
+		areas to be included into optimal PD set (with -ia option)
+	*/
+	IntVector initialareas;
+
+	/**
+		calculate the total maximum budget required 
+		@return total maximum budget required 
+	*/
+	int calcMaxBudget();
+
+/********************************************************
+	hill-climbing and greedy heuristics
+********************************************************/
+
+	/**
+		greedy algorithm for phylogenetic diversity of a given size 
+		@param subsize the subset size
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (OUT) order of inserted taxa
+		@return the PD score of the maximal set, also returned in taxa_set.weight
+	*/
+	double greedyPD(int subsize, Split &taxa_set, vector<int> &taxa_order);
+
+
+	/**
+		local search algorithm for phylogenetic diversity of a given size 
+		@param subsize the subset size
+		@param taxa_set (OUT) the set of taxa in the PD-set
+		@param taxa_order (IN) order of inserted taxa
+		@return the PD score of the maximal set, also returned in taxa_set.weight
+	*/
+	double localSearchPD(int subsize, Split &taxa_set, vector<int> &taxa_order);
+	
+/********************************************************
+	exhaustive search
+********************************************************/
+
+	/**
+		exhaustive search version 2 for maximal phylogenetic diversity of a given size 
+		@param subsize the subset size
+		@param cur_tax current taxon
+		@param curset current set
+		@param find_all TRUE if wanting all max PD set
+		@param best_set (OUT) the set of taxa in the maximal PD set
+		@param taxa_order (OUT) order of inserted taxa
+		@param rem_splits (IN) remaining splits
+		@param rem_it (IN) begin of remaining iterator
+		@return the PD score of the maximal set
+	*/
+	double exhaustPD2(int subsize, int cur_tax, Split &curset, 
+		bool find_all,SplitSet &best_set, vector<int> &taxa_order, 
+		IntList &rem_splits, IntList::iterator &rem_it);
+
+	/**
+		exhaustive search for maximal PD with cost-constrained
+		@param cur_budget  current budget
+		@param cur_tax current taxon
+		@param curset current set
+		@param find_all TRUE if wanting all max PD set
+		@param best_set (OUT) the set of taxa in the maximal PD set
+		@param taxa_order (OUT) order of inserted taxa
+		@param rem_splits (IN) remaining splits
+		@param rem_it (IN) begin of remaining iterator
+		@return the PD score of the maximal set
+	*/
+	double exhaustPDBudget(int cur_budget, int cur_tax, Split &curset, 
+		bool find_all,SplitSet &best_set, vector<int> &taxa_order, 
+		IntList &rem_splits, IntList::iterator &rem_it);
+
+	/**
+		calculate sum of weights of preserved splits in the taxa_set
+		@param taxa_set a set of taxa
+		@param rem_splits remaining splits
+		@param rem_it begin iterator of remaining splits
+	*/
+	double calcRaisedWeight(Split &taxa_set, IntList &rem_splits, IntList::iterator & rem_it);
+
+	/**
+		update the best taxa set during the search
+		@param curset the current taxa set
+		@param best_set the list of best taxa set so far
+	*/
+	void updateSplitVector(Split &curset, SplitSet &best_set);
+
+/********************************************************
+	linear programming support
+********************************************************/
+
+	/**
+		y variables in the LP formulation, check if it can be dropped or equals some x variable.
+		@param total_size k for PD_k or total budget
+		@param y_value (OUT): vector of: -1 if cannot reduce, 1 if equals 1, or id+2 where id is the trivial split id 
+	*/
+	void checkYValue(int total_size, vector<int> &y_value);
+
+	/**
+		y variables in the LP formulation for PD area optimization, check if it can be dropped or equals some x variable.
+		@param total_size k for PD_k or total budget
+		@param y_value (OUT) vector of: -1 if cannot reduce, 1 if can be dropped, or id+2 where id is the trivial area id
+		@param count1 (OUT) count of x variables in the inequality 1 
+		@param count2 (OUT) count of x variables in the inequality 2 
+	*/
+	void checkYValue_Area(int total_size, vector<int> &y_value, vector<int> &count1, vector<int> &count2);
+
+	/**
+		check if a taxon is uniquely covered by one area
+		@param taxon the taxon ID
+		@param area (OUT) area the area ID that covers taxon
+		@return TRUE if the 'taxon' is uniquely covered by only one area. Otherwise FALSE.
+	*/
+	bool isUniquelyCovered(int taxon, int &area);
+
+	void lpObjectiveMaxSD(ostream &out, Params &params, IntVector &y_value, int total_size);
+
+	void lpObjectiveMinK(ostream &out, Params &params);
+
+	void lpSplitConstraint_RS(ostream &out, Params &params, IntVector &y_value, IntVector &count1, IntVector &count2, int total_size);
+	void lpSplitConstraint_TS(ostream &out, Params &params, IntVector &y_value, int total_size);
+
+	void lpK_BudgetConstraint(ostream &out, Params &params, int total_size);
+
+	void lpMinSDConstraint(ostream &out, Params &params, IntVector &y_value, double pd_proportion);
+
+	void lpVariableBound(ostream &out, Params &params, Split &included_vars, IntVector &y_value);
+
+	void lpBoundaryConstraint(ostream &out, Params &params);
+
+	void lpVariableBinary(ostream &out, Params &params, Split &included_vars);
+
+	void lpVariableBinary(const char *outfile, Params &params, Split &included_vars);
+	void lpVariableBinary(const char *outfile, Params &params, IntVector &initialset);
+	void lpInitialArea(ostream &out, Params &params);
+
+	void computeFeasibleBudget(Params &params, IntVector &list_k);
+
+};
+
+#endif
diff --git a/pdtree.cpp b/pdtree.cpp
new file mode 100644
index 0000000..bc92ff4
--- /dev/null
+++ b/pdtree.cpp
@@ -0,0 +1,400 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "ncl/ncl.h"
+#include "tools.h"
+#include "pdtree.h"
+#include "msetsblock.h"
+#include "myreader.h"
+
+/*********************************************
+	class PDTree
+*********************************************/
+PDTree::PDTree(Params &params)
+{
+	init(params);
+}
+
+void PDTree::init(Params &params) {
+	MTree::init(params.user_file, params.is_rooted);
+	if (params.is_rooted) {
+		params.sub_size++;
+		params.min_size++;
+	}
+	if (params.is_rooted && params.root != NULL) {
+		outError(ERR_CONFLICT_ROOT);
+	}
+
+	if (params.sub_size > leafNum) {
+		ostringstream err;
+		err << "Subset size k = " << params.sub_size-params.is_rooted << 
+			" is greater than the number of taxa = " << leafNum-params.is_rooted;
+		outError(err.str());
+	}
+
+	if (params.is_rooted) {
+		initialset.push_back(root);
+	}
+	// read the parameter file
+	if (params.param_file != NULL) {
+		readParams(params);
+	}
+	// identify the root
+	if (params.root != NULL) 
+		readRootNode(params.root);
+	// read the initial set of taxa, incoporate info into the split system
+	if (params.initial_file != NULL) {
+		readInitialSet(params);
+	}
+}
+
+
+/**
+	constructor
+*/
+PDTree::PDTree(PDTree &tree)
+{
+	init(tree);
+}
+
+void PDTree::init(PDTree &tree) {
+	MTree::init(tree);
+	//subsize = tree.subsize;
+	initialset = tree.initialset;
+}
+
+
+void PDTree::buildLeafMapName(LeafMapName &lsn, Node *node, Node* dad) {
+	if (!node) node = root;
+	if (node->isLeaf()) {
+		if (lsn.find(node->name) != lsn.end()) 
+			outError(ERR_DUPLICATED_TAXA);
+		lsn[node->name] = node;
+	}
+	//for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++)
+		//if ((*it)->node != dad)
+	FOR_NEIGHBOR_IT(node, dad, it)
+		buildLeafMapName(lsn, (*it)->node, node);
+}
+
+/*
+Node *PDTree::findNode(char *name, Node *node, Node *dad) {
+	if (!node) node = root;
+	// check the name if a leaf
+	if (node->isLeaf()) {
+		if (node->name == name)
+			return node;
+	}
+	// recursive search
+	//for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it++) 
+		//if ((*it)->node != dad) {
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		Node *res = findNode(name, (*it)->node, node);
+		if (res != NULL)
+			return res;
+	}
+	return NULL;
+}
+*/
+
+void PDTree::readRootNode(const char *root_name) {
+	string name = root_name;
+	Node *node = findNodeName(name);
+	if (node == NULL)
+		outError(ERR_NO_ROOT, root_name);
+	initialset.push_back(node);
+}
+
+
+/**
+	read the initial set of taxa to be included into PD-tree
+*/
+void PDTree::readInitialSet(Params &params) {
+	LeafMapName lsn;
+	buildLeafMapName(lsn);
+	int ntaxa = leafNum - params.is_rooted;
+	StrVector tax_name;
+	readInitTaxaFile(params, ntaxa, tax_name);
+	for (StrVector::iterator it = tax_name.begin(); it != tax_name.end(); it++) {
+		LeafMapName::iterator nameit = lsn.find((*it));
+		if (nameit == lsn.end()) {
+			Node *node = findNodeName(*it);
+			if (!node)
+				cout << "Find no taxon with name " << *it << endl;
+			else {
+				Node *taxon;
+				int distance = findNearestTaxon(taxon, node);
+				cout << "Replace internal node " << node->name << " by taxon " 
+					 << taxon->name << " (" << distance << " branches away)" << endl;
+				initialset.push_back(taxon);
+			}
+		} else
+		initialset.push_back((*nameit).second);
+	}
+	cout << initialset.size() - rooted << " initial taxa" << endl;
+}
+
+
+void PDTree::readParams(Params &params) {
+	int ntaxa = leafNum - params.is_rooted;
+
+	// read parameters from file
+	double scale;
+	StrVector tax_name;
+	DoubleVector ori_weight, tax_weight;
+	readWeightFile(params, ntaxa, scale, tax_name, ori_weight);
+
+	// now convert the weights
+	LeafMapName lsn;
+	buildLeafMapName(lsn);
+	tax_weight.resize(ntaxa, 0);
+	for (int i = 0; i < tax_name.size(); i++) {
+		LeafMapName::iterator nameit = lsn.find(tax_name[i]);
+		if (nameit == lsn.end())
+			outError(ERR_NO_TAXON, tax_name[i]);
+		tax_weight[(*nameit).second->id] = ori_weight[i];
+	}
+
+	if (params.scaling_factor >= 0) {
+		if (params.scaling_factor > 1) outError("Scaling factor must be between 0 and 1");
+		cout << "Rescaling branch lengths with " << params.scaling_factor << 
+			" and taxa weights with " << 1 - params.scaling_factor << endl;
+		scale = params.scaling_factor;
+		for (DoubleVector::iterator it = tax_weight.begin(); it != tax_weight.end(); it++)
+			(*it) *= (1 - scale);
+	}
+
+	// incoporate them into the tree
+	incoporateParams(scale, tax_weight);
+}
+
+void PDTree::incoporateParams(double &scale, DoubleVector &tax_weight, Node* node, Node* dad) {
+	if (!node) node = root;
+	FOR_NEIGHBOR_DECLARE(node, NULL, it) {
+		double newlen;
+		newlen = (*it)->length * scale;
+		if (node->isLeaf())
+			newlen += tax_weight[node->id];
+		else if ((*it)->node->isLeaf())
+			newlen += tax_weight[(*it)->node->id];
+		(*it)->length = newlen;
+	}
+	FOR_NEIGHBOR(node, dad, it)
+		incoporateParams(scale, tax_weight, (*it)->node, node);
+			
+}
+
+void PDTree::computePD(Params &params, vector<PDTaxaSet> &taxa_set, PDRelatedMeasures &pd_more) {
+	LeafMapName lsn;
+	buildLeafMapName(lsn);
+
+	MSetsBlock *sets;
+	TaxaSetNameVector *allsets;
+	sets = new MSetsBlock();
+
+ 	cout << "Reading taxa sets in file " << params.pdtaxa_file << "..." << endl;
+
+	bool nexus_formated = (detectInputFile(params.pdtaxa_file) == IN_NEXUS);
+	if (nexus_formated) {
+		MyReader nexus(params.pdtaxa_file);
+		nexus.Add(sets);
+		MyToken token(nexus.inf);
+		nexus.Execute(token);
+	} else {
+		readTaxaSets(params.pdtaxa_file, sets);
+	}
+
+	allsets = sets->getSets();
+
+	//sets->Report(cout);
+
+	taxa_set.resize(sets->getNSets());
+
+	vector<PDTaxaSet>::iterator it_ts;
+	TaxaSetNameVector::iterator i;
+
+	for (i = allsets->begin(), it_ts = taxa_set.begin(); i != allsets->end(); i++, it_ts++) {
+		set<string> taxa_name;
+		for (NodeVector::iterator it = initialset.begin(); it != initialset.end(); it++)
+			taxa_name.insert((*it)->name);
+		for (vector<string>::iterator it2 = (*i)->taxlist.begin(); it2 != (*i)->taxlist.end(); it2++) {
+			LeafMapName::iterator nameit = lsn.find(*it2);
+			if (nameit == lsn.end())
+				outError(ERR_NO_TAXON, *it2);
+			taxa_name.insert(*it2);
+		}
+
+		Split id_set;
+		makeTaxaSet(taxa_name, *it_ts);
+		(*it_ts).makeIDSet(leafNum, id_set);
+		if (params.exclusive_pd) {
+			calcExclusivePD(id_set);
+			pd_more.exclusivePD.push_back(id_set.getWeight());
+		}
+		calcPD(id_set);
+		(*it_ts).score = id_set.getWeight();
+		(*it_ts).name = (*i)->name;
+		pd_more.PDScore.push_back(id_set.getWeight());
+		pd_more.setName.push_back((*i)->name);
+	}
+
+	delete sets;
+}
+
+
+
+void PDTree::makeTaxaSet(set<string> &taxa_name, PDTaxaSet &taxa_set, Node *node, Node *dad) {
+	if (!node) node = root;
+	if (node->isLeaf() && taxa_name.find(node->name) != taxa_name.end()) {
+		taxa_set.push_back(node);
+	}
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		makeTaxaSet(taxa_name, taxa_set, (*it)->node, node);
+	}
+}
+
+bool PDTree::calcPD(Split &id_set, double cur_len, Node *node, Node *dad) {
+	if (!node) { 
+		node = root; 
+		id_set.weight = 0.0;
+		if (!rooted && !id_set.containTaxon(node->id)) {
+			int id = id_set.firstTaxon();
+			if (id < 0) return false;
+			node = findNodeID(id);
+		}
+	}
+
+	bool resval = false;
+
+	if (node->isLeaf() && id_set.containTaxon(node->id)) {
+		id_set.weight += cur_len;
+		resval = true;
+	}
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (calcPD(id_set, cur_len + (*it)->length, (*it)->node, node)) {
+			cur_len = 0.0;
+			resval = true;
+		}
+	}
+	return resval;
+}
+
+void PDTree::calcExclusivePD(Split &id_set) {
+	id_set.invert();
+	calcPD(id_set);
+	id_set.invert();
+	id_set.weight = treeLength() - id_set.weight;
+}
+
+
+void PDTree::calcPDEndemism(vector<PDTaxaSet> &area_set, DoubleVector &pd_endem) {
+	vector<Split> id_sets;
+	vector<Split>::iterator it_s;
+	vector<PDTaxaSet>::iterator it_a;
+
+	// convert taxa set to id set
+	id_sets.resize(area_set.size());
+	for (it_a = area_set.begin(), it_s = id_sets.begin(); it_a != area_set.end(); it_a++, it_s++) 
+		(*it_a).makeIDSet(leafNum, *it_s);
+
+	// make union of all id_sets
+	Split id_union(leafNum);
+	for (it_s = id_sets.begin(); it_s != id_sets.end(); it_s++) 
+		id_union += *it_s;
+	
+	// calculate PD of union 
+	calcPD(id_union);
+
+	// now calculate PD endemism
+	pd_endem.clear();
+	for (it_s = id_sets.begin(); it_s != id_sets.end(); it_s++) {
+		// make union of all other set
+		Split id_other(leafNum);
+		for (vector<Split>::iterator it_s2 = id_sets.begin(); it_s2 != id_sets.end(); it_s2++)
+			if (it_s2 != it_s) id_other += *it_s2;
+		// calculate PD of all other sets
+		calcPD(id_other);
+
+		// calc PD endemism
+		pd_endem.push_back(id_union.weight - id_other.weight);
+	}
+}
+
+
+void PDTree::calcPDComplementarity(vector<PDTaxaSet> &area_set, char *area_names, DoubleVector &pd_comp) {
+
+	set<string> given_areas;
+
+	parseAreaName(area_names, given_areas);
+
+/*
+	for (set<string>::iterator it = given_areas.begin(); it != given_areas.end(); it++)
+		cout << (*it) << "!";
+	cout << endl;
+*/
+	vector<Split> id_sets;
+	vector<Split>::iterator it_s;
+	vector<PDTaxaSet>::iterator it_a;
+
+	Split given_id(leafNum);
+
+	// convert taxa set to id set
+	id_sets.resize(area_set.size());
+	for (it_a = area_set.begin(), it_s = id_sets.begin(); it_a != area_set.end(); it_a++, it_s++) {
+		(*it_a).makeIDSet(leafNum, *it_s);
+		if (given_areas.find((*it_a).name) != given_areas.end())
+			given_id += *it_s;
+	}
+	
+	if (given_id.countTaxa() == 0)
+		outError("Complementary area name(s) not correct");
+	calcPD(given_id);
+
+	
+
+	// now calculate PD complementarity
+	pd_comp.clear();
+	for (it_s = id_sets.begin(); it_s != id_sets.end(); it_s++) {
+		// make union the two sets
+		Split id_both(*it_s);
+		id_both += given_id;
+		// calculate PD of both sets
+		calcPD(id_both);
+		// calc PD complementarity
+		pd_comp.push_back(id_both.weight - given_id.weight);
+	}
+
+}
+int PDTree::findNearestTaxon(Node* &taxon, Node *node, Node *dad) {
+	if (node->isLeaf()) {
+		taxon = node;
+		return 0;
+	}
+	int distance = 10000000;
+	taxon = NULL;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		Node *mytaxon;
+		int mydistance = findNearestTaxon(mytaxon, (*it)->node, node);
+		if (mydistance < distance) {
+			distance = mydistance;
+			taxon = mytaxon;
+		}
+	}
+	return distance+1;
+}
diff --git a/pdtree.h b/pdtree.h
new file mode 100644
index 0000000..61bb6eb
--- /dev/null
+++ b/pdtree.h
@@ -0,0 +1,171 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PDTREE_H
+#define PDTREE_H
+
+#include "mtree.h"
+#include "split.h"
+
+
+/**
+Specialized Tree for Phylogenetic Diversity Algorithms
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class PDTree: public MTree{
+public:
+	/**
+		construct from program parameters
+		@param params program parameters
+	*/
+    PDTree(Params &params);
+
+	/**
+		constructor, get from another tree
+		@param tree another MTree
+	*/
+    PDTree(PDTree &tree);
+
+	/**
+		constructor
+	*/
+	PDTree() : MTree() {};
+
+
+/********************************************************
+	INITIALZATION
+********************************************************/
+	/**
+		initialize the tree from program parameters
+		@param params program parameters
+	*/
+	void init(Params &params);
+
+	/**
+		initialize tree, get from another tree
+		@param tree another MTree
+	*/
+	void init(PDTree &tree);
+
+	/**
+		read the parameter from the file
+		@param params program parameters
+	*/
+	void readParams(Params &params);
+
+	/**
+		Identify the root node if specified, include it into the initial set
+		@param root_name name of the root node
+	*/
+	void readRootNode(const char *root_name);
+
+	/**
+		read the initial set of taxa to be included into PD-tree
+		@param params program parameters
+	*/
+	void readInitialSet(Params &params);
+
+	/**
+		incoporate the parameters to the tree
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param scale (OUT) branch scaling factor
+		@param tax_weight (OUT) taxa weights
+	*/
+	void incoporateParams(double &scale, DoubleVector &tax_weight, Node *node = NULL, Node* dad = NULL);
+
+	/**
+		build a set of leaf name, return to lsn
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param lsn (OUT) leaf set name
+	*/
+	void buildLeafMapName(LeafMapName &lsn, Node *node = NULL, Node* dad = NULL);
+
+	int findNearestTaxon(Node* &taxon, Node *node, Node *dad = NULL);
+
+/********************************************************
+	Computing PD of area (user-defined set of taxa)
+********************************************************/
+
+	/**
+		compute the PD score of a given taxa set in filename
+		@param params program parameters
+		@param taxa_set (OUT) corresponding set of taxa
+		@param pd_more (OUT) more computed PD measures will be stored here
+	*/
+	void computePD(Params &params, vector<PDTaxaSet> &taxa_set, PDRelatedMeasures &pd_more);
+
+	/**
+		compute the PD score of a given taxa set with name in taxa_name
+		@param taxa_name vector of name of all taxa
+		@param taxa_set (OUT) corresponding set of taxa
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+	*/
+	void makeTaxaSet(set<string> &taxa_name, PDTaxaSet &taxa_set, Node *node = NULL, Node *dad = NULL);
+
+	/**
+		compute the PD score of a given taxa set with name in taxa_name
+		@param id_set (IN/OUT) corresponding set of taxa
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+		@param curlen current length so far
+		@return TRUE if the below subtree contains taxon in id_set
+	*/
+	bool calcPD(Split &id_set, double curlen = 0.0, Node *node = NULL, Node *dad = NULL);
+
+	/**
+		compute the EXCLUSIVE PD score of a given taxa set with name in taxa_name
+		@param id_set (IN/OUT) corresponding set of taxa IDs
+	*/
+	void calcExclusivePD(Split &id_set);
+
+	/**
+		compute the area's PD ENDEMISM of set of area
+		@param area_set set of area
+		@param pd_endem (OUT) corresponding PD endemism
+	*/
+	void calcPDEndemism(vector<PDTaxaSet> &area_set, DoubleVector &pd_endem);
+
+	/**
+		compute the area's PD complementarity given a specific area
+		@param area_set set of area
+		@param area_name given area names as string separated by commas
+		@param pd_comp (OUT) corresponding PD endemism
+	*/
+	void calcPDComplementarity(vector<PDTaxaSet> &area_set, char *area_name, DoubleVector &pd_comp);
+
+
+/********************************************************
+	VARIABLES
+********************************************************/
+
+	/**
+	 	initial set of taxa which must be included into the final PD set
+	*/
+	NodeVector initialset;
+
+protected:
+
+
+};
+
+
+#endif
diff --git a/pdtreeset.cpp b/pdtreeset.cpp
new file mode 100644
index 0000000..dd7d305
--- /dev/null
+++ b/pdtreeset.cpp
@@ -0,0 +1,135 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "pdtreeset.h"
+
+PDTreeSet::PDTreeSet()
+ : MTreeSet()
+{
+}
+
+
+PDTreeSet::PDTreeSet(Params &params) {
+	init(params);
+}
+
+void PDTreeSet::init(Params &params) {
+	MTreeSet::init(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
+
+	if (isRootedTrees()) {
+		params.sub_size++;
+		params.min_size++;
+	}
+	if (isRootedTrees() && params.root != NULL) {
+		outError(ERR_CONFLICT_ROOT);
+	}
+
+	if (params.sub_size > getNTaxa()) {
+		ostringstream err;
+		err << "Subset size k = " << params.sub_size - params.is_rooted <<
+			" is greater than the number of taxa = " << getNTaxa() - params.is_rooted;
+		outError(err.str());
+	}
+
+	if (isRootedTrees()) {
+		char *rname = (char*)ROOT_NAME;
+		readRootNode(rname);
+	}
+	// read the parameter file
+	if (params.param_file != NULL) {
+		readParams(params);
+	}
+	// identify the root
+	if (params.root != NULL) 
+		readRootNode(params.root);
+
+	// read the initial set of taxa, incoporate info into the split system
+	if (params.initial_file != NULL) {
+		readInitialSet(params);
+	}
+}
+
+bool PDTreeSet::isRootedTrees() {
+	assert(size() > 0);
+	return front()->rooted;
+}
+
+int PDTreeSet::getNTaxa() {
+	assert(size() > 0);
+	return front()->leafNum;
+}
+
+void PDTreeSet::readRootNode(const char *root_name) {
+	string name = root_name;
+	init_taxa.push_back(name);
+	for (iterator it = begin(); it != end(); it++)
+		((PDTree*)(*it))->readRootNode(root_name);
+}
+
+void PDTreeSet::readParams(Params &params) {
+
+	int ntaxa = getNTaxa() - params.is_rooted;
+
+	// read parameters from file
+	double scale;
+	StrVector tax_name;
+	DoubleVector ori_weight;
+	readWeightFile(params, ntaxa, scale, tax_name, ori_weight);
+
+	for (iterator it = begin(); it != end(); it++) {
+		// now convert the weights
+		PDTree *mytree = (PDTree*)(*it);
+		LeafMapName lsn;
+		mytree->buildLeafMapName(lsn);
+		DoubleVector tax_weight;
+		tax_weight.resize(ntaxa, 0);
+		for (int i = 0; i < tax_name.size(); i++) {
+			LeafMapName::iterator nameit = lsn.find(tax_name[i]);
+			if (nameit == lsn.end())
+				outError(ERR_NO_TAXON, tax_name[i]);
+			tax_weight[(*nameit).second->id] = ori_weight[i];
+		}
+	
+		// incoporate them into the tree
+		mytree->incoporateParams(scale, tax_weight);
+	}
+}
+
+/**
+	read the initial set of taxa to be included into PD-tree
+*/
+void PDTreeSet::readInitialSet(Params &params) {
+	int ntaxa = getNTaxa() - params.is_rooted;
+	StrVector tax_name;
+	readInitTaxaFile(params, ntaxa, tax_name);
+	init_taxa.insert(init_taxa.end(), tax_name.begin(), tax_name.end());
+
+	for (iterator itree = begin(); itree != end(); itree++) {
+		PDTree *mytree = (PDTree*)(*itree);
+		LeafMapName lsn;
+		mytree->buildLeafMapName(lsn);
+		for (StrVector::iterator it = tax_name.begin(); it != tax_name.end(); it++) {
+			LeafMapName::iterator nameit = lsn.find((*it));
+			if (nameit == lsn.end()) {
+				outError(ERR_NO_TAXON, *it);
+			}
+			mytree->initialset.push_back((*nameit).second);
+		}
+	}
+}
diff --git a/pdtreeset.h b/pdtreeset.h
new file mode 100644
index 0000000..c720616
--- /dev/null
+++ b/pdtreeset.h
@@ -0,0 +1,94 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PDTREESET_H
+#define PDTREESET_H
+
+#include "mtreeset.h"
+#include "pdtree.h"
+
+/**
+Vector of PDTree
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class PDTreeSet : public MTreeSet
+{
+public:
+    PDTreeSet();
+
+	/**
+		constructor, read trees from user file
+		@param params program parameters
+	*/
+	PDTreeSet(Params &params);
+
+	/**
+		@return a new tree
+	*/
+	virtual MTree *newTree() { return  (new PDTree()); }
+
+	/**
+		@return true if trees are rooted
+	*/
+	bool isRootedTrees();
+
+	/**
+		@return number of taxa
+	*/
+	int getNTaxa();
+
+
+/********************************************************
+	INITIALZATION
+********************************************************/
+	/**
+		initialize the tree from program parameters
+		@param params program parameters
+	*/
+	void init(Params &params);
+
+	/**
+		read the parameter from the file
+		@param params program parameters
+	*/
+	void readParams(Params &params);
+
+	/**
+		Identify the root node if specified, include it into the initial set
+		@param root_name name of the root node
+	*/
+	void readRootNode(const char *root_name);
+
+	/**
+		read the initial set of taxa to be included into PD-tree
+		@param params program parameters
+	*/
+	void readInitialSet(Params &params);
+
+protected:
+	
+	/**
+		name of initial taxa, to be included into PD set
+	*/
+	StrVector init_taxa;
+
+};
+
+#endif
diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp
new file mode 100644
index 0000000..e92b5ba
--- /dev/null
+++ b/phyloanalysis.cpp
@@ -0,0 +1,2760 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <iqtree_config.h>
+#include "phylotree.h"
+#include "phylosupertree.h"
+#include "phylosupertreeplen.h"
+#include "phyloanalysis.h"
+#include "alignment.h"
+#include "superalignment.h"
+#include "iqtree.h"
+#include "model/modelgtr.h"
+#include "model/modeldna.h"
+#include "myreader.h"
+#include "model/rateheterogeneity.h"
+#include "model/rategamma.h"
+#include "model/rateinvar.h"
+#include "model/rategammainvar.h"
+//#include "modeltest_wrapper.h"
+#include "model/modelprotein.h"
+#include "model/modelbin.h"
+#include "model/modelcodon.h"
+#include "stoprule.h"
+
+#include "mtreeset.h"
+#include "mexttree.h"
+#include "model/ratemeyerhaeseler.h"
+#include "whtest_wrapper.h"
+#include "model/partitionmodel.h"
+#include "model/modelmixture.h"
+#include "guidedbootstrap.h"
+#include "model/modelset.h"
+#include "timeutil.h"
+#include "upperbounds.h"
+
+
+void reportReferences(Params &params, ofstream &out, string &original_model) {
+	out << "To cite IQ-TREE please use:" << endl << endl
+		<< "Lam-Tung Nguyen, Heiko A. Schmidt, Arndt von Haeseler, and Bui Quang Minh (2015)" << endl
+		<< "IQ-TREE: A fast and effective stochastic algorithm for estimating" << endl
+		<< "maximum likelihood phylogenies. Mol. Biol. Evol., 32:268-274." << endl << endl;
+
+	if (params.gbo_replicates)
+	out << "Since you also used ultrafast bootstrap (UFBoot) please cite: " << endl << endl
+		<< "Bui Quang Minh, Minh Anh Thi Nguyen, and Arndt von Haeseler (2013) Ultrafast" << endl
+		<< "approximation for phylogenetic bootstrap. Mol. Biol. Evol., 30:1188-1195." << endl << endl;
+
+	/*		"*** If you use the parallel version, please cite: " << endl << endl <<
+	 "Bui Quang Minh, Le Sy Vinh, Arndt von Haeseler, and Heiko A. Schmidt (2005)" << endl <<
+	 "pIQPNNI - parallel reconstruction of large maximum likelihood phylogenies." << endl <<
+	 "Bioinformatics, 21:3794-3796." << endl << endl;*/
+
+// 	if (original_model == "TEST" || original_model == "TESTONLY")
+// 		out << "Since you used Modeltest please also cite Posada and Crandall (1998)" << endl << endl;
+}
+
+void reportAlignment(ofstream &out, Alignment &alignment, int nremoved_seqs) {
+	out << "Input data: " << alignment.getNSeq()+nremoved_seqs << " sequences with "
+			<< alignment.getNSite() << " "
+			<< ((alignment.seq_type == SEQ_BINARY) ?
+					"binary" :
+					((alignment.seq_type == SEQ_DNA) ? "nucleotide" :
+					(alignment.seq_type == SEQ_PROTEIN) ? "amino-acid" :
+					(alignment.seq_type == SEQ_CODON) ? "codon": "morphological"))
+			<< " sites" << endl << "Number of constant sites: "
+			<< round(alignment.frac_const_sites * alignment.getNSite())
+			<< " (= " << alignment.frac_const_sites * 100 << "% of all sites)"
+			<< endl << "Number of site patterns: " << alignment.size() << endl
+			<< endl;
+}
+
+void pruneModelInfo(vector<ModelInfo> &model_info, PhyloSuperTree *tree) {
+	vector<ModelInfo> res_info;
+	for (vector<PartitionInfo>::iterator it = tree->part_info.begin(); it != tree->part_info.end(); it++) {
+		for (vector<ModelInfo>::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
+			if (mit->set_name == it->name)
+				res_info.push_back(*mit);
+	}
+	model_info = res_info;
+
+}
+
+void reportModelSelection(ofstream &out, Params &params, vector<ModelInfo> &model_info, bool is_partitioned) {
+	out << "Best-fit model according to "
+		<< ((params.model_test_criterion == MTC_BIC) ? "BIC" :
+			((params.model_test_criterion == MTC_AIC) ? "AIC" : "AICc")) << ": ";
+	vector<ModelInfo>::iterator it;
+	if (is_partitioned) {
+		string set_name = "";
+		for (it = model_info.begin(); it != model_info.end(); it++) {
+			if (it->set_name != set_name) {
+				if (set_name != "")
+					out << ",";
+				out << it->name << ":" << it->set_name;
+				set_name = it->set_name;
+			}
+		}
+	} else {
+		out << model_info[0].name;
+	}
+
+	if (is_partitioned) {
+		out << endl << endl << "List of best-fit models per partition:" << endl << endl;
+	} else {
+		out << endl << endl << "List of models sorted by "
+			<< ((params.model_test_criterion == MTC_BIC) ? "BIC" :
+				((params.model_test_criterion == MTC_AIC) ? "AIC" : "AICc"))
+			<< " scores: " << endl << endl;
+	}
+	if (is_partitioned)
+		out << "  ID  ";
+	out << "Model             LogL          AIC      w-AIC      AICc     w-AICc       BIC      w-BIC" << endl;
+	/*
+	if (is_partitioned)
+		out << "----------";
+
+	out << "----------------------------------------------------------------------------------------" << endl;
+	*/
+	int setid = 1;
+	for (it = model_info.begin(); it != model_info.end(); it++) {
+		if (it->AIC_score == DBL_MAX) continue;
+		if (it != model_info.begin() && it->set_name != (it-1)->set_name)
+			setid++;
+		if (is_partitioned && it != model_info.begin() && it->set_name == (it-1)->set_name)
+			continue;
+		if (is_partitioned) {
+			out.width(4);
+			out << right << setid << "  ";
+		}
+		out.width(15);
+		out << left << it->name << " ";
+		out.width(11);
+		out << right << it->logl << " ";
+		out.width(11);
+		out	<< it->AIC_score << ((it->AIC_conf) ? " + " : " - ") << it->AIC_weight << " ";
+		out.width(11);
+		out << it->AICc_score << ((it->AICc_conf) ? " + " : " - ") << it->AICc_weight << " ";
+		out.width(11);
+		out << it->BIC_score  << ((it->BIC_conf) ? " + " : " - ") << it->BIC_weight;
+		out << endl;
+	}
+	out << endl;
+	out <<  "AIC, w-AIC   : Akaike information criterion scores and weights." << endl
+		 << "AICc, w-AICc : Corrected AIC scores and weights." << endl
+		 << "BIC, w-BIC   : Bayesian information criterion scores and weights." << endl << endl
+
+		 << "Plus signs denote the 95% confidence sets." << endl
+		 << "Minus signs denote significant exclusion." <<endl;
+	out << endl;
+}
+
+void reportModel(ofstream &out, Alignment *aln, ModelSubst *m) {
+	int i, j, k;
+	assert(aln->num_states == m->num_states);
+	if (m->num_states <= 4) {
+		out << "Rate parameter R:" << endl << endl;
+
+		double *rate_mat = new double[m->num_states * m->num_states];
+		if (!m->isSiteSpecificModel())
+			m->getRateMatrix(rate_mat);
+		else
+			((ModelSet*)m)->front()->getRateMatrix(rate_mat);
+		if (m->num_states > 4)
+			out << fixed;
+		if (m->isReversible()) {
+			for (i = 0, k = 0; i < m->num_states - 1; i++)
+				for (j = i + 1; j < m->num_states; j++, k++) {
+					out << "  " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j) << ": "
+							<< rate_mat[k];
+					if (m->num_states <= 4)
+						out << endl;
+					else if (k % 5 == 4)
+						out << endl;
+				}
+
+		} else { // non-reversible model
+			for (i = 0, k = 0; i < m->num_states; i++)
+				for (j = 0; j < m->num_states; j++)
+					if (i != j) {
+						out << "  " << aln->convertStateBackStr(i) << "-" << aln->convertStateBackStr(j)
+								<< ": " << rate_mat[k];
+						if (m->num_states <= 4)
+							out << endl;
+						else if (k % 5 == 4)
+							out << endl;
+						k++;
+					}
+
+		}
+
+		//if (tree.aln->num_states > 4)
+		out << endl;
+		out.unsetf(ios_base::fixed);
+		delete[] rate_mat;
+	}
+	out << "State frequencies: ";
+	if (m->isSiteSpecificModel())
+		out << "(site specific frequencies)" << endl << endl;
+	else {
+		if (!m->isReversible())
+			out << "(inferred from Q matrix)" << endl;
+		else
+			switch (m->getFreqType()) {
+			case FREQ_EMPIRICAL:
+				out << "(empirical counts from alignment)" << endl;
+				break;
+			case FREQ_ESTIMATE:
+				out << "(estimated with maximum likelihood)" << endl;
+				break;
+			case FREQ_USER_DEFINED:
+				out << ((aln->seq_type == SEQ_PROTEIN) ? "(model)" : "(user-defined)") << endl;
+				break;
+			case FREQ_EQUAL:
+				out << "(equal frequencies)" << endl;
+				break;
+			default:
+				break;
+			}
+		out << endl;
+
+		if (m->getFreqType() != FREQ_USER_DEFINED && m->getFreqType() != FREQ_EQUAL) {
+			double *state_freqs = new double[m->num_states];
+			m->getStateFrequency(state_freqs);
+            int ncols=(aln->seq_type == SEQ_CODON) ? 4 : 1;
+			for (i = 0; i < m->num_states; i++) {
+				out << "  pi(" << aln->convertStateBackStr(i) << ") = " << state_freqs[i];
+                if (i % ncols == ncols-1)
+                    out << endl;
+            }
+			delete[] state_freqs;
+			out << endl;
+		}
+		if (m->num_states <= 4) {
+			// report Q matrix
+			double *q_mat = new double[m->num_states * m->num_states];
+			m->getQMatrix(q_mat);
+
+			out << "Rate matrix Q:" << endl << endl;
+			for (i = 0, k = 0; i < m->num_states; i++) {
+				out << "  " << aln->convertStateBackStr(i);
+				for (j = 0; j < m->num_states; j++, k++) {
+					out << "  ";
+					out.width(8);
+					out << q_mat[k];
+				}
+				out << endl;
+			}
+			out << endl;
+			delete[] q_mat;
+		}
+	}
+}
+
+void reportModel(ofstream &out, PhyloTree &tree) {
+//	int i, j, k;
+	int i;
+
+	if (tree.getModel()->isMixture()) {
+		out << "Mixture model of substitution: " << tree.params->model_name << endl;
+		out << "Full name: " << tree.getModelName() << endl;
+		ModelMixture *mmodel = (ModelMixture*) tree.getModel();
+		out << endl << "  No  Component      Rate    Weight   Parameters" << endl;
+		i = 0;
+		for (ModelMixture::iterator m = mmodel->begin(); m != mmodel->end(); m++, i++) {
+			out.width(4);
+			out << right << i+1 << "  ";
+			out.width(12);
+			out << left << (*m)->name << "  ";
+			out.width(7);
+			out << (*m)->total_num_subst << "  ";
+			out.width(7);
+			out << mmodel->prop[i] << "  " << (*m)->getNameParams() << endl;
+//			out << "Model for mixture component "  << (m-mmodel->begin())+1 << ": " << (*m)->name << endl;
+//			reportModel(out, tree.aln, *m);
+		}
+		out << endl;
+	} else {
+		out << "Model of substitution: " << tree.getModelName() << endl << endl;
+		reportModel(out, tree.aln, tree.getModel());
+	}
+}
+
+void reportRate(ofstream &out, PhyloTree &tree) {
+	int i;
+	RateHeterogeneity *rate_model = tree.getRate();
+	out << "Model of rate heterogeneity: " << rate_model->full_name << endl;
+	rate_model->writeInfo(out);
+
+	if (rate_model->getNDiscreteRate() > 1 || rate_model->getPInvar() > 0.0) {
+		out << endl << " Category  Relative_rate  Proportion" << endl;
+		if (rate_model->getPInvar() > 0.0)
+			out << "  0         0              " << rate_model->getPInvar()
+					<< endl;
+		int cats = rate_model->getNDiscreteRate();
+		DoubleVector prop;
+		if (rate_model->getGammaShape() > 0 || rate_model->getPtnCat(0) < 0) {
+//			prop.resize(cats, (1.0 - rate_model->getPInvar()) / rate_model->getNRate());
+			prop.resize(cats);
+		for (i = 0; i < cats; i++)
+			prop[i] = rate_model->getProp(i);
+		} else {
+			prop.resize(cats, 0.0);
+			for (i = 0; i < tree.aln->getNPattern(); i++)
+				prop[rate_model->getPtnCat(i)] += tree.aln->at(i).frequency;
+			for (i = 0; i < cats; i++)
+				prop[i] /= tree.aln->getNSite();
+		}
+		for (i = 0; i < cats; i++) {
+			out << "  " << i + 1 << "         ";
+			out.width(14);
+			out << left << rate_model->getRate(i) << " " << prop[i];
+			out << endl;
+		}
+		if (rate_model->isGammaRate()) {
+			out << "Relative rates are computed as " << ((dynamic_cast<RateGamma*>(rate_model)->isCutMedian()) ? "MEDIAN" : "MEAN") <<
+				" of the portion of the Gamma distribution falling in the category." << endl;
+		}
+	}
+	/*
+	 if (rate_model->getNDiscreteRate() > 1 || rate_model->isSiteSpecificRate())
+	 out << endl << "See file " << rate_file << " for site-specific rates and categories" << endl;*/
+	out << endl;
+}
+
+void reportTree(ofstream &out, Params &params, PhyloTree &tree, double tree_lh, double lh_variance, double main_tree) {
+	double epsilon = 1.0 / tree.getAlnNSite();
+	double totalLen = tree.treeLength();
+	int df = tree.getModelFactory()->getNParameters();
+	int ssize = tree.getAlnNSite();
+	double AIC_score, AICc_score, BIC_score;
+	computeInformationScores(tree_lh, df, ssize, AIC_score, AICc_score, BIC_score);
+    
+	out << "Log-likelihood of the tree: " << fixed << tree_lh << " (s.e. "
+			<< sqrt(lh_variance) << ")" << endl;
+    out	<< "Unconstrained log-likelihood (without tree): " << tree.aln->computeUnconstrainedLogL() << endl;
+
+    out << "Number of free parameters (#branches + #model parameters): " << df << endl;
+//    if (ssize > df) { 
+//        if (ssize > 40*df)
+//            out	<< "Akaike information criterion (AIC) score: " << AIC_score << endl;
+//        else
+//			out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
+//        
+//		out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
+//    } else 
+    out	<< "Akaike information criterion (AIC) score: " << AIC_score << endl;
+    out << "Corrected Akaike information criterion (AICc) score: " << AICc_score << endl;
+    out << "Bayesian information criterion (BIC) score: " << BIC_score << endl;
+
+    if (ssize <= df && main_tree) {
+        
+        out << endl
+            << "**************************** WARNING ****************************" << endl
+            << "Number of parameters (K): " << df << endl
+            << "Sample size (n):          " << ssize << endl << endl
+            << "Given that K>=n, the model parameters are not identifiable." << endl
+            << "The program will still try to estimate the parameter values," << endl
+            << "but because of the small sample size, the parameter estimates" << endl 
+            << "are likely to be inaccurate." << endl << endl
+            
+            << "Phylogenetic estimates obtained under these conditions should be" << endl 
+            << "interpreted with extreme caution." << endl << endl 
+
+            << "Ideally, it is desirable that n >> K. When selecting optimal" << endl
+            << "models," << endl
+            << "1. use AIC or BIC if n > 40K;" << endl 
+            << "2. use AICc or BIC if 40K >= n > K;" << endl 
+            << "3. be extremely cautious if n <= K (because model parameters" << endl
+            << "   are not identifiable)." << endl << endl
+
+            << "To improve the situation (3), consider the following options:" << endl
+            << "  1. Increase the sample size (n)" << endl
+            << "  2. Decrease the number of parameters (K) to be estimated. If" << endl
+            << "     possible:" << endl
+            << "     a. Remove the least important sequences from the alignment" << endl
+            << "     b. Specify some of the parameter values for the substitution"<< endl 
+            << "        model (e.g., the nucleotide or amino acid frequencies)" << endl
+            << "     c. Specify some of the parameter values for the rates-across-" << endl
+            << "        sites model (e.g., the shape parameter for the discrete" << endl
+            << "        Gamma distribution, the proportion of invariable sites, or" << endl
+            << "        the rates of change for different rate categories under" << endl
+            << "        the FreeRate model)" << endl << endl
+            << "Reference:" << endl
+            << "Burnham KR, Anderson DR (2002). Model Selection and Multimodel" << endl
+            << "Inference: A Practical Information-Theoretic Approach. Springer," << endl
+            << "New York." << endl 
+            << "************************ END OF WARNING ***********************" << endl;
+    }
+    out << endl;
+    
+	out << "Total tree length (sum of branch lengths): " << totalLen << endl;
+	double totalLenInternal = tree.treeLengthInternal(epsilon);
+	out << "Sum of internal branch lengths: " << totalLenInternal << " (" << totalLenInternal*100.0 / totalLen << "% of tree length)" << endl;
+//	out << "Sum of internal branch lengths divided by total tree length: "
+//			<< totalLenInternal / totalLen << endl;
+	out << endl;
+	//out << "ZERO BRANCH EPSILON = " << epsilon << endl;
+	int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
+	if (zero_internal_branches > 0) {
+		//int zero_internal_branches = tree.countZeroInternalBranches(NULL, NULL, epsilon);
+		/*
+		out << "WARNING: " << zero_branches
+				<< " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
+				<< endl;
+		*/
+		out << "WARNING: " << zero_internal_branches
+				<< " near-zero internal branches (<" << epsilon << ") should be treated with caution"
+				<< endl;
+		/*
+		cout << endl << "WARNING: " << zero_branches
+				<< " branches of near-zero lengths (<" << epsilon << ") and should be treated with caution!"
+				<< endl;
+		*/
+		out << "         Such branches are denoted by '**' in the figure below"
+				<< endl << endl;
+	}
+	int long_branches = tree.countLongBranches(NULL, NULL, MAX_BRANCH_LEN-0.2);
+	if (long_branches > 0) {
+		//stringstream sstr;
+		out << "WARNING: " << long_branches << " too long branches (>" 
+            << MAX_BRANCH_LEN-0.2 << ") should be treated with caution!" << endl;
+		//out << sstr.str();
+		//cout << sstr.str();
+	}
+
+			//<< "Total tree length: " << tree.treeLength() << endl << endl
+	tree.sortTaxa();
+    out << "NOTE: Tree is UNROOTED although outgroup taxon '" << tree.root->name << "' is drawn at root" << endl;
+
+    if (tree.isSuperTree() && params.partition_type == 0)
+        out	<< "NOTE: Branch lengths are weighted average over all partitions" << endl
+            << "      (weighted by the number of sites in the partitions)" << endl;
+
+    bool is_codon = tree.aln->seq_type == SEQ_CODON;
+    if (tree.isSuperTree()) {
+        PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
+        is_codon = true;
+        for (PhyloSuperTree::iterator sit = stree->begin(); sit != stree->end(); sit++)
+            if ((*sit)->aln->seq_type != SEQ_CODON) {
+                is_codon = false;
+                break;
+            }
+    }
+    if (is_codon)
+        out << endl << "NOTE: Branch lengths are intepreted as number of nucleotide substitutions per codon site!" 
+            << endl << "      Rescale them by 1/3 if you want to have #nt substitutions per nt site" << endl;
+    if (main_tree) 
+    if (params.aLRT_replicates > 0 || params.gbo_replicates || (params.num_bootstrap_samples && params.compute_ml_tree)) {
+        out << "Numbers in parentheses are ";
+        if (params.aLRT_replicates > 0) {
+            out << "SH-aLRT supports";
+            if (params.localbp_replicates)
+                out << " / local bootstrap (LBP)";
+        }
+        if (params.num_bootstrap_samples && params.compute_ml_tree) {
+            if (params.aLRT_replicates > 0)
+                out << " /";
+            out << " standard bootstrap supports";
+        }
+        if (params.gbo_replicates) {
+            if (params.aLRT_replicates > 0)
+                out << " /";
+            out << " ultrafast bootstrap supports";
+        }
+        out << " (%)" << endl;
+    }
+    out << endl;
+
+	//tree.setExtendedFigChar();
+	tree.drawTree(out, WT_BR_SCALE, epsilon);
+        
+    out << "Tree in newick format:" << endl << endl;
+
+	tree.printTree(out, WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
+
+	out << endl << endl;
+}
+
+void reportCredits(ofstream &out) {
+	out << "CREDITS" << endl << "-------" << endl << endl
+			<< "Some parts of the code were taken from the following packages/libraries:"
+			<< endl << endl
+			<< "Schmidt HA, Strimmer K, Vingron M, and von Haeseler A (2002)" << endl
+			<< "TREE-PUZZLE: maximum likelihood phylogenetic analysis using quartets" << endl
+			<< "and parallel computing. Bioinformatics, 18(3):502-504." << endl << endl
+
+			//<< "The source code to construct the BIONJ tree were taken from BIONJ software:"
+			//<< endl << endl
+			<< "Gascuel O (1997) BIONJ: an improved version of the NJ algorithm" << endl
+			<< "based on a simple model of sequence data. Mol. Bio. Evol., 14:685-695." << endl << endl
+
+			//<< "The Nexus file parser was taken from the Nexus Class Library:"
+			//<< endl << endl
+			<< "Paul O. Lewis (2003) NCL: a C++ class library for interpreting data files in" << endl
+			<< "NEXUS format. Bioinformatics, 19(17):2330-2331." << endl << endl
+
+			<< "Mascagni M and Srinivasan A (2000) Algorithm 806: SPRNG: A Scalable Library" << endl
+			<< "for Pseudorandom Number Generation. ACM Transactions on Mathematical Software," << endl
+			<< "26: 436-461." << endl << endl
+
+			<< "Guennebaud G, Jacob B, et al. (2010) Eigen v3. http://eigen.tuxfamily.org" << endl << endl;
+			/*
+			<< "The Modeltest 3.7 source codes were taken from:" << endl << endl
+			<< "David Posada and Keith A. Crandall (1998) MODELTEST: testing the model of"
+			<< endl << "DNA substitution. Bioinformatics, 14(9):817-8." << endl
+			*/
+}
+
+/***********************************************************
+ * CREATE REPORT FILE
+ ***********************************************************/
+extern StringIntMap pllTreeCounter;
+
+void exhaustiveSearchGAMMAInvar(Params &params, IQTree &iqtree);
+
+void searchGAMMAInvarByRestarting(IQTree &iqtree);
+
+void computeLoglFromUserInputGAMMAInvar(Params &params, IQTree &iqtree);
+
+void reportPhyloAnalysis(Params &params, string &original_model,
+		IQTree &tree, vector<ModelInfo> &model_info) {
+	if (params.count_trees) {
+		// addon: print #distinct trees
+		cout << endl << "NOTE: " << pllTreeCounter.size() << " distinct trees evaluated during whole tree search" << endl;
+
+		IntVector counts;
+		for (StringIntMap::iterator i = pllTreeCounter.begin(); i != pllTreeCounter.end(); i++) {
+			if (i->second > counts.size())
+				counts.resize(i->second+1, 0);
+			counts[i->second]++;
+		}
+		for (IntVector::iterator i2 = counts.begin(); i2 != counts.end(); i2++) {
+		    if (*i2 != 0) {
+	            cout << "#Trees occuring " << (i2-counts.begin()) << " times: " << *i2 << endl;
+		    }
+		}
+	}
+	string outfile = params.out_prefix;
+
+	outfile += ".iqtree";
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(outfile.c_str());
+		out << "IQ-TREE " << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR
+				<< "." << iqtree_VERSION_PATCH << " built " << __DATE__ << endl
+				<< endl;
+		if (params.partition_file)
+			out << "Partition file name: " << params.partition_file << endl;
+		if (params.aln_file)
+			out << "Input file name: " << params.aln_file << endl;
+
+		if (params.user_file)
+			out << "User tree file name: " << params.user_file << endl;
+		out << "Type of analysis: ";
+        if (original_model.find("TEST") != string::npos && original_model.find("ONLY") != string::npos) {
+            out << "model selection";
+        } else {
+            if (params.compute_ml_tree)
+                out << "tree reconstruction";
+            if (params.num_bootstrap_samples > 0) {
+                if (params.compute_ml_tree)
+                    out << " + ";
+                out << "non-parametric bootstrap (" << params.num_bootstrap_samples
+                        << " replicates)";
+            }
+            if (params.gbo_replicates > 0) {
+                out << " + ultrafast bootstrap (" << params.gbo_replicates << " replicates)";
+            }
+        }
+		out << endl;
+		out << "Random seed number: " << params.ran_seed << endl << endl;
+		out << "REFERENCES" << endl << "----------" << endl << endl;
+		reportReferences(params, out, original_model);
+
+		out << "SEQUENCE ALIGNMENT" << endl << "------------------" << endl
+				<< endl;
+		if (tree.isSuperTree()) {
+			out << "Input data: " << tree.aln->getNSeq()+tree.removed_seqs.size() << " taxa with "
+					<< tree.aln->getNSite() << " partitions and "
+					<< tree.getAlnNSite() << " total sites ("
+					<< ((SuperAlignment*)tree.aln)->computeMissingData()*100 << "% missing data)" << endl << endl;
+
+			PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
+			int namelen = stree->getMaxPartNameLength();
+			int part;
+			out.width(max(namelen+6,10));
+			out << left << "  ID  Name" << "  Type  #Seqs  #Sites  #Patterns  #Const_Sites" << endl;
+			//out << string(namelen+54, '-') << endl;
+			part = 0;
+			for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++, part++) {
+				//out << "FOR PARTITION " << stree->part_info[part].name << ":" << endl << endl;
+				//reportAlignment(out, *((*it)->aln));
+				out.width(4);
+				out << right << part+1 << "  ";
+				out.width(max(namelen,4));
+				out << left << stree->part_info[part].name << "  ";
+				out.width(6);
+				switch ((*it)->aln->seq_type) {
+				case SEQ_BINARY: out << "BIN"; break;
+				case SEQ_CODON: out << "CODON"; break;
+				case SEQ_DNA: out << "DNA"; break;
+				case SEQ_MORPH: out << "MORPH"; break;
+				case SEQ_MULTISTATE: out << "TINA"; break;
+				case SEQ_PROTEIN: out << "AA"; break;
+				case SEQ_UNKNOWN: out << "???"; break;
+				}
+				out.width(5);
+				out << right << (*it)->aln->getNSeq() << "  ";
+				out.width(6);
+				out << (*it)->aln->getNSite() << "  ";
+				out.width(6);
+				out << (*it)->aln->getNPattern() << "      ";
+				out << round((*it)->aln->frac_const_sites*100) << "%" << endl;
+			}
+			out << endl;
+		} else
+			reportAlignment(out, *(tree.aln), tree.removed_seqs.size());
+
+		out.precision(4);
+		out << fixed;
+
+		if (!model_info.empty()) {
+			out << "MODEL SELECTION" << endl << "---------------" << endl << endl;
+			if (tree.isSuperTree())
+				pruneModelInfo(model_info, (PhyloSuperTree*)&tree);
+			reportModelSelection(out, params, model_info, tree.isSuperTree());
+		}
+
+		out << "SUBSTITUTION PROCESS" << endl << "--------------------" << endl
+				<< endl;
+		if (tree.isSuperTree()) {
+			if(params.partition_type)
+				out	<< "Proportional partition model with joint branch lengths and separate models between partitions" << endl << endl;
+			else
+				out	<< "Full partition model with separate branch lengths and models between partitions" << endl << endl;
+			PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
+			PhyloSuperTree::iterator it;
+			int part;
+			if(params.partition_type)
+				out << "  ID  Model           Speed  Parameters" << endl;
+			else
+				out << "  ID  Model         TreeLen  Parameters" << endl;
+			//out << "-------------------------------------" << endl;
+			for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
+				out.width(4);
+				out << right << (part+1) << "  ";
+				out.width(14);
+				if(params.partition_type)
+					out << left << (*it)->getModelName() << " " << stree->part_info[part].part_rate  << "  " << (*it)->getModelNameParams() << endl;
+				else
+					out << left << (*it)->getModelName() << " " << (*it)->treeLength() << "  " << (*it)->getModelNameParams() << endl;
+			}
+			out << endl;
+			/*
+			for (it = stree->begin(), part = 0; it != stree->end(); it++, part++) {
+				reportModel(out, *(*it));
+				reportRate(out, *(*it));
+			}*/
+		} else {
+			reportModel(out, tree);
+			reportRate(out, tree);
+		}
+
+		/*
+		out << "RATE HETEROGENEITY" << endl << "------------------" << endl
+				<< endl;
+		if (tree.isSuperTree()) {
+			PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
+			int part = 0;
+			for (PhyloSuperTree::iterator it = stree->begin();
+					it != stree->end(); it++, part++) {
+				out << "FOR PARTITION " << stree->part_info[part].name << ":"
+						<< endl << endl;
+				reportRate(out, *(*it));
+			}
+		} else
+			reportRate(out, tree);
+		*/
+		// Bootstrap analysis:
+		//Display as outgroup: a
+
+		if (original_model == "WHTEST") {
+			out << "TEST OF MODEL HOMOGENEITY" << endl
+					<< "-------------------------" << endl << endl;
+			out << "Delta of input data:                 "
+					<< params.whtest_delta << endl;
+			out << ".95 quantile of Delta distribution:  "
+					<< params.whtest_delta_quantile << endl;
+			out << "Number of simulations performed:     "
+					<< params.whtest_simulations << endl;
+			out << "P-value:                             "
+					<< params.whtest_p_value << endl;
+			if (params.whtest_p_value < 0.05) {
+				out
+						<< "RESULT: Homogeneity assumption is rejected (p-value cutoff 0.05)"
+						<< endl;
+			} else {
+				out
+						<< "RESULT: Homogeneity assumption is NOT rejected (p-value cutoff 0.05)"
+						<< endl;
+			}
+			out << endl << "*** For this result please cite:" << endl << endl;
+			out
+					<< "G. Weiss and A. von Haeseler (2003) Testing substitution models"
+					<< endl
+					<< "within a phylogenetic tree. Mol. Biol. Evol, 20(4):572-578"
+					<< endl << endl;
+		}
+/*
+		out << "TREE SEARCH" << endl << "-----------" << endl << endl
+				<< "Stopping rule: "
+				<< ((params.stop_condition == SC_STOP_PREDICT) ? "Yes" : "No")
+				<< endl << "Number of iterations: "
+				<< tree.stop_rule.getNumIterations() << endl
+				<< "Probability of deleting sequences: " << params.p_delete
+				<< endl << "Number of representative leaves: "
+				<< params.k_representative << endl
+				<< "NNI log-likelihood cutoff: " << tree.getNNICutoff() << endl
+				<< endl;
+*/
+		if (params.compute_ml_tree) {
+			if (original_model.find("ONLY") != string::npos)
+				out << "TREE USED FOR MODEL SELECTION" << endl
+					<< "-----------------------------" << endl << endl;
+			else
+				out << "MAXIMUM LIKELIHOOD TREE" << endl
+					<< "-----------------------" << endl << endl;
+
+			tree.setRootNode(params.root);
+            
+            if (params.gbo_replicates) {
+                if (tree.boot_consense_logl > tree.candidateTrees.getBestScore()) {
+                    out << endl << "**NOTE**: Consensus tree has higher likelihood than ML tree found! Please use consensus tree below." << endl;
+                }
+            }
+
+			reportTree(out, params, tree, tree.candidateTrees.getBestScore(), tree.logl_variance, true);
+
+			if (tree.isSuperTree() && verbose_mode >= VB_MED) {
+				PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
+//				stree->mapTrees();
+//				int empty_branches = stree->countEmptyBranches();
+//				if (empty_branches) {
+//					stringstream ss;
+//					ss << empty_branches << " branches in the overall tree with no phylogenetic information due to missing data!";
+//					outWarning(ss.str());
+//				}
+				
+				int part = 0;
+				for (PhyloSuperTree::iterator it = stree->begin();
+						it != stree->end(); it++, part++) {
+					out << "FOR PARTITION " << stree->part_info[part].name
+							<< ":" << endl << endl;
+					string root_name;
+					if (params.root)
+						root_name = params.root;
+					else
+						root_name = (*it)->aln->getSeqName(0);
+					(*it)->root = (*it)->findNodeName(root_name);
+					assert((*it)->root);
+					reportTree(out, params, *(*it), (*it)->computeLikelihood(), (*it)->computeLogLVariance(), false);
+				}
+			}
+
+		}
+		/*
+		 if (params.write_intermediate_trees) {
+		 out << endl << "CONSENSUS OF INTERMEDIATE TREES" << endl << "-----------------------" << endl << endl
+		 << "Number of intermediate trees: " << tree.stop_rule.getNumIterations() << endl
+		 << "Split threshold: " << params.split_threshold << endl
+		 << "Burn-in: " << params.tree_burnin << endl << endl;
+		 }*/
+
+		if (params.consensus_type == CT_CONSENSUS_TREE) {
+			out << "CONSENSUS TREE" << endl << "--------------" << endl << endl;
+			out << "Consensus tree is constructed from "
+					<< (params.num_bootstrap_samples ? params.num_bootstrap_samples : params.gbo_replicates)
+					<< " bootstrap trees";
+            if (params.gbo_replicates) {
+                out << endl << "Log-likelihood of consensus tree: " << tree.boot_consense_logl;
+            }
+			string con_file = params.out_prefix;
+			con_file += ".contree";
+
+            IntVector rfdist;
+            tree.computeRFDist(con_file.c_str(), rfdist);
+            out << endl << "Robinson-Foulds distance between ML tree and consensus tree: " << rfdist[0] << endl;
+            
+            out << endl << "Branches with bootstrap support >"
+					<< floor(params.split_threshold * 1000) / 10 << "% are kept";
+			if (params.split_threshold == 0.0)
+				out << " (extended consensus)";
+			if (params.split_threshold == 0.5)
+				out << " (majority-rule consensus)";
+			if (params.split_threshold >= 0.99)
+				out << " (strict consensus)";
+
+			out << endl << "Branch lengths are optimized by maximum likelihood on original alignment" << endl;
+			out << "Numbers in parentheses are bootstrap supports (%)" << endl << endl;
+
+			bool rooted = false;
+			MTree contree;
+			contree.readTree(con_file.c_str(), rooted);
+			contree.drawTree(out, WT_BR_SCALE);
+			out << endl << "Consensus tree in newick format: " << endl << endl;
+			contree.printTree(out);
+			out << endl << endl;
+//			tree.freeNode();
+//			tree.root = NULL;
+//			tree.readTree(con_file.c_str(), rooted);
+//			if (removed_seqs.size() > 0) {
+//				tree.reinsertIdenticalSeqs(tree.aln, removed_seqs, twin_seqs);
+//			}
+//			tree.setAlignment(tree.aln);
+
+			// bug fix
+//			if ((tree.sse == LK_EIGEN || tree.sse == LK_EIGEN_SSE) && !tree.isBifurcating()) {
+//				cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
+//				tree.changeLikelihoodKernel(LK_SSE);
+//			}
+
+//			tree.initializeAllPartialLh();
+//			tree.fixNegativeBranch(false);
+//			if (tree.isSuperTree())
+//				((PhyloSuperTree*) &tree)->mapTrees();
+//			tree.optimizeAllBranches();
+//			tree.printTree(con_file.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA);
+//			tree.sortTaxa();
+//			tree.drawTree(out, WT_BR_SCALE);
+//			out << endl << "Consensus tree in newick format: " << endl << endl;
+//			tree.printResultTree(out);
+//			out << endl << endl;
+		}
+
+
+		/* evaluate user trees */
+		vector<TreeInfo> info;
+		IntVector distinct_trees;
+		if (params.treeset_file) {
+			evaluateTrees(params, &tree, info, distinct_trees);
+			out.precision(4);
+
+			out << endl << "USER TREES" << endl << "----------" << endl << endl;
+			out << "See " << params.treeset_file << ".trees for trees with branch lengths." << endl << endl;
+			if (params.topotest_replicates && info.size() > 1) {
+				if (params.do_weighted_test) {
+					out << "Tree      logL    deltaL  bp-RELL    p-KH     p-SH    p-WKH    p-WSH    c-ELW" << endl;
+					out << "-------------------------------------------------------------------------------" << endl;
+				} else {
+					out << "Tree      logL    deltaL  bp-RELL    p-KH     p-SH    c-ELW" << endl;
+					out << "-------------------------------------------------------------" << endl;
+
+				}
+			} else {
+				out << "Tree      logL    deltaL" << endl;
+				out << "-------------------------" << endl;
+
+			}
+			double maxL = -DBL_MAX;
+			int tid, orig_id;
+			for (tid = 0; tid < info.size(); tid++)
+				if (info[tid].logl > maxL) maxL = info[tid].logl;
+			for (orig_id = 0, tid = 0; orig_id < distinct_trees.size(); orig_id++) {
+				out.width(3);
+				out << right << orig_id+1 << " ";
+				if (distinct_trees[orig_id] >= 0) {
+					out << " = tree " << distinct_trees[orig_id]+1 << endl;
+					continue;
+				}
+				out.precision(3);
+				out.width(12);
+				out << info[tid].logl << " ";
+				out.width(7);
+				out << maxL - info[tid].logl;
+				if (!params.topotest_replicates || info.size() <= 1) {
+					out << endl;
+					tid++;
+					continue;
+				}
+				out.precision(4);
+				out << "  ";
+				out.width(6);
+				out << info[tid].rell_bp;
+				if (info[tid].rell_confident)
+					out << " + ";
+				else
+					out << " - ";
+				out.width(6);
+				out << right << info[tid].kh_pvalue;
+				if (info[tid].kh_pvalue < 0.05)
+					out << " - ";
+				else
+					out << " + ";
+				out.width(6);
+				out << right << info[tid].sh_pvalue;
+				if (info[tid].sh_pvalue < 0.05)
+					out << " - ";
+				else
+					out << " + ";
+				if (params.do_weighted_test) {
+					out.width(6);
+					out << right << info[tid].wkh_pvalue;
+					if (info[tid].wkh_pvalue < 0.05)
+						out << " - ";
+					else
+						out << " + ";
+					out.width(6);
+					out << right << info[tid].wsh_pvalue;
+					if (info[tid].wsh_pvalue < 0.05)
+						out << " - ";
+					else
+						out << " + ";
+				}
+				out.width(6);
+				out << info[tid].elw_value;
+				if (info[tid].elw_confident)
+					out << " +";
+				else
+					out << " -";
+				out << endl;
+				tid++;
+			}
+			out << endl;
+
+			if (params.topotest_replicates) {
+				out <<  "deltaL  : logL difference from the maximal logl in the set." << endl
+					 << "bp-RELL : bootstrap proportion using RELL method (Kishino et al. 1990)." << endl
+					 << "p-KH    : p-value of one sided Kishino-Hasegawa test (1989)." << endl
+					 << "p-SH    : p-value of Shimodaira-Hasegawa test (2000)." << endl;
+				if (params.do_weighted_test) {
+					out << "p-WKH   : p-value of weighted KH test." << endl
+					 << "p-WSH   : p-value of weighted SH test." << endl;
+				}
+				out	 << "c-ELW   : Expected Likelihood Weight (Strimmer & Rambaut 2002)." << endl << endl
+					 << "Plus signs denote the 95% confidence sets." << endl
+					 << "Minus signs denote significant exclusion."  << endl
+					 << "All tests performed "
+					 << params.topotest_replicates << " resamplings using the RELL method."<<endl;
+			}
+			out << endl;
+		}
+
+
+		time_t cur_time;
+		time(&cur_time);
+
+		char *date_str;
+		date_str = ctime(&cur_time);
+		out.unsetf(ios_base::fixed);
+		out << "TIME STAMP" << endl << "----------" << endl << endl
+				<< "Date and time: " << date_str << "Total CPU time used: "
+				<< (double) params.run_time << " seconds (" << convert_time(params.run_time) << ")" << endl
+				<< "Total wall-clock time used: " << getRealTime() - params.start_real_time
+				<< " seconds (" << convert_time(getRealTime() - params.start_real_time) << ")" << endl << endl;
+
+		//reportCredits(out); // not needed, now in the manual
+		out.close();
+
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, outfile);
+	}
+
+	cout << endl << "Analysis results written to: " << endl
+			<< "  IQ-TREE report:                " << params.out_prefix << ".iqtree"
+			<< endl;
+	if (params.compute_ml_tree) {
+		if (original_model.find("ONLY") == string::npos)
+			cout << "  Maximum-likelihood tree:       " << params.out_prefix << ".treefile" << endl;
+		else
+			cout << "  Tree used for model selection: " << params.out_prefix << ".treefile" << endl;
+		if (params.snni && params.write_local_optimal_trees) {
+			cout << "  Locally optimal trees (" << tree.candidateTrees.getNumLocalOptTrees() << "):    " << params.out_prefix << ".suboptimal_trees" << endl;
+		}
+	}
+	if (!params.user_file && params.start_tree == STT_BIONJ) {
+		cout << "  BIONJ tree:                    " << params.out_prefix << ".bionj"
+				<< endl;
+	}
+	if (!params.dist_file) {
+		//cout << "  Juke-Cantor distances:    " << params.out_prefix << ".jcdist" << endl;
+		if (params.compute_ml_dist)
+		cout << "  Likelihood distances:          " << params.out_prefix
+					<< ".mldist" << endl;
+		if (params.print_conaln)
+		cout << "  Concatenated alignment:        " << params.out_prefix
+					<< ".conaln" << endl;
+	}
+	if (original_model.find("TEST") != string::npos && tree.isSuperTree()) {
+		cout << "  Best partitioning scheme:      " << params.out_prefix << ".best_scheme.nex" << endl;
+		bool raxml_format_printed = true;
+
+		for (vector<PartitionInfo>::iterator it = ((PhyloSuperTree*)&tree)->part_info.begin();
+				it != ((PhyloSuperTree*)&tree)->part_info.end(); it++)
+			if (!it->aln_file.empty()) {
+				raxml_format_printed = false;
+				break;
+			}
+		if (raxml_format_printed)
+			 cout << "           in RAxML format:      " << params.out_prefix << ".best_scheme" << endl;
+	}
+	if (tree.getRate()->getGammaShape() > 0 && params.print_site_rate)
+		cout << "  Gamma-distributed rates:       " << params.out_prefix << ".rate"
+				<< endl;
+
+	if ((tree.getRate()->isSiteSpecificRate() || tree.getRate()->getPtnCat(0) >= 0) && params.print_site_rate)
+		cout << "  Site-rates by MH model:        " << params.out_prefix << ".rate"
+				<< endl;
+
+	if (params.print_site_lh)
+		cout << "  Site log-likelihoods:          " << params.out_prefix << ".sitelh"
+				<< endl;
+
+	if (params.write_intermediate_trees)
+		cout << "  All intermediate trees:        " << params.out_prefix << ".treels"
+				<< endl;
+
+	if (params.gbo_replicates) {
+		cout << endl << "Ultrafast bootstrap approximation results written to:" << endl
+			 << "  Split support values:          " << params.out_prefix << ".splits.nex" << endl
+			 << "  Consensus tree:                " << params.out_prefix << ".contree" << endl;
+		if (params.print_ufboot_trees)
+		cout << "  UFBoot trees:                  " << params.out_prefix << ".ufboot" << endl;
+
+	}
+
+	if (params.treeset_file) {
+		cout << "  Evaluated user trees:          " << params.out_prefix << ".trees" << endl;
+
+		if (params.print_tree_lh) {
+		cout << "  Tree log-likelihoods:          " << params.out_prefix << ".treelh" << endl;
+		}
+		if (params.print_site_lh) {
+		cout << "  Site log-likelihoods:          " << params.out_prefix << ".sitelh" << endl;
+		}
+	}
+	cout << "  Screen log file:               " << params.out_prefix << ".log" << endl;
+	/*	if (original_model == "WHTEST")
+	 cout <<"  WH-TEST report:           " << params.out_prefix << ".whtest" << endl;*/
+	cout << endl;
+
+}
+
+void checkZeroDist(Alignment *aln, double *dist) {
+	int ntaxa = aln->getNSeq();
+	IntVector checked;
+	checked.resize(ntaxa, 0);
+	int i, j;
+	for (i = 0; i < ntaxa - 1; i++) {
+		if (checked[i])
+			continue;
+		string str = "";
+		bool first = true;
+		for (j = i + 1; j < ntaxa; j++)
+			if (dist[i * ntaxa + j] <= 1e-6) {
+				if (first)
+					str = "ZERO distance between sequences "
+							+ aln->getSeqName(i);
+				str += ", " + aln->getSeqName(j);
+				checked[j] = 1;
+				first = false;
+			}
+		checked[i] = 1;
+		if (str != "")
+			outWarning(str);
+	}
+}
+
+
+void printAnalysisInfo(int model_df, IQTree& iqtree, Params& params) {
+//	if (!params.raxmllib) {
+	cout << "Model of evolution: ";
+	if (iqtree.isSuperTree()) {
+		cout << iqtree.getModelName() << " (" << model_df << " free parameters)" << endl;
+	} else {
+		cout << iqtree.getModelName() << " with ";
+		switch (iqtree.getModel()->getFreqType()) {
+		case FREQ_EQUAL:
+			cout << "equal";
+			break;
+		case FREQ_EMPIRICAL:
+			cout << "counted";
+			break;
+		case FREQ_USER_DEFINED:
+			cout << "user-defined";
+			break;
+		case FREQ_ESTIMATE:
+			cout << "optimized";
+			break;
+		case FREQ_CODON_1x4:
+			cout << "counted 1x4";
+			break;
+		case FREQ_CODON_3x4:
+			cout << "counted 3x4";
+			break;
+		case FREQ_CODON_3x4C:
+			cout << "counted 3x4-corrected";
+			break;
+		default:
+			outError("Wrong specified state frequencies");
+		}
+		cout << " frequencies (" << model_df << " free parameters)" << endl;
+	}
+	cout << "Fixed branch lengths: "
+			<< ((params.fixed_branch_length) ? "Yes" : "No") << endl;
+
+	if (params.min_iterations > 0) {
+	    cout << "Tree search algorithm: " << (params.snni ? "Stochastic nearest neighbor interchange" : "IQPNNI") << endl;
+	    cout << "Termination condition: ";
+	    if (params.stop_condition == SC_REAL_TIME) {
+	        cout << "after " << params.maxtime << " minutes" << endl;
+	    } else if (params.stop_condition == SC_UNSUCCESS_ITERATION) {
+	        cout << "after " << params.unsuccess_iteration << " unsuccessful iterations" << endl;
+	    } else if (params.stop_condition == SC_FIXED_ITERATION) {
+	            cout << params.min_iterations << " iterations" << endl;
+	    } else if(params.stop_condition == SC_WEIBULL) {
+	            cout << "predicted in [" << params.min_iterations << ","
+	                    << params.max_iterations << "] (confidence "
+	                    << params.stop_confidence << ")" << endl;
+	    } else if (params.stop_condition == SC_BOOTSTRAP_CORRELATION) {
+	    	cout << "min " << params.min_correlation << " correlation coefficient" << endl;
+	    }
+
+	    if (!params.snni) {
+	        cout << "Number of representative leaves  : " << params.k_representative << endl;
+	        cout << "Probability of deleting sequences: " << iqtree.getProbDelete() << endl;
+	        cout << "Number of leaves to be deleted   : " << iqtree.getDelete() << endl;
+	        cout << "Important quartets assessed on: "
+	                << ((params.iqp_assess_quartet == IQP_DISTANCE) ?
+	                        "Distance" : ((params.iqp_assess_quartet == IQP_PARSIMONY) ? "Parsimony" : "Bootstrap"))
+	                << endl;
+	    }
+	    cout << "NNI assessed on: " << ((params.nni5) ? "5 branches" : "1 branch") << endl;
+	}
+	cout << "Phylogenetic likelihood library: " << (params.pll ? "Yes" : "No") << endl;
+    cout << "Branch length optimization method: "
+            << ((iqtree.optimize_by_newton) ? "Newton" : "Brent") << endl;
+    cout << "Number of Newton-Raphson steps in NNI evaluation and branch length optimization: " << NNI_MAX_NR_STEP
+            << " / " << PLL_NEWZPERCYCLE << endl;
+    cout << "SSE instructions: "
+            << ((iqtree.sse) ? "Yes" : "No") << endl;
+	cout << endl;
+}
+
+void computeMLDist(Params& params, IQTree& iqtree, string &dist_file, double begin_time) {
+	double longest_dist;
+	stringstream best_tree_string;
+	iqtree.printTree(best_tree_string, WT_BR_LEN + WT_TAXON_ID);
+	cout << "Computing ML distances based on estimated model parameters...";
+	double *ml_dist = NULL;
+    double *ml_var = NULL;
+    longest_dist = iqtree.computeDist(params, iqtree.aln, ml_dist, ml_var, dist_file);
+	cout << " " << (getCPUTime() - begin_time) << " sec" << endl;
+	if (longest_dist > MAX_GENETIC_DIST * 0.99) {
+		outWarning("Some pairwise ML distances are too long (saturated)");
+		//cout << "Some ML distances are too long, using old distances..." << endl;
+	} //else
+	{
+		if ( !iqtree.dist_matrix ) {
+	        iqtree.dist_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
+		}
+		if ( !iqtree.var_matrix ) {
+	        iqtree.var_matrix = new double[iqtree.aln->getNSeq() * iqtree.aln->getNSeq()];
+		}
+		memmove(iqtree.dist_matrix, ml_dist,
+                sizeof (double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
+        memmove(iqtree.var_matrix, ml_var,
+				sizeof(double) * iqtree.aln->getNSeq() * iqtree.aln->getNSeq());
+	}
+	delete[] ml_dist;
+    delete[] ml_var;
+}
+
+void computeInitialDist(Params &params, IQTree &iqtree, string &dist_file) {
+    double longest_dist;
+	if (params.dist_file) {
+		cout << "Reading distance matrix file " << params.dist_file << " ..." << endl;
+	} else if (params.compute_jc_dist) {
+		cout << "Computing Juke-Cantor distances..." << endl;
+	} else if (params.compute_obs_dist) {
+		cout << "Computing observed distances..." << endl;
+	}
+
+	if (params.compute_jc_dist || params.compute_obs_dist || params.partition_file) {
+		longest_dist = iqtree.computeDist(params, iqtree.aln, iqtree.dist_matrix, iqtree.var_matrix, dist_file);
+		checkZeroDist(iqtree.aln, iqtree.dist_matrix);
+		if (longest_dist > MAX_GENETIC_DIST * 0.99) {
+			outWarning("Some pairwise distances are too long (saturated)");
+		}
+    }
+
+}
+
+void initializeParams(Params &params, IQTree &iqtree, vector<ModelInfo> &model_info) {
+//    iqtree.setCurScore(-DBL_MAX);
+    bool test_only = params.model_name.find("ONLY") != string::npos;
+    /* initialize substitution model */
+    if (params.model_name.substr(0, 4) == "TEST") {
+        if (iqtree.isSuperTree())
+            ((PhyloSuperTree*) &iqtree)->mapTrees();
+        double start_cpu_time = getCPUTime();
+        double start_real_time = getRealTime();
+        ofstream fmodel;
+        string fmodel_str = ((string)params.out_prefix + ".model"); 
+
+        bool ok_model_file = false;
+        if (!params.print_site_lh && !params.model_test_again) {
+            ok_model_file = checkModelFile(fmodel_str, iqtree.isSuperTree(), model_info);
+        }
+
+        ok_model_file &= model_info.size() > 0;
+        if (ok_model_file) {
+            cout << "Reusing information from model file " << fmodel_str << endl;
+            fmodel.open(fmodel_str.c_str(), ios::app);
+            if (!fmodel.is_open())
+                outError("cannot append to file ", fmodel_str);            
+        } else {
+            fmodel.open(fmodel_str.c_str());
+            if (!fmodel.is_open())
+                outError("cannot write to file ", fmodel_str);
+            // print header
+            SeqType seq_type = iqtree.aln->seq_type;
+            if (iqtree.isSuperTree()) {
+                fmodel << "Charset\t";
+                seq_type = ((PhyloSuperTree*)&iqtree)->front()->aln->seq_type;
+            }
+            fmodel << "Model\tdf\tLnL\tTreeLen";
+            if (seq_type == SEQ_BINARY)
+                fmodel << "\t0\t1";
+            else if (seq_type == SEQ_DNA)
+                fmodel << "\tA-C\tA-G\tA-T\tC-G\tC-T\tG-T\tA\tC\tG\tT";
+            fmodel << "\talpha\tpinv\tTree" << endl;
+            model_info.clear();
+        }
+        fmodel.precision(4);
+        fmodel << fixed;
+
+        params.model_name = testModel(params, &iqtree, model_info, fmodel, "", true);
+        fmodel.close();
+        params.startCPUTime = start_cpu_time;
+        params.start_real_time = start_real_time;
+        cout << "CPU time for model selection: " << getCPUTime() - start_cpu_time << " seconds." << endl;
+//        alignment = iqtree.aln;
+        if (test_only) {
+            params.min_iterations = 0;
+        }
+    }
+
+    if (params.model_name == "WHTEST") {
+        if (iqtree.aln->seq_type != SEQ_DNA)
+            outError("Weiss & von Haeseler test of model homogeneity only works for DNA");
+        params.model_name = "GTR+G";
+    }
+
+    assert(iqtree.aln);
+    if (params.gbo_replicates)
+        params.speed_conf = 1.0;
+
+    if (iqtree.isSuperTree())
+        ((PhyloSuperTree*) &iqtree)->mapTrees();
+
+    // set parameter for the current tree
+//    iqtree.setParams(params);
+}
+
+
+void pruneTaxa(Params &params, IQTree &iqtree, double *pattern_lh, NodeVector &pruned_taxa, StrVector &linked_name) {
+	int num_low_support;
+	double mytime;
+
+	if (params.aLRT_threshold <= 100 && (params.aLRT_replicates > 0 || params.localbp_replicates > 0)) {
+		mytime = getCPUTime();
+		cout << "Testing tree branches by SH-like aLRT with " << params.aLRT_replicates << " replicates..." << endl;
+		iqtree.setRootNode(params.root);
+		double curScore =  iqtree.getCurScore();
+		iqtree.computePatternLikelihood(pattern_lh, &curScore);
+		num_low_support = iqtree.testAllBranches(params.aLRT_threshold, curScore,
+				pattern_lh, params.aLRT_replicates, params.localbp_replicates);
+		iqtree.printResultTree();
+		cout << "  " << getCPUTime() - mytime << " sec." << endl;
+		cout << num_low_support << " branches show low support values (<= " << params.aLRT_threshold << "%)" << endl;
+
+		//tree.drawTree(cout);
+		cout << "Collapsing stable clades..." << endl;
+		iqtree.collapseStableClade(params.aLRT_threshold, pruned_taxa, linked_name, iqtree.dist_matrix);
+		cout << pruned_taxa.size() << " taxa were pruned from stable clades" << endl;
+	}
+
+	if (!pruned_taxa.empty()) {
+		cout << "Pruned alignment contains " << iqtree.aln->getNSeq()
+				<< " sequences and " << iqtree.aln->getNSite() << " sites and "
+				<< iqtree.aln->getNPattern() << " patterns" << endl;
+		//tree.clearAllPartialLh();
+		iqtree.initializeAllPartialLh();
+		iqtree.clearAllPartialLH();
+		iqtree.setCurScore(iqtree.optimizeAllBranches());
+		//cout << "Log-likelihood	after reoptimizing model parameters: " << tree.curScore << endl;
+		int nni_count, nni_steps;
+		iqtree.setCurScore(iqtree.optimizeNNI(nni_count, nni_steps));
+		cout << "Log-likelihood after optimizing partial tree: "
+				<< iqtree.getCurScore() << endl;
+	}
+
+}
+
+void restoreTaxa(IQTree &iqtree, double *saved_dist_mat, NodeVector &pruned_taxa, StrVector &linked_name) {
+	if (!pruned_taxa.empty()) {
+		cout << "Restoring full tree..." << endl;
+		iqtree.restoreStableClade(iqtree.aln, pruned_taxa, linked_name);
+		delete[] iqtree.dist_matrix;
+		iqtree.dist_matrix = saved_dist_mat;
+		iqtree.initializeAllPartialLh();
+		iqtree.clearAllPartialLH();
+		iqtree.setCurScore(iqtree.optimizeAllBranches());
+		//cout << "Log-likelihood	after reoptimizing model parameters: " << tree.curScore << endl;
+		int nni_count, nni_steps;
+		iqtree.setCurScore(iqtree.optimizeNNI(nni_count, nni_steps));
+		cout << "Log-likelihood	after reoptimizing full tree: " << iqtree.getCurScore() << endl;
+		//iqtree.setBestScore(iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, true, params.model_eps));
+
+	}
+}
+void runApproximateBranchLengths(Params &params, IQTree &iqtree) {
+    if (!params.fixed_branch_length && params.leastSquareBranch) {
+        cout << endl << "Computing Least Square branch lengths..." << endl;
+        iqtree.optimizeAllBranchesLS();
+        iqtree.clearAllPartialLH();
+        iqtree.setCurScore(iqtree.computeLikelihood());
+        string filename = params.out_prefix;
+        filename += ".lstree";
+        iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+        cout << "Logl of tree with LS branch lengths: " << iqtree.getCurScore() << endl;
+        cout << "Tree with LS branch lengths written to " << filename << endl;
+        if (params.print_branch_lengths) {
+        	if (params.manuel_analytic_approx) {
+        		cout << "Applying Manuel's analytic approximation.." << endl;
+        		iqtree.approxAllBranches();
+        	}
+        	ofstream out;
+        	filename = params.out_prefix;
+        	filename += ".lsbrlen";
+        	out.open(filename.c_str());
+        	iqtree.printBranchLengths(out);
+        	out.close();
+        	cout << "LS Branch lengths written to " << filename << endl;
+        }
+        cout << "Total LS tree length: " << iqtree.treeLength() << endl;
+    }
+
+    if (params.pars_branch_length) {
+    	cout << endl << "Computing parsimony branch lengths..." << endl;
+    	iqtree.fixNegativeBranch(true);
+    	iqtree.clearAllPartialLH();
+        iqtree.setCurScore(iqtree.computeLikelihood());
+        string filename = params.out_prefix;
+        filename += ".mptree";
+        iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+        cout << "Logl of tree with MP branch lengths: " << iqtree.getCurScore() << endl;
+        cout << "Tree with MP branch lengths written to " << filename << endl;
+        if (params.print_branch_lengths) {
+        	ofstream out;
+        	filename = params.out_prefix;
+        	filename += ".mpbrlen";
+        	out.open(filename.c_str());
+        	iqtree.printBranchLengths(out);
+        	out.close();
+        	cout << "MP Branch lengths written to " << filename << endl;
+        }
+        cout << "Total MP tree length: " << iqtree.treeLength() << endl;
+
+    }
+
+    if (params.bayes_branch_length) {
+    	cout << endl << "Computing Bayesian branch lengths..." << endl;
+    	iqtree.computeAllBayesianBranchLengths();
+    	iqtree.clearAllPartialLH();
+        iqtree.setCurScore(iqtree.computeLikelihood());
+        string filename = params.out_prefix;
+        filename += ".batree";
+        iqtree.printTree(filename.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+        cout << "Logl of tree with Bayesian branch lengths: " << iqtree.getCurScore() << endl;
+        cout << "Tree with Bayesian branch lengths written to " << filename << endl;
+        if (params.print_branch_lengths) {
+        	ofstream out;
+        	filename = params.out_prefix;
+        	filename += ".babrlen";
+        	out.open(filename.c_str());
+        	iqtree.printBranchLengths(out);
+        	out.close();
+        	cout << "Bayesian Branch lengths written to " << filename << endl;
+        }
+        cout << "Total Bayesian tree length: " << iqtree.treeLength() << endl;
+
+    }
+
+}
+
+void printMiscInfo(Params &params, IQTree &iqtree, double *pattern_lh) {
+	if (params.print_site_lh && !params.pll) {
+		string site_lh_file = params.out_prefix;
+		site_lh_file += ".sitelh";
+		if (params.print_site_lh == 1)
+			printSiteLh(site_lh_file.c_str(), &iqtree, pattern_lh);
+		else
+			printSiteLhCategory(site_lh_file.c_str(), &iqtree);
+	}
+
+    if (params.print_site_posterior) {
+        cout << "Computing mixture posterior probabilities" << endl;
+        IntVector pattern_cat;
+        int num_mix = iqtree.computePatternCategories(&pattern_cat);
+        cout << num_mix << " mixture components are necessary" << endl;
+        string site_mix_file = (string)params.out_prefix + ".sitemix";
+        ofstream out(site_mix_file.c_str());
+        if (!out.is_open())
+            outError("File " + site_mix_file + " could not be opened");
+        out << "Ptn\tFreq\tNumMix" << endl;
+        int ptn;
+        for (ptn = 0; ptn < pattern_cat.size(); ptn++)
+            out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << pattern_cat[ptn] << endl;
+        out.close();
+        cout << "Pattern mixtures printed to " << site_mix_file << endl;
+        
+        site_mix_file = (string)params.out_prefix + ".sitemixall";
+        out.open(site_mix_file.c_str());
+        int ncat = iqtree.getRate()->getNRate();
+        if (iqtree.getModel()->isMixture() && !iqtree.getModelFactory()->fused_mix_rate)
+            ncat = iqtree.getModel()->getNMixtures();
+        out << "Ptn\tFreq\tNumMix\tCat" << endl;
+        
+        int c;
+        for (ptn = 0; ptn < iqtree.ptn_cat_mask.size(); ptn++) {
+            int num_cat = popcount_lauradoux((unsigned*)&iqtree.ptn_cat_mask[ptn], 2);
+            out << ptn << "\t" << (int)iqtree.ptn_freq[ptn] << "\t" << num_cat << "\t";
+            for (c = 0; c < ncat; c++)
+                if (iqtree.ptn_cat_mask[ptn] & ((uint64_t)1<<c))
+                    out << "1";
+                else
+                    out << "0";
+            out << endl;
+        }
+        out.close();
+    }
+
+	if (params.print_branch_lengths) {
+    	if (params.manuel_analytic_approx) {
+    		cout << "Applying Manuel's analytic approximation.." << endl;
+    		iqtree.approxAllBranches();
+    	}
+		string brlen_file = params.out_prefix;
+		brlen_file += ".brlen";
+		ofstream out;
+		out.open(brlen_file.c_str());
+		iqtree.printBranchLengths(out);
+		out.close();
+		cout << "Branch lengths written to " << brlen_file << endl;
+	}
+
+	if (params.print_partition_info && iqtree.isSuperTree()) {
+		string partition_info = params.out_prefix;
+		partition_info += ".partinfo.nex";
+		((PhyloSuperTree*)(&iqtree))->printPartition(partition_info.c_str());
+		partition_info = (string)params.out_prefix + ".partitions";
+		((PhyloSuperTree*)(&iqtree))->printPartitionRaxml(partition_info.c_str());
+	}
+
+	if (params.mvh_site_rate) {
+		RateMeyerHaeseler *rate_mvh = new RateMeyerHaeseler(params.rate_file,
+				&iqtree, params.rate_mh_type);
+		cout << endl << "Computing site-specific rates by "
+				<< rate_mvh->full_name << "..." << endl;
+		rate_mvh->runIterativeProc(params, iqtree);
+		cout << endl << "BEST SCORE FOUND : " << iqtree.candidateTrees.getBestScore()<< endl;
+		string mhrate_file = params.out_prefix;
+		mhrate_file += ".mhrate";
+		iqtree.getRate()->writeSiteRates(mhrate_file.c_str());
+
+		if (params.print_site_lh) {
+			string site_lh_file = params.out_prefix;
+			site_lh_file += ".mhsitelh";
+			printSiteLh(site_lh_file.c_str(), &iqtree);
+		}
+	}
+
+	if (params.print_site_rate) {
+		string rate_file = params.out_prefix;
+		rate_file += ".rate";
+		iqtree.getRate()->writeSiteRates(rate_file.c_str());
+		if (iqtree.isSuperTree()) {
+			PhyloSuperTree *stree = (PhyloSuperTree*) &iqtree;
+			int part = 0;
+			try {
+				ofstream out;
+				out.exceptions(ios::failbit | ios::badbit);
+				out.open(rate_file.c_str());
+				for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++, part++) {
+					out << "SITE RATES FOR PARTITION " << stree->part_info[part].name << ":" << endl;
+					(*it)->getRate()->writeSiteRates(out);
+				}
+				cout << "Site rates printed to " << rate_file << endl;
+				out.close();
+			} catch (ios::failure) {
+				outError(ERR_WRITE_OUTPUT, rate_file);
+			}
+		}
+	}
+
+}
+
+void printFinalSearchInfo(Params &params, IQTree &iqtree, double search_cpu_time, double search_real_time) {
+	cout << "Total tree length: " << iqtree.treeLength() << endl;
+
+	if (iqtree.isSuperTree() && verbose_mode >= VB_MAX) {
+		PhyloSuperTree *stree = (PhyloSuperTree*) &iqtree;
+		cout << stree->evalNNIs << " NNIs evaluated from " << stree->totalNNIs << " all possible NNIs ( " <<
+				(int)(((stree->evalNNIs+1.0)/(stree->totalNNIs+1.0))*100.0) << " %)" << endl;
+		cout<<"Details for subtrees:"<<endl;
+		for(int part = 0; part < stree->size(); part++){
+			cout << part+1 <<". "<<stree->part_info[part].name<<": "<<stree->part_info[part].evalNNIs<<" ( "
+				<< (int)(((stree->part_info[part].evalNNIs+1.0)/((stree->totalNNIs+1.0) / stree->size()))*100.0)
+				<< " %)" << endl;
+		}
+	}
+
+	params.run_time = (getCPUTime() - params.startCPUTime);
+	cout << endl;
+	cout << "Total number of iterations: " << iqtree.stop_rule.getCurIt() << endl;
+	cout << "CPU time used for tree search: " << search_cpu_time
+			<< " sec (" << convert_time(search_cpu_time) << ")" << endl;
+	cout << "Wall-clock time used for tree search: " << search_real_time
+			<< " sec (" << convert_time(search_real_time) << ")" << endl;
+	cout << "Total CPU time used: " << (double) params.run_time << " sec ("
+			<< convert_time((double) params.run_time) << ")" << endl;
+	cout << "Total wall-clock time used: "
+			<< getRealTime() - params.start_real_time << " sec ("
+			<< convert_time(getRealTime() - params.start_real_time) << ")" << endl;
+
+}
+
+void printSuboptimalTrees(IQTree& iqtree, Params& params, string suffix) {
+	vector<string> trees = iqtree.candidateTrees.getTopTrees();
+	ofstream treesOut((string(params.out_prefix) + suffix).c_str(),
+			ofstream::out);
+	for (vector<string>::iterator it = trees.begin(); it != trees.end(); it++) {
+		treesOut << (*it);
+		treesOut << endl;
+	}
+	treesOut.close();
+}
+
+/************************************************************
+ *  MAIN TREE RECONSTRUCTION
+ ***********************************************************/
+void runTreeReconstruction(Params &params, string &original_model, IQTree &iqtree, vector<ModelInfo> &model_info) {
+
+    string dist_file;
+    params.startCPUTime = getCPUTime();
+    params.start_real_time = getRealTime();
+
+    // Make sure that no partial likelihood of IQ-TREE is initialized when PLL is used to save memory
+    if (params.pll) {
+        iqtree.deleteAllPartialLh();
+    }
+
+//    if (params.count_trees && pllTreeCounter == NULL)
+//    	pllTreeCounter = new StringIntMap;
+
+    // Temporary fix since PLL only supports DNA/Protein: switch to IQ-TREE parsimony kernel
+    if (params.start_tree == STT_PLL_PARSIMONY) {
+		if (iqtree.isSuperTree()) {
+			PhyloSuperTree *stree = (PhyloSuperTree*)&iqtree;
+			for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++)
+				if ((*it)->aln->seq_type != SEQ_DNA && (*it)->aln->seq_type != SEQ_PROTEIN)
+					params.start_tree = STT_BIONJ;
+		} else if (iqtree.aln->seq_type != SEQ_DNA && iqtree.aln->seq_type != SEQ_PROTEIN)
+			params.start_tree = STT_PARSIMONY;
+    }
+
+    /***************** Initialization for PLL and sNNI ******************/
+    if (params.start_tree == STT_PLL_PARSIMONY || params.pll) {
+        /* Initialized all data structure for PLL*/
+    	iqtree.initializePLL(params);
+    }
+
+
+    /********************* Compute pairwise distances *******************/
+    if (params.start_tree == STT_BIONJ || params.iqp || params.leastSquareBranch) {
+    	computeInitialDist(params, iqtree, dist_file);
+    }
+
+    /******************** Pass the parameter object params to IQTree *******************/
+    iqtree.setParams(&params);
+
+    /********************** Create an initial tree **********************/
+    iqtree.computeInitialTree(dist_file, params.SSE);
+    
+    //*** FOR TUNG: This is wrong! a NULL root was already treated correctly
+//    if (params.root == NULL) {
+//    	params.root = iqtree.aln->getSeqName(0).c_str();
+//    	iqtree.setRootNode(params.root);
+//    }
+   	iqtree.setRootNode(params.root);
+
+    /*************** SET UP PARAMETERS and model testing ****************/
+
+   	// FOR TUNG: swapping the order cause bug for -m TESTLINK
+//    iqtree.initSettings(params);
+    initializeParams(params, iqtree, model_info);
+    iqtree.initSettings(params);
+
+    /*********************** INITIAL MODEL OPTIMIZATION *****************/
+
+    iqtree.initializeModel(params);
+
+    // UpperBounds analysis. Here, to analyse the initial tree without any tree search or optimization
+    if (params.upper_bound) {
+    	iqtree.setCurScore(iqtree.computeLikelihood());
+    	cout<<iqtree.getCurScore()<<endl;
+    	UpperBounds(&params, iqtree.aln, &iqtree);
+    	exit(0);
+	}
+
+    // degree of freedom
+    cout << endl;
+    if (verbose_mode >= VB_MED) {
+    	cout << "ML-TREE SEARCH START WITH THE FOLLOWING PARAMETERS:" << endl;
+        int model_df = iqtree.getModelFactory()->getNParameters();
+    	printAnalysisInfo(model_df, iqtree, params);
+    }
+
+    if (!params.pll) {
+        uint64_t mem_size = iqtree.getMemoryRequired();
+        uint64_t total_mem = getMemorySize();
+        if (mem_size >= total_mem) {
+            if (params.lh_mem_save == LM_DETECT) {
+                // switch to memory saving technique that reduces memory requirement to 1/3
+                params.lh_mem_save = LM_PER_NODE;
+                mem_size = iqtree.getMemoryRequired();
+            }
+        }
+//#if defined __APPLE__ || defined __MACH__
+        cout << "NOTE: " << (mem_size / 1024) / 1024 << " MB RAM is required!" << endl;
+//#else
+//        cout << "NOTE: " << ((double) mem_size / 1000.0) / 1000 << " MB RAM is required!" << endl;
+//#endif
+        if (mem_size >= total_mem) {
+            outError("Memory required exceeds your computer RAM size!");
+        }
+#ifdef BINARY32
+        if (mem_size >= 2000000000) {
+            outError("Memory required exceeds 2GB limit of 32-bit executable");
+        }
+#endif
+        int max_procs = countPhysicalCPUCores();
+        if (mem_size * max_procs > total_mem * params.num_threads) {
+            outWarning("Memory required per CPU-core (" + convertDoubleToString((double)mem_size/params.num_threads/1024/1024/1024)+
+            " GB) is higher than your computer RAM per CPU-core ("+convertIntToString(total_mem/max_procs/1024/1024/1024)+
+            " GB), thus multiple runs may exceed RAM!");
+        }
+    }
+
+    iqtree.initializeAllPartialLh();
+	double initEpsilon = params.min_iterations == 0 ? params.modeps : (params.modeps*10);
+	string initTree;
+
+	if (iqtree.getRate()->name.find("+I+G") != string::npos) {
+		if (params.alpha_invar_file != NULL) { // COMPUTE TREE LIKELIHOOD BASED ON THE INPUT ALPHA AND P_INVAR VALUE
+			computeLoglFromUserInputGAMMAInvar(params, iqtree);
+			exit(0);
+		}
+
+		if (params.exh_ai) {
+			exhaustiveSearchGAMMAInvar(params, iqtree);
+			exit(0);
+		}
+
+		if (params.testAlpha) { // DO RESTART ON ALPHA AND P_INVAR
+			double stime = getRealTime();
+			searchGAMMAInvarByRestarting(iqtree);
+			double etime = getRealTime();
+            cout << "Testing alpha took: " << etime -stime << " CPU seconds" << endl;
+            cout << endl;
+		}
+	}
+
+    // Optimize model parameters and branch lengths using ML for the initial tree
+	iqtree.clearAllPartialLH();
+	initTree = iqtree.optimizeModelParameters(true, initEpsilon);
+
+
+    /****************** NOW PERFORM MAXIMUM LIKELIHOOD TREE RECONSTRUCTION ******************/
+
+    // Update best tree
+    iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
+
+    // Compute maximum likelihood distance
+    // ML distance is only needed for IQP
+//    if ( params.start_tree != STT_BIONJ && ((params.snni && !params.iqp) || params.min_iterations == 0)) {
+//        params.compute_ml_dist = false;
+//    }
+    if (params.min_iterations == 0 && params.start_tree != STT_BIONJ)
+        params.compute_ml_dist = false;
+    
+    if ((params.user_file || params.start_tree == STT_RANDOM_TREE) && params.snni && !params.iqp) {
+        params.compute_ml_dist = false;
+    }
+//    if ( params.user_file && params.min_iterations == 0) {
+//        params.compute_ml_dist = false;
+//    }
+
+    if ((!params.dist_file && params.compute_ml_dist) || params.leastSquareBranch) {
+        computeMLDist(params, iqtree, dist_file, getCPUTime());
+        if (!params.user_file && params.start_tree != STT_RANDOM_TREE) {
+            // NEW 2015-08-10: always compute BIONJ tree into the candidate set
+            iqtree.resetCurScore();
+            double start_bionj = getRealTime();
+            iqtree.computeBioNJ(params, iqtree.aln, dist_file);
+            cout << getRealTime() - start_bionj << " seconds" << endl;
+            if (iqtree.isSuperTree())
+                iqtree.wrapperFixNegativeBranch(true);
+            else
+                iqtree.wrapperFixNegativeBranch(false);
+            if (params.start_tree == STT_BIONJ) {
+                initTree = iqtree.optimizeModelParameters(params.min_iterations==0, initEpsilon);
+            } else {
+                initTree = iqtree.optimizeBranches();
+            }
+            cout << "Log-likelihood of BIONJ tree: " << iqtree.getCurScore() << endl;
+            iqtree.candidateTrees.update(initTree, iqtree.getCurScore());
+        }
+    }
+
+	double cputime_search_start = getCPUTime();
+    double realtime_search_start = getRealTime();
+
+    if (params.min_iterations > 0) {
+        double initTime = getCPUTime();
+
+        if (!params.user_file && (params.start_tree == STT_PARSIMONY || params.start_tree == STT_PLL_PARSIMONY)) {
+        	iqtree.initCandidateTreeSet(params.numInitTrees - iqtree.candidateTrees.size(), params.numNNITrees);
+        	assert(iqtree.candidateTrees.size() != 0);
+        	cout << "Finish initializing candidate tree set. ";
+        	cout << "Number of distinct locally optimal trees: " << iqtree.candidateTrees.size() << endl;
+        	if (params.write_local_optimal_trees) {
+        		printSuboptimalTrees(iqtree, params, ".init_suboptimal_trees");
+        	}
+        } else {
+            int nni_count = 0;
+            int nni_steps = 0;
+            cout << "Doing NNI on the initial tree ... " << endl;
+            string tree = iqtree.doNNISearch(nni_count, nni_steps);
+        	iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
+
+        }
+        cout << "Current best tree score: " << iqtree.candidateTrees.getBestScore() << " / CPU time: "
+                << getCPUTime() - initTime << endl;
+	}
+
+
+    if (params.leastSquareNNI) {
+    	iqtree.computeSubtreeDists();
+    }
+    /* TUNG: what happens if params.root is not set? This is usually the case.
+     * I added code to ininialize the root above.
+     */
+    //iqtree.setRootNode(params.root); // Important for NNI below
+
+	if (original_model == "WHTEST") {
+		cout << endl << "Testing model homogeneity by Weiss & von Haeseler (2003)..." << endl;
+		WHTest(params, iqtree);
+	}
+
+	NodeVector pruned_taxa;
+	StrVector linked_name;
+	double *saved_dist_mat = iqtree.dist_matrix;
+	double *pattern_lh;
+
+	pattern_lh = new double[iqtree.getAlnNPattern()];
+
+	// prune stable taxa
+	pruneTaxa(params, iqtree, pattern_lh, pruned_taxa, linked_name);
+
+	if (params.min_iterations > 1) {
+		iqtree.readTreeString(iqtree.candidateTrees.getTopTrees()[0]);
+		iqtree.doTreeSearch();
+		iqtree.setAlignment(iqtree.aln);
+        cout << "TREE SEARCH COMPLETED AFTER " << iqtree.stop_rule.getCurIt() << " ITERATIONS" << endl << endl;
+	} else {
+		/* do SPR with likelihood function */
+		if (params.tree_spr) {
+			//tree.optimizeSPRBranches();
+			cout << "Doing SPR Search" << endl;
+			cout << "Start tree.optimizeSPR()" << endl;
+			double spr_score = iqtree.optimizeSPR();
+			cout << "Finish tree.optimizeSPR()" << endl;
+			//double spr_score = tree.optimizeSPR(tree.curScore, (PhyloNode*) tree.root->neighbors[0]->node);
+			if (spr_score <= iqtree.getCurScore()) {
+				cout << "SPR search did not found any better tree" << endl;
+			}
+		}
+	}
+
+	// restore pruned taxa
+	restoreTaxa(iqtree, saved_dist_mat, pruned_taxa, linked_name);
+
+	double search_cpu_time = getCPUTime() - cputime_search_start;
+	double search_real_time = getRealTime() - realtime_search_start;
+
+    // COMMENT THIS OUT BECAUSE IT DELETES ALL BRANCH LENGTHS OF SUBTREES!
+//	if (iqtree.isSuperTree())
+//			((PhyloSuperTree*) &iqtree)->mapTrees();
+
+	if (params.snni && params.min_iterations && verbose_mode >= VB_MED) {
+		cout << "Log-likelihoods of best " << params.popSize << " trees: " << endl;
+		iqtree.printBestScores(params.popSize);
+		cout << endl;
+	}
+
+	if (params.min_iterations) {
+		iqtree.readTreeString(iqtree.candidateTrees.getBestTrees()[0]);
+        iqtree.initializeAllPartialLh();
+        iqtree.clearAllPartialLH();
+        cout << "--------------------------------------------------------------------" << endl;
+        cout << "|                    FINALIZING TREE SEARCH                        |" << endl;
+        cout << "--------------------------------------------------------------------" << endl;
+        cout << "Performs final model parameters optimization" << endl;
+		string tree;
+        if (params.testAlpha)
+            tree = iqtree.optimizeModelParameters(true, 0.001);
+        else
+            tree = iqtree.optimizeModelParameters(true);
+        
+		iqtree.candidateTrees.update(tree, iqtree.getCurScore(), true);
+    }
+
+	if (iqtree.isSuperTree())
+		((PhyloSuperTree*) &iqtree)->computeBranchLengths();
+
+	cout << "BEST SCORE FOUND : " << iqtree.getCurScore() << endl;
+
+	if (params.write_local_optimal_trees) {
+		printSuboptimalTrees(iqtree, params, ".suboptimal_trees");
+	}
+
+	if (params.pll)
+		iqtree.inputModelPLL2IQTree();
+
+	/* root the tree at the first sequence */
+	iqtree.root = iqtree.findLeafName(iqtree.aln->getSeqName(0));
+	assert(iqtree.root);
+
+
+	if (!params.pll) {
+	    iqtree.computeLikelihood(pattern_lh);
+	    // compute logl variance
+	    iqtree.logl_variance = iqtree.computeLogLVariance();
+	}
+
+	printMiscInfo(params, iqtree, pattern_lh);
+
+	/****** perform SH-aLRT test ******************/
+	if ((params.aLRT_replicates > 0 || params.localbp_replicates > 0) && !params.pll) {
+		double mytime = getCPUTime();
+		params.aLRT_replicates = max(params.aLRT_replicates, params.localbp_replicates);
+		cout << endl << "Testing tree branches by SH-like aLRT with "
+				<< params.aLRT_replicates << " replicates..." << endl;
+		iqtree.setRootNode(params.root);
+		iqtree.testAllBranches(params.aLRT_threshold, iqtree.getCurScore(),
+				pattern_lh, params.aLRT_replicates, params.localbp_replicates);
+		cout << "CPU Time used:  " << getCPUTime() - mytime << " sec." << endl;
+	}
+
+	if (params.gbo_replicates > 0) {
+		if (!params.online_bootstrap)
+			runGuidedBootstrap(params, iqtree.aln, iqtree);
+		else
+			iqtree.summarizeBootstrap(params);
+	}
+
+	printFinalSearchInfo(params, iqtree, search_cpu_time, search_real_time);
+
+	// BUG FIX: readTreeString(bestTreeString) not needed before this line
+	iqtree.printResultTree();
+
+	if(params.upper_bound_NNI){
+		string out_file_UB = params.out_prefix;
+		out_file_UB += ".UB.NNI.main";
+		ofstream out_UB;
+		out_UB.exceptions(ios::failbit | ios::badbit);
+		out_UB.open((char*)out_file_UB.c_str(),std::ofstream::out | std::ofstream::app);
+		out_UB<<iqtree.leafNum<<"\t"<<iqtree.aln->getNSite()<<"\t"<<iqtree.params->upper_bound_frac<<"\t"
+				  <<iqtree.skippedNNIub<<"\t"<< iqtree.totalNNIub<<"\t"<<iqtree.candidateTrees.getBestScore() <<endl;
+					//iqtree.minUB << "\t" << iqtree.meanUB/iqtree.skippedNNIub << "\t" << iqtree.maxUB << endl;
+		out_UB.close();
+		}
+
+	if (params.out_file)
+		iqtree.printTree(params.out_file);
+
+	delete[] pattern_lh;
+
+	runApproximateBranchLengths(params, iqtree);
+
+}
+
+void computeLoglFromUserInputGAMMAInvar(Params &params, IQTree &iqtree) {
+	RateGammaInvar *site_rates = dynamic_cast<RateGammaInvar *>(iqtree.getRate());
+	site_rates->setFixPInvar(true);
+	site_rates->setFixGammaShape(true);
+	vector<double> alphas, p_invars, logl;
+	ifstream aiFile;
+	aiFile.open(params.alpha_invar_file, ios_base::in);
+	if (aiFile.good()) {
+		double alpha, p_invar;
+		while (aiFile >> alpha >> p_invar) {
+			alphas.push_back(alpha);
+			p_invars.push_back(p_invar);
+		}
+		aiFile.close();
+		cout << "Computing tree logl based on the alpha and p_invar values in " << params.alpha_invar_file << " ..." <<
+		endl;
+	} else {
+		stringstream errMsg;
+		errMsg << "Could not find file: " << params.alpha_invar_file;
+		outError(errMsg.str().c_str());
+	}
+	string aiResultsFileName = string(params.out_prefix) + "_" + string(params.alpha_invar_file) + ".results";
+	ofstream aiFileResults;
+	aiFileResults.open(aiResultsFileName.c_str());
+	aiFileResults << fixed;
+	aiFileResults.precision(4);
+	DoubleVector lenvec;
+	aiFileResults << "Alpha P_Invar Logl TreeLength\n";
+	for (int i = 0; i < alphas.size(); i++) {
+		iqtree.saveBranchLengths(lenvec);
+		aiFileResults << alphas.at(i) << " " << p_invars.at(i) << " ";
+		site_rates->setGammaShape(alphas.at(i));
+		site_rates->setPInvar(p_invars.at(i));
+		site_rates->computeRates();
+		iqtree.clearAllPartialLH();
+		double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
+		aiFileResults << lh << " " << iqtree.treeLength() << "\n";
+		iqtree.restoreBranchLengths(lenvec);
+	}
+	aiFileResults.close();
+	cout << "Results were written to: " << aiResultsFileName << endl;
+	cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
+}
+
+void searchGAMMAInvarByRestarting(IQTree &iqtree) {
+    if (!Params::getInstance().fixed_branch_length)
+		iqtree.setCurScore(iqtree.optimizeAllBranches(1));
+	else
+		iqtree.setCurScore(iqtree.computeLikelihood());
+	RateGammaInvar* site_rates = dynamic_cast<RateGammaInvar*>(iqtree.getRate());
+	double initAlphas[] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 };
+	double bestLogl = iqtree.getCurScore();
+	double bestAlpha = 0.0;
+	double bestPInvar = 0.0;
+	double initPInvar = iqtree.getRate()->getPInvar();
+
+    /* Back up branch lengths and substitutional rates */
+	DoubleVector lenvec;
+	DoubleVector bestLens;
+	iqtree.saveBranchLengths(lenvec);
+    int numRateEntries = iqtree.getModel()->getNumRateEntries();
+    double *rates = new double[numRateEntries];
+    double *bestRates = new double[numRateEntries];
+    iqtree.getModel()->getRateMatrix(rates);
+    int numStates = iqtree.aln->num_states;
+    double *state_freqs = new double[numStates];
+    iqtree.getModel()->getStateFrequency(state_freqs);
+    double *bestStateFreqs =  new double[numStates];
+
+    for (int i = 0; i < 10; i++) {
+		cout << endl;
+		cout << "Testing alpha: " << initAlphas[i] << endl;
+        // Initialize model parameters
+        iqtree.restoreBranchLengths(lenvec);
+        ((ModelGTR*) iqtree.getModel())->setRateMatrix(rates);
+        ((ModelGTR*) iqtree.getModel())->setStateFrequency(state_freqs);
+        iqtree.getModel()->decomposeRateMatrix();
+        site_rates->setGammaShape(initAlphas[i]);
+		site_rates->setPInvar(initPInvar);
+		site_rates->computeRates();
+		iqtree.clearAllPartialLH();
+		iqtree.optimizeModelParameters(verbose_mode >= VB_MED, 0.1);
+        double estAlpha = iqtree.getRate()->getGammaShape();
+        double estPInv = iqtree.getRate()->getPInvar();
+        double logl = iqtree.getCurScore();
+		cout << "Est. alpha: " << estAlpha << " / Est. pinv: " << estPInv
+        << " / Logl: " << logl << endl;
+
+		if (iqtree.getCurScore() > bestLogl) {
+			bestLogl = logl;
+			bestAlpha = estAlpha;
+			bestPInvar = estPInv;
+			bestLens.clear();
+			iqtree.saveBranchLengths(bestLens);
+            iqtree.getModel()->getRateMatrix(bestRates);
+            iqtree.getModel()->getStateFrequency(bestStateFreqs);
+        }
+    }
+	site_rates->setGammaShape(bestAlpha);
+	site_rates->setFixGammaShape(false);
+	site_rates->setPInvar(bestPInvar);
+	site_rates->setFixPInvar(false);
+    ((ModelGTR*) iqtree.getModel())->setRateMatrix(bestRates);
+    ((ModelGTR*) iqtree.getModel())->setStateFrequency(bestStateFreqs);
+	iqtree.restoreBranchLengths(bestLens);
+    iqtree.getModel()->decomposeRateMatrix();
+    site_rates->computeRates();
+	iqtree.clearAllPartialLH();
+    iqtree.setCurScore(iqtree.computeLikelihood());
+    cout << endl;
+    cout << "Best initial alpha: " << bestAlpha << " / initial pinv: " << bestPInvar << " / ";
+    cout << "Logl: " << iqtree.getCurScore() << endl;
+
+    delete [] rates;
+    delete [] state_freqs;
+    delete [] bestRates;
+    delete [] bestStateFreqs;
+	Params::getInstance().testAlpha = false;
+}
+
+// Test alpha fom 0.1 to 15 and p_invar from 0.1 to 0.99, stepsize = 0.01
+void exhaustiveSearchGAMMAInvar(Params &params, IQTree &iqtree) {
+	double alphaMin = 0.01;
+	double alphaMax = 15.00;
+	double p_invarMin = 0.01;
+	double p_invarMax = 1.00;
+	double stepSize = 0.01;
+	int numAlpha = (int) floor((alphaMax - alphaMin)/stepSize);
+	int numInvar = (int) floor((p_invarMax - p_invarMin)/stepSize);
+
+	cout << "EVALUATING COMBINATIONS OF " << numAlpha << " ALPHAS AND " << numInvar << " P_INVARS ... " << endl;
+
+	vector<string> results;
+	results.reserve((unsigned long) (numAlpha * numInvar));
+	DoubleVector lenvec;
+	iqtree.saveBranchLengths(lenvec);
+
+	RateGammaInvar* site_rates = dynamic_cast<RateGammaInvar*>(iqtree.getRate());
+	site_rates->setFixPInvar(true);
+	site_rates->setFixGammaShape(true);
+
+    for (double alpha = alphaMin; alpha < alphaMax; alpha = alpha + stepSize) {
+        for (double p_invar = p_invarMin; p_invar < p_invarMax; p_invar = p_invar + stepSize) {
+            site_rates->setGammaShape(alpha);
+            site_rates->setPInvar(p_invar);
+            site_rates->computeRates();
+            iqtree.clearAllPartialLH();
+            double lh = iqtree.getModelFactory()->optimizeParameters(params.fixed_branch_length, false, 0.001);
+            stringstream ss;
+            ss << fixed << setprecision(2) << alpha << " " << p_invar << " " << lh << " " << iqtree.treeLength();
+            //cout << ss.str() << endl;
+            results.push_back(ss.str());
+            iqtree.restoreBranchLengths(lenvec);
+        }
+    }
+	string aiResultsFileName = string(params.out_prefix) + ".ai_results";
+	ofstream aiFileResults;
+	aiFileResults.open(aiResultsFileName.c_str());
+	aiFileResults << fixed;
+	aiFileResults.precision(4);
+	aiFileResults << "alpha p_invar logl tree_len\n";
+	for (vector<string>::iterator it = results.begin(); it != results.end(); it++) {
+				aiFileResults << (*it) << endl;
+			}
+	aiFileResults.close();
+	cout << "Results were written to: " << aiResultsFileName << endl;
+	cout << "Wall clock time used: " << getRealTime() - params.start_real_time << endl;
+}
+
+
+/**********************************************************
+ * STANDARD NON-PARAMETRIC BOOTSTRAP
+ ***********************************************************/
+void runStandardBootstrap(Params &params, string &original_model, Alignment *alignment, IQTree *tree) {
+	vector<ModelInfo> *model_info = new vector<ModelInfo>;
+	StrVector removed_seqs, twin_seqs;
+
+	// turn off aLRT test
+	int saved_aLRT_replicates = params.aLRT_replicates;
+	params.aLRT_replicates = 0;
+	string treefile_name = params.out_prefix;
+	treefile_name += ".treefile";
+	string boottrees_name = params.out_prefix;
+	boottrees_name += ".boottrees";
+	string bootaln_name = params.out_prefix;
+	bootaln_name += ".bootaln";
+	string bootlh_name = params.out_prefix;
+	bootlh_name += ".bootlh";
+	// first empty the boottrees file
+	try {
+		ofstream tree_out;
+		tree_out.exceptions(ios::failbit | ios::badbit);
+		tree_out.open(boottrees_name.c_str());
+		tree_out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, boottrees_name);
+	}
+
+	// empty the bootaln file
+	if (params.print_bootaln)
+	try {
+		ofstream tree_out;
+		tree_out.exceptions(ios::failbit | ios::badbit);
+		tree_out.open(bootaln_name.c_str());
+		tree_out.close();
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, bootaln_name);
+	}
+
+	double start_time = getCPUTime();
+
+	// do bootstrap analysis
+	for (int sample = 0; sample < params.num_bootstrap_samples; sample++) {
+		cout << endl << "===> START BOOTSTRAP REPLICATE NUMBER "
+				<< sample + 1 << endl << endl;
+
+		Alignment* bootstrap_alignment;
+		cout << "Creating bootstrap alignment..." << endl;
+		if (alignment->isSuperAlignment())
+			bootstrap_alignment = new SuperAlignment;
+		else
+			bootstrap_alignment = new Alignment;
+		bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
+		if (params.print_tree_lh) {
+			double prob;
+			bootstrap_alignment->multinomialProb(*alignment, prob);
+			ofstream boot_lh;
+			if (sample == 0)
+				boot_lh.open(bootlh_name.c_str());
+			else
+				boot_lh.open(bootlh_name.c_str(), ios_base::out | ios_base::app);
+			boot_lh << "0\t" << prob << endl;
+			boot_lh.close();
+		}
+		IQTree *boot_tree;
+		if (alignment->isSuperAlignment()){
+			if(params.partition_type){
+				boot_tree = new PhyloSuperTreePlen((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
+			} else {
+				boot_tree = new PhyloSuperTree((SuperAlignment*) bootstrap_alignment, (PhyloSuperTree*) tree);
+			}
+		} else
+			boot_tree = new IQTree(bootstrap_alignment);
+		if (params.print_bootaln)
+			bootstrap_alignment->printPhylip(bootaln_name.c_str(), true);
+		runTreeReconstruction(params, original_model, *boot_tree, *model_info);
+		// read in the output tree file
+		string tree_str;
+		try {
+			ifstream tree_in;
+			tree_in.exceptions(ios::failbit | ios::badbit);
+			tree_in.open(treefile_name.c_str());
+			tree_in >> tree_str;
+			tree_in.close();
+		} catch (ios::failure) {
+			outError(ERR_READ_INPUT, treefile_name);
+		}
+		// write the tree into .boottrees file
+		try {
+			ofstream tree_out;
+			tree_out.exceptions(ios::failbit | ios::badbit);
+			tree_out.open(boottrees_name.c_str(), ios_base::out | ios_base::app);
+			tree_out << tree_str << endl;
+			tree_out.close();
+		} catch (ios::failure) {
+			outError(ERR_WRITE_OUTPUT, boottrees_name);
+		}
+		// fix bug: set the model for original tree after testing
+		if (original_model.substr(0,4) == "TEST" && tree->isSuperTree()) {
+			PhyloSuperTree *stree = ((PhyloSuperTree*)tree);
+			stree->part_info =  ((PhyloSuperTree*)boot_tree)->part_info;
+//			for (int i = 0; i < ((PhyloSuperTree*)tree)->part_info.size(); i++)
+//				((PhyloSuperTree*)tree)->part_info[i].model_name = ((PhyloSuperTree*)boot_tree)->part_info[i].model_name;
+		}
+		if (params.num_bootstrap_samples == 1)
+			reportPhyloAnalysis(params, original_model, *boot_tree, *model_info);
+		// WHY was the following line missing, which caused memory leak?
+		delete boot_tree;
+		// fix bug: bootstrap_alignment might be changed
+		bootstrap_alignment = boot_tree->aln;
+		delete bootstrap_alignment;
+	}
+
+	if (params.consensus_type == CT_CONSENSUS_TREE) {
+
+		cout << endl << "===> COMPUTE CONSENSUS TREE FROM "
+				<< params.num_bootstrap_samples << " BOOTSTRAP TREES" << endl << endl;
+		computeConsensusTree(boottrees_name.c_str(), 0, 1e6, -1,
+				params.split_threshold, NULL, params.out_prefix, NULL, &params);
+	}
+
+	if (params.compute_ml_tree) {
+		cout << endl << "===> START ANALYSIS ON THE ORIGINAL ALIGNMENT" << endl << endl;
+		params.aLRT_replicates = saved_aLRT_replicates;
+		runTreeReconstruction(params, original_model, *tree, *model_info);
+
+		cout << endl << "===> ASSIGN BOOTSTRAP SUPPORTS TO THE TREE FROM ORIGINAL ALIGNMENT" << endl << endl;
+		MExtTree ext_tree;
+		assignBootstrapSupport(boottrees_name.c_str(), 0, 1e6,
+				treefile_name.c_str(), false, treefile_name.c_str(),
+				params.out_prefix, ext_tree, NULL, &params);
+		tree->copyTree(&ext_tree);
+		reportPhyloAnalysis(params, original_model, *tree, *model_info);
+	} else if (params.consensus_type == CT_CONSENSUS_TREE) {
+		int mi = params.min_iterations;
+		STOP_CONDITION sc = params.stop_condition;
+		params.min_iterations = 0;
+		params.stop_condition = SC_FIXED_ITERATION;
+		runTreeReconstruction(params, original_model, *tree, *model_info);
+		params.min_iterations = mi;
+		params.stop_condition = sc;
+		tree->stop_rule.initialize(params);
+		reportPhyloAnalysis(params, original_model, *tree, *model_info);
+	} else
+		cout << endl;
+
+	cout << "Total CPU time for bootstrap: " << (getCPUTime() - start_time) << " seconds." << endl << endl;
+	cout << "Non-parametric bootstrap results written to:" << endl;
+	if (params.print_bootaln)
+		cout << "  Bootstrap alignments:     " << params.out_prefix << ".bootaln" << endl;
+	cout << "  Bootstrap trees:          " << params.out_prefix << ".boottrees" << endl;
+	if (params.consensus_type == CT_CONSENSUS_TREE)
+		cout << "  Consensus tree:           " << params.out_prefix << ".contree" << endl;
+	cout << endl;
+    
+    delete model_info;
+}
+
+void convertAlignment(Params &params, IQTree *iqtree) {
+	Alignment *alignment = iqtree->aln;
+	if (params.num_bootstrap_samples || params.print_bootaln) {
+		// create bootstrap alignment
+		Alignment* bootstrap_alignment;
+		cout << "Creating bootstrap alignment..." << endl;
+		if (alignment->isSuperAlignment())
+			bootstrap_alignment = new SuperAlignment;
+		else
+			bootstrap_alignment = new Alignment;
+		bootstrap_alignment->createBootstrapAlignment(alignment, NULL, params.bootstrap_spec);
+		delete alignment;
+		alignment = bootstrap_alignment;
+	}
+	if (alignment->isSuperAlignment()) {
+		((SuperAlignment*)alignment)->printCombinedAlignment(params.aln_output);
+		if (params.print_subaln)
+			((SuperAlignment*)alignment)->printSubAlignments(params, ((PhyloSuperTree*)iqtree)->part_info);
+
+	} else if (params.gap_masked_aln) {
+		Alignment out_aln;
+		Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype);
+		out_aln.createGapMaskedAlignment(&masked_aln, alignment);
+		out_aln.printPhylip(params.aln_output, false, params.aln_site_list,
+				params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name);
+		string str = params.gap_masked_aln;
+		str += ".sitegaps";
+		out_aln.printSiteGaps(str.c_str());
+	} else if (params.aln_output_format == ALN_PHYLIP)
+		alignment->printPhylip(params.aln_output, false, params.aln_site_list,
+				params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name);
+	else if (params.aln_output_format == ALN_FASTA)
+		alignment->printFasta(params.aln_output, false, params.aln_site_list,
+				params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name);
+}
+
+
+/**********************************************************
+ * TOP-LEVEL FUNCTION
+ ***********************************************************/
+void runPhyloAnalysis(Params &params) {
+	Alignment *alignment;
+	IQTree *tree;
+
+	/****************** read in alignment **********************/
+	if (params.partition_file) {
+		// Partition model analysis
+		if(params.partition_type){
+			// since nni5 does not work yet, stop the programm
+/*			if(params.nni5)
+				outError("-nni5 option is unsupported yet for proportitional partition model. please use -nni1 option");*/
+//			if(params.aLRT_replicates || params.localbp_replicates)
+//				outError("-alrt or -lbp option is unsupported yet for joint/proportional partition model");
+			// initialize supertree - Proportional Edges case, "-spt p" option
+			tree = new PhyloSuperTreePlen(params);
+		} else {
+			// initialize supertree stuff if user specifies partition file with -sp option
+			tree = new PhyloSuperTree(params);
+		}
+		// this alignment will actually be of type SuperAlignment
+		alignment = tree->aln;
+	} else {
+		alignment = new Alignment(params.aln_file, params.sequence_type, params.intype);
+		if (params.freq_const_patterns) {
+			int orig_nsite = alignment->getNSite();
+			alignment->addConstPatterns(params.freq_const_patterns);
+			cout << "INFO: " << alignment->getNSite() - orig_nsite << " const sites added into alignment" << endl;
+		}
+		tree = new IQTree(alignment);
+	}
+
+	string original_model = params.model_name;
+
+	if (params.concatenate_aln) {
+		Alignment aln(params.concatenate_aln, params.sequence_type, params.intype);
+		cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl;
+		alignment->concatenateAlignment(&aln);
+	}
+
+    if (params.compute_seq_identity_along_tree) {
+        if (!params.user_file)
+            outError("Please supply a user tree file!");
+        tree->readTree(params.user_file, params.is_rooted);
+        if (!tree->rooted && !params.root) {
+            outError("Tree is unrooted, thus you have to specify a root with -o option");
+        }
+        tree->setAlignment(tree->aln);
+        if (!tree->rooted)
+            tree->setRootNode(params.root);
+        tree->computeSeqIdentityAlongTree();
+        if (verbose_mode >= VB_MED)
+            tree->drawTree(cout);
+        string out_tree = (string)params.out_prefix + ".seqident_tree";
+        tree->printTree(out_tree.c_str());
+        cout << "Tree with sequence identity printed to " << out_tree << endl;
+	} else if (params.aln_output) {
+		/************ convert alignment to other format and write to output file *************/
+		convertAlignment(params, tree);
+	} else if (params.gbo_replicates > 0 && params.user_file && params.second_tree) {
+		// run one of the UFBoot analysis
+		runGuidedBootstrap(params, alignment, *tree);
+	} else if (params.avh_test) {
+		// run one of the wondering test for Arndt
+		runAvHTest(params, alignment, *tree);
+	} else if (params.bootlh_test) {
+		// run Arndt's plot of tree likelihoods against bootstrap alignments
+		runBootLhTest(params, alignment, *tree);
+	} else if (params.num_bootstrap_samples == 0) {
+		// the main Maximum likelihood tree reconstruction
+		vector<ModelInfo> *model_info = new vector<ModelInfo>;
+		alignment->checkGappySeq(params.remove_empty_seq);
+
+		// remove identical sequences
+        if (params.ignore_identical_seqs) {
+            tree->removeIdenticalSeqs(params);
+        }
+        alignment = NULL; // from now on use tree->aln instead
+
+		// call main tree reconstruction
+        runTreeReconstruction(params, original_model, *tree, *model_info);
+		if (params.gbo_replicates && params.online_bootstrap) {
+			if (params.print_ufboot_trees)
+				tree->writeUFBootTrees(params);
+
+			cout << endl << "Computing bootstrap consensus tree..." << endl;
+			string splitsfile = params.out_prefix;
+			splitsfile += ".splits.nex";
+			computeConsensusTree(splitsfile.c_str(), 0, 1e6, params.split_threshold,
+					params.split_weight_threshold, NULL, params.out_prefix, NULL, &params);
+			// now optimize branch lengths of the consensus tree
+			string current_tree = tree->getTreeString();
+			splitsfile = params.out_prefix;
+			splitsfile += ".contree";
+			tree->readTreeFile(splitsfile);
+			// bug fix
+			if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
+				cout << "NOTE: Changing to old kernel as consensus tree is multifurcating" << endl;
+                if (tree->sse == LK_EIGEN)
+                    tree->changeLikelihoodKernel(LK_NORMAL);
+                else
+                    tree->changeLikelihoodKernel(LK_SSE);
+			}
+
+			tree->initializeAllPartialLh();
+			tree->fixNegativeBranch(true);
+//	        if (tree->isSuperTree()) {
+//	        	if (params.partition_type == 0) {
+//	        		PhyloSuperTree *stree = (PhyloSuperTree*) tree;
+//	        		tree->clearAllPartialLH();
+//	        		// full partition model
+//	        		for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++) {
+//	        			(*it)->fixNegativeBranch(true);
+//	        		}
+//	        		tree->clearAllPartialLH();
+//	        	} else {
+//	        		// joint/prop. partition model
+//					tree->assignRandomBranchLengths(true);
+//					((PhyloSuperTree*)tree)->mapTrees();
+//	        	}
+//	        } else {
+//	        	tree->fixNegativeBranch(true);
+//	    	}
+
+			tree->boot_consense_logl = tree->optimizeAllBranches();
+			cout << "Log-likelihood of consensus tree: " << tree->boot_consense_logl << endl;
+		    tree->setRootNode(params.root);
+		    tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
+			tree->printTree(splitsfile.c_str(), WT_BR_LEN | WT_BR_LEN_FIXED_WIDTH | WT_SORT_TAXA | WT_NEWLINE);
+			// revert the best tree
+			tree->readTreeString(current_tree);
+//			if (tree->isSuperTree()) {
+//				tree->optimizeAllBranches();
+//				((PhyloSuperTree*)tree)->computeBranchLengths();
+//			}
+		}
+		// reinsert identical sequences
+		if (tree->removed_seqs.size() > 0) {
+			// BUG HERE!
+//			delete tree->aln;
+//			tree->reinsertIdenticalSeqs(alignment);
+			// BUG FIX: dont use reinsertIdenticalSeqs anymore
+			tree->insertTaxa(tree->removed_seqs, tree->twin_seqs);
+			tree->printResultTree();
+		}
+		reportPhyloAnalysis(params, original_model, *tree, *model_info);
+        delete model_info;
+	} else {
+		// the classical non-parameter bootstrap (SBS)
+		if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos)
+			outError("-m TESTMERGE is not allowed when doing standard bootstrap. Please first\nfind partition scheme on the original alignment and use it for bootstrap analysis");
+		runStandardBootstrap(params, original_model, alignment, tree);
+	}
+
+//	if (params.upper_bound) {
+//			UpperBounds(&params, alignment, tree);
+//	}
+
+	if(verbose_mode >= VB_MED){
+		if(tree->isSuperTree() && params.partition_type){
+			((PhyloSuperTreePlen*) tree)->printNNIcasesNUM();
+		}
+	}
+	delete tree;
+	// BUG FIX: alignment can be changed, should delete tree->aln instead
+	alignment = tree->aln;
+	delete alignment;
+}
+
+void assignBranchSupportNew(Params &params) {
+	if (!params.user_file)
+		outError("No trees file provided");
+	if (!params.second_tree)
+		outError("No target tree file provided");
+	cout << "Reading tree " << params.second_tree << " ..." << endl;
+	MTree tree(params.second_tree, params.is_rooted);
+	cout << tree.leafNum << " taxa and " << tree.branchNum << " branches" << endl;
+	tree.assignBranchSupport(params.user_file);
+	string str = params.second_tree;
+	str += ".suptree";
+	tree.printTree(str.c_str());
+	cout << "Tree with assigned branch supports written to " << str << endl;
+	if (verbose_mode >= VB_DEBUG)
+		tree.drawTree(cout);
+}
+
+
+
+/**
+ * assign split occurence frequencies from a set of input trees onto a target tree
+ * NOTE: input trees must have the same taxon set
+ * @param input_trees file containing NEWICK tree strings
+ * @param burnin number of beginning trees to discard
+ * @param max_count max number of trees to read in
+ * @param target_tree the target tree
+ * @param rooted TRUE if trees are rooted, false for unrooted trees
+ * @param output_file file name to write output tree with assigned support values
+ * @param out_prefix prefix of output file
+ * @param mytree (OUT) resulting tree with support values assigned from target_tree
+ * @param tree_weight_file file containing INTEGER weights of input trees
+ * @param params program parameters
+ */
+void assignBootstrapSupport(const char *input_trees, int burnin, int max_count,
+		const char *target_tree, bool rooted, const char *output_tree,
+		const char *out_prefix, MExtTree &mytree, const char* tree_weight_file,
+		Params *params) {
+	//bool rooted = false;
+	// read the tree file
+	cout << "Reading tree " << target_tree << " ..." << endl;
+	mytree.init(target_tree, rooted);
+	// reindex the taxa in the tree to aphabetical names
+	NodeVector taxa;
+	mytree.getTaxa(taxa);
+	sort(taxa.begin(), taxa.end(), nodenamecmp);
+	int i = 0;
+	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++) {
+		(*it)->id = i++;
+	}
+
+	/*
+	 string filename = params.boot_trees;
+	 filename += ".nolen";
+	 boot_trees.printTrees(filename.c_str(), false);
+	 return;
+	 */
+	SplitGraph sg;
+	SplitIntMap hash_ss;
+	// make the taxa name
+	vector<string> taxname;
+	taxname.resize(mytree.leafNum);
+	mytree.getTaxaName(taxname);
+
+	// read the bootstrap tree file
+	double scale = 100.0;
+	if (params->scaling_factor > 0)
+		scale = params->scaling_factor;
+
+	MTreeSet boot_trees;
+	if (params && detectInputFile((char*) input_trees) == IN_NEXUS) {
+		sg.init(*params);
+		for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++)
+			hash_ss.insertSplit((*it), (*it)->getWeight());
+		StrVector sgtaxname;
+		sg.getTaxaName(sgtaxname);
+		i = 0;
+		for (StrVector::iterator sit = sgtaxname.begin();
+				sit != sgtaxname.end(); sit++, i++) {
+			Node *leaf = mytree.findLeafName(*sit);
+			if (!leaf)
+				outError("Tree does not contain taxon ", *sit);
+			leaf->id = i;
+		}
+		scale /= sg.maxWeight();
+	} else {
+		boot_trees.init(input_trees, rooted, burnin, max_count,
+				tree_weight_file);
+		boot_trees.convertSplits(taxname, sg, hash_ss, SW_COUNT, -1, params->support_tag);
+		scale /= boot_trees.sumTreeWeights();
+	}
+	//sg.report(cout);
+	cout << "Rescaling split weights by " << scale << endl;
+	if (params->scaling_factor < 0)
+		sg.scaleWeight(scale, true);
+	else {
+		sg.scaleWeight(scale, false, params->numeric_precision);
+	}
+
+	cout << sg.size() << " splits found" << endl;
+	// compute the percentage of appearance
+	//	printSplitSet(sg, hash_ss);
+	//sg.report(cout);
+	cout << "Creating bootstrap support values..." << endl;
+	mytree.createBootstrapSupport(taxname, boot_trees, sg, hash_ss, params->support_tag);
+	//mytree.scaleLength(100.0/boot_trees.size(), true);
+	string out_file;
+	if (output_tree)
+		out_file = output_tree;
+	else {
+		if (out_prefix)
+			out_file = out_prefix;
+		else
+			out_file = target_tree;
+		out_file += ".suptree";
+	}
+
+	mytree.printTree(out_file.c_str());
+	cout << "Tree with assigned bootstrap support written to " << out_file
+			<< endl;
+	/*
+	if (out_prefix)
+		out_file = out_prefix;
+	else
+		out_file = target_tree;
+	out_file += ".supval";
+	mytree.writeInternalNodeNames(out_file);
+
+	cout << "Support values written to " << out_file << endl;
+	*/
+}
+
+void computeConsensusTree(const char *input_trees, int burnin, int max_count,
+		double cutoff, double weight_threshold, const char *output_tree,
+		const char *out_prefix, const char *tree_weight_file, Params *params) {
+	bool rooted = false;
+
+	// read the bootstrap tree file
+	/*
+	 MTreeSet boot_trees(input_trees, rooted, burnin, tree_weight_file);
+	 string first_taxname = boot_trees.front()->root->name;
+	 //if (params.root) first_taxname = params.root;
+
+	 SplitGraph sg;
+
+	 boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);*/
+
+	//sg.report(cout);
+	SplitGraph sg;
+	SplitIntMap hash_ss;
+	// make the taxa name
+	//vector<string> taxname;
+	//taxname.resize(mytree.leafNum);
+	//mytree.getTaxaName(taxname);
+
+	// read the bootstrap tree file
+	double scale = 100.0;
+	if (params->scaling_factor > 0)
+		scale = params->scaling_factor;
+
+	MTreeSet boot_trees;
+	if (params && detectInputFile((char*) input_trees) == IN_NEXUS) {
+		char *user_file = params->user_file;
+		params->user_file = (char*) input_trees;
+		params->split_weight_summary = SW_COUNT; // count number of splits
+		sg.init(*params);
+		params->user_file = user_file;
+		for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++)
+			hash_ss.insertSplit((*it), (*it)->getWeight());
+		/*		StrVector sgtaxname;
+		 sg.getTaxaName(sgtaxname);
+		 i = 0;
+		 for (StrVector::iterator sit = sgtaxname.begin(); sit != sgtaxname.end(); sit++, i++) {
+		 Node *leaf = mytree.findLeafName(*sit);
+		 if (!leaf) outError("Tree does not contain taxon ", *sit);
+		 leaf->id = i;
+		 }*/
+		scale /= sg.maxWeight();
+	} else {
+		boot_trees.init(input_trees, rooted, burnin, max_count,
+				tree_weight_file);
+		boot_trees.convertSplits(sg, cutoff, SW_COUNT, weight_threshold);
+		scale /= boot_trees.sumTreeWeights();
+		cout << sg.size() << " splits found" << endl;
+	}
+	//sg.report(cout);
+	if (verbose_mode >= VB_MED)
+		cout << "Rescaling split weights by " << scale << endl;
+	if (params->scaling_factor < 0)
+		sg.scaleWeight(scale, true);
+	else {
+		sg.scaleWeight(scale, false, params->numeric_precision);
+	}
+
+
+
+	//cout << "Creating greedy consensus tree..." << endl;
+	MTree mytree;
+	SplitGraph maxsg;
+	sg.findMaxCompatibleSplits(maxsg);
+
+	if (verbose_mode >= VB_MAX)
+		maxsg.saveFileStarDot(cout);
+	//cout << "convert compatible split system into tree..." << endl;
+	mytree.convertToTree(maxsg);
+	//cout << "done" << endl;
+	string taxname;
+	if (params->root)
+		taxname = params->root;
+	else
+		taxname = sg.getTaxa()->GetTaxonLabel(0);
+	Node *node = mytree.findLeafName(taxname);
+	if (node)
+		mytree.root = node;
+	// mytree.scaleLength(100.0 / boot_trees.sumTreeWeights(), true);
+
+	// mytree.getTaxaID(maxsg.getSplitsBlock()->getCycle());
+	//maxsg.saveFile(cout);
+
+	string out_file;
+
+	if (output_tree)
+		out_file = output_tree;
+	else {
+		if (out_prefix)
+			out_file = out_prefix;
+		else
+			out_file = input_trees;
+		out_file += ".contree";
+	}
+
+//	if (removed_seqs.size() > 0)
+//		mytree.insertTaxa(removed_seqs, twin_seqs);
+
+	mytree.printTree(out_file.c_str(), WT_BR_CLADE);
+	cout << "Consensus tree written to " << out_file << endl;
+
+	if (output_tree)
+		out_file = output_tree;
+	else {
+		if (out_prefix)
+			out_file = out_prefix;
+		else
+			out_file = input_trees;
+		out_file += ".splits";
+	}
+
+    //sg.scaleWeight(0.01, false, 4);
+	if (params->print_splits_file) {
+		sg.saveFile(out_file.c_str(), IN_OTHER, true);
+		cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
+	}
+
+}
+
+void computeConsensusNetwork(const char *input_trees, int burnin, int max_count,
+		double cutoff, int weight_summary, double weight_threshold, const char *output_tree,
+		const char *out_prefix, const char* tree_weight_file) {
+	bool rooted = false;
+
+	// read the bootstrap tree file
+	MTreeSet boot_trees(input_trees, rooted, burnin, max_count,
+			tree_weight_file);
+
+	SplitGraph sg;
+	//SplitIntMap hash_ss;
+
+	boot_trees.convertSplits(sg, cutoff, weight_summary, weight_threshold);
+
+	string out_file;
+
+	if (output_tree)
+		out_file = output_tree;
+	else {
+		if (out_prefix)
+			out_file = out_prefix;
+		else
+			out_file = input_trees;
+		out_file += ".nex";
+	}
+
+	sg.saveFile(out_file.c_str(), IN_NEXUS);
+	cout << "Consensus network printed to " << out_file << endl;
+
+	if (output_tree)
+		out_file = output_tree;
+	else {
+		if (out_prefix)
+			out_file = out_prefix;
+		else
+			out_file = input_trees;
+		out_file += ".splits";
+	}
+	if (verbose_mode >= VB_MED) {
+		sg.saveFile(out_file.c_str(), IN_OTHER, true);
+		cout << "Non-trivial split supports printed to star-dot file " << out_file << endl;
+	}
+
+}
+
diff --git a/phyloanalysis.h b/phyloanalysis.h
new file mode 100644
index 0000000..069f17f
--- /dev/null
+++ b/phyloanalysis.h
@@ -0,0 +1,92 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef PHYLOANALYSIS_H
+#define PHYLOANALYSIS_H
+
+#include "tools.h"
+#include "mexttree.h"
+#include "phylotesting.h"
+#include "upperbounds.h" // Olga: functions for Upper Bounds analysis
+#include "pllnni.h"
+
+class PhyloTree;
+class IQTree;
+
+/**
+	main function to carry out phylogenetic inference
+	@param params program parameters
+*/
+void runPhyloAnalysis(Params &params);
+
+void runTreeReconstruction(Params &params, string &original_model,
+		IQTree &tree, vector<ModelInfo> &model_info);
+
+/**
+	take the collection of trees from input_trees, it assign support values to target_tree
+	and print resulting tree to output_tree. 
+	@param input_trees collection of input trees to infer split supports
+	@param burnin the number trees at the beginning of input_trees to be discarded
+	@param max_count max number of trees to load
+	@param target_tree tree to assign support value
+	@param output_tree (OUT, OVERWRITE IF EXIST) Resulting will be written to this file. If NULL,
+		output_tree will be named target_tree appended with ".suptree"
+*/
+void assignBootstrapSupport(const char *input_trees, int burnin, int max_count, const char *target_tree, 
+	bool rooted, const char *output_tree, const char *out_prefix, MExtTree &mytree, 
+	const char* tree_weight_file, Params *params);
+
+/**
+ * assign branch supports from params.user_tree trees file to params.second_tree
+ * @param params program parameters
+ */
+void assignBranchSupportNew(Params &params);
+
+/**
+	Compute the consensus tree from the collection of trees from input_trees
+	and print resulting tree to output_tree. 
+	@param phylo_tree used to optimize branch lengths of the consensus tree. Can be NULL
+	@param input_trees collection of input trees to infer split supports
+	@param burnin the number trees at the beginning of input_trees to be discarded
+	@param max_count max number of trees to load
+	@param cutoff only incorporate those splits that have support values more than cutoff
+	@param weight_threshold minimum weight cutoff
+	@param output_tree (OUT, OVERWRITE IF EXIST) Resulting consensus tree will be written to this file. If NULL,
+		output_tree will be named input_trees appended with ".contree"
+*/
+void computeConsensusTree(const char *input_trees, int burnin, int max_count, double cutoff, double weight_threshold,
+	const char *output_tree, const char *out_prefix, const char* tree_weight_file, Params *params);
+
+/**
+	Compute the consensus network from the collection of trees in input_trees.
+	print consensus network to output_tree
+	@param input_trees collection of input trees to infer split supports
+	@param burnin the number trees at the beginning of input_trees to be discarded
+	@param max_count max number of trees to load
+	@param cutoff only incorporate those splits that have support values more than cutoff
+	@param weight_threshold minimum weight cutoff
+	@param output_tree (OUT, OVERWRITE IF EXIST) Resulting consensus tree will be written to this file. If NULL,
+		output_tree will be named input_trees appended with ".connetwork"
+*/
+void computeConsensusNetwork(const char *input_trees, int burnin, int max_count, double cutoff,
+		int weight_summary, double weight_threshold,
+	const char *output_tree, const char *out_prefix, const char* tree_weight_file);
+
+#endif
diff --git a/phylokernel.h b/phylokernel.h
new file mode 100644
index 0000000..5497634
--- /dev/null
+++ b/phylokernel.h
@@ -0,0 +1,1535 @@
+/*
+ * phylokernel.h
+ *
+ *  Created on: Dec 14, 2014
+ *      Author: minh
+ */
+
+#ifndef PHYLOKERNEL_H_
+#define PHYLOKERNEL_H_
+
+#include "phylotree.h"
+#include "vectorclass/vectorclass.h"
+#include "vectorclass/vectormath_exp.h"
+
+inline Vec2d horizontal_add(Vec2d x[2]) {
+#if  INSTRSET >= 3  // SSE3
+    return _mm_hadd_pd(x[0],x[1]);
+#else
+#error "You must compile with SSE3 enabled!"
+#endif
+}
+
+inline double horizontal_max(Vec2d const &a) {
+    double x[2];
+    a.store(x);
+    return max(x[0],x[1]);
+}
+
+#ifdef __AVX__
+
+inline Vec4d horizontal_add(Vec4d x[4]) {
+	// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
+	__m256d sumab = _mm256_hadd_pd(x[0], x[1]);
+	// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
+	__m256d sumcd = _mm256_hadd_pd(x[2], x[3]);
+
+	// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
+	__m256d blend = _mm256_blend_pd(sumab, sumcd, 12/* 0b1100*/);
+	// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
+	__m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
+
+	return _mm256_add_pd(perm, blend);
+}
+
+inline double horizontal_max(Vec4d const &a) {
+	__m128d high = _mm256_extractf128_pd(a,1);
+	__m128d m = _mm_max_pd(_mm256_castpd256_pd128(a), high);
+    double x[2];
+    _mm_storeu_pd(x, m);
+    return max(x[0],x[1]);
+}
+
+#endif // __AVX__
+
+template <class Numeric, class VectorClass, const int VCSIZE>
+Numeric PhyloTree::dotProductSIMD(Numeric *x, Numeric *y, int size) {
+	VectorClass res = VectorClass().load_a(x) * VectorClass().load_a(y);
+	for (int i = VCSIZE; i < size; i += VCSIZE)
+		res = mul_add(VectorClass().load_a(&x[i]), VectorClass().load_a(&y[i]), res);
+	return horizontal_add(res);
+}
+
+/************************************************************************************************
+ *
+ *   Highly optimized vectorized versions of likelihood functions
+ *
+ *************************************************************************************************/
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+
+    size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+	    //memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+
+    size_t ncat = site_rate->getNRate();
+    assert(nstates == aln->num_states && nstates >= VCSIZE && VCSIZE == VectorClass().size());
+    assert(model->isReversible()); // only works with reversible model!
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x, j;
+    size_t block = nstates * ncat;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		// swap left and right
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(right, node);
+
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+
+	VectorClass vc_inv_evec[nstates*nstates/VCSIZE];
+	assert(inv_evec && evec);
+	for (i = 0; i < nstates; i++) {
+		for (x = 0; x < nstates/VCSIZE; x++)
+			// inv_evec is not aligned!
+			vc_inv_evec[i*nstates/VCSIZE+x].load_a(&inv_evec[i*nstates+x*VCSIZE]);
+	}
+	double *eval = model->getEigenvalues();
+
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+
+	VectorClass *eleft = (VectorClass*)aligned_alloc<double>(block*nstates);
+	VectorClass *eright = (VectorClass*)aligned_alloc<double>(block*nstates);
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_evec;
+		VectorClass expleft[nstates/VCSIZE];
+		VectorClass expright[nstates/VCSIZE];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			// eval is not aligned!
+			expleft[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_left));
+			expright[i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * VectorClass(len_right));
+		}
+		for (x = 0; x < nstates; x++)
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				// evec is not be aligned!
+				vc_evec.load_a(&evec[x*nstates+i*VCSIZE]);
+				eleft[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expleft[i]);
+				eright[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expright[i]);
+			}
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+		double *partial_lh_right = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			size_t addr = state*nstates;
+			for (i = 0; i < nstates/VCSIZE; i++)
+				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addr+i*VCSIZE]);
+			for (x = 0; x < block; x+=VCSIZE) {
+				addr = x*nstates/VCSIZE;
+				for (j = 0; j < VCSIZE; j++)
+					vleft[j] = eleft[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+				for (i = 1; i < nstates/VCSIZE; i++) {
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = mul_add(eleft[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
+				}
+				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vright[VCSIZE];
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+
+			for (i = 0; i < nstates/VCSIZE; i++)
+				vc_partial_lh_tmp[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
+			for (x = 0; x < block; x+=VCSIZE) {
+				for (j = 0; j < VCSIZE; j++)
+					vright[j] = eright[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+				for (i = 1; i < nstates/VCSIZE; i++) {
+					for (j = 0; j < VCSIZE; j++)
+						vright[j] = mul_add(eright[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vright[j]);
+				}
+				horizontal_add(vright).store_a(&partial_lh_right[state*block+x]);
+			}
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+			partial_lh_right[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for left and right partial_lh
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		double **lh_right_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * (aln->at(ptn))[right->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i, j, vc_partial_lh_tmp, res)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+	        double *lh_right = lh_right_ptr[ptn];
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					vc_partial_lh_tmp[x] = (VectorClass().load_a(&lh_left[x*VCSIZE]) * VectorClass().load_a(&lh_right[x*VCSIZE]));
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					horizontal_add(res).store_a(&partial_lh[i]);
+				}
+
+				lh_left += nstates;
+				lh_right += nstates;
+				partial_lh += nstates;
+			}
+		}
+
+	    aligned_free(lh_left_ptr);
+	    aligned_free(lh_right_ptr);
+		aligned_free(partial_lh_right);
+		aligned_free(partial_lh_left);
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_tip_lh[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			for (i = 0; i < nstates/VCSIZE; i++)
+				vc_tip_lh[i].load_a(&tip_partial_lh[state*nstates+i*VCSIZE]);
+			for (x = 0; x < block; x+=VCSIZE) {
+				for (j = 0; j < VCSIZE; j++)
+					vleft[j] = eleft[(x+j)*nstates/VCSIZE] * vc_tip_lh[0];
+				for (i = 1; i < nstates/VCSIZE; i++) {
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = mul_add(eleft[(x+j)*nstates/VCSIZE+i], vc_tip_lh[i], vleft[j]);
+				}
+				horizontal_add(vleft).store_a(&partial_lh_left[state*block+x]);
+			}
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for partial_lh_left
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		double sum_scale = 0.0;
+		VectorClass vc_lh_right[nstates/VCSIZE];
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private (ptn, c, x, i, j, vc_lh_right, vc_partial_lh_tmp, res, vc_max, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+	        double *partial_lh_right = right->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+			vc_max = 0.0;
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = c*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						vright[j] = eright[addr+nstates*j/VCSIZE] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++)
+						for (j = 0; j < VCSIZE; j++) {
+							vright[j] = mul_add(eright[addr+i+nstates*j/VCSIZE], vc_lh_right[i], vright[j]);
+						}
+					vc_partial_lh_tmp[x] = VectorClass().load_a(&lh_left[x*VCSIZE])
+							* horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++) {
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					}
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				lh_left += nstates;
+				partial_lh_right += nstates;
+				partial_lh += nstates;
+			}
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+            	// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	    aligned_free(lh_left_ptr);
+		aligned_free(partial_lh_left);
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass vc_lh_left[nstates/VCSIZE], vc_lh_right[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vleft[VCSIZE], vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction (+: sum_scale) private(ptn, c, x, i, j, vc_max, vc_partial_lh_tmp, vc_lh_left, vc_lh_right, res, vleft, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+			vc_max = 0.0;
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (i = 0; i < nstates/VCSIZE; i++) {
+					vc_lh_left[i].load_a(&partial_lh_left[i*VCSIZE]);
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+				}
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = c*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						size_t addr_com = addr+j*nstates/VCSIZE;
+						vleft[j] = eleft[addr_com] * vc_lh_left[0];
+						vright[j] = eright[addr_com] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++) {
+							size_t addr_com = addr+i+j*nstates/VCSIZE;
+							vleft[j] = mul_add(eleft[addr_com], vc_lh_left[i], vleft[j]);
+							vright[j] = mul_add(eright[addr_com], vc_lh_right[i], vright[j]);
+						}
+					}
+					vc_partial_lh_tmp[x] = horizontal_add(vleft) * horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++)
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				partial_lh += nstates;
+				partial_lh_left += nstates;
+				partial_lh_right += nstates;
+			}
+
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	aligned_free(eright);
+	aligned_free(eleft);
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    df = ddf = 0.0;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val1 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val2 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = dad_branch->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		VectorClass vc_prop = site_rate->getProp(c);
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass val = exp(cof*vc_len) * vc_prop;
+			VectorClass val1_ = cof*val;
+			vc_val0[c*nstates/VCSIZE+i] = val;
+			vc_val1[c*nstates/VCSIZE+i] = val1_;
+			vc_val2[c*nstates/VCSIZE+i] = cof*val1_;
+		}
+	}
+
+	assert(theta_all);
+	if (!theta_computed) {
+		theta_computed = true;
+		// precompute theta for fast branch length optimization
+
+		if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i)
+#endif
+			for (ptn = 0; ptn < orig_nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+				}
+			}
+			// ascertainment bias correction
+			for (ptn = orig_nptn; ptn < nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+				}
+			}
+	    } else {
+	    	// both dad and node are internal nodes
+		    double *partial_lh_node = node_branch->partial_lh;
+		    double *partial_lh_dad = dad_branch->partial_lh;
+	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(i)
+#endif
+	    	for (i = 0; i < all_entries; i+=VCSIZE) {
+				(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]))
+						.store_a(&theta_all[i]);
+			}
+	    }
+		if (nptn < maxptn) {
+			// copy dummy values
+			for (ptn = nptn; ptn < maxptn; ptn++)
+				memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
+		}
+	}
+
+
+
+	VectorClass vc_ptn[VCSIZE], vc_df[VCSIZE], vc_ddf[VCSIZE], vc_theta[VCSIZE];
+	VectorClass vc_unit = 1.0;
+	VectorClass vc_freq;
+	VectorClass df_final = 0.0, ddf_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn, df_ptn, ddf_ptn, inv_lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, vc_df, vc_ddf, vc_theta, inv_lh_ptn, lh_ptn, df_ptn, ddf_ptn)
+	{
+	VectorClass df_final_th = 0.0;
+	VectorClass ddf_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_theta[i].load_a(theta+i*block);
+			vc_ptn[i] = vc_val0[0] * vc_theta[i];
+			vc_df[i] = vc_val1[0] * vc_theta[i];
+			vc_ddf[i] = vc_val2[0] * vc_theta[i];
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+				vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+				vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+				vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+		inv_lh_ptn = vc_unit / abs(lh_ptn);
+
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+		df_ptn = horizontal_add(vc_df) * inv_lh_ptn;
+		ddf_ptn = horizontal_add(vc_ddf) * inv_lh_ptn;
+		ddf_ptn = nmul_add(df_ptn, df_ptn, ddf_ptn);
+
+#ifdef _OPENMP
+		df_final_th = mul_add(df_ptn, vc_freq, df_final_th);
+		ddf_final_th = mul_add(ddf_ptn, vc_freq, ddf_final_th);
+#else
+		df_final = mul_add(df_ptn, vc_freq, df_final);
+		ddf_final = mul_add(ddf_ptn, vc_freq, ddf_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		df_final += df_final_th;
+		ddf_final += ddf_final_th;
+	}
+}
+#endif
+	df = horizontal_add(df_final);
+	ddf = horizontal_add(ddf_final);
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+
+//	assert(isnormal(tree_lh));
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		VectorClass lh_final = 0.0;
+		df_final = 0.0;
+		ddf_final = 0.0;
+		lh_ptn = 0.0;
+		df_ptn = 0.0;
+		ddf_ptn = 0.0;
+		double prob_const, df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+			df_final += df_ptn;
+			ddf_final += ddf_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_theta[i].load_a(theta+i*block);
+				vc_ptn[i] = vc_val0[0] * vc_theta[i];
+				vc_df[i] = vc_val1[0] * vc_theta[i];
+				vc_ddf[i] = vc_val2[0] * vc_theta[i];
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+					vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+					vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+					vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			df_const = horizontal_add(df_final+df_ptn);
+			ddf_const = horizontal_add(ddf_final+ddf_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			df_const = horizontal_add(df_final)+df_ptn[0];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1]+df_ptn[2];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1]+ddf_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+	}
+    assert(!isnan(df));
+    aligned_free(vc_val2);
+    aligned_free(vc_val1);
+    aligned_free(vc_val0);
+}
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    VectorClass *vc_val = (VectorClass*)aligned_alloc<double>(block);
+
+
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		VectorClass vc_len(len);
+		VectorClass vc_prop(site_rate->getProp(c));
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			// eval is not aligned!
+			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval[i*VCSIZE]) * vc_len) * vc_prop;
+		}
+	}
+
+	double prob_const = 0.0;
+
+	if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+    	VectorClass vc_tip_partial_lh[nstates];
+    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn; // store likelihoods of VCSIZE consecutive patterns
+
+    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
+    	for (ptn = orig_nptn; ptn < nptn; ptn++)
+    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
+    	// initialize beyond #patterns for efficiency
+    	for (ptn = nptn; ptn < maxptn; ptn++)
+    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * nstates];
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_tip_partial_lh, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+    {
+    	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+   		// main loop over all patterns with a step size of VCSIZE
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+
+			// initialize vc_tip_partial_lh
+			for (j = 0; j < VCSIZE; j++) {
+				double *lh_dad = lh_states_dad[ptn+j];
+				for (i = 0; i < nstates/VCSIZE; i++) {
+					vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
+				}
+				vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+				vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+			}
+
+			// compute vc_ptn
+			for (i = 1; i < block/VCSIZE; i++)
+				for (j = 0; j < VCSIZE; j++) {
+					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+					vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+							vc_partial_lh_dad[j], vc_ptn[j]);
+				}
+
+			vc_freq.load_a(&ptn_freq[ptn]);
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+
+			// multiply with pattern frequency
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+    	}
+    }
+#endif
+		tree_lh += horizontal_add(lh_final);
+        if (isnan(tree_lh) || isinf(tree_lh)) {
+            cout << "WARNING: Numerical underflow caused by alignment sites";
+            i = aln->getNSite();
+            for (j = 0; j < i; j++) {
+                ptn = aln->getPatternID(j);
+                if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                	cout << " " << j+1;
+                }
+            }
+            tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+            for (ptn = 0; ptn < orig_nptn; ptn++) {
+                if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                	_pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+                }
+            	tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+            }
+            cout << endl;
+//            cout << "WARNING: Tree log-likelihood is set to " << tree_lh << endl;
+        }
+
+		// ascertainment bias correction
+		if (orig_nptn < nptn) {
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+				lh_final += lh_ptn;
+
+				// initialize vc_tip_partial_lh
+				for (j = 0; j < VCSIZE; j++) {
+					double *lh_dad = lh_states_dad[ptn+j];
+					for (i = 0; i < nstates/VCSIZE; i++) {
+						vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load(&lh_dad[i*VCSIZE]); // lh_dad is not aligned!
+					}
+					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+					vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+				}
+
+				// compute vc_ptn
+				for (i = 1; i < block/VCSIZE; i++)
+					for (j = 0; j < VCSIZE; j++) {
+						vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+						vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+								vc_partial_lh_dad[j], vc_ptn[j]);
+					}
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+		aligned_free(lh_states_dad);
+    } else {
+    	// both dad and node are internal nodes
+    	VectorClass vc_partial_lh_node[VCSIZE];
+    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn;
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++) {
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+			memcpy(&node_branch->partial_lh[ptn*block], node_branch->partial_lh, block*sizeof(double));
+		}
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_partial_lh_node, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+		{
+		VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+
+			for (j = 0; j < VCSIZE; j++)
+				vc_ptn[j] = 0.0;
+
+			for (i = 0; i < block; i+=VCSIZE) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+					vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+					vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+				}
+			}
+
+			vc_freq.load_a(&ptn_freq[ptn]);
+
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+		}
+	}
+#endif
+
+		tree_lh += horizontal_add(lh_final);
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+		if (orig_nptn < nptn) {
+			// ascertainment bias correction
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			double *partial_lh_node = &node_branch->partial_lh[orig_nptn*block];
+			double *partial_lh_dad = &dad_branch->partial_lh[orig_nptn*block];
+
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+				lh_final += lh_ptn;
+
+				for (j = 0; j < VCSIZE; j++)
+					vc_ptn[j] = 0.0;
+
+				for (i = 0; i < block; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+						vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+						vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+					}
+				}
+
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+				partial_lh_node += block*VCSIZE;
+				partial_lh_dad += block*VCSIZE;
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+    }
+
+    aligned_free(vc_val);
+    return tree_lh;
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeLikelihoodFromBufferEigenSIMD() {
+
+
+	assert(theta_all && theta_computed);
+
+	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+    size_t ncat = site_rate->getNRate();
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+//    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = current_it->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		VectorClass vc_prop = site_rate->getProp(c);
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			VectorClass cof = VectorClass().load_a(&eval[i*VCSIZE]) * vc_rate;
+			VectorClass val = exp(cof*vc_len) * vc_prop;
+			vc_val0[c*nstates/VCSIZE+i] = val;
+		}
+	}
+
+	VectorClass vc_ptn[VCSIZE];
+	VectorClass vc_freq;
+	VectorClass lh_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, lh_ptn)
+	{
+	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+		lh_ptn = log(abs(lh_ptn));
+		lh_ptn.store_a(&_pattern_lh[ptn]);
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+#ifdef _OPENMP
+		lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+		lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		lh_final += lh_final_th;
+	}
+}
+#endif
+	tree_lh += horizontal_add(lh_final);
+    if (isnan(tree_lh) || isinf(tree_lh)) {
+        cout << "WARNING: Numerical underflow caused by alignment sites";
+        i = aln->getNSite();
+        for (j = 0, c = 0; j < i; j++) {
+            ptn = aln->getPatternID(j);
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                cout << " " << j+1;
+                c++;
+                if (c >= 10) {
+                    cout << " ...";
+                    break;
+                }
+            }
+        }
+        cout << endl;
+        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+        for (ptn = 0; ptn < orig_nptn; ptn++) {
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+            }
+            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+        }
+    }
+
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		lh_final = 0.0;
+		lh_ptn = 0.0;
+		double prob_const;// df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = log(1.0 - prob_const);
+    	tree_lh -= aln->getNSite() * prob_const;
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+	}
+
+    aligned_free(vc_val0);
+
+    return tree_lh;
+}
+
+/****************************************************************************
+        Highly optimized Parsimony function
+ ****************************************************************************/
+
+#ifdef _MSC_VER
+	#define MEM_ALIGN_BEGIN __declspec(align(32))
+	#define MEM_ALIGN_END
+#else
+	#define MEM_ALIGN_BEGIN
+	#define MEM_ALIGN_END __attribute__((aligned(32)))
+#endif
+
+inline UINT fast_popcount(Vec4ui &x) {
+    MEM_ALIGN_BEGIN UINT vec[4] MEM_ALIGN_END;
+    x.store_a(vec);
+    return popcount_lauradoux(vec, 4);
+}
+
+inline UINT fast_popcount(Vec8ui &x) {
+#if defined (__GNUC__) || defined(__clang__)
+    MEM_ALIGN_BEGIN uint64_t vec[4] MEM_ALIGN_END;
+    MEM_ALIGN_BEGIN uint64_t res[4] MEM_ALIGN_END;
+    Vec8ui y;
+    x.store_a(vec);
+    __asm("popcntq %1, %0" : "=r"(res[0]) : "r"(vec[0]) : );
+    __asm("popcntq %1, %0" : "=r"(res[1]) : "r"(vec[1]) : );
+    __asm("popcntq %1, %0" : "=r"(res[2]) : "r"(vec[2]) : );
+    __asm("popcntq %1, %0" : "=r"(res[3]) : "r"(vec[3]) : );
+    y.load_a(res);
+    return horizontal_add(y);
+#else
+    MEM_ALIGN_BEGIN uint64_t vec[4] MEM_ALIGN_END;
+    MEM_ALIGN_BEGIN int res[4] MEM_ALIGN_END;
+    Vec4ui y;
+    x.store_a(vec);
+    res[0] = _mm_popcnt_u64(vec[0]);
+    res[1] = _mm_popcnt_u64(vec[1]);
+    res[2] = _mm_popcnt_u64(vec[2]);
+    res[3] = _mm_popcnt_u64(vec[3]);
+    y.load_a(res);
+    return horizontal_add(y);
+#endif
+
+}
+
+
+inline void horizontal_popcount(Vec4ui &x) {
+    MEM_ALIGN_BEGIN UINT vec[4] MEM_ALIGN_END;
+    x.store_a(vec);
+    vec[0] = vml_popcnt(vec[0]);
+    vec[1] = vml_popcnt(vec[1]);
+    vec[2] = vml_popcnt(vec[2]);
+    vec[3] = vml_popcnt(vec[3]);
+    x.load_a(vec);
+}
+
+inline void horizontal_popcount(Vec8ui &x) {
+    MEM_ALIGN_BEGIN UINT vec[8] MEM_ALIGN_END;
+    x.store_a(vec);
+    vec[0] = vml_popcnt(vec[0]);
+    vec[1] = vml_popcnt(vec[1]);
+    vec[2] = vml_popcnt(vec[2]);
+    vec[3] = vml_popcnt(vec[3]);
+    vec[4] = vml_popcnt(vec[4]);
+    vec[5] = vml_popcnt(vec[5]);
+    vec[6] = vml_popcnt(vec[6]);
+    vec[7] = vml_popcnt(vec[7]);
+    x.load_a(vec);
+}
+
+template<class VectorClass>
+void PhyloTree::computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    if (dad_branch->partial_lh_computed & 2)
+        return;
+    Node *node = dad_branch->node;
+    int nstates = aln->num_states;
+    int site;
+    const int VCSIZE = VectorClass::size();
+    const int NUM_BITS = VectorClass::size() * UINT_BITS;
+
+    dad_branch->partial_lh_computed |= 2;
+
+    if (node->isLeaf() && dad) {
+        // external node
+        if (aln->ordered_pattern.empty())
+            aln->orderPatternByNumChars();
+        int leafid = node->id;
+        int pars_size = getBitsBlockSize();
+        memset(dad_branch->partial_pars, 0, pars_size*sizeof(UINT));
+//        int ptn;
+//        int nptn = aln->size();
+    	int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
+//        int max_sites = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
+//        UINT *x = dad_branch->partial_pars - (nstates*VCSIZE);
+        UINT *x = dad_branch->partial_pars;
+        Alignment::iterator pat;
+    	switch (aln->seq_type) {
+    	case SEQ_DNA:
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < 4) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 4*VCSIZE;
+                            site = 0;
+                        }
+                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 4*VCSIZE;
+                            site = 0;
+                        }
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        UINT *p = x+(site/UINT_BITS);
+                        p[0] |= bit1;
+                        p[VCSIZE] |= bit1;
+                        p[2*VCSIZE] |= bit1;
+                        p[3*VCSIZE] |= bit1;
+                    }
+                } else {
+                	state -= 3;
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 4*VCSIZE;
+                            site = 0;
+                        }
+                        UINT *p = x + ((site/UINT_BITS));
+                        
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        for (int i = 0; i < 4; i++)
+                            if (state & (1<<i))
+                                p[i*VCSIZE] |= bit1;
+                    }
+                }
+            }
+    		break;
+    	case SEQ_PROTEIN:
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < 20) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 20*VCSIZE;
+                            site = 0;
+                        }
+                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 20*VCSIZE;
+                            site = 0;
+                        }
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        UINT *p = x+(site/UINT_BITS);
+                        for (int i = 0; i < 20; i++)
+                            p[i*VCSIZE] |= bit1;
+                    }
+                } else {
+                	assert(state < 23);
+            		state = (state-20)*2;
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += 20*VCSIZE;
+                            site = 0;
+                        }
+                        UINT *p = x + ((site/UINT_BITS));
+                        UINT bit1 = (1 << (site%UINT_BITS));
+
+                        p[ambi_aa[state]*VCSIZE] |= bit1;
+                        p[ambi_aa[state+1]*VCSIZE] |= bit1;
+                    }
+                }
+            }
+    		break;
+    	default:
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < nstates) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += nstates*VCSIZE;
+                            site = 0;
+                        }
+                        x[state*VCSIZE + site/UINT_BITS] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        if (site == NUM_BITS) {
+                            x += nstates*VCSIZE;
+                            site = 0;
+                        }
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        UINT *p = x+(site/UINT_BITS);
+                        for (int i = 0; i < nstates; i++)
+                            p[i*VCSIZE] |= bit1;
+                    }
+                } else {
+                	assert(0);
+                }
+            }
+    		break;
+    	}
+        // add dummy states
+        if (site > 0) {
+            x += site/UINT_BITS;
+        	*x |= ~((1<<(site%UINT_BITS)) - 1);
+            x++;
+            int max_sites = ((site+UINT_BITS-1)/UINT_BITS);
+            memset(x, 255, (VCSIZE - max_sites)*sizeof(UINT));
+        }
+    } else {
+        // internal node
+        assert(node->degree() == 3); // it works only for strictly bifurcating tree
+        PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            PhyloNeighbor* pit = (PhyloNeighbor*) (*it);
+            if ((*it)->node->name != ROOT_NAME && (pit->partial_lh_computed & 2) == 0) {
+                computePartialParsimonyFastSIMD<VectorClass>(pit, (PhyloNode*) node);
+            }
+            if (!left) left = pit; else right = pit;
+        }
+//        VectorClass score = 0;
+        UINT score = 0;
+        int nsites = (aln->num_informative_sites+NUM_BITS-1)/NUM_BITS;
+        int entry_size = nstates * VCSIZE;
+        
+        switch (nstates) {
+        case 4:
+            #ifdef _OPENMP
+            #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
+            #endif
+			for (site = 0; site<nsites; site++) {
+                size_t offset = 4*VCSIZE*site;
+                VectorClass *x = (VectorClass*)(left->partial_pars + offset);
+                VectorClass *y = (VectorClass*)(right->partial_pars + offset);
+                VectorClass *z = (VectorClass*)(dad_branch->partial_pars + offset);
+                z[0] = x[0] & y[0];
+                z[1] = x[1] & y[1];
+                z[2] = x[2] & y[2];
+                z[3] = x[3] & y[3];
+                VectorClass w = z[0] | z[1] | z[2] | z[3];
+				w = ~w;
+                z[0] |= w & (x[0] | y[0]);
+                z[1] |= w & (x[1] | y[1]);
+                z[2] |= w & (x[2] | y[2]);
+                z[3] |= w & (x[3] | y[3]);
+//				horizontal_popcount(w);
+//                score += w;
+                score += fast_popcount(w);
+//                x += 4;
+//                y += 4;
+//                z += 4;
+			}
+
+			break;
+        default:
+            #ifdef _OPENMP
+            #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
+            #endif
+			for (site = 0; site<nsites; site++) {
+                size_t offset = entry_size*site;
+                VectorClass *x = (VectorClass*)(left->partial_pars + offset);
+                VectorClass *y = (VectorClass*)(right->partial_pars + offset);
+                VectorClass *z = (VectorClass*)(dad_branch->partial_pars + offset);
+				int i;
+				VectorClass w = 0;
+				for (i = 0; i < nstates; i++) {
+                    z[i] = x[i] & y[i];
+                    w |= z[i];
+				}
+				w = ~w;
+				for (i = 0; i < nstates; i++) {
+                    z[i] |= w & (x[i] | y[i]);
+				}
+//				horizontal_popcount(w);
+//                score += w;
+                score += fast_popcount(w);
+                x += nstates;
+                y += nstates;
+                z += nstates;
+			}
+			break;
+        }
+//        UINT sum_score = horizontal_add(score); 
+//        UINT *zscore = (UINT*)z;
+//        UINT *xscore = (UINT*)x;
+//        UINT *yscore = (UINT*)y;
+        dad_branch->partial_pars[nstates*VCSIZE*nsites] = score + left->partial_pars[nstates*VCSIZE*nsites] + right->partial_pars[nstates*VCSIZE*nsites];
+    }
+}
+
+template<class VectorClass>
+int PhyloTree::computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    assert(node_branch);
+    if (!central_partial_pars)
+        initializeAllPartialPars();
+    if ((dad_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyFastSIMD<VectorClass>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyFastSIMD<VectorClass>(node_branch, node);
+    int site;
+    int nstates = aln->num_states;
+
+//    VectorClass score = 0;
+//    VectorClass w;
+
+    const int NUM_BITS = VectorClass::size() * UINT_BITS;
+    int nsites = (aln->num_informative_sites + NUM_BITS - 1)/NUM_BITS;
+    int entry_size = nstates * VectorClass::size();
+    
+    int scoreid = nsites*entry_size;
+    UINT sum_end_node = (dad_branch->partial_pars[scoreid] + node_branch->partial_pars[scoreid]);
+    UINT score = sum_end_node;
+    UINT lower_bound = best_pars_score;
+    if (branch_subst) lower_bound = INT_MAX;
+    
+    switch (nstates) {
+    case 4:
+        #ifdef _OPENMP
+        #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
+        #endif
+		for (site = 0; site < nsites; site++) {
+            size_t offset = entry_size*site;
+            VectorClass *x = (VectorClass*)(dad_branch->partial_pars + offset);
+            VectorClass *y = (VectorClass*)(node_branch->partial_pars + offset);
+            VectorClass w = (x[0] & y[0]) | (x[1] & y[1]) | (x[2] & y[2]) | (x[3] & y[3]);
+			w = ~w;
+//			horizontal_popcount(w);
+//            score += w;
+            score += fast_popcount(w);
+            #ifndef _OPENMP
+            if (score >= lower_bound) 
+                break;
+            #endif
+		}
+		break;
+    default:
+        #ifdef _OPENMP
+        #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
+        #endif
+		for (site = 0; site < nsites; site++) {
+            size_t offset = entry_size*site;
+            VectorClass *x = (VectorClass*)(dad_branch->partial_pars + offset);
+            VectorClass *y = (VectorClass*)(node_branch->partial_pars + offset);
+            VectorClass w = x[0] & y[0];
+			for (int i = 1; i < nstates; i++) {
+                w |= x[i] & y[i];
+			}
+			w = ~w;
+//			horizontal_popcount(w);
+//            score += w;
+            score += fast_popcount(w);
+            #ifndef _OPENMP
+            if (score >= lower_bound) 
+                break;
+            #endif
+		}
+		break;
+    }
+//    UINT sum_score = horizontal_add(score);
+//    if (branch_subst)
+//        *branch_subst = sum_score;
+    if (branch_subst)
+        *branch_subst = score - sum_end_node;
+//    UINT *xscore = (UINT*)x;
+//    UINT *yscore = (UINT*)y;
+//    sum_score += *xscore + *yscore;
+//    score += *xscore + *yscore;
+//    return sum_score;
+    return score;
+}
+
+
+#endif /* PHYLOKERNEL_H_ */
diff --git a/phylokernelmixrate.h b/phylokernelmixrate.h
new file mode 100644
index 0000000..44e27f0
--- /dev/null
+++ b/phylokernelmixrate.h
@@ -0,0 +1,1113 @@
+/*
+ * phylokernelmixrate.h
+ *
+ *  Created on: Jan 7, 2015
+ *      Author: minh
+ */
+
+#ifndef PHYLOKERNELMIXRATE_H_
+#define PHYLOKERNELMIXRATE_H_
+
+#include "model/modelmixture.h"
+
+
+
+/************************************************************************************************
+ *
+ *   Highly optimized vectorized versions of likelihood functions
+ *
+ *************************************************************************************************/
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+
+    size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+	    //memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+
+    size_t ncat = site_rate->getNRate();
+    assert(ncat == model->getNMixtures());
+    assert(nstates == aln->num_states && nstates >= VCSIZE && VCSIZE == VectorClass().size());
+    assert(model->isReversible()); // only works with reversible model!
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x, j;
+    size_t block = nstates * ncat;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		// swap left and right
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(right, node);
+
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+
+	VectorClass *vc_inv_evec = aligned_alloc<VectorClass>(ncat*nstates*nstates/VCSIZE);
+	assert(inv_evec && evec);
+	for (c = 0; c < ncat; c++)
+	for (i = 0; i < nstates; i++) {
+		for (x = 0; x < nstates/VCSIZE; x++)
+			// inv_evec is not aligned!
+			vc_inv_evec[(c*nstates+i)*nstates/VCSIZE+x].load_a(&inv_evec[c*nstatesqr+i*nstates+x*VCSIZE]);
+	}
+	double *eval = model->getEigenvalues();
+
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+
+	VectorClass *eleft = (VectorClass*)aligned_alloc<double>(block*nstates);
+	VectorClass *eright = (VectorClass*)aligned_alloc<double>(block*nstates);
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_evec;
+		VectorClass expleft[nstates/VCSIZE];
+		VectorClass expright[nstates/VCSIZE];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			// eval is not aligned!
+			expleft[i] = exp(VectorClass().load_a(&eval[c*nstates+i*VCSIZE]) * VectorClass(len_left));
+			expright[i] = exp(VectorClass().load_a(&eval[c*nstates+i*VCSIZE]) * VectorClass(len_right));
+		}
+		for (x = 0; x < nstates; x++)
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				// evec is not be aligned!
+				vc_evec.load_a(&evec[c*nstatesqr+x*nstates+i*VCSIZE]);
+				eleft[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expleft[i]);
+				eright[c*nstatesqr/VCSIZE+x*nstates/VCSIZE+i] = (vc_evec * expright[i]);
+			}
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+		double *partial_lh_right = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			for (c = 0; c < ncat; c++) {
+				size_t addrtip = state*block+c*nstates;
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addrtip+i*VCSIZE]);
+				for (x = 0; x < nstates; x+=VCSIZE) {
+					size_t addr = (c*nstates+x)*nstates/VCSIZE;
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = eleft[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vleft[j] = mul_add(eleft[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
+					}
+					horizontal_add(vleft).store_a(&partial_lh_left[addrtip+x]);
+				}
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vright[VCSIZE];
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			for (c = 0; c < ncat; c++) {
+				size_t addrtip = state*block+c*nstates;
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addrtip+i*VCSIZE]);
+				for (x = 0; x < nstates; x+=VCSIZE) {
+					size_t addr = (c*nstates+x)*nstates/VCSIZE;
+					for (j = 0; j < VCSIZE; j++)
+						vright[j] = eright[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vright[j] = mul_add(eright[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vright[j]);
+					}
+					horizontal_add(vright).store_a(&partial_lh_right[addrtip+x]);
+				}
+			}
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+			partial_lh_right[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for left and right partial_lh
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		double **lh_right_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * (aln->at(ptn))[right->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i, j, vc_partial_lh_tmp, res)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+	        double *lh_right = lh_right_ptr[ptn];
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				VectorClass *this_inv_evec = &vc_inv_evec[c*nstatesqr/VCSIZE];
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					vc_partial_lh_tmp[x] = (VectorClass().load_a(&lh_left[x*VCSIZE]) * VectorClass().load_a(&lh_right[x*VCSIZE]));
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * this_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], this_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					horizontal_add(res).store_a(&partial_lh[i]);
+				}
+
+				lh_left += nstates;
+				lh_right += nstates;
+				partial_lh += nstates;
+			}
+		}
+
+	    aligned_free(lh_left_ptr);
+	    aligned_free(lh_right_ptr);
+		aligned_free(partial_lh_right);
+		aligned_free(partial_lh_left);
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			for (c = 0; c < ncat; c++) {
+				size_t addrtip = state*block+c*nstates;
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&tip_partial_lh[addrtip+i*VCSIZE]);
+				for (x = 0; x < nstates; x+=VCSIZE) {
+					size_t addr = (c*nstates+x)*nstates/VCSIZE;
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = eleft[addr+j*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vleft[j] = mul_add(eleft[addr+j*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
+					}
+					horizontal_add(vleft).store_a(&partial_lh_left[addrtip+x]);
+				}
+			}
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for partial_lh_left
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		double sum_scale = 0.0;
+		VectorClass vc_lh_right[nstates/VCSIZE];
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private (ptn, c, x, i, j, vc_lh_right, vc_partial_lh_tmp, res, vc_max, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+	        double *partial_lh_right = right->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+			vc_max = 0.0;
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				VectorClass *this_inv_evec = &vc_inv_evec[c*nstatesqr/VCSIZE];
+
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = c*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						vright[j] = eright[addr+nstates*j/VCSIZE] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++)
+						for (j = 0; j < VCSIZE; j++) {
+							vright[j] = mul_add(eright[addr+i+nstates*j/VCSIZE], vc_lh_right[i], vright[j]);
+						}
+					vc_partial_lh_tmp[x] = VectorClass().load_a(&lh_left[x*VCSIZE])
+							* horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * this_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++) {
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], this_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					}
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				lh_left += nstates;
+				partial_lh_right += nstates;
+				partial_lh += nstates;
+			}
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+            	// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	    aligned_free(lh_left_ptr);
+		aligned_free(partial_lh_left);
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass vc_lh_left[nstates/VCSIZE], vc_lh_right[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vleft[VCSIZE], vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction (+: sum_scale) private(ptn, c, x, i, j, vc_max, vc_partial_lh_tmp, vc_lh_left, vc_lh_right, res, vleft, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+			vc_max = 0.0;
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				VectorClass *this_inv_evec = &vc_inv_evec[c*nstatesqr/VCSIZE];
+
+				for (i = 0; i < nstates/VCSIZE; i++) {
+					vc_lh_left[i].load_a(&partial_lh_left[i*VCSIZE]);
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+				}
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = c*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						size_t addr_com = addr+j*nstates/VCSIZE;
+						vleft[j] = eleft[addr_com] * vc_lh_left[0];
+						vright[j] = eright[addr_com] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++) {
+							size_t addr_com = addr+i+j*nstates/VCSIZE;
+							vleft[j] = mul_add(eleft[addr_com], vc_lh_left[i], vleft[j]);
+							vright[j] = mul_add(eright[addr_com], vc_lh_right[i], vright[j]);
+						}
+					}
+					vc_partial_lh_tmp[x] = horizontal_add(vleft) * horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * this_inv_evec[(i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++)
+							res[j] = mul_add(vc_partial_lh_tmp[x], this_inv_evec[(i+j)*nstates/VCSIZE+x], res[j]);
+
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				partial_lh += nstates;
+				partial_lh_left += nstates;
+				partial_lh_right += nstates;
+			}
+
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	aligned_free(eright);
+	aligned_free(eleft);
+	aligned_free(vc_inv_evec);
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeMixrateLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    df = ddf = 0.0;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val1 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val2 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = dad_branch->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		VectorClass vc_prop = site_rate->getProp(c);
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			VectorClass cof = VectorClass().load_a(&eval[c*nstates+i*VCSIZE]) * vc_rate;
+			VectorClass val = exp(cof*vc_len) * vc_prop;
+			VectorClass val1_ = cof*val;
+			vc_val0[c*nstates/VCSIZE+i] = val;
+			vc_val1[c*nstates/VCSIZE+i] = val1_;
+			vc_val2[c*nstates/VCSIZE+i] = cof*val1_;
+		}
+	}
+
+	assert(theta_all);
+	if (!theta_computed) {
+		theta_computed = true;
+		// precompute theta for fast branch length optimization
+
+		if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i)
+#endif
+			for (ptn = 0; ptn < orig_nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates * ncat];
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&lh_dad[i]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+				}
+			}
+			// ascertainment bias correction
+			for (ptn = orig_nptn; ptn < nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates * ncat];
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&lh_dad[i]) * VectorClass().load_a(&partial_lh_dad[i])).store_a(&theta[i]);
+				}
+			}
+	    } else {
+	    	// both dad and node are internal nodes
+		    double *partial_lh_node = node_branch->partial_lh;
+		    double *partial_lh_dad = dad_branch->partial_lh;
+	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(i)
+#endif
+	    	for (i = 0; i < all_entries; i+=VCSIZE) {
+				(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]))
+						.store_a(&theta_all[i]);
+			}
+	    }
+		if (nptn < maxptn) {
+			// copy dummy values
+			for (ptn = nptn; ptn < maxptn; ptn++)
+				memcpy(&theta_all[ptn*block], theta_all, block*sizeof(double));
+		}
+	}
+
+
+
+	VectorClass vc_ptn[VCSIZE], vc_df[VCSIZE], vc_ddf[VCSIZE], vc_theta[VCSIZE];
+	VectorClass vc_unit = 1.0;
+	VectorClass vc_freq;
+	VectorClass df_final = 0.0, ddf_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn, df_ptn, ddf_ptn, inv_lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, vc_df, vc_ddf, vc_theta, inv_lh_ptn, lh_ptn, df_ptn, ddf_ptn)
+	{
+	VectorClass df_final_th = 0.0;
+	VectorClass ddf_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_theta[i].load_a(theta+i*block);
+			vc_ptn[i] = vc_val0[0] * vc_theta[i];
+			vc_df[i] = vc_val1[0] * vc_theta[i];
+			vc_ddf[i] = vc_val2[0] * vc_theta[i];
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+				vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+				vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+				vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+		inv_lh_ptn = vc_unit / abs(lh_ptn);
+
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+		df_ptn = horizontal_add(vc_df) * inv_lh_ptn;
+		ddf_ptn = horizontal_add(vc_ddf) * inv_lh_ptn;
+		ddf_ptn = nmul_add(df_ptn, df_ptn, ddf_ptn);
+
+#ifdef _OPENMP
+		df_final_th = mul_add(df_ptn, vc_freq, df_final_th);
+		ddf_final_th = mul_add(ddf_ptn, vc_freq, ddf_final_th);
+#else
+		df_final = mul_add(df_ptn, vc_freq, df_final);
+		ddf_final = mul_add(ddf_ptn, vc_freq, ddf_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		df_final += df_final_th;
+		ddf_final += ddf_final_th;
+	}
+}
+#endif
+	df = horizontal_add(df_final);
+	ddf = horizontal_add(ddf_final);
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+
+//	assert(isnormal(tree_lh));
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		VectorClass lh_final = 0.0;
+		df_final = 0.0;
+		ddf_final = 0.0;
+		lh_ptn = 0.0;
+		df_ptn = 0.0;
+		ddf_ptn = 0.0;
+		double prob_const, df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+			df_final += df_ptn;
+			ddf_final += ddf_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_theta[i].load_a(theta+i*block);
+				vc_ptn[i] = vc_val0[0] * vc_theta[i];
+				vc_df[i] = vc_val1[0] * vc_theta[i];
+				vc_ddf[i] = vc_val2[0] * vc_theta[i];
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+					vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+					vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+					vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			df_const = horizontal_add(df_final+df_ptn);
+			ddf_const = horizontal_add(ddf_final+ddf_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			df_const = horizontal_add(df_final)+df_ptn[0];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1]+df_ptn[2];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1]+ddf_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+	}
+
+    aligned_free(vc_val2);
+    aligned_free(vc_val1);
+    aligned_free(vc_val0);
+}
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeMixrateLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    VectorClass *vc_val = aligned_alloc<VectorClass>(block/VCSIZE);
+
+
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		VectorClass vc_len(len);
+		VectorClass vc_prop(site_rate->getProp(c));
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			// eval is not aligned!
+			vc_val[c*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval[c*nstates+i*VCSIZE]) * vc_len) * vc_prop;
+		}
+	}
+
+	double prob_const = 0.0;
+
+	if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+//    	VectorClass vc_tip_partial_lh[nstates];
+//    	VectorClass vc_partial_lh_dad[VCSIZE];
+    	VectorClass vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn; // store likelihoods of VCSIZE consecutive patterns
+
+    	// precompute information from one tip
+    	double *partial_lh_node = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node + (*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*block;
+    		for (i = 0; i < block; i+=VCSIZE)
+    			(vc_val[i/VCSIZE]*VectorClass().load_a(&lh_tip[i])).store_a(&lh_node[i]);
+    	}
+
+    	// now do the real computation
+
+
+//    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
+//    	for (ptn = 0; ptn < orig_nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates * ncat];
+//    	for (ptn = orig_nptn; ptn < nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates * ncat];
+//    	// initialize beyond #patterns for efficiency
+//    	for (ptn = nptn; ptn < maxptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * nstates * ncat];
+
+		int *ptn_states_dad = aligned_alloc<int>(maxptn);
+		for (ptn = 0; ptn < orig_nptn; ptn++)
+			ptn_states_dad[ptn] = (aln->at(ptn))[dad->id];
+		for (ptn = orig_nptn; ptn < nptn; ptn++)
+			ptn_states_dad[ptn] = model_factory->unobserved_ptns[ptn-orig_nptn];
+		// initialize beyond #patterns for efficiency
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			ptn_states_dad[ptn] = aln->STATE_UNKNOWN;
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_ptn, vc_freq, lh_ptn)
+    {
+    	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+   		// main loop over all patterns with a step size of VCSIZE
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_ptn[j] = 0.0;
+				double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+				int state_dad = ptn_states_dad[ptn+j];
+				double *lh_node = &partial_lh_node[state_dad*block];
+				for (i = 0; i < block; i+=VCSIZE) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+							VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
+				}
+			}
+			vc_freq.load_a(&ptn_freq[ptn]);
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+
+			// multiply with pattern frequency
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+    	}
+    }
+#endif
+		tree_lh += horizontal_add(lh_final);
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+		// ascertainment bias correction
+		if (orig_nptn < nptn) {
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+				lh_final += lh_ptn;
+				for (j = 0; j < VCSIZE; j++) {
+					vc_ptn[j] = 0.0;
+					double *partial_lh_dad = &dad_branch->partial_lh[(ptn+j)*block];
+					int state_dad = ptn_states_dad[ptn+j];
+					double *lh_node = &partial_lh_node[state_dad*block];
+
+					for (i = 0; i < block; i+=VCSIZE) {
+						vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+								VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
+					}
+				}
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+//		aligned_free(lh_states_dad);
+		aligned_free(ptn_states_dad);
+		aligned_free(partial_lh_node);
+    } else {
+    	// both dad and node are internal nodes
+    	VectorClass vc_partial_lh_node[VCSIZE];
+    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn;
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++) {
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+			memcpy(&node_branch->partial_lh[ptn*block], node_branch->partial_lh, block*sizeof(double));
+		}
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_partial_lh_node, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+		{
+		VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+
+			for (j = 0; j < VCSIZE; j++)
+				vc_ptn[j] = 0.0;
+
+			for (i = 0; i < block; i+=VCSIZE) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+					vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+					vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+				}
+			}
+
+			vc_freq.load_a(&ptn_freq[ptn]);
+
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+		}
+	}
+#endif
+
+		tree_lh += horizontal_add(lh_final);
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+		if (orig_nptn < nptn) {
+			// ascertainment bias correction
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			double *partial_lh_node = &node_branch->partial_lh[orig_nptn*block];
+			double *partial_lh_dad = &dad_branch->partial_lh[orig_nptn*block];
+
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+				lh_final += lh_ptn;
+
+				for (j = 0; j < VCSIZE; j++)
+					vc_ptn[j] = 0.0;
+
+				for (i = 0; i < block; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+						vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+						vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+					}
+				}
+
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+				partial_lh_node += block*VCSIZE;
+				partial_lh_dad += block*VCSIZE;
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+    }
+
+    aligned_free(vc_val);
+    return tree_lh;
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD() {
+
+
+	assert(theta_all && theta_computed);
+
+	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+    size_t ncat = site_rate->getNRate();
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+//    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = current_it->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		VectorClass vc_prop = site_rate->getProp(c);
+		for (i = 0; i < nstates/VCSIZE; i++) {
+			VectorClass cof = VectorClass().load_a(&eval[c*nstates+i*VCSIZE]) * vc_rate;
+			VectorClass val = exp(cof*vc_len) * vc_prop;
+			vc_val0[c*nstates/VCSIZE+i] = val;
+		}
+	}
+
+	VectorClass vc_ptn[VCSIZE];
+	VectorClass vc_freq;
+	VectorClass lh_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, lh_ptn)
+	{
+	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+		lh_ptn = log(abs(lh_ptn));
+		lh_ptn.store_a(&_pattern_lh[ptn]);
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+#ifdef _OPENMP
+		lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+		lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		lh_final += lh_final_th;
+	}
+}
+#endif
+	tree_lh += horizontal_add(lh_final);
+    if (isnan(tree_lh) || isinf(tree_lh)) {
+        cout << "WARNING: Numerical underflow caused by alignment sites";
+        i = aln->getNSite();
+        for (j = 0, c = 0; j < i; j++) {
+            ptn = aln->getPatternID(j);
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                cout << " " << j+1;
+                c++;
+                if (c >= 10) {
+                    cout << " ...";
+                    break;
+                }
+            }
+        }
+        cout << endl;
+        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+        for (ptn = 0; ptn < orig_nptn; ptn++) {
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+            }
+            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+        }
+    }
+
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		lh_final = 0.0;
+		lh_ptn = 0.0;
+		double prob_const;// df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = log(1.0 - prob_const);
+    	tree_lh -= aln->getNSite() * prob_const;
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+	}
+
+    aligned_free(vc_val0);
+
+    return tree_lh;
+}
+
+#endif /* PHYLOKERNELMIXRATE_H_ */
diff --git a/phylokernelmixture.h b/phylokernelmixture.h
new file mode 100644
index 0000000..747c4a2
--- /dev/null
+++ b/phylokernelmixture.h
@@ -0,0 +1,1197 @@
+/*
+ * phylokernelmixture.h
+ *
+ *  Created on: Dec 19, 2014
+ *      Author: minh
+ */
+
+#ifndef PHYLOKERNELMIXTURE_H_
+#define PHYLOKERNELMIXTURE_H_
+
+#include "model/modelmixture.h"
+
+
+/************************************************************************************************
+ *
+ * Highly-optimized vectorized likelihood functions for mixture models
+ *
+ *************************************************************************************************/
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+
+    size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+	    //memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+    assert(nstates == aln->num_states && nstates >= VCSIZE && VCSIZE == VectorClass().size());
+    assert(model->isReversible()); // only works with reversible model!
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x, j, m;
+    size_t statecat = nstates * ncat;
+    size_t block = statecat * nmixture;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		// swap left and right
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(right, node);
+
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+
+	VectorClass *vc_inv_evec = aligned_alloc<VectorClass>(nmixture*nstatesqr/VCSIZE);
+	assert(inv_evec && evec);
+	for (m = 0; m < nmixture; m++) {
+		for (i = 0; i < nstates; i++) {
+			for (x = 0; x < nstates/VCSIZE; x++)
+				// inv_evec is not aligned!
+				vc_inv_evec[m*nstatesqr/VCSIZE + i*nstates/VCSIZE+x].load_a(&inv_evec[m*nstatesqr + i*nstates+x*VCSIZE]);
+		}
+	}
+	double *eval = model->getEigenvalues();
+
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+
+	VectorClass *eleft = (VectorClass*)aligned_alloc<double>(block*nstates);
+	VectorClass *eright = (VectorClass*)aligned_alloc<double>(block*nstates);
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_evec;
+		VectorClass expleft[nstates/VCSIZE];
+		VectorClass expright[nstates/VCSIZE];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (m = 0; m < nmixture; m++) {
+			size_t addr = (m*ncat+c)*nstatesqr/VCSIZE;
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				// eval is not aligned!
+				expleft[i] = exp(VectorClass().load_a(&eval[m*nstates+i*VCSIZE]) * VectorClass(len_left));
+				expright[i] = exp(VectorClass().load_a(&eval[m*nstates+i*VCSIZE]) * VectorClass(len_right));
+			}
+			for (x = 0; x < nstates; x++)
+				for (i = 0; i < nstates/VCSIZE; i++) {
+					// evec is not be aligned!
+					vc_evec.load_a(&evec[m*nstatesqr+x*nstates+i*VCSIZE]);
+					eleft[addr+x*nstates/VCSIZE+i] = (vc_evec * expleft[i]);
+					eright[addr+x*nstates/VCSIZE+i] = (vc_evec * expright[i]);
+				}
+		}
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+		double *partial_lh_right = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			for (m = 0; m < nmixture; m++) {
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				VectorClass *this_eleft = &eleft[m*ncat*nstatesqr/VCSIZE];
+				double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
+
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&this_tip_partial_lh[i*VCSIZE]);
+
+				for (x = 0; x < statecat; x+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = this_eleft[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vleft[j] = mul_add(this_eleft[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
+					}
+					horizontal_add(vleft).store_a(&this_partial_lh_left[x]);
+				}
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vright[VCSIZE];
+			for (m = 0; m < nmixture; m++) {
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				VectorClass *this_eright = &eright[m*ncat*nstatesqr/VCSIZE];
+				double *this_partial_lh_right = &partial_lh_right[state*block+m*statecat];
+
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&this_tip_partial_lh[i*VCSIZE]);
+
+				for (x = 0; x < statecat; x+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++)
+						vright[j] = this_eright[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vright[j] = mul_add(this_eright[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vright[j]);
+					}
+					horizontal_add(vright).store_a(&this_partial_lh_right[x]);
+				}
+			}
+
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+			partial_lh_right[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for left and right partial_lh
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		double **lh_right_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * (aln->at(ptn))[right->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+			lh_right_ptr[ptn] = &partial_lh_right[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, c, x, i, j, m, vc_partial_lh_tmp, res)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+	        double *lh_right = lh_right_ptr[ptn];
+	        for (m = 0; m < nmixture; m++) {
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					vc_partial_lh_tmp[x] = (VectorClass().load_a(&lh_left[x*VCSIZE]) * VectorClass().load_a(&lh_right[x*VCSIZE]));
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					horizontal_add(res).store_a(&partial_lh[i]);
+				}
+				lh_left += nstates;
+				lh_right += nstates;
+				partial_lh += nstates;
+			}
+	        }
+		}
+
+	    aligned_free(lh_left_ptr);
+	    aligned_free(lh_right_ptr);
+		aligned_free(partial_lh_right);
+		aligned_free(partial_lh_left);
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+			VectorClass vleft[VCSIZE];
+			for (m = 0; m < nmixture; m++) {
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				VectorClass *this_eleft = &eleft[m*ncat*nstatesqr/VCSIZE];
+				double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
+
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_partial_lh_tmp[i].load_a(&this_tip_partial_lh[i*VCSIZE]);
+
+				for (x = 0; x < statecat; x+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++)
+						vleft[j] = this_eleft[(x+j)*nstates/VCSIZE] * vc_partial_lh_tmp[0];
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++)
+							vleft[j] = mul_add(this_eleft[(x+j)*nstates/VCSIZE+i], vc_partial_lh_tmp[i], vleft[j]);
+					}
+					horizontal_add(vleft).store_a(&this_partial_lh_left[x]);
+				}
+			}
+		}
+
+		size_t addr_unknown = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr_unknown+x] = 1.0;
+		}
+
+		// assign pointers for partial_lh_left
+		double **lh_left_ptr = aligned_alloc<double*>(nptn);
+		for (ptn = 0; ptn < orig_ntn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block *  (aln->at(ptn))[left->node->id]];
+		}
+		for (ptn = orig_ntn; ptn < nptn; ptn++) {
+			lh_left_ptr[ptn] = &partial_lh_left[block * model_factory->unobserved_ptns[ptn-orig_ntn]];
+		}
+
+		double sum_scale = 0.0;
+		VectorClass vc_lh_right[nstates/VCSIZE];
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private (ptn, c, x, i, j, m, vc_lh_right, vc_partial_lh_tmp, res, vc_max, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+	        double *partial_lh_right = right->partial_lh + ptn*block;
+
+	        double *lh_left = lh_left_ptr[ptn];
+			vc_max = 0.0;
+			for (m = 0; m < nmixture; m++)
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (i = 0; i < nstates/VCSIZE; i++)
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = (m*ncat+c)*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						vright[j] = eright[addr+nstates*j/VCSIZE] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++)
+						for (j = 0; j < VCSIZE; j++) {
+							vright[j] = mul_add(eright[addr+i+nstates*j/VCSIZE], vc_lh_right[i], vright[j]);
+						}
+					vc_partial_lh_tmp[x] = VectorClass().load_a(&lh_left[x*VCSIZE])
+							* horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++) {
+						for (j = 0; j < VCSIZE; j++) {
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE+x], res[j]);
+						}
+					}
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				lh_left += nstates;
+				partial_lh_right += nstates;
+				partial_lh += nstates;
+			}
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+            	// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	    aligned_free(lh_left_ptr);
+		aligned_free(partial_lh_left);
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+		VectorClass vc_max; // maximum of partial likelihood, for scaling check
+		VectorClass vc_partial_lh_tmp[nstates/VCSIZE];
+		VectorClass vc_lh_left[nstates/VCSIZE], vc_lh_right[nstates/VCSIZE];
+		VectorClass res[VCSIZE];
+		VectorClass vleft[VCSIZE], vright[VCSIZE];
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction (+: sum_scale) private(ptn, c, x, i, j, m, vc_max, vc_partial_lh_tmp, vc_lh_left, vc_lh_right, res, vleft, vright)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+	        double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+			vc_max = 0.0;
+			for (m = 0; m < nmixture; m++)
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (i = 0; i < nstates/VCSIZE; i++) {
+					vc_lh_left[i].load_a(&partial_lh_left[i*VCSIZE]);
+					vc_lh_right[i].load_a(&partial_lh_right[i*VCSIZE]);
+				}
+
+				for (x = 0; x < nstates/VCSIZE; x++) {
+					size_t addr = (m*ncat+c)*nstatesqr/VCSIZE+x*nstates;
+					for (j = 0; j < VCSIZE; j++) {
+						size_t addr_com = addr+j*nstates/VCSIZE;
+						vleft[j] = eleft[addr_com] * vc_lh_left[0];
+						vright[j] = eright[addr_com] * vc_lh_right[0];
+					}
+					for (i = 1; i < nstates/VCSIZE; i++) {
+						for (j = 0; j < VCSIZE; j++) {
+							size_t addr_com = addr+i+j*nstates/VCSIZE;
+							vleft[j] = mul_add(eleft[addr_com], vc_lh_left[i], vleft[j]);
+							vright[j] = mul_add(eright[addr_com], vc_lh_right[i], vright[j]);
+						}
+					}
+					vc_partial_lh_tmp[x] = horizontal_add(vleft) * horizontal_add(vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						res[j] = vc_partial_lh_tmp[0] * vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE];
+					}
+					for (x = 1; x < nstates/VCSIZE; x++)
+						for (j = 0; j < VCSIZE; j++)
+							res[j] = mul_add(vc_partial_lh_tmp[x], vc_inv_evec[(m*nstates+i+j)*nstates/VCSIZE+x], res[j]);
+
+					VectorClass sum_res = horizontal_add(res);
+					sum_res.store_a(&partial_lh[i]);
+					vc_max = max(vc_max, abs(sum_res)); // take the maximum for scaling check
+				}
+				partial_lh += nstates;
+				partial_lh_left += nstates;
+				partial_lh_right += nstates;
+			}
+
+            // check if one should scale partial likelihoods
+			double lh_max = horizontal_max(vc_max);
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+            	partial_lh -= block; // revert its pointer
+            	VectorClass scale_thres(SCALING_THRESHOLD_INVER);
+				for (i = 0; i < block; i+=VCSIZE) {
+					(VectorClass().load_a(&partial_lh[i]) * scale_thres).store_a(&partial_lh[i]);
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+				partial_lh += block; // increase the pointer again
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	aligned_free(eright);
+	aligned_free(eleft);
+	aligned_free(vc_inv_evec);
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+void PhyloTree::computeMixtureLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    df = ddf = 0.0;
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+
+    size_t block = ncat * nstates * nmixture;
+    size_t statemix = nstates * nmixture;
+    size_t statecat = nstates * ncat;
+
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j, m;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val1 = (VectorClass*)aligned_alloc<double>(block);
+	VectorClass *vc_val2 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = dad_branch->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		for (m = 0; m < nmixture; m++) {
+			VectorClass vc_prop = VectorClass(site_rate->getProp(c) * ((ModelMixture*)model)->prop[m]);
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				VectorClass cof = VectorClass().load_a(&eval[m*nstates+i*VCSIZE]) * vc_rate;
+				VectorClass val = exp(cof*vc_len) * vc_prop;
+				VectorClass val1_ = cof*val;
+				vc_val0[(m*ncat+c)*nstates/VCSIZE+i] = val;
+				vc_val1[(m*ncat+c)*nstates/VCSIZE+i] = val1_;
+				vc_val2[(m*ncat+c)*nstates/VCSIZE+i] = cof*val1_;
+			}
+		}
+	}
+
+	assert(theta_all);
+	if (!theta_computed) {
+		theta_computed = true;
+		// precompute theta for fast branch length optimization
+
+		if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, m)
+#endif
+			for (ptn = 0; ptn < orig_nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[(aln->at(ptn))[dad->id] * statemix];
+                for (m = 0; m < nmixture; m++) {
+                    for (i = 0; i < statecat; i+=VCSIZE) {
+                        (VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).
+                        		store_a(&theta[i]);
+                    }
+                    partial_lh_dad += statecat;
+                    theta += statecat;
+                    lh_dad += nstates;
+                }
+			}
+			// ascertainment bias correction
+			for (ptn = orig_nptn; ptn < nptn; ptn++) {
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_dad = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * statemix];
+                for (m = 0; m < nmixture; m++) {
+                    for (i = 0; i < statecat; i+=VCSIZE) {
+                        (VectorClass().load_a(&lh_dad[i%nstates]) * VectorClass().load_a(&partial_lh_dad[i])).
+                        		store_a(&theta[i]);
+                    }
+                    partial_lh_dad += statecat;
+                    theta += statecat;
+                    lh_dad += nstates;
+                }
+			}
+	    } else {
+	    	// both dad and node are internal nodes
+		    double *partial_lh_node = node_branch->partial_lh;
+		    double *partial_lh_dad = dad_branch->partial_lh;
+	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(i)
+#endif
+	    	for (i = 0; i < all_entries; i+=VCSIZE) {
+				(VectorClass().load_a(&partial_lh_node[i]) * VectorClass().load_a(&partial_lh_dad[i]))
+						.store_a(&theta_all[i]);
+			}
+	    }
+		if (nptn < maxptn) {
+			// copy dummy values
+			for (ptn = nptn; ptn < maxptn; ptn++)
+				memcpy(&theta_all[ptn*block], &theta_all[(ptn-1)*block], block*sizeof(double));
+		}
+	}
+
+
+
+	VectorClass vc_ptn[VCSIZE], vc_df[VCSIZE], vc_ddf[VCSIZE], vc_theta[VCSIZE];
+	VectorClass vc_unit = 1.0;
+	VectorClass vc_freq;
+	VectorClass df_final = 0.0, ddf_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn, df_ptn, ddf_ptn, inv_lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, vc_df, vc_ddf, vc_theta, inv_lh_ptn, lh_ptn, df_ptn, ddf_ptn)
+	{
+	VectorClass df_final_th = 0.0;
+	VectorClass ddf_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_theta[i].load_a(theta+i*block);
+			vc_ptn[i] = vc_val0[0] * vc_theta[i];
+			vc_df[i] = vc_val1[0] * vc_theta[i];
+			vc_ddf[i] = vc_val2[0] * vc_theta[i];
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+				vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+				vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+				vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+		inv_lh_ptn = vc_unit / abs(lh_ptn);
+
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+		df_ptn = horizontal_add(vc_df) * inv_lh_ptn;
+		ddf_ptn = horizontal_add(vc_ddf) * inv_lh_ptn;
+		ddf_ptn = nmul_add(df_ptn, df_ptn, ddf_ptn);
+
+#ifdef _OPENMP
+		df_final_th = mul_add(df_ptn, vc_freq, df_final_th);
+		ddf_final_th = mul_add(ddf_ptn, vc_freq, ddf_final_th);
+#else
+		df_final = mul_add(df_ptn, vc_freq, df_final);
+		ddf_final = mul_add(ddf_ptn, vc_freq, ddf_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		df_final += df_final_th;
+		ddf_final += ddf_final_th;
+	}
+}
+#endif
+	df = horizontal_add(df_final);
+	ddf = horizontal_add(ddf_final);
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+//	assert(isnormal(tree_lh));
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		VectorClass lh_final = 0.0;
+		df_final = 0.0;
+		ddf_final = 0.0;
+		lh_ptn = 0.0;
+		df_ptn = 0.0;
+		ddf_ptn = 0.0;
+		double prob_const, df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+			df_final += df_ptn;
+			ddf_final += ddf_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_theta[i].load_a(theta+i*block);
+				vc_ptn[i] = vc_val0[0] * vc_theta[i];
+				vc_df[i] = vc_val1[0] * vc_theta[i];
+				vc_ddf[i] = vc_val2[0] * vc_theta[i];
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_theta[j].load_a(&theta[i*VCSIZE+j*block]);
+					vc_ptn[j] = mul_add(vc_theta[j], vc_val0[i], vc_ptn[j]);
+					vc_df[j] = mul_add(vc_theta[j], vc_val1[i], vc_df[j]);
+					vc_ddf[j] = mul_add(vc_theta[j], vc_val2[i], vc_ddf[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			df_const = horizontal_add(df_final+df_ptn);
+			ddf_const = horizontal_add(ddf_final+ddf_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			df_const = horizontal_add(df_final)+df_ptn[0];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			df_const = horizontal_add(df_final)+df_ptn[0]+df_ptn[1]+df_ptn[2];
+			ddf_const = horizontal_add(ddf_final)+ddf_ptn[0]+ddf_ptn[1]+ddf_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+	}
+
+    aligned_free(vc_val2);
+    aligned_free(vc_val1);
+    aligned_free(vc_val0);
+}
+
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeMixtureLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigenSIMD<VectorClass, VCSIZE, nstates>(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+
+    size_t block = ncat * nstates * nmixture;
+    size_t statemix = nstates * nmixture;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j, m;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    maxptn = max(maxptn, aln->size()+((model_factory->unobserved_ptns.size()+VCSIZE-1)/VCSIZE)*VCSIZE);
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    VectorClass *vc_val = (VectorClass*)aligned_alloc<double>(block);
+
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		VectorClass vc_len(len);
+		for (m = 0; m < nmixture; m++) {
+			VectorClass vc_prop = VectorClass(site_rate->getProp(c) * ((ModelMixture*)model)->prop[m]);
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				// eval is not aligned!
+				vc_val[(m*ncat+c)*nstates/VCSIZE+i] = exp(VectorClass().load_a(&eval[m*nstates+i*VCSIZE]) * vc_len) * vc_prop;
+			}
+		}
+	}
+
+	double prob_const = 0.0;
+
+	if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+    	double *partial_lh_node = aligned_alloc<double>((aln->STATE_UNKNOWN+1)*block);
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node + (*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*statemix;
+    		VectorClass *vc_val_tmp = vc_val;
+			for (m = 0; m < nmixture; m++) {
+				for (c = 0; c < ncat; c++) {
+					for (i = 0; i < nstates; i+=VCSIZE) {
+						(vc_val_tmp[i/VCSIZE] * VectorClass().load_a(&lh_tip[m*nstates+i])).store_a(&lh_node[i]);
+					}
+					lh_node += nstates;
+					vc_val_tmp += nstates/VCSIZE;
+				}
+			}
+    	}
+
+//    	VectorClass vc_tip_partial_lh[nstates];
+//    	VectorClass vc_partial_lh_dad[VCSIZE];
+    	VectorClass vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn; // store likelihoods of VCSIZE consecutive patterns
+
+//    	double **lh_states_dad = aligned_alloc<double*>(maxptn);
+//    	for (ptn = 0; ptn < orig_nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[(aln->at(ptn))[dad->id] * nstates];
+//    	for (ptn = orig_nptn; ptn < nptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[model_factory->unobserved_ptns[ptn-orig_nptn] * nstates];
+//    	// initialize beyond #patterns for efficiency
+//    	for (ptn = nptn; ptn < maxptn; ptn++)
+//    		lh_states_dad[ptn] = &tip_partial_lh[aln->STATE_UNKNOWN * nstates];
+
+		int *ptn_states_dad = aligned_alloc<int>(maxptn);
+		for (ptn = 0; ptn < orig_nptn; ptn++)
+			ptn_states_dad[ptn] = (aln->at(ptn))[dad->id];
+		for (ptn = orig_nptn; ptn < nptn; ptn++)
+			ptn_states_dad[ptn] = model_factory->unobserved_ptns[ptn-orig_nptn];
+		// initialize beyond #patterns for efficiency
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			ptn_states_dad[ptn] = aln->STATE_UNKNOWN;
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++)
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_ptn, vc_freq, lh_ptn)
+    {
+    	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+   		// main loop over all patterns with a step size of VCSIZE
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+//			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			for (j = 0; j < VCSIZE; j++) {
+				vc_ptn[j] = 0.0;
+				double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+				int state_dad = ptn_states_dad[ptn+j];
+				double *lh_node = &partial_lh_node[state_dad*block];
+				for (i = 0; i < block; i+=VCSIZE) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+							VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
+				}
+			}
+
+//			// initialize vc_tip_partial_lh
+//			for (j = 0; j < VCSIZE; j++) {
+//				double *lh_dad = lh_states_dad[ptn+j];
+//				for (i = 0; i < nstates/VCSIZE; i++) {
+//					vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
+//				}
+//				vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+//				vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+//			}
+//
+//			// compute vc_ptn
+//			for (i = 1; i < block/VCSIZE; i++)
+//				for (j = 0; j < VCSIZE; j++) {
+//					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+//					vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+//							vc_partial_lh_dad[j], vc_ptn[j]);
+//				}
+
+			vc_freq.load_a(&ptn_freq[ptn]);
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+
+			// multiply with pattern frequency
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+    	}
+    }
+#endif
+		tree_lh += horizontal_add(lh_final);
+		if (isnan(tree_lh) || isinf(tree_lh)) {
+			cout.setf(ios::scientific);
+			cout.precision(10);
+			model->writeInfo(cout);
+			site_rate->writeInfo(cout);
+			assert(0);
+		}
+
+		// ascertainment bias correction
+		if (orig_nptn < nptn) {
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+//				double *partial_lh_dad = &dad_branch->partial_lh[ptn*block];
+				lh_final += lh_ptn;
+				for (j = 0; j < VCSIZE; j++) {
+					vc_ptn[j] = 0.0;
+					double *partial_lh_dad = dad_branch->partial_lh + (ptn+j)*block;
+					int state_dad = ptn_states_dad[ptn+j];
+					double *lh_node = &partial_lh_node[state_dad*block];
+					for (i = 0; i < block; i+=VCSIZE) {
+						vc_ptn[j] = mul_add(VectorClass().load_a(&lh_node[i]),
+								VectorClass().load_a(&partial_lh_dad[i]), vc_ptn[j]);
+					}
+				}
+
+//				// initialize vc_tip_partial_lh
+//				for (j = 0; j < VCSIZE; j++) {
+//					double *lh_dad = lh_states_dad[ptn+j];
+//					for (i = 0; i < nstates/VCSIZE; i++) {
+//						vc_tip_partial_lh[j*(nstates/VCSIZE)+i].load_a(&lh_dad[i*VCSIZE]);
+//					}
+//					vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block]);
+//					vc_ptn[j] = vc_val[0] * vc_tip_partial_lh[j*(nstates/VCSIZE)] * vc_partial_lh_dad[j];
+//				}
+//
+//				// compute vc_ptn
+//				for (i = 1; i < block/VCSIZE; i++)
+//					for (j = 0; j < VCSIZE; j++) {
+//						vc_partial_lh_dad[j].load_a(&partial_lh_dad[j*block+i*VCSIZE]);
+//						vc_ptn[j] = mul_add(vc_val[i] * vc_tip_partial_lh[j*(nstates/VCSIZE)+i%(nstates/VCSIZE)],
+//								vc_partial_lh_dad[j], vc_ptn[j]);
+//					}
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+		aligned_free(ptn_states_dad);
+		aligned_free(partial_lh_node);
+    } else {
+    	// both dad and node are internal nodes
+    	VectorClass vc_partial_lh_node[VCSIZE];
+    	VectorClass vc_partial_lh_dad[VCSIZE], vc_ptn[VCSIZE];
+    	VectorClass lh_final(0.0), vc_freq;
+		VectorClass lh_ptn;
+
+		// copy dummy values because VectorClass will access beyond nptn
+		for (ptn = nptn; ptn < maxptn; ptn++) {
+			memcpy(&dad_branch->partial_lh[ptn*block], dad_branch->partial_lh, block*sizeof(double));
+			memcpy(&node_branch->partial_lh[ptn*block], node_branch->partial_lh, block*sizeof(double));
+		}
+
+#ifdef _OPENMP
+#pragma omp parallel private(ptn, i, j, vc_partial_lh_node, vc_partial_lh_dad, vc_ptn, vc_freq, lh_ptn)
+		{
+		VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+		for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+
+			for (j = 0; j < VCSIZE; j++)
+				vc_ptn[j] = 0.0;
+
+			for (i = 0; i < block; i+=VCSIZE) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+					vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+					vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+				}
+			}
+
+			vc_freq.load_a(&ptn_freq[ptn]);
+
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+
+			lh_ptn = log(abs(lh_ptn));
+			lh_ptn.store_a(&_pattern_lh[ptn]);
+#ifdef _OPENMP
+			lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+			lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+		}
+#ifdef _OPENMP
+#pragma omp critical
+		{
+			lh_final += lh_final_th;
+		}
+	}
+#endif
+
+		tree_lh += horizontal_add(lh_final);
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+		if (orig_nptn < nptn) {
+			// ascertainment bias correction
+			lh_final = 0.0;
+			lh_ptn = 0.0;
+			double *partial_lh_node = &node_branch->partial_lh[orig_nptn*block];
+			double *partial_lh_dad = &dad_branch->partial_lh[orig_nptn*block];
+
+			for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+				lh_final += lh_ptn;
+
+				for (j = 0; j < VCSIZE; j++)
+					vc_ptn[j] = 0.0;
+
+				for (i = 0; i < block; i+=VCSIZE) {
+					for (j = 0; j < VCSIZE; j++) {
+						vc_partial_lh_node[j].load_a(&partial_lh_node[i+j*block]);
+						vc_partial_lh_dad[j].load_a(&partial_lh_dad[i+j*block]);
+						vc_ptn[j] = mul_add(vc_val[i/VCSIZE] * vc_partial_lh_node[j], vc_partial_lh_dad[j], vc_ptn[j]);
+					}
+				}
+
+				// ptn_invar[ptn] is not aligned
+				lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+				partial_lh_node += block*VCSIZE;
+				partial_lh_dad += block*VCSIZE;
+			}
+			switch ((nptn-orig_nptn)%VCSIZE) {
+			case 0: prob_const = horizontal_add(lh_final+lh_ptn); break;
+			case 1: prob_const = horizontal_add(lh_final)+lh_ptn[0]; break;
+			case 2: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]; break;
+			case 3: prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2]; break;
+			default: assert(0); break;
+			}
+		}
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+    }
+
+    aligned_free(vc_val);
+    return tree_lh;
+}
+
+template <class VectorClass, const int VCSIZE, const int nstates>
+double PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD() {
+
+
+	assert(theta_all && theta_computed);
+
+	double tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+    size_t block = nstates * ncat * nmixture;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, j, m;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+//    size_t maxptn = ((nptn+VCSIZE-1)/VCSIZE)*VCSIZE;
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	VectorClass *vc_val0 = (VectorClass*)aligned_alloc<double>(block);
+
+	VectorClass vc_len = current_it->length;
+	for (c = 0; c < ncat; c++) {
+		VectorClass vc_rate = site_rate->getRate(c);
+		for (m = 0; m < nmixture; m++) {
+			VectorClass vc_prop = site_rate->getProp(c)*((ModelMixture*)model)->prop[m];
+			for (i = 0; i < nstates/VCSIZE; i++) {
+				VectorClass cof = VectorClass().load_a(&eval[m*nstates+i*VCSIZE]) * vc_rate;
+				VectorClass val = exp(cof*vc_len) * vc_prop;
+				vc_val0[(m*ncat+c)*nstates/VCSIZE+i] = val;
+			}
+		}
+	}
+
+	VectorClass vc_ptn[VCSIZE];
+	VectorClass vc_freq;
+	VectorClass lh_final = 0.0;
+	// these stores values of 2 consecutive patterns
+	VectorClass lh_ptn;
+
+	// perform 2 sites at the same time for SSE/AVX efficiency
+
+#ifdef _OPENMP
+#pragma omp parallel private (ptn, i, j, vc_freq, vc_ptn, lh_ptn)
+	{
+	VectorClass lh_final_th = 0.0;
+#pragma omp for nowait
+#endif
+	for (ptn = 0; ptn < orig_nptn; ptn+=VCSIZE) {
+		double *theta = theta_all + ptn*block;
+		// initialization
+		for (i = 0; i < VCSIZE; i++) {
+			vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+		}
+
+		for (i = 1; i < block/VCSIZE; i++) {
+			for (j = 0; j < VCSIZE; j++) {
+				vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+			}
+		}
+		lh_ptn = horizontal_add(vc_ptn) + VectorClass().load_a(&ptn_invar[ptn]);
+		lh_ptn = log(abs(lh_ptn));
+		lh_ptn.store_a(&_pattern_lh[ptn]);
+		vc_freq.load_a(&ptn_freq[ptn]);
+
+#ifdef _OPENMP
+		lh_final_th = mul_add(lh_ptn, vc_freq, lh_final_th);
+#else
+		lh_final = mul_add(lh_ptn, vc_freq, lh_final);
+#endif
+
+	}
+
+#ifdef _OPENMP
+#pragma omp critical
+	{
+		lh_final += lh_final_th;
+	}
+}
+#endif
+	tree_lh += horizontal_add(lh_final);
+    if (isnan(tree_lh) || isinf(tree_lh)) {
+        cout << "WARNING: Numerical underflow caused by alignment sites";
+        i = aln->getNSite();
+        for (j = 0, c = 0; j < i; j++) {
+            ptn = aln->getPatternID(j);
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                cout << " " << j+1;
+                c++;
+                if (c >= 10) {
+                    cout << " ...";
+                    break;
+                }
+            }
+        }
+        cout << endl;
+        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+        for (ptn = 0; ptn < orig_nptn; ptn++) {
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+            }
+            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+        }
+    }
+
+	if (orig_nptn < nptn) {
+		// ascertaiment bias correction
+		lh_final = 0.0;
+		lh_ptn = 0.0;
+		double prob_const;// df_const, ddf_const;
+		double *theta = &theta_all[orig_nptn*block];
+		for (ptn = orig_nptn; ptn < nptn; ptn+=VCSIZE) {
+			lh_final += lh_ptn;
+
+			// initialization
+			for (i = 0; i < VCSIZE; i++) {
+				vc_ptn[i] = vc_val0[0] * VectorClass().load_a(theta+i*block);
+			}
+
+			for (i = 1; i < block/VCSIZE; i++) {
+				for (j = 0; j < VCSIZE; j++) {
+					vc_ptn[j] = mul_add(VectorClass().load_a(&theta[i*VCSIZE+j*block]), vc_val0[i], vc_ptn[j]);
+				}
+			}
+			theta += block*VCSIZE;
+
+			// ptn_invar[ptn] is not aligned
+			lh_ptn = horizontal_add(vc_ptn) + VectorClass().load(&ptn_invar[ptn]);
+
+		}
+		switch ((nptn-orig_nptn) % VCSIZE) {
+		case 0:
+			prob_const = horizontal_add(lh_final+lh_ptn);
+			break;
+		case 1:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0];
+			break;
+		case 2:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1];
+			break;
+		case 3:
+			prob_const = horizontal_add(lh_final)+lh_ptn[0]+lh_ptn[1]+lh_ptn[2];
+			break;
+		default:
+			assert(0);
+			break;
+		}
+    	prob_const = log(1.0 - prob_const);
+    	tree_lh -= aln->getNSite() * prob_const;
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+	}
+
+    aligned_free(vc_val0);
+
+    return tree_lh;
+}
+
+
+
+
+#endif /* PHYLOKERNELMIXTURE_H_ */
diff --git a/phylolib.h b/phylolib.h
new file mode 100755
index 0000000..3c80012
--- /dev/null
+++ b/phylolib.h
@@ -0,0 +1,26 @@
+/*
+ *  phylolib.h
+ *
+ *  Created on: Nov 19, 2012
+ *  Author: tung
+ */
+
+#ifndef PHYLOLIB_H_
+#define PHYLOLIB_H_
+
+//#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
+
+#endif /* PHYLOLIB_H_ */
diff --git a/phylonode.cpp b/phylonode.cpp
new file mode 100644
index 0000000..d59129f
--- /dev/null
+++ b/phylonode.cpp
@@ -0,0 +1,94 @@
+//
+// C++ Implementation: phylonode
+//
+// Description: 
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#include "phylonode.h"
+
+
+void PhyloNeighbor::clearForwardPartialLh(Node *dad) {
+	clearPartialLh();
+	for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it ++)
+		if ((*it)->node != dad)
+			((PhyloNeighbor*)*it)->clearForwardPartialLh(node);
+}
+
+void PhyloNeighbor::reorientPartialLh(Node *dad) {
+    if (partial_lh)
+        return;
+    bool done = false;
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        PhyloNeighbor *backnei = (PhyloNeighbor*)(*it)->node->findNeighbor(node);
+        if (backnei->partial_lh) {
+            partial_lh = backnei->partial_lh;
+            scale_num = backnei->scale_num;
+            backnei->partial_lh = NULL;
+            backnei->scale_num = NULL;
+            backnei->partial_lh_computed &= ~1; // clear bit
+            done = true;
+            break;
+        }
+    }
+    assert(done && "partial_lh is not re-oriented");
+}
+
+
+void PhyloNode::clearReversePartialLh(PhyloNode *dad) {
+	PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad);
+	assert(node_nei);
+	node_nei->partial_lh_computed = 0;
+	for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it ++)
+		if ((*it)->node != dad)
+			((PhyloNode*)(*it)->node)->clearReversePartialLh(this);
+}
+
+void PhyloNode::clearAllPartialLh(bool make_null, PhyloNode *dad) {
+	PhyloNeighbor *node_nei = (PhyloNeighbor*)findNeighbor(dad);
+	node_nei->partial_lh_computed = 0;
+	if (make_null) node_nei->partial_lh = NULL;
+
+	node_nei = (PhyloNeighbor*)dad->findNeighbor(this);
+	node_nei->partial_lh_computed = 0;
+	if (make_null) node_nei->partial_lh = NULL;
+
+	for (NeighborVec::iterator it = neighbors.begin(); it != neighbors.end(); it ++)
+		if ((*it)->node != dad)
+			((PhyloNode*)(*it)->node)->clearAllPartialLh(make_null, this);
+}
+
+
+PhyloNode::PhyloNode()
+ : Node()
+{
+	init();
+}
+
+
+PhyloNode::PhyloNode(int aid) : Node(aid)
+{
+	init();
+}
+
+PhyloNode::PhyloNode(int aid, int aname) : Node (aid, aname) {
+	init();
+}
+
+
+PhyloNode::PhyloNode(int aid, const char *aname) : Node(aid, aname) {
+	init();
+}
+
+void PhyloNode::init() {
+	//partial_lh = NULL;
+}
+
+
+void PhyloNode::addNeighbor(Node *node, double length, int id) {
+	neighbors.push_back(new PhyloNeighbor(node, length, id));
+}
diff --git a/phylonode.h b/phylonode.h
new file mode 100644
index 0000000..5bc4563
--- /dev/null
+++ b/phylonode.h
@@ -0,0 +1,201 @@
+//
+// C++ Interface: phylonode
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+#ifndef PHYLONODE_H
+#define PHYLONODE_H
+
+#include "node.h"
+
+typedef short int UBYTE;
+
+/**
+A neighbor in a phylogenetic tree
+
+    @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+ */
+class PhyloNeighbor : public Neighbor {
+    friend class PhyloNode;
+    friend class PhyloTree;
+    friend class IQTree;
+    friend class PhyloSuperTree;
+
+public:
+    friend class TinaTree;
+    friend class PhyloSuperTreePlen;
+
+    /**
+        construct class with a node and length		@param anode the other end of the branch
+
+        @param alength length of branch
+     */
+    PhyloNeighbor(Node *anode, double alength) : Neighbor(anode, alength) {
+        partial_lh = NULL;
+        scale_num = NULL;
+        partial_lh_computed = 0;
+        lh_scale_factor = 0.0;
+        partial_pars = NULL;
+    }
+
+    /**
+        construct class with a node and length
+        @param anode the other end of the branch
+        @param alength length of branch
+        @param aid branch ID
+     */
+    PhyloNeighbor(Node *anode, double alength, int aid) : Neighbor(anode, alength, aid) {
+        partial_lh = NULL;
+        scale_num = NULL;
+        partial_lh_computed = 0;
+        lh_scale_factor = 0.0;
+        partial_pars = NULL;
+    }
+
+    /**
+        tell that the partial likelihood vector is not computed
+     */
+    inline void clearPartialLh() {
+        partial_lh_computed = 0;
+    }
+
+    /**
+     *  tell that the partial likelihood vector is computed
+     */
+    inline void unclearPartialLh() {
+        partial_lh_computed = 1;
+    }
+
+    /**
+        clear all partial likelihood recursively in forward direction
+        @param dad dad of this neighbor
+     */
+    void clearForwardPartialLh(Node *dad);
+
+    /**
+        if partial_lh is NULL, reorient partial_lh (LM_PER_NODE technique)
+        @param dad dad of this neighbor
+    */
+    void reorientPartialLh(Node *dad);
+
+	/**
+	* For Upper Bounds analysis: get partial likelihood and lh scale factor
+	*/
+	double* get_partial_lh(){
+	return partial_lh;
+	}
+
+	double get_lh_scale_factor(){
+	return lh_scale_factor;
+	}
+
+	int get_partial_lh_computed(){
+	return partial_lh_computed;
+	}
+
+private:
+
+    /**
+        true if the partial likelihood was computed
+     */
+    int partial_lh_computed;
+
+    /**
+        vector containing the partial likelihoods
+     */
+    double *partial_lh;
+
+    /**
+        likelihood scaling factor
+     */
+    double lh_scale_factor;
+
+    /**
+        vector containing number of scaling events per pattern // NEW!
+     */
+    UBYTE *scale_num;
+
+    /**
+        vector containing the partial parsimony scores
+     */
+    UINT *partial_pars;
+
+};
+
+/**
+A node in a phylogenetic tree
+
+    @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+ */
+class PhyloNode : public Node {
+    friend class PhyloTree;
+
+public:
+    /**
+        constructor
+     */
+    PhyloNode();
+
+    /**
+        constructor
+        @param aid id of this node
+     */
+    PhyloNode(int aid);
+
+    /**
+        constructor
+        @param aid id of this node
+        @param aname name of this node
+     */
+    PhyloNode(int aid, int aname);
+
+    /**
+        constructor
+        @param aid id of this node
+        @param aname name of this node
+     */
+    PhyloNode(int aid, const char *aname);
+
+    /**
+        initialization
+     */
+    void init();
+
+    /**
+        add a neighbor
+        @param node the neighbor node
+        @param length branch length
+        @param id branch ID
+     */
+    virtual void addNeighbor(Node *node, double length, int id = -1);
+
+
+
+    /**
+        tell that all partial likelihood vectors below this node are not computed
+     */
+    void clearAllPartialLh(bool make_null, PhyloNode *dad);
+
+    /**
+        tell that all partial likelihood vectors (in reverse direction) below this node are not computed
+     */
+    void clearReversePartialLh(PhyloNode *dad);
+
+    void computeReversePartialLh(PhyloNode *dad);
+
+};
+
+
+/**
+    Node vector
+ */
+typedef vector<PhyloNode*> PhyloNodeVector;
+
+
+#endif
diff --git a/phylosupertree.cpp b/phylosupertree.cpp
new file mode 100644
index 0000000..9c42346
--- /dev/null
+++ b/phylosupertree.cpp
@@ -0,0 +1,1449 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "phylosupertree.h"
+#include "superalignment.h"
+#include "superalignmentpairwise.h"
+#include "msetsblock.h"
+#include "myreader.h"
+#include "phylotesting.h"
+
+PhyloSuperTree::PhyloSuperTree()
+ : IQTree()
+{
+	totalNNIs = evalNNIs = 0;
+    rescale_codon_brlen = false;
+	// Initialize the counter for evaluated NNIs on subtrees. FOR THIS CASE IT WON'T BE initialized.
+}
+
+PhyloSuperTree::PhyloSuperTree(SuperAlignment *alignment, PhyloSuperTree *super_tree) :  IQTree(alignment) {
+	totalNNIs = evalNNIs = 0;
+    rescale_codon_brlen = super_tree->rescale_codon_brlen;
+	part_info = super_tree->part_info;
+	for (vector<Alignment*>::iterator it = alignment->partitions.begin(); it != alignment->partitions.end(); it++) {
+		PhyloTree *tree = new PhyloTree((*it));
+		push_back(tree);
+	}
+	// Initialize the counter for evaluated NNIs on subtrees
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		part_info[part].evalNNIs = 0.0;
+	}
+
+	aln = alignment;
+}
+
+void PhyloSuperTree::readPartition(Params &params) {
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(params.partition_file);
+		in.exceptions(ios::badbit);
+		Params origin_params = params;
+		PartitionInfo info;
+
+		while (!in.eof()) {
+			getline(in, info.name, ',');
+			if (in.eof()) break;
+			getline(in, info.model_name, ',');
+			if (info.model_name == "") info.model_name = params.model_name;
+			getline(in, info.aln_file, ',');
+			if (info.aln_file == "" && params.aln_file) info.aln_file = params.aln_file;
+			getline(in, info.sequence_type, ',');
+			if (info.sequence_type=="" && params.sequence_type) info.sequence_type = params.sequence_type;
+			getline(in, info.position_spec);
+            trimString(info.sequence_type);
+			cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
+					info.aln_file << ", seq=" << info.sequence_type << ", pos=" << info.position_spec << ") ..." << endl;
+
+			//info.mem_ptnlh = NULL;
+			info.nniMoves[0].ptnlh = NULL;
+			info.nniMoves[1].ptnlh = NULL;
+			info.cur_ptnlh = NULL;
+			part_info.push_back(info);
+			Alignment *part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype);
+			if (!info.position_spec.empty()) {
+				Alignment *new_aln = new Alignment();
+				new_aln->extractSites(part_aln, info.position_spec.c_str());
+				delete part_aln;
+				part_aln = new_aln;
+			}
+			PhyloTree *tree = new PhyloTree(part_aln);
+			push_back(tree);
+			params = origin_params;
+		}
+
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (string str) {
+		outError(str);
+	}
+
+
+}
+
+void PhyloSuperTree::readPartitionRaxml(Params &params) {
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(params.partition_file);
+		in.exceptions(ios::badbit);
+		PartitionInfo info;
+        Alignment *input_aln = NULL;
+        if (!params.aln_file)
+            outError("Please supply an alignment with -s option");
+            
+        input_aln = new Alignment(params.aln_file, params.sequence_type, params.intype);
+
+        cout << endl << "Partition file is not in NEXUS format, assuming RAxML-style partition file..." << endl;
+
+        size_t pos = params.model_name.find_first_of("+*");
+        string rate_type = "";
+        if (pos != string::npos) rate_type = params.model_name.substr(pos);
+
+		while (!in.eof()) {
+			getline(in, info.model_name, ',');
+			if (in.eof()) break;
+            trimString(info.model_name);
+//            std::transform(info.model_name.begin(), info.model_name.end(), info.model_name.begin(), ::toupper);
+
+            bool is_ASC = info.model_name.substr(0,4) == "ASC_";
+            if (is_ASC) info.model_name.erase(0, 4);
+            StateFreqType freq = FREQ_UNKNOWN;
+            if (info.model_name.find_first_of("*+{") == string::npos ) {
+                if (*info.model_name.rbegin() == 'F' && info.model_name != "DAYHOFF") {
+                    freq = FREQ_EMPIRICAL;
+                    info.model_name.erase(info.model_name.length()-1);
+                } else if (*info.model_name.rbegin() == 'X' && info.model_name != "LG4X") {
+                    freq = FREQ_ESTIMATE;
+                    info.model_name.erase(info.model_name.length()-1);
+                }
+            }
+            
+            if (info.model_name.empty())
+                outError("Please give model names in partition file!");
+            if (info.model_name == "BIN") {
+                info.sequence_type = "BIN";
+                info.model_name = "GTR2";
+            } else if (info.model_name == "DNA") {
+                info.sequence_type = "DNA";
+                info.model_name = "GTR";
+            } else if (info.model_name == "MULTI") {
+                info.sequence_type = "MORPH";
+                info.model_name = "MK";
+            } else if (info.model_name.substr(0,5) == "CODON") {
+                info.sequence_type = info.model_name;
+                info.model_name = "GY";
+            } else {
+                info.sequence_type = "AA";
+                if (*info.model_name.begin() == '[') {
+                    if (*info.model_name.rbegin() != ']')
+                        outError("User-defined protein model should be [myProtenSubstitutionModelFileName]");
+                    info.model_name = info.model_name.substr(1, info.model_name.length()-2);
+                }
+            }
+
+            if (freq == FREQ_EMPIRICAL) 
+                info.model_name += "+F";
+            else if (freq == FREQ_ESTIMATE)
+                info.model_name += "+FO";
+            if (is_ASC)
+                info.model_name += "+ASC";
+            info.model_name += rate_type;
+
+			getline(in, info.name, '=');
+            trimString(info.name);
+            if (info.name.empty())
+                outError("Please give partition names in partition file!");
+
+			getline(in, info.position_spec);
+            trimString(info.position_spec);
+            if (info.position_spec.empty())
+                outError("Please specify alignment positions for partition" + info.name);
+            std::replace(info.position_spec.begin(), info.position_spec.end(), ',', ' ');
+            
+			cout << "Reading partition " << info.name << " (model=" << info.model_name << ", seq=" << info.sequence_type << ", pos=" << info.position_spec << ") ..." << endl;
+
+			//info.mem_ptnlh = NULL;
+			info.nniMoves[0].ptnlh = NULL;
+			info.nniMoves[1].ptnlh = NULL;
+			info.cur_ptnlh = NULL;
+			part_info.push_back(info);
+            Alignment *part_aln = new Alignment();
+            part_aln->extractSites(input_aln, info.position_spec.c_str());
+            
+			Alignment *new_aln;
+			if (params.remove_empty_seq)
+				new_aln = part_aln->removeGappySeq();
+			else
+				new_aln = part_aln;
+		    // also rebuild states set of each sequence for likelihood computation
+		    new_aln->buildSeqStates();
+
+			if (part_aln != new_aln) delete part_aln;
+			PhyloTree *tree = new PhyloTree(new_aln);
+            push_back(tree);
+            cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
+//			params = origin_params;
+		}
+
+		in.clear();
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch(ios::failure) {
+		outError(ERR_READ_INPUT);
+	} catch (string str) {
+		outError(str);
+	}
+
+
+}
+
+void PhyloSuperTree::readPartitionNexus(Params &params) {
+	Params origin_params = params;
+	MSetsBlock *sets_block = new MSetsBlock();
+    MyReader nexus(params.partition_file);
+    nexus.Add(sets_block);
+    MyToken token(nexus.inf);
+    nexus.Execute(token);
+
+    Alignment *input_aln = NULL;
+    if (params.aln_file) {
+    	input_aln = new Alignment(params.aln_file, params.sequence_type, params.intype);
+    }
+
+    bool empty_partition = true;
+    vector<CharSet*>::iterator it;
+    for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
+    	if ((*it)->model_name != "") {
+    		empty_partition = false;
+    		break;
+    	}
+    if (empty_partition) {
+    	cout << "NOTE: No CharPartition defined, use all CharSets" << endl;
+    }
+
+    for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
+    	if (empty_partition || (*it)->char_partition != "") {
+			PartitionInfo info;
+			info.name = (*it)->name;
+			info.model_name = (*it)->model_name;
+			if (info.model_name == "")
+				info.model_name = params.model_name;
+			info.aln_file = (*it)->aln_file;
+			//if (info.aln_file == "" && params.aln_file) info.aln_file = params.aln_file;
+			if (info.aln_file == "" && !params.aln_file)
+				outError("No input data for partition ", info.name);
+			info.sequence_type = (*it)->sequence_type;
+			if (info.sequence_type=="" && params.sequence_type) info.sequence_type = params.sequence_type;
+            
+            if (info.sequence_type == "" && !info.model_name.empty()) {
+                // try to get sequence type from model
+                info.sequence_type = getSeqType(info.model_name.substr(0, info.model_name.find_first_of("+*")));
+            }
+			info.position_spec = (*it)->position_spec;
+			trimString(info.sequence_type);
+			cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
+				info.aln_file << ", seq=" << info.sequence_type << ", pos=" << info.position_spec << ") ..." << endl;
+            if (info.sequence_type != "" && Alignment::getSeqType(info.sequence_type.c_str()) == SEQ_UNKNOWN)
+                outError("Unknown sequence type " + info.sequence_type);
+			//info.mem_ptnlh = NULL;
+			info.nniMoves[0].ptnlh = NULL;
+			info.nniMoves[1].ptnlh = NULL;
+			info.cur_ptnlh = NULL;
+			part_info.push_back(info);
+			Alignment *part_aln;
+			if (info.aln_file != "") {
+				part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype);
+			} else {
+				part_aln = input_aln;
+			}
+			if (!info.position_spec.empty() && info.position_spec != "*") {
+				Alignment *new_aln = new Alignment();
+				new_aln->extractSites(part_aln, info.position_spec.c_str());
+				if (part_aln != input_aln) delete part_aln;
+				part_aln = new_aln;
+			}
+            if (part_aln->seq_type == SEQ_DNA && (info.sequence_type.substr(0, 5) == "CODON" || info.sequence_type.substr(0, 5) == "NT2AA")) {
+				Alignment *new_aln = new Alignment();
+                new_aln->convertToCodonOrAA(part_aln, &info.sequence_type[5], info.sequence_type.substr(0, 5) == "NT2AA");
+                if (part_aln != input_aln) delete part_aln;
+                part_aln = new_aln;
+            }
+			Alignment *new_aln;
+			if (params.remove_empty_seq)
+				new_aln = part_aln->removeGappySeq();
+			else
+				new_aln = part_aln;
+		    // also rebuild states set of each sequence for likelihood computation
+		    new_aln->buildSeqStates();
+
+			if (part_aln != new_aln && part_aln != input_aln) delete part_aln;
+			PhyloTree *tree = new PhyloTree(new_aln);
+			push_back(tree);
+			params = origin_params;
+			cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
+    	}
+
+    if (input_aln)
+    	delete input_aln;
+}
+
+void PhyloSuperTree::printPartition(const char *filename) {
+   try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		out << "#nexus" << endl << "[ partition information for alignment written in .conaln file ]" << endl
+			<< "begin sets;" << endl;
+		int part; int start_site;
+		for (part = 0, start_site = 1; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+			int end_site = start_site + at(part)->getAlnNSite();
+			out << "  charset " << name << " = " << start_site << "-" << end_site-1 << ";" << endl;
+			start_site = end_site;
+		}
+		out << "  charpartition mymodels =" << endl;
+		for (part = 0; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+			if (part > 0) out << "," << endl;
+			out << "    " << at(part)->getModelNameParams() << ":" << name;
+		}
+		out << ";" << endl;
+		out << "end;" << endl;
+		out.close();
+		cout << "Partition information was printed to " << filename << endl;
+	} catch (ios::failure &) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+void PhyloSuperTree::printBestPartition(const char *filename) {
+   try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		out << "#nexus" << endl
+			<< "begin sets;" << endl;
+		int part;
+		for (part = 0; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+			out << "  charset " << name << " = ";
+			if (!part_info[part].aln_file.empty()) out << part_info[part].aln_file << ": ";
+			if (at(part)->aln->seq_type == SEQ_CODON)
+				out << "CODON, ";
+			string pos = part_info[part].position_spec;
+			replace(pos.begin(), pos.end(), ',' , ' ');
+			out << pos << ";" << endl;
+		}
+		out << "  charpartition mymodels =" << endl;
+		for (part = 0; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+			if (part > 0) out << "," << endl;
+			out << "    " << part_info[part].model_name << ": " << name;
+		}
+		out << ";" << endl;
+		out << "end;" << endl;
+		out.close();
+		cout << "Partition information was printed to " << filename << endl;
+	} catch (ios::failure &) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+
+void PhyloSuperTree::printPartitionRaxml(const char *filename) {
+	int part;
+	for (part = 0; part < part_info.size(); part++) {
+		if (part_info[part].aln_file != "") {
+			cout << "INFO: Printing partition in RAxML format is not possible" << endl;
+            return;
+        }
+	}
+   try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		int start_site;
+		for (part = 0, start_site = 1; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+			int end_site = start_site + at(part)->getAlnNSite();
+			switch (at(part)->aln->seq_type) {
+			case SEQ_DNA: out << "DNA, "; break;
+			case SEQ_BINARY: out << "BIN, "; break;
+			case SEQ_MORPH: out << "MULTI, "; break;
+			default: out << at(part)->getModel()->name << ","; break;
+			}
+			out << name << " = " << start_site << "-" << end_site-1 << endl;
+			start_site = end_site;
+		}
+		out.close();
+		cout << "Partition information in Raxml format was printed to " << filename << endl;
+	} catch (ios::failure &) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+void PhyloSuperTree::printBestPartitionRaxml(const char *filename) {
+	int part;
+	for (part = 0; part < part_info.size(); part++) {
+		if (part_info[part].aln_file != "") {
+			cout << "INFO: Printing partition in RAxML format is not possible" << endl;
+            return;
+        }
+	}
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		for (part = 0; part < part_info.size(); part++) {
+			string name = part_info[part].name;
+			replace(name.begin(), name.end(), '+', '_');
+            if (part_info[part].model_name.find("+ASC") != string::npos)
+                out << "ASC_";
+			switch (at(part)->aln->seq_type) {
+			case SEQ_DNA: out << "DNA"; break;
+			case SEQ_BINARY: out << "BIN"; break;
+			case SEQ_MORPH: out << "MULTI"; break;
+            case SEQ_PROTEIN:
+                out << part_info[part].model_name.substr(0, part_info[part].model_name.find_first_of("*{+"));
+                break;
+            case SEQ_CODON:
+                out << "CODON_" << part_info[part].model_name.substr(0, part_info[part].model_name.find_first_of("*{+"));
+                break;
+			default: out << part_info[part].model_name; break;
+			}
+            if (part_info[part].model_name.find("+FO") != string::npos)
+                out << "X";
+            else if (part_info[part].model_name.find("+F") != string::npos)
+                out << "F";
+                
+			out << ", " << name << " = " << part_info[part].position_spec << endl;
+		}
+		out.close();
+		cout << "Partition information in Raxml format was printed to " << filename << endl;
+	} catch (ios::failure &) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+}
+
+
+PhyloSuperTree::PhyloSuperTree(Params &params) :  IQTree() {
+	totalNNIs = evalNNIs = 0;
+
+	cout << "Reading partition model file " << params.partition_file << " ..." << endl;
+	if (detectInputFile(params.partition_file) == IN_NEXUS) {
+		readPartitionNexus(params);
+        if (part_info.empty()) {
+            outError("No partition found in SETS block. An example syntax looks like: \n#nexus\nbegin sets;\n  charset part1=1-100;\n  charset part2=101-300;\nend;");
+        }
+	} else
+		readPartitionRaxml(params);
+	if (part_info.empty())
+		outError("No partition found");
+
+	// Initialize the counter for evaluated NNIs on subtrees
+	int part = 0;
+    iterator it;
+	for (it = begin(); it != end(); it++, part++) {
+		part_info[part].evalNNIs = 0.0;
+	}
+
+	aln = new SuperAlignment(this);
+	if (params.print_conaln) {
+		string str = params.out_prefix;
+		str = params.out_prefix;
+		str += ".conaln";
+		((SuperAlignment*)aln)->printCombinedAlignment(str.c_str());
+	}
+
+    // this is important: rescale branch length of codon partitions to be compatible with other partitions.
+    // since for codon models, branch lengths = # nucleotide subst per codon site!
+    rescale_codon_brlen = false;
+    bool has_codon = false;
+	for (it = begin(); it != end(); it++, part++) 
+        if ((*it)->aln->seq_type != SEQ_CODON) {
+            rescale_codon_brlen = true;
+        } else 
+            has_codon = true;
+            
+    rescale_codon_brlen &= has_codon;
+    if (rescale_codon_brlen)
+        cout << "NOTE: Mixed codon and other data, branch lengths of codon partitions are rescaled by 3!" << endl;
+    
+	cout << "Degree of missing data: " << ((SuperAlignment*)aln)->computeMissingData() << endl;
+    
+#ifdef _OPENMP
+    if (params.num_threads > size()) {
+        outWarning("More threads (" + convertIntToString(params.num_threads) + ") than number of partitions (" + convertIntToString(size()) + ") is not necessary. ");
+        outWarning("Please rerun again with -nt " + convertIntToString(size()));
+    }
+#endif
+	cout << endl;
+
+}
+
+void PhyloSuperTree::setParams(Params* params) {
+	IQTree::setParams(params);
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->setParams(params);
+	}
+}
+
+void PhyloSuperTree::initSettings(Params &params) {
+	IQTree::initSettings(params);
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->params = ¶ms;
+		(*it)->setLikelihoodKernel(params.SSE);
+		(*it)->optimize_by_newton = params.optimize_by_newton;
+	}
+
+}
+
+void PhyloSuperTree::setLikelihoodKernel(LikelihoodKernel lk) {
+    PhyloTree::setLikelihoodKernel(lk);
+    for (iterator it = begin(); it != end(); it++)
+        (*it)->setLikelihoodKernel(lk);    
+}
+
+void PhyloSuperTree::changeLikelihoodKernel(LikelihoodKernel lk) {
+	PhyloTree::changeLikelihoodKernel(lk);
+//	if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
+//		// need to increase the memory usage when changing from new kernel to old kernel
+//        setLikelihoodKernel(lk);
+//        for (iterator it = begin(); it != end(); it++)
+//            (*it)->setLikelihoodKernel(lk);
+//		deleteAllPartialLh();
+//		initializeAllPartialLh();
+//		clearAllPartialLH();
+//    } else {
+//        for (iterator it = begin(); it != end(); it++)
+//            (*it)->setLikelihoodKernel(lk);
+//    }
+}
+
+string PhyloSuperTree::getTreeString() {
+	stringstream tree_stream;
+	printTree(tree_stream, WT_BR_LEN+WT_NEWLINE);
+	for (iterator it = begin(); it != end(); it++)
+		(*it)->printTree(tree_stream, WT_BR_LEN+WT_NEWLINE);
+	return tree_stream.str();
+}
+
+void PhyloSuperTree::readTreeString(const string &tree_string) {
+	stringstream str;
+	str << tree_string;
+	str.seekg(0, ios::beg);
+	freeNode();
+	readTree(str, rooted);
+	setAlignment(aln);
+	setRootNode(params->root);
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->freeNode();
+		(*it)->readTree(str, rooted);
+//		(*it)->setAlignment((*it)->aln);
+	}
+	linkTrees();
+//	if (isSuperTree()) {
+//		((PhyloSuperTree*) this)->mapTrees();
+//	}
+	if (params->pll) {
+		assert(0);
+		pllReadNewick(getTreeString());
+	}
+	resetCurScore();
+
+}
+
+
+/**
+ * save branch lengths into a vector
+ */
+void PhyloSuperTree::saveBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
+	int totalBranchNum = branchNum;
+	iterator it;
+	for (it = begin(); it != end(); it++) {
+		totalBranchNum += (*it)->branchNum;
+	}
+	lenvec.resize(startid + totalBranchNum);
+
+	PhyloTree::saveBranchLengths(lenvec, startid);
+	startid += branchNum;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->saveBranchLengths(lenvec, startid);
+		startid += (*it)->branchNum;
+	}
+}
+/**
+ * restore branch lengths from a vector previously called with saveBranchLengths
+ */
+void PhyloSuperTree::restoreBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
+	PhyloTree::restoreBranchLengths(lenvec, startid);
+	startid += branchNum;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->restoreBranchLengths(lenvec, startid);
+		startid += (*it)->branchNum;
+	}
+}
+
+Node* PhyloSuperTree::newNode(int node_id, const char* node_name) {
+    return (Node*) (new SuperNode(node_id, node_name));
+}
+
+Node* PhyloSuperTree::newNode(int node_id, int node_name) {
+    return (Node*) (new SuperNode(node_id, node_name));
+}
+
+int PhyloSuperTree::getAlnNPattern() {
+	int num = 0;
+	for (iterator it = begin(); it != end(); it++)
+		num += (*it)->getAlnNPattern();
+	return num;
+}
+
+int PhyloSuperTree::getAlnNSite() {
+	int num = 0;
+	for (iterator it = begin(); it != end(); it++)
+		num += (*it)->getAlnNSite();
+	return num;
+}
+
+double PhyloSuperTree::computeDist(int seq1, int seq2, double initial_dist, double &var) {
+    // if no model or site rate is specified, return JC distance
+    if (initial_dist == 0.0) {
+    	if (params->compute_obs_dist)
+            initial_dist = aln->computeObsDist(seq1, seq2);
+    	else
+    		initial_dist = aln->computeDist(seq1, seq2);
+    }
+    if (initial_dist == MAX_GENETIC_DIST) return initial_dist; // MANUEL: here no d2l is return
+    if (!model_factory || !site_rate) return initial_dist; // MANUEL: here no d2l is return
+
+    // now optimize the distance based on the model and site rate
+    SuperAlignmentPairwise aln_pair(this, seq1, seq2);
+    return aln_pair.optimizeDist(initial_dist, var);
+}
+
+void PhyloSuperTree::linkBranch(int part, SuperNeighbor *nei, SuperNeighbor *dad_nei) {
+	SuperNode *node = (SuperNode*)dad_nei->node;
+	SuperNode *dad = (SuperNode*)nei->node;
+	nei->link_neighbors[part] = NULL;
+	dad_nei->link_neighbors[part] = NULL;
+	vector<PhyloNeighbor*> part_vec;
+	vector<PhyloNeighbor*> child_part_vec;
+
+	FOR_NEIGHBOR_DECLARE(node, dad, it) {
+		if (((SuperNeighbor*)*it)->link_neighbors[part]) {
+			part_vec.push_back(((SuperNeighbor*)*it)->link_neighbors[part]);
+			child_part_vec.push_back(((SuperNeighbor*)(*it)->node->findNeighbor(node))->link_neighbors[part]);
+			assert(child_part_vec.back()->node == child_part_vec.front()->node || child_part_vec.back()->id == child_part_vec.front()->id);
+		}
+	}
+
+	if (part_vec.empty())
+		return;
+	if (part_vec.size() == 1) {
+		nei->link_neighbors[part] = child_part_vec[0];
+		dad_nei->link_neighbors[part] = part_vec[0];
+		return;
+	}
+	if (part_vec[0] == child_part_vec[1]) {
+		// ping-pong, out of sub-tree
+		assert(part_vec[1] == child_part_vec[0]);
+		return;
+	}
+	PhyloNode *node_part = (PhyloNode*) child_part_vec[0]->node;
+	PhyloNode *dad_part = NULL;
+	FOR_NEIGHBOR(node_part, NULL, it) {
+		bool appear = false;
+		for (vector<PhyloNeighbor*>::iterator it2 = part_vec.begin(); it2 != part_vec.end(); it2++){
+			if ((*it2) == (*it)) {
+				appear = true; break;
+			}
+		}
+		if (!appear) {
+			assert(!dad_part);
+			dad_part = (PhyloNode*)(*it)->node;
+		}
+	}
+	nei->link_neighbors[part] = (PhyloNeighbor*)node_part->findNeighbor(dad_part);
+	dad_nei->link_neighbors[part] = (PhyloNeighbor*)dad_part->findNeighbor(node_part);
+}
+
+void PhyloSuperTree::linkTree(int part, NodeVector &part_taxa, SuperNode *node, SuperNode *dad) {
+	if (!node) {
+		if (!root->isLeaf())
+			node = (SuperNode*) root;
+		else
+			node = (SuperNode*)root->neighbors[0]->node;
+		assert(node);
+		if (node->isLeaf()) // two-taxa tree
+			dad = (SuperNode*)node->neighbors[0]->node;
+	}
+	SuperNeighbor *nei = NULL;
+	SuperNeighbor *dad_nei = NULL;
+	if (dad) {
+		nei = (SuperNeighbor*)node->findNeighbor(dad);
+		dad_nei = (SuperNeighbor*)dad->findNeighbor(node);
+		if (nei->link_neighbors.empty()) nei->link_neighbors.resize(size());
+		if (dad_nei->link_neighbors.empty()) dad_nei->link_neighbors.resize(size());
+		nei->link_neighbors[part] = NULL;
+		dad_nei->link_neighbors[part] = NULL;
+	}
+	if (node->isLeaf()) {
+		assert(dad);
+		PhyloNode *node_part = (PhyloNode*)part_taxa[node->id];
+		if (node_part) {
+			PhyloNode *dad_part = (PhyloNode*)node_part->neighbors[0]->node;
+			assert(node_part->isLeaf());
+			nei->link_neighbors[part] = (PhyloNeighbor*) node_part->neighbors[0];
+			dad_nei->link_neighbors[part] = (PhyloNeighbor*)dad_part->findNeighbor(node_part);
+		}
+		return;
+	}
+
+	FOR_NEIGHBOR_DECLARE(node, dad, it) {
+		linkTree(part, part_taxa, (SuperNode*) (*it)->node, (SuperNode*) node);
+	}
+	if (!dad) return;
+	linkBranch(part, nei, dad_nei);
+}
+
+void PhyloSuperTree::printMapInfo() {
+	NodeVector nodes1, nodes2;
+	getBranches(nodes1, nodes2);
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		cout << "Subtree for partition " << part << endl;
+		(*it)->drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE);
+		for (int i = 0; i < nodes1.size(); i++) {
+			PhyloNeighbor *nei1 = ((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part];
+			PhyloNeighbor *nei2 = ((SuperNeighbor*)nodes2[i]->findNeighbor(nodes1[i]))->link_neighbors[part];
+			cout << nodes1[i]->findNeighbor(nodes2[i])->id << ":";
+			if (nodes1[i]->isLeaf()) cout << nodes1[i]->name; else cout << nodes1[i]->id;
+			cout << ",";
+			if (nodes2[i]->isLeaf()) cout << nodes2[i]->name; else cout << nodes2[i]->id;
+			cout << " -> ";
+			if (nei2) {
+				cout << nei2->id << ":";
+				if (nei2->node->isLeaf())
+					cout << nei2->node->name;
+				else cout << nei2->node->id;
+			}
+			else cout << -1;
+			cout << ",";
+			if (nei1)
+				if (nei1->node->isLeaf())
+					cout << nei1->node->name;
+				else cout << nei1->node->id;
+			else cout << -1;
+			cout << endl;
+		}
+	}
+}
+
+
+void PhyloSuperTree::mapTrees() {
+	assert(root);
+	int part = 0, i;
+	if (verbose_mode >= VB_DEBUG)
+		drawTree(cout,  WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+	for (iterator it = begin(); it != end(); it++, part++) {
+		string taxa_set = aln->getPattern(part);
+		(*it)->copyTree(this, taxa_set);
+        if ((*it)->getModel()) {
+			(*it)->initializeAllPartialLh();
+        }
+        (*it)->resetCurScore();
+		NodeVector my_taxa, part_taxa;
+		(*it)->getOrderedTaxa(my_taxa);
+		part_taxa.resize(leafNum, NULL);
+		for (i = 0; i < leafNum; i++) {
+			int id = ((SuperAlignment*)aln)->taxa_index[i][part];
+			if (id >=0) part_taxa[i] = my_taxa[id];
+		}
+		if (verbose_mode >= VB_DEBUG) {
+			cout << "Subtree for partition " << part << endl;
+			(*it)->drawTree(cout,  WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+		}
+		linkTree(part, part_taxa);
+	}
+
+	if (verbose_mode >= VB_DEBUG) printMapInfo();
+}
+
+void PhyloSuperTree::linkTrees() {
+	int part = 0;
+	iterator it;
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		(*it)->initializeTree();
+		(*it)->setAlignment((*it)->aln);
+        if ((*it)->getModel()) {
+			(*it)->initializeAllPartialLh();
+        }
+        (*it)->resetCurScore();
+		NodeVector my_taxa, part_taxa;
+		(*it)->getOrderedTaxa(my_taxa);
+		part_taxa.resize(leafNum, NULL);
+		int i;
+		for (i = 0; i < leafNum; i++) {
+			int id = ((SuperAlignment*)aln)->taxa_index[i][part];
+			if (id >=0) part_taxa[i] = my_taxa[id];
+		}
+		linkTree(part, part_taxa);
+	}
+}
+
+void PhyloSuperTree::initializeAllPartialLh() {
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->initializeAllPartialLh();
+	}
+}
+
+
+void PhyloSuperTree::deleteAllPartialLh() {
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->deleteAllPartialLh();
+	}
+}
+
+void PhyloSuperTree::clearAllPartialLH(bool make_null) {
+    for (iterator it = begin(); it != end(); it++) {
+        (*it)->clearAllPartialLH(make_null);
+    }
+}
+
+int PhyloSuperTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+    int score = 0, part = 0;
+    SuperNeighbor *dad_nei = (SuperNeighbor*)dad_branch;
+    SuperNeighbor *node_nei = (SuperNeighbor*)(dad_branch->node->findNeighbor(dad));
+        
+    if (branch_subst)
+        branch_subst = 0;
+    for (iterator it = begin(); it != end(); it++, part++) {
+        int this_subst = 0;
+        if (dad_nei->link_neighbors[part]) {
+            if (branch_subst)
+                score += (*it)->computeParsimonyBranch(dad_nei->link_neighbors[part], (PhyloNode*)node_nei->link_neighbors[part]->node, &this_subst);
+            else
+                score += (*it)->computeParsimonyBranch(dad_nei->link_neighbors[part], (PhyloNode*)node_nei->link_neighbors[part]->node);
+        } else
+            score += (*it)->computeParsimony();
+        if (branch_subst)
+            branch_subst += this_subst;
+    }
+    return score;
+}
+
+void PhyloSuperTree::computePartitionOrder() {
+    if (!part_order.empty())
+        return;
+    int i, ntrees = size();
+    part_order.resize(ntrees);
+    part_order_by_nptn.resize(ntrees);
+#ifdef _OPENMP
+    int *id = new int[ntrees];
+    double *cost = new double[ntrees];
+    
+    for (i = 0; i < ntrees; i++) {
+        Alignment *part_aln = at(i)->aln;
+        cost[i] = -((double)part_aln->getNSeq())*part_aln->getNPattern()*part_aln->num_states;
+        id[i] = i;
+    }
+    quicksort(cost, 0, ntrees-1, id);
+    for (i = 0; i < ntrees; i++) 
+        part_order[i] = id[i];
+        
+    // compute part_order by number of patterns
+    for (i = 0; i < ntrees; i++) {
+        Alignment *part_aln = at(i)->aln;
+        cost[i] = -((double)part_aln->getNPattern())*part_aln->num_states;
+        id[i] = i;
+    }
+    quicksort(cost, 0, ntrees-1, id);
+    for (i = 0; i < ntrees; i++) 
+        part_order_by_nptn[i] = id[i];
+        
+    delete [] cost;
+    delete [] id;
+#else
+    for (i = 0; i < ntrees; i++) {
+        part_order[i] = i;
+        part_order_by_nptn[i] = i;
+    }
+#endif // OPENMP
+}
+
+double PhyloSuperTree::computeLikelihood(double *pattern_lh) {
+	double tree_lh = 0.0;
+	int ntrees = size();
+	if (pattern_lh) {
+		//#ifdef _OPENMP
+		//#pragma omp parallel for reduction(+: tree_lh)
+		//#endif
+		for (int i = 0; i < ntrees; i++) {
+			part_info[i].cur_score = at(i)->computeLikelihood(pattern_lh);
+			tree_lh += part_info[i].cur_score;
+			pattern_lh += at(i)->getAlnNPattern();
+		}
+	} else {
+        if (part_order.empty()) computePartitionOrder();
+		#ifdef _OPENMP
+		#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic)
+		#endif
+		for (int j = 0; j < ntrees; j++) {
+            int i = part_order[j];
+			part_info[i].cur_score = at(i)->computeLikelihood();
+			tree_lh += part_info[i].cur_score;
+		}
+	}
+	return tree_lh;
+}
+
+void PhyloSuperTree::computePatternLikelihood(double *pattern_lh, double *cur_logl, double *ptn_lh_cat) {
+	int offset = 0, offset_lh_cat = 0;
+	iterator it;
+	for (it = begin(); it != end(); it++) {
+		if (ptn_lh_cat)
+			(*it)->computePatternLikelihood(pattern_lh + offset, NULL, ptn_lh_cat + offset_lh_cat);
+		else
+			(*it)->computePatternLikelihood(pattern_lh + offset);
+		offset += (*it)->aln->getNPattern();
+        if ((*it)->getModel()->isMixture() && !(*it)->getModelFactory()->fused_mix_rate)
+            offset_lh_cat += (*it)->aln->getNPattern() * (*it)->site_rate->getNDiscreteRate() * (*it)->model->getNMixtures();
+        else
+            offset_lh_cat += (*it)->aln->getNPattern() * (*it)->site_rate->getNDiscreteRate();
+	}
+	if (cur_logl) { // sanity check
+		double sum_logl = 0;
+		offset = 0;
+		for (it = begin(); it != end(); it++) {
+			int nptn = (*it)->aln->getNPattern();
+			for (int j = 0; j < nptn; j++)
+				sum_logl += pattern_lh[offset + j] * (*it)->aln->at(j).frequency;
+			offset += (*it)->aln->getNPattern();
+		}
+		if (fabs(sum_logl - *cur_logl) > 0.001) {
+            cout << *cur_logl << " " << sum_logl << endl;
+//            outError("Wrong PhyloSuperTree::", __func__);
+		}
+        assert(fabs(sum_logl - *cur_logl) < 0.001);
+	}
+}
+
+double PhyloSuperTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
+	double tree_lh = 0.0;
+	int ntrees = size();
+    if (part_order.empty()) computePartitionOrder();
+	#ifdef _OPENMP
+	#pragma omp parallel for reduction(+: tree_lh) schedule(dynamic)
+	#endif
+	for (int j = 0; j < ntrees; j++) {
+        int i = part_order[j];
+		part_info[i].cur_score = at(i)->optimizeAllBranches(my_iterations, tolerance/min(ntrees,10), maxNRStep);
+		tree_lh += part_info[i].cur_score;
+		if (verbose_mode >= VB_MAX)
+			at(i)->printTree(cout, WT_BR_LEN + WT_NEWLINE);
+	}
+
+	if (my_iterations >= 100) computeBranchLengths();
+	return tree_lh;
+}
+
+PhyloSuperTree::~PhyloSuperTree()
+{
+	for (vector<PartitionInfo>::reverse_iterator pit = part_info.rbegin(); pit != part_info.rend(); pit++) {
+		if (pit->nniMoves[1].ptnlh)
+			delete [] pit->nniMoves[1].ptnlh;
+		pit->nniMoves[1].ptnlh = NULL;
+		if (pit->nniMoves[0].ptnlh)
+			delete [] pit->nniMoves[0].ptnlh;
+		pit->nniMoves[0].ptnlh = NULL;
+		if (pit->cur_ptnlh)
+			delete [] pit->cur_ptnlh;
+		pit->cur_ptnlh = NULL;
+	}
+	part_info.clear();
+
+	for (reverse_iterator it = rbegin(); it != rend(); it++)
+		delete (*it);
+	clear();
+}
+
+
+void PhyloSuperTree::initPartitionInfo() {
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		part_info[part].cur_score = 0.0;
+
+		part_info[part].cur_brlen.resize((*it)->branchNum, 0.0);
+		if (params->nni5) {
+			part_info[part].nni1_brlen.resize((*it)->branchNum * 5, 0.0);
+			part_info[part].nni2_brlen.resize((*it)->branchNum * 5, 0.0);
+		} else {
+			part_info[part].nni1_brlen.resize((*it)->branchNum, 0.0);
+			part_info[part].nni2_brlen.resize((*it)->branchNum, 0.0);
+		}
+
+		(*it)->getBranchLengths(part_info[part].cur_brlen);
+
+		if (save_all_trees == 2 || params->write_intermediate_trees >= 2) {
+			// initialize ptnlh for ultrafast bootstrap
+			int nptn = (*it)->getAlnNPattern();
+			if (!part_info[part].cur_ptnlh)
+				part_info[part].cur_ptnlh = new double[nptn];
+			if (!part_info[part].nniMoves[0].ptnlh)
+				part_info[part].nniMoves[0].ptnlh = new double [nptn];
+			if (!part_info[part].nniMoves[1].ptnlh)
+				part_info[part].nniMoves[1].ptnlh = new double [nptn];
+		}
+	}
+}
+
+int PhyloSuperTree::getMaxPartNameLength() {
+	int namelen = 0;
+	for (vector<PartitionInfo>::iterator it = part_info.begin(); it != part_info.end(); it++)
+		namelen = max((int)it->name.length(), namelen);
+	return namelen;
+}
+
+NNIMove PhyloSuperTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves) {
+    NNIMove myMove;
+    //myMove.newloglh = 0;
+	SuperNeighbor *nei1 = ((SuperNeighbor*)node1->findNeighbor(node2));
+	SuperNeighbor *nei2 = ((SuperNeighbor*)node2->findNeighbor(node1));
+	assert(nei1 && nei2);
+	SuperNeighbor *node1_nei = NULL;
+	SuperNeighbor *node2_nei = NULL;
+	SuperNeighbor *node2_nei_other = NULL;
+	FOR_NEIGHBOR_DECLARE(node1, node2, node1_it) {
+		node1_nei = (SuperNeighbor*)(*node1_it);
+		break;
+	}
+	FOR_NEIGHBOR_DECLARE(node2, node1, node2_it) {
+		node2_nei = (SuperNeighbor*)(*node2_it);
+		break;
+	}
+
+	FOR_NEIGHBOR_IT(node2, node1, node2_it_other)
+	if ((*node2_it_other) != node2_nei) {
+		node2_nei_other = (SuperNeighbor*)(*node2_it_other);
+		break;
+	}
+
+	//double bestScore = optimizeOneBranch(node1, node2, false);
+
+	int ntrees = size(), part;
+	double nni_score1 = 0.0, nni_score2 = 0.0;
+	int local_totalNNIs = 0, local_evalNNIs = 0;
+
+    if (part_order.empty()) computePartitionOrder();
+	#ifdef _OPENMP
+	#pragma omp parallel for reduction(+: nni_score1, nni_score2, local_totalNNIs, local_evalNNIs) private(part) schedule(dynamic)
+	#endif
+	for (int treeid = 0; treeid < ntrees; treeid++) {
+        part = part_order_by_nptn[treeid];
+		bool is_nni = true;
+		local_totalNNIs++;
+		FOR_NEIGHBOR_DECLARE(node1, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		FOR_NEIGHBOR(node2, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		if (!is_nni && params->terrace_aware) {
+			if (part_info[part].cur_score == 0.0)  {
+				part_info[part].cur_score = at(part)->computeLikelihood();
+				if (save_all_trees == 2 || nniMoves)
+					at(part)->computePatternLikelihood(part_info[part].cur_ptnlh, &part_info[part].cur_score);
+			}
+			nni_score1 += part_info[part].cur_score;
+			nni_score2 += part_info[part].cur_score;
+			continue;
+		}
+
+		local_evalNNIs++;
+		part_info[part].evalNNIs++;
+
+		PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+		PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+
+		int brid = nei1_part->id;
+
+		//NNIMove part_moves[2];
+		//part_moves[0].node1Nei_it = NULL;
+
+		// setup subtree NNI correspondingly
+		PhyloNode *node1_part = (PhyloNode*)nei2_part->node;
+		PhyloNode *node2_part = (PhyloNode*)nei1_part->node;
+		part_info[part].nniMoves[0].node1 = part_info[part].nniMoves[1].node1 = node1;
+		part_info[part].nniMoves[0].node2 = part_info[part].nniMoves[1].node2 = node2;
+		part_info[part].nniMoves[0].node1Nei_it = node1_part->findNeighborIt(node1_nei->link_neighbors[part]->node);
+		part_info[part].nniMoves[0].node2Nei_it = node2_part->findNeighborIt(node2_nei->link_neighbors[part]->node);
+
+		part_info[part].nniMoves[1].node1Nei_it = node1_part->findNeighborIt(node1_nei->link_neighbors[part]->node);
+		part_info[part].nniMoves[1].node2Nei_it = node2_part->findNeighborIt(node2_nei_other->link_neighbors[part]->node);
+
+		at(part)->getBestNNIForBran((PhyloNode*)nei2_part->node, (PhyloNode*)nei1_part->node, part_info[part].nniMoves);
+		// detect the corresponding NNIs and swap if necessary (the swapping refers to the swapping of NNI order)
+		if (!((*part_info[part].nniMoves[0].node1Nei_it == node1_nei->link_neighbors[part] &&
+				*part_info[part].nniMoves[0].node2Nei_it == node2_nei->link_neighbors[part]) ||
+			(*part_info[part].nniMoves[0].node1Nei_it != node1_nei->link_neighbors[part] &&
+					*part_info[part].nniMoves[0].node2Nei_it != node2_nei->link_neighbors[part])))
+		{
+			outError("WRONG");
+			NNIMove tmp = part_info[part].nniMoves[0];
+			part_info[part].nniMoves[0] = part_info[part].nniMoves[1];
+			part_info[part].nniMoves[1] = tmp;
+		}
+		nni_score1 += part_info[part].nniMoves[0].newloglh;
+		nni_score2 += part_info[part].nniMoves[1].newloglh;
+		int numlen = 1;
+		if (params->nni5) numlen = 5;
+		for (int i = 0; i < numlen; i++) {
+			part_info[part].nni1_brlen[brid*numlen + i] = part_info[part].nniMoves[0].newLen[i];
+			part_info[part].nni2_brlen[brid*numlen + i] = part_info[part].nniMoves[1].newLen[i];
+		}
+
+	}
+	totalNNIs += local_totalNNIs;
+	evalNNIs += local_evalNNIs;
+	double nni_scores[2] = {nni_score1, nni_score2};
+
+	myMove.node1Nei_it = node1->findNeighborIt(node1_nei->node);
+	myMove.node1 = node1;
+	myMove.node2 = node2;
+	if (nni_scores[0] > nni_scores[1]) {
+		myMove.swap_id = 1;
+		myMove.node2Nei_it = node2->findNeighborIt(node2_nei->node);
+		myMove.newloglh = nni_scores[0];
+	} else  {
+		myMove.swap_id = 2;
+		myMove.node2Nei_it = node2->findNeighborIt(node2_nei_other->node);
+		myMove.newloglh = nni_scores[1];
+	}
+
+	if (save_all_trees != 2 && !nniMoves) return myMove;
+
+	// for bootstrap now
+    //now setup pattern likelihoods per partition
+	double *save_lh_factor = new double [ntrees];
+	double *save_lh_factor_back = new double [ntrees];
+	int nnino = 0;
+	FOR_NEIGHBOR(node2, node1, node2_it) {
+
+		// do the NNI
+		node2_nei = (SuperNeighbor*)(*node2_it);
+        node1->updateNeighbor(node1_it, node2_nei);
+        node2_nei->node->updateNeighbor(node2, node1);
+        node2->updateNeighbor(node2_it, node1_nei);
+        node1_nei->node->updateNeighbor(node1, node2);
+
+        for (part = 0; part < ntrees; part++) {
+			bool is_nni = true;
+			FOR_NEIGHBOR_DECLARE(node1, NULL, nit) {
+				if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+			}
+			FOR_NEIGHBOR(node2, NULL, nit) {
+				if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+			}
+			if (!is_nni)
+				memcpy(at(part)->_pattern_lh, part_info[part].cur_ptnlh, at(part)->getAlnNPattern() * sizeof(double));
+			else
+				memcpy(at(part)->_pattern_lh, part_info[part].nniMoves[nnino].ptnlh, at(part)->getAlnNPattern() * sizeof(double));
+    		save_lh_factor[part] = at(part)->current_it->lh_scale_factor;
+    		save_lh_factor_back[part] = at(part)->current_it_back->lh_scale_factor;
+    		at(part)->current_it->lh_scale_factor = 0.0;
+    		at(part)->current_it_back->lh_scale_factor = 0.0;
+        }
+        if (nniMoves) {
+        	nniMoves[nnino].newloglh = nni_scores[nnino];
+       		computePatternLikelihood(nniMoves[nnino].ptnlh, &nni_scores[nnino]);
+        }
+        if (save_all_trees == 2)
+        	saveCurrentTree(nni_scores[nnino]);
+
+        // restore information
+        for (part = 0; part < ntrees; part++) {
+    		at(part)->current_it->lh_scale_factor = save_lh_factor[part];
+    		at(part)->current_it_back->lh_scale_factor = save_lh_factor_back[part];
+        }
+
+        // swap back to recover the tree
+        node1->updateNeighbor(node1_it, node1_nei);
+        node1_nei->node->updateNeighbor(node2, node1);
+        node2->updateNeighbor(node2_it, node2_nei);
+        node2_nei->node->updateNeighbor(node1, node2);
+        nnino++;
+
+	}
+
+	delete [] save_lh_factor_back;
+	delete [] save_lh_factor;
+	return myMove;
+}
+
+void PhyloSuperTree::doNNI(NNIMove &move, bool clearLH) {
+	SuperNeighbor *nei1 = (SuperNeighbor*)move.node1->findNeighbor(move.node2);
+	SuperNeighbor *nei2 = (SuperNeighbor*)move.node2->findNeighbor(move.node1);
+	SuperNeighbor *node1_nei = (SuperNeighbor*)*move.node1Nei_it;
+	SuperNeighbor *node2_nei = (SuperNeighbor*)*move.node2Nei_it;
+	int part = 0;
+	iterator it;
+	PhyloTree::doNNI(move, clearLH);
+
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		bool is_nni = true;
+		FOR_NEIGHBOR_DECLARE(move.node1, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		FOR_NEIGHBOR(move.node2, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		if (!is_nni) {
+			// relink the branch if it does not correspond to NNI for partition
+			linkBranch(part, nei1, nei2);
+			continue;
+		}
+
+		NNIMove part_move;
+		PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+		PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+		part_move.node1 = (PhyloNode*)nei2_part->node;
+		part_move.node2 = (PhyloNode*)nei1_part->node;
+		part_move.node1Nei_it = part_move.node1->findNeighborIt(node1_nei->link_neighbors[part]->node);
+		part_move.node2Nei_it = part_move.node2->findNeighborIt(node2_nei->link_neighbors[part]->node);
+
+		(*it)->doNNI(part_move, clearLH);
+
+	}
+
+}
+
+void PhyloSuperTree::changeNNIBrans(NNIMove move) {
+	SuperNeighbor *nei1 = (SuperNeighbor*)move.node1->findNeighbor(move.node2);
+	SuperNeighbor *nei2 = (SuperNeighbor*)move.node2->findNeighbor(move.node1);
+	iterator it;
+	int part;
+
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		bool is_nni = true;
+		FOR_NEIGHBOR_DECLARE(move.node1, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		FOR_NEIGHBOR(move.node2, NULL, nit) {
+			if (! ((SuperNeighbor*)*nit)->link_neighbors[part]) { is_nni = false; break; }
+		}
+		if (!is_nni) {
+			continue;
+		}
+
+		NNIMove part_move;
+		PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+		PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+		int brid = nei1_part->id;
+		part_move.node1 = (PhyloNode*)nei2_part->node;
+		part_move.node2 = (PhyloNode*)nei1_part->node;
+		int numlen = 1;
+		if (params->nni5) numlen = 5;
+		if (move.swap_id == 1) {
+			for (int i = 0; i < numlen; i++)
+				part_move.newLen[i] = part_info[part].nni1_brlen[brid*numlen + i];
+		} else {
+			for (int i = 0; i < numlen; i++)
+				part_move.newLen[i] = part_info[part].nni2_brlen[brid*numlen + i];
+		}
+
+		(*it)->changeNNIBrans(part_move);
+
+	}
+
+}
+
+void PhyloSuperTree::restoreAllBrans(PhyloNode *node, PhyloNode *dad) {
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		(*it)->setBranchLengths(part_info[part].cur_brlen);
+	}
+}
+
+void PhyloSuperTree::reinsertLeaves(PhyloNodeVector &del_leaves) {
+	IQTree::reinsertLeaves(del_leaves);
+	mapTrees();
+}
+
+void PhyloSuperTree::computeBranchLengths() {
+	if (verbose_mode >= VB_DEBUG)
+		cout << "Assigning branch lengths for full tree with weighted average..." << endl;
+	int part = 0, i;
+    iterator it;
+
+	NodeVector nodes1, nodes2;
+	getBranches(nodes1, nodes2);
+	vector<SuperNeighbor*> neighbors1;
+	vector<SuperNeighbor*> neighbors2;
+	IntVector occurence;
+	occurence.resize(nodes1.size(), 0);
+	for (i = 0; i < nodes1.size(); i++) {
+		neighbors1.push_back((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]) );
+		neighbors2.push_back((SuperNeighbor*)nodes2[i]->findNeighbor(nodes1[i]) );
+		neighbors1.back()->length = 0.0;
+	}
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		IntVector brfreq;
+		brfreq.resize((*it)->branchNum, 0);
+		for (i = 0; i < nodes1.size(); i++) {
+			PhyloNeighbor *nei1 = neighbors1[i]->link_neighbors[part];
+			if (!nei1) continue;
+			brfreq[nei1->id]++;
+		}
+		for (i = 0; i < nodes1.size(); i++) {
+			PhyloNeighbor *nei1 = neighbors1[i]->link_neighbors[part];
+			if (!nei1) continue;
+            if ((*it)->aln->seq_type == SEQ_CODON && rescale_codon_brlen) {
+                // rescale branch length by 3
+                neighbors1[i]->length += (nei1->length) * (*it)->aln->getNSite() / brfreq[nei1->id];
+                occurence[i] += (*it)->aln->getNSite()*3;
+            } else {
+                neighbors1[i]->length += (nei1->length) * (*it)->aln->getNSite() / brfreq[nei1->id];
+                occurence[i] += (*it)->aln->getNSite();
+            }
+			//cout << neighbors1[i]->id << "  " << nodes1[i]->id << nodes1[i]->name <<"," << nodes2[i]->id << nodes2[i]->name <<": " << (nei1->length) / brfreq[nei1->id] << endl;
+		}
+		//cout << endl;
+	}
+	for (i = 0; i < nodes1.size(); i++) {
+		if (occurence[i])
+			neighbors1[i]->length /= occurence[i];
+		neighbors2[i]->length = neighbors1[i]->length;
+	}
+}
+
+string PhyloSuperTree::getModelName() {
+	return (string)"Partition model";
+}
+
+PhyloTree *PhyloSuperTree::extractSubtree(IntVector &ids) {
+	string union_taxa;
+	int i;
+	for (i = 0; i < ids.size(); i++) {
+		int id = ids[i];
+		if (id < 0 || id >= size())
+			outError("Internal error ", __func__);
+		string taxa_set = aln->getPattern(id);
+		if (i == 0) union_taxa = taxa_set; else {
+			for (int j = 0; j < union_taxa.length(); j++)
+				if (taxa_set[j] == 1) union_taxa[j] = 1;
+		}
+	}
+	PhyloTree *tree = new PhyloTree;
+	tree->copyTree(this, union_taxa);
+	return tree;
+}
+
+uint64_t PhyloSuperTree::getMemoryRequired(size_t ncategory) {
+//	uint64_t mem_size = PhyloTree::getMemoryRequired(ncategory);
+	// supertree does not need any memory for likelihood vectors!
+	uint64_t mem_size = 0;
+	for (iterator it = begin(); it != end(); it++)
+		mem_size += (*it)->getMemoryRequired(ncategory);
+	return mem_size;
+}
+
+int PhyloSuperTree::countEmptyBranches(PhyloNode *node, PhyloNode *dad) {
+	int count = 0;
+    if (!node)
+        node = (PhyloNode*)root;
+
+    FOR_NEIGHBOR_IT(node, dad, it) {
+    	SuperNeighbor *nei = (SuperNeighbor*)(*it);
+    	bool isempty = true;
+    	for (PhyloNeighborVec::iterator nit = nei->link_neighbors.begin(); nit != nei->link_neighbors.end(); nit++)
+    		if ((*nit)) {
+    			isempty = false;
+    			break;
+    		}
+    	if (isempty) count++;
+    	count += countEmptyBranches((PhyloNode*)(*it)->node, node);
+    }
+    return count;
+}
+
+/** remove identical sequences from the tree */
+void PhyloSuperTree::removeIdenticalSeqs(Params &params) {
+	IQTree::removeIdenticalSeqs(params);
+	if (removed_seqs.empty()) return;
+	// now synchronize aln
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		if (verbose_mode >= VB_MED) {
+			cout << "Partition " << part_info[part].name << " " << ((SuperAlignment*)aln)->partitions[part]->getNSeq() <<
+					" sequences from " << (*it)->aln->getNSeq() << " extracted" << endl;
+		}
+		(*it)->aln = ((SuperAlignment*)aln)->partitions[part];
+	}
+	if (verbose_mode >= VB_MED) {
+		cout << "Reduced alignment has " << aln->getNSeq() << " sequences with " << getAlnNSite() << " sites and "
+				<< getAlnNPattern() << " patterns" << endl;
+	}
+
+}
+
+/** reinsert identical sequences into the tree and reset original alignment */
+void PhyloSuperTree::reinsertIdenticalSeqs(Alignment *orig_aln) {
+	if (removed_seqs.empty()) return;
+	IQTree::reinsertIdenticalSeqs(orig_aln);
+
+	// now synchronize aln
+	int part = 0;
+    for (iterator it = begin(); it != end(); it++, part++) {
+//        (*it)->setAlignment(((SuperAlignment*)aln)->partitions[part]);
+		(*it)->aln = ((SuperAlignment*)aln)->partitions[part];
+    }
+	mapTrees();
+
+
+}
+
+int PhyloSuperTree::fixNegativeBranch(bool force, Node *node, Node *dad) {
+	mapTrees();
+	int fixed = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->initializeAllPartialPars();
+		(*it)->clearAllPartialLH();
+		fixed += (*it)->fixNegativeBranch(force);
+		(*it)->clearAllPartialLH();
+	}
+	computeBranchLengths();
+	return fixed;
+}
diff --git a/phylosupertree.h b/phylosupertree.h
new file mode 100644
index 0000000..dd3ce97
--- /dev/null
+++ b/phylosupertree.h
@@ -0,0 +1,322 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PHYLOSUPERTREE_H
+#define PHYLOSUPERTREE_H
+
+#include "iqtree.h"
+#include "supernode.h"
+#include "superalignment.h"
+
+
+/**
+Phylogenetic tree for partition model (multi-gene alignment)
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class PhyloSuperTree : public IQTree, public vector<PhyloTree* >
+{
+public:
+	/**
+		constructor
+	*/
+    PhyloSuperTree();
+
+	/**
+		constructor
+	*/
+    PhyloSuperTree(SuperAlignment *alignment, PhyloSuperTree *super_tree);
+
+	/**
+		constructor
+	*/
+    PhyloSuperTree(Params &params);
+
+
+    ~PhyloSuperTree();
+
+    /** read partition model file */
+    void readPartition(Params &params);
+
+    /** read RAxML-style partition file */
+    void readPartitionRaxml(Params &params);
+
+    /** read partition model file in NEXUS format into variable info */
+    void readPartitionNexus(Params &params);
+
+    void printPartition(const char *filename);
+
+    void printPartitionRaxml(const char *filename);
+
+    void printBestPartition(const char *filename);
+    void printBestPartitionRaxml(const char *filename);
+
+    /** remove identical sequences from the tree */
+    virtual void removeIdenticalSeqs(Params &params);
+
+    /** reinsert identical sequences into the tree and reset original alignment */
+    virtual void reinsertIdenticalSeqs(Alignment *orig_aln);
+
+	virtual void setParams(Params* params);
+
+	/**
+	 * setup all necessary parameters  (declared as virtual needed for phylosupertree)
+	 */
+	virtual void initSettings(Params& params);
+
+    virtual void setLikelihoodKernel(LikelihoodKernel lk);
+
+    virtual void changeLikelihoodKernel(LikelihoodKernel lk);
+
+	virtual bool isSuperTree() { return true; }
+
+    /**
+     * Return the tree string contining taxon names and branch lengths
+     * @return
+     */
+    virtual string getTreeString();
+
+    /**
+            Read the tree saved with Taxon Names and branch lengths.
+            @param tree_string tree string to read from
+            @param updatePLL if true, tree is read into PLL
+     */
+    virtual void readTreeString(const string &tree_string);
+
+    /**
+     * save branch lengths into a vector
+     */
+    virtual void saveBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+    /**
+     * restore branch lengths from a vector previously called with saveBranchLengths
+     */
+    virtual void restoreBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            allocate a new node. Override this if you have an inherited Node class.
+            @param node_id node ID
+            @param node_name node name
+            @return a new node
+     */
+    virtual Node* newNode(int node_id = -1, const char* node_name = NULL);
+
+    /**
+            allocate a new node. Override this if you have an inherited Node class.
+            @param node_id node ID
+            @param node_name node name issued by an interger
+            @return a new node
+     */
+    virtual Node* newNode(int node_id, int node_name);
+
+	/**
+	 *		@return number of alignment patterns
+	*/
+	virtual int getAlnNPattern();
+
+	/**
+	 *		@return number of alignment sites
+	*/
+	virtual int getAlnNSite();
+
+    /**
+            compute the distance between 2 sequences.
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2
+            @param initial_dist initial distance
+            @return distance between seq1 and seq2
+     */
+    virtual double computeDist(int seq1, int seq2, double initial_dist, double &var);
+
+	/**
+		create sub-trees T|Y_1,...,T|Y_k of the current super-tree T
+		and map F={f_1,...,f_k} the edges of supertree T to edges of subtrees T|Y_i
+	*/
+	virtual void mapTrees();
+
+	/*
+	 * create one map f_i from supertree T to subtree indexed by part (called by mapTrees)
+	 * @param part index of subtree
+	 * @param part_taxa vector of taxa of T that are present in subtree
+	 * @param node the current node of the post-order tree traversal
+	 * @param dad the dad of that node used to direct the traversal
+	 */
+	void linkTree(int part, NodeVector &part_taxa, SuperNode *node = NULL, SuperNode *dad = NULL);
+
+	/**
+	 * Given current supertree T and subtrees T|Y_1,...,T|Y_k, build all maps f_1,...,f_k
+	 */
+	virtual void linkTrees();
+
+	/**
+	 * link a branch from supertree to subtree (called by linkTree)
+	 * @param part index of subtree
+	 * @param nei pointer to branch
+	 * @param dad_nei pointer to reverse branch
+	 */
+	void linkBranch(int part, SuperNeighbor *nei, SuperNeighbor *dad_nei);
+
+
+    /**
+            initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+     */
+    virtual void initializeAllPartialLh();
+
+    /**
+            de-allocate central_partial_lh
+     */
+    virtual void deleteAllPartialLh();
+
+    /**
+     NEWLY ADDED (2014-12-04): clear all partial likelihood for a clean computation again
+     */
+    virtual void clearAllPartialLH(bool make_null = false);
+    
+
+    /**
+            compute the tree likelihood
+            @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
+                            assuming pattern_lh has the size of the number of patterns
+            @return tree likelihood
+     */
+    virtual double computeLikelihood(double *pattern_lh = NULL);
+
+    /**
+            compute pattern likelihoods only if the accumulated scaling factor is non-zero.
+            Otherwise, copy the pattern_lh attribute
+            @param pattern_lh (OUT) pattern log-likelihoods,
+                            assuming pattern_lh has the size of the number of patterns
+            @param cur_logl current log-likelihood (for sanity check)
+            @param pattern_lh_cat (OUT) if not NULL, store all pattern-likelihood per category
+     */
+    virtual void computePatternLikelihood(double *pattern_lh, double *cur_logl = NULL,
+    		double *pattern_lh_cat = NULL);
+
+    /**
+            optimize all branch lengths of all subtrees, then compute branch lengths
+            of supertree as weighted average over all subtrees
+            @param iterations number of iterations to loop through all branches
+            @return the likelihood of the tree
+     */
+    virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD, int maxNRStep = 100);
+
+    /**
+            search the best swap for a branch
+            @return NNIMove The best Move/Swap
+            @param cur_score the current score of the tree before the swaps
+            @param node1 1 of the 2 nodes on the branch
+            @param node2 1 of the 2 nodes on the branch
+     */
+    virtual NNIMove getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves = NULL);
+
+    /**
+            Do an NNI on the supertree and synchronize all subtrees respectively
+            @param move the single NNI
+     */
+    virtual void doNNI(NNIMove &move, bool clearLH = true);
+
+    /**
+     *   Apply 5 new branch lengths stored in the NNI move
+     *   @param nnimove the NNI move currently in consideration
+     */
+    virtual void changeNNIBrans(NNIMove nnimove);
+
+    /**
+     * 	 Restore the branch lengths from the saved values
+	 * @param node the current node of the post-order tree traversal
+	 * @param dad the dad of that node used to direct the traversal
+     */
+    virtual void restoreAllBrans(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            reinsert the whole list of leaves back into the supertree then call mapTrees
+            @param del_leaves the list of deleted leaves, returned by deleteLeaves() function
+     */
+    virtual void reinsertLeaves(PhyloNodeVector &del_leaves);
+
+	/**
+		compute the weighted average of branch lengths over partitions
+	*/
+	virtual void computeBranchLengths();
+
+	/**
+	 * print debug information about all maps
+	 */
+	virtual void printMapInfo();
+
+	/**
+	 * initialize partition information for super tree
+	*/
+	virtual void initPartitionInfo();
+
+	int getMaxPartNameLength();
+
+	/**
+		partition information
+	*/
+	vector<PartitionInfo> part_info;
+
+    /* partition ID sorted in descending order of computation cost */
+    IntVector part_order;
+    IntVector part_order_by_nptn;
+
+    /* compute part_order vector */
+    void computePartitionOrder();
+
+    /**
+            get the name of the model
+    */
+    virtual string getModelName();
+	/**
+	 * extract subtree containing all taxa from partition IDs
+	 * @param ids partitions IDs
+	 * @return subtree
+	 */
+    PhyloTree *extractSubtree(IntVector &ids);
+
+    /**
+     * compute the memory size required for storing partial likelihood vectors
+     * @return memory size required in bytes
+     */
+    virtual uint64_t getMemoryRequired(size_t ncategory = 1);
+
+    /**
+     * count the number of super branches that map to no branches in gene trees
+     */
+    int countEmptyBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            Neighbor-joining/parsimony tree might contain negative branch length. This
+            function will fix this.
+            @param fixed_length fixed branch length to set to negative branch lengths
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return The number of branches that have no/negative length
+     */
+    virtual int fixNegativeBranch(bool force = false, Node *node = NULL, Node *dad = NULL);
+
+    virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+
+    /** True when mixed codon with other data type */
+    bool rescale_codon_brlen;
+    
+    int totalNNIs, evalNNIs;
+
+};
+
+#endif
diff --git a/phylosupertreeplen.cpp b/phylosupertreeplen.cpp
new file mode 100644
index 0000000..1efb692
--- /dev/null
+++ b/phylosupertreeplen.cpp
@@ -0,0 +1,2039 @@
+/*
+ * phylosupertreeplen.cpp
+ *
+ *  Created on: Aug 5, 2013
+ *      Author: olga
+ */
+
+#include "phylosupertreeplen.h"
+#include "superalignmentpairwise.h"
+#include <string.h>
+#include "timeutil.h"
+
+/**********************************************************
+ * class SuperAlignmentPairwisePlen
+**********************************************************/
+
+
+SuperAlignmentPairwisePlen::SuperAlignmentPairwisePlen(PhyloSuperTreePlen *atree, int seq1, int seq2)
+ : SuperAlignmentPairwise((PhyloSuperTree*) atree, seq1, seq2)
+{
+	part_info = &(atree->part_info);
+}
+
+double SuperAlignmentPairwisePlen::computeFunction(double value) {
+	int part = 0;
+	double lh = 0.0;
+	for (vector<AlignmentPairwise*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
+		lh += (*it)->computeFunction(part_info->at(part).part_rate*value);
+	}
+	return lh;
+}
+
+void SuperAlignmentPairwisePlen::computeFuncDerv(double value, double &df, double &ddf) {
+	int part = 0;
+//	double lh = 0.0;
+	df = 0.0;
+	ddf = 0.0;
+	for (vector<AlignmentPairwise*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
+		double d1, d2;
+		(*it)->computeFuncDerv(part_info->at(part).part_rate*value, d1, d2);
+		df += part_info->at(part).part_rate*d1;
+		ddf += part_info->at(part).part_rate*part_info->at(part).part_rate*d2;
+	}
+//	return lh;
+}
+
+SuperAlignmentPairwisePlen::~SuperAlignmentPairwisePlen()
+{}
+
+/**********************************************************
+ * class PartitionModelPlen
+**********************************************************/
+
+//const double MIN_GENE_RATE = 0.001;
+//const double MAX_GENE_RATE = 1000.0;
+//const double TOL_GENE_RATE = 0.0001;
+
+PartitionModelPlen::PartitionModelPlen()
+        : PartitionModel()
+{
+//    optimizing_part = -1;
+}
+
+PartitionModelPlen::PartitionModelPlen(Params &params, PhyloSuperTreePlen *tree, ModelsBlock *models_block)
+        : PartitionModel(params, tree, models_block)
+{
+//    optimizing_part = -1;
+}
+
+PartitionModelPlen::~PartitionModelPlen()
+{
+	}
+
+double PartitionModelPlen::optimizeParameters(bool fixed_len, bool write_info, double logl_epsilon, double gradient_epsilon) {
+    PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+    double tree_lh = 0.0, cur_lh = 0.0;
+    int ntrees = tree->size();
+
+
+    //tree->initPartitionInfo(); // FOR OLGA: needed here
+
+	for(int part = 0; part < ntrees; part++){
+		tree->part_info[part].cur_score = 0.0;
+	}
+	if (fixed_len) {
+		tree_lh = tree->computeLikelihood();
+	} else {
+		tree_lh = tree->optimizeAllBranches(1);
+	}
+
+    cout<<"Initial log-likelihood: "<<tree_lh<<endl;
+	double begin_time = getRealTime();
+	int i;
+    for(i = 1; i < tree->params->num_param_iterations; i++){
+    	cur_lh = 0.0;
+        if (tree->part_order.empty()) tree->computePartitionOrder();
+        #ifdef _OPENMP
+        #pragma omp parallel for reduction(+: cur_lh) schedule(dynamic)
+        #endif
+    	for (int partid = 0; partid < ntrees; partid++) {
+            int part = tree->part_order[partid];
+    		// Subtree model parameters optimization
+//        	tree->part_info[part].cur_score = tree->at(part)->getModelFactory()->optimizeParameters(true, false, logl_epsilon, gradient_epsilon);
+        	tree->part_info[part].cur_score = tree->at(part)->getModelFactory()->optimizeParametersOnly(gradient_epsilon/min(min(i,ntrees),10));
+            if (tree->part_info[part].cur_score == 0.0)
+                tree->part_info[part].cur_score = tree->at(part)->computeLikelihood();
+        	cur_lh += tree->part_info[part].cur_score;
+
+        	// normalize rates s.t. branch lengths are #subst per site
+        	double mean_rate = tree->at(part)->getRate()->rescaleRates();
+        	if (mean_rate != 1.0) {
+        		if (tree->fixed_rates) {
+        			outError("Partition " + tree->part_info[part].name + " follows FreeRate heterogeneity. Please use proportion edge-linked partition model (-spp)");
+                }
+                tree->at(part)->scaleLength(mean_rate);
+        		tree->part_info[part].part_rate *= mean_rate;
+        	}
+
+    	}
+    	tree->clearAllPartialLH();
+    	// Optimizing gene rate
+    	if(!tree->fixed_rates){
+    		cur_lh = optimizeGeneRate(gradient_epsilon);
+    	}
+
+    	// Optimizing branch lengths
+    	int my_iter = min(5,i+1);
+
+    	if(!fixed_len){
+            double new_lh = tree->optimizeAllBranches(my_iter, logl_epsilon);
+            assert(new_lh > cur_lh - 1.0);
+            cur_lh = new_lh;
+    	}
+    	cout<<"Current log-likelihood at step "<<i<<": "<<cur_lh<<endl;
+    	if(fabs(cur_lh-tree_lh) < logl_epsilon) {
+            tree_lh = cur_lh;
+    		break;
+        }
+    	// make sure that the new logl is not so bad compared with previous logl
+    	assert(cur_lh > tree_lh - 1.0);
+    	tree_lh = cur_lh;
+    }
+//    cout <<"OPTIMIZE MODEL has finished"<< endl;
+    if (!tree->fixed_rates) {
+        cout << "Partition-specific rates: ";
+        for(int part = 0; part < ntrees; part++){
+            cout << " " << tree->part_info[part].part_rate;
+        }
+        cout << endl;
+    }
+	cout << "Parameters optimization took " << i-1 << " rounds (" << getRealTime()-begin_time << " sec)" << endl << endl;
+
+    return tree_lh;
+}
+
+//double PartitionModelPlen::computeFunction(double value) {
+//	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+//    if (value != tree->part_info[optimizing_part].part_rate) {
+//        tree->part_info[optimizing_part].part_rate = value;
+//        tree->mapBranchLen(optimizing_part);
+//        tree->at(optimizing_part)->clearAllPartialLH();
+//    }
+//    return -tree->at(optimizing_part)->computeLikelihood();
+//}
+
+
+double PartitionModelPlen::optimizeGeneRate(double gradient_epsilon)
+{
+	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+/*    
+	int ndim = tree->size()-1;
+
+	double *variables   = new double[ndim+1];
+	double *upper_bound = new double[ndim+1];
+	double *lower_bound = new double[ndim+1];
+	bool   *bound_check = new bool[ndim+1];
+	int i;
+	double score;
+
+	// gene rates are optimized by BFGS algorithm
+
+	setVariables(variables);
+
+	for (i = 1; i <= ndim; i++) {
+		//cout << variables[i] << endl;
+		lower_bound[i] = 1e-4;
+		upper_bound[i] = tree->size();
+		bound_check[i] = false;
+	}
+
+	score = -minimizeMultiDimen(variables, ndim, lower_bound, upper_bound, bound_check, tol);
+
+	getVariables(variables);
+	tree->clearAllPartialLH();
+
+	delete [] bound_check;
+	delete [] lower_bound;
+	delete [] upper_bound;
+	delete [] variables;
+
+	return score;
+*/
+    // BQM 22-05-2015: change to optimize individual rates
+    int i;
+    double score = 0.0;
+
+    if (tree->part_order.empty()) tree->computePartitionOrder();
+    #ifdef _OPENMP
+    #pragma omp parallel for reduction(+: score) private(i) schedule(dynamic)
+    #endif    
+    for (int j = 0; j < tree->size(); j++) {
+        int i = tree->part_order[j];
+//        double gene_rate = tree->part_info[i].part_rate;
+//        double negative_lh, ferror;
+//        optimizing_part = i;
+//        gene_rate = minimizeOneDimen(MIN_GENE_RATE, gene_rate, MAX_GENE_RATE, max(TOL_GENE_RATE, gradient_epsilon), &negative_lh, &ferror);
+//    	if (gene_rate != tree->part_info[optimizing_part].part_rate) {
+//            tree->part_info[i].part_rate = gene_rate;
+//            tree->mapBranchLen(i);
+//            tree->at(i)->clearAllPartialLH();
+//        }
+//        tree->part_info[i].cur_score = tree->at(i)->computeLikelihood();
+        tree->part_info[i].cur_score = tree->at(i)->optimizeTreeLengthScaling(tree->part_info[i].part_rate, gradient_epsilon);
+        score += tree->part_info[i].cur_score;
+    }
+    // now normalize the rates
+    double sum = 0.0;
+    size_t nsite = 0;
+    for (i = 0; i < tree->size(); i++) {
+        sum += tree->part_info[i].part_rate * tree->at(i)->aln->getNSite();
+        if (tree->at(i)->aln->seq_type == SEQ_CODON && tree->rescale_codon_brlen)
+            nsite += 3*tree->at(i)->aln->getNSite();
+        else
+            nsite += tree->at(i)->aln->getNSite();
+    }
+//    sum /= tree->getAlnNSite();
+    sum /= nsite;
+    tree->scaleLength(sum);
+    sum = 1.0/sum;
+    for (i = 0; i < tree->size(); i++)
+        tree->part_info[i].part_rate *= sum;
+    return score;
+}
+
+//double PartitionModelPlen::targetFunk(double x[]) {
+//	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+//
+//	double sum = 0.0;
+//	int part;
+//	for( part = 0; part < tree->size()-1; part ++){
+//		sum += x[part+1];
+//	}
+//	if (tree->size() - sum < 1e-4) return 1.0e+12;
+//
+//	for( part = 0, sum = 0.0; part < tree->size(); part ++){
+//		double rate;
+//		if (part < tree->size() - 1)
+//			rate = x[part+1];
+//		else
+//			rate = tree->size() - sum;
+//		sum += rate;
+//		if(tree->part_info[part].part_rate != rate){
+//			tree->at(part)->clearAllPartialLH();
+//			//tree->at(part)->scaleLength(rate/tree->part_info[part].part_rate);
+//			tree->part_info[part].part_rate = rate;
+//			tree->part_info[part].cur_score = 0.0;
+//		}
+//	}
+//	tree->mapBranchLen();
+//	//getVariables(x);
+//
+//	return -tree->computeLikelihood();
+//}
+
+//void PartitionModelPlen::getVariables(double *variables) {
+//	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+//	int ntrees = tree->size()-1;
+//	double sum = 0.0;
+//	for(int part = 0; part < ntrees; part++){
+//		tree->part_info[part].part_rate = variables[part+1];
+//		sum += variables[part+1];
+//	}
+//	tree->part_info[ntrees].part_rate = tree->size() - sum;
+//}
+//
+//void PartitionModelPlen::setVariables(double *variables) {
+//	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+//	int ntrees = tree->size()-1;
+//	for(int part = 0; part < ntrees; part++){
+//		variables[part+1] = tree->part_info[part].part_rate;
+//	}
+//}
+
+int PartitionModelPlen::getNParameters() {
+    PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+	int df = 0;
+    for (PhyloSuperTreePlen::iterator it = tree->begin(); it != tree->end(); it++) {
+    	df += (*it)->getModelFactory()->model->getNDim()+(*it)->getModelFactory()->site_rate->getNDim();
+		if ( (*it)->getModelFactory()->model->freq_type == FREQ_EMPIRICAL) df +=  (*it)->getModelFactory()->model->num_states-1;
+    }
+    df += tree->branchNum;
+    if(!tree->fixed_rates)
+    	df += tree->size()-1;
+    return df;
+}
+
+int PartitionModelPlen::getNDim(){
+	PhyloSuperTreePlen *tree = (PhyloSuperTreePlen*)site_rate->getTree();
+	int ndim = tree->size() -1;
+	return ndim;
+}
+
+
+/**********************************************************
+ * class PhyloSuperTreePlen
+**********************************************************/
+
+
+PhyloSuperTreePlen::PhyloSuperTreePlen()
+: PhyloSuperTree()
+{
+	memset(allNNIcases_computed, 0, 5*sizeof(int));
+	fixed_rates = false;
+}
+
+PhyloSuperTreePlen::PhyloSuperTreePlen(Params &params)
+: PhyloSuperTree(params)
+{
+	memset(allNNIcases_computed, 0, 5*sizeof(int));
+	fixed_rates = (params.partition_type == 'j') ? true : false;
+	int part = 0;
+	for (iterator it = begin(); it != end(); it++, part++) {
+		part_info[part].part_rate = 1.0;
+		part_info[part].evalNNIs = 0.0;
+        if ((*it)->aln->seq_type == SEQ_CODON && rescale_codon_brlen)
+            part_info[part].part_rate = 3.0;
+	}
+}
+
+PhyloSuperTreePlen::PhyloSuperTreePlen(SuperAlignment *alignment, PhyloSuperTree *super_tree)
+: PhyloSuperTree(alignment,super_tree)
+{
+	memset(allNNIcases_computed, 0, 5*sizeof(int));
+	fixed_rates = false;
+}
+
+void PhyloSuperTreePlen::deleteAllPartialLh() {
+	for (iterator it = begin(); it != end(); it++) {
+		// reset these pointers so that they are not deleted
+		(*it)->central_partial_lh = NULL;
+		(*it)->central_scale_num = NULL;
+		(*it)->central_partial_pars = NULL;
+		(*it)->_pattern_lh = NULL;
+		(*it)->_pattern_lh_cat = NULL;
+		(*it)->theta_all = NULL;
+		(*it)->ptn_freq = NULL;
+		(*it)->ptn_freq_computed = false;
+		(*it)->ptn_invar = NULL;
+        (*it)->nni_partial_lh = NULL;
+        (*it)->nni_scale_num = NULL;
+	}
+    PhyloTree::deleteAllPartialLh();
+}
+
+PhyloSuperTreePlen::~PhyloSuperTreePlen()
+{
+	for (iterator it = begin(); it != end(); it++) {
+		// reset these pointers so that they are not deleted
+		(*it)->central_partial_lh = NULL;
+		(*it)->central_scale_num = NULL;
+		(*it)->central_partial_pars = NULL;
+		(*it)->_pattern_lh = NULL;
+		(*it)->_pattern_lh_cat = NULL;
+		(*it)->theta_all = NULL;
+		(*it)->ptn_freq = NULL;
+		(*it)->ptn_freq_computed = false;
+		(*it)->ptn_invar = NULL;
+        (*it)->nni_partial_lh = NULL;
+        (*it)->nni_scale_num = NULL;
+	}
+}
+
+
+// -------------------------------------------------------------------------------------------------------------
+double PhyloSuperTreePlen::computeDist(int seq1, int seq2, double initial_dist, double &var) {
+    // if no model or site rate is specified, return JC distance
+    if (initial_dist == 0.0)
+        initial_dist = aln->computeDist(seq1, seq2);
+    if (initial_dist == MAX_GENETIC_DIST) return initial_dist;
+    if (!model_factory || !site_rate) return initial_dist;
+
+    // now optimize the distance based on the model and site rate
+    SuperAlignmentPairwisePlen aln_pair(this, seq1, seq2);
+    return aln_pair.optimizeDist(initial_dist, var);
+}
+
+void PhyloSuperTreePlen::mapTrees() {
+	assert(root);
+	int part = 0;
+    // this is important: rescale branch length of codon partitions to be compatible with other partitions.
+    // since for codon models, branch lengths = # nucleotide subst per codon site!
+    bool noncodon_present = false;
+    iterator it;
+    for (it = begin(); it != end(); it++)
+        if ((*it)->aln->seq_type != SEQ_CODON) {
+            noncodon_present = true;
+            break;
+        }
+//	if (verbose_mode >= VB_DEBUG)
+//		drawTree(cout,  WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+	for (it = begin(); it != end(); it++, part++) {
+		string taxa_set = ((SuperAlignment*)aln)->getPattern(part);
+		(*it)->copyTree(this, taxa_set);
+
+		// the only difference with PhyloSuperTree::mapTrees()
+		(*it)->scaleLength(part_info[part].part_rate);
+
+//		if ((*it)->getModel())
+//			(*it)->initializeAllPartialLh();
+		NodeVector my_taxa, part_taxa;
+		(*it)->getOrderedTaxa(my_taxa);
+		part_taxa.resize(leafNum, NULL);
+		int i;
+		for (i = 0; i < leafNum; i++) {
+			int id = ((SuperAlignment*)aln)->taxa_index[i][part];
+			if (id >=0) part_taxa[i] = my_taxa[id];
+		}
+//		if (verbose_mode >= VB_DEBUG) {
+//			cout << "Subtree for partition " << part << endl;
+//			(*it)->drawTree(cout,  WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_ID);
+//		}
+		linkTree(part, part_taxa);
+	}
+	//if (verbose_mode >= VB_DEBUG) printMapInfo();
+	if (getModel())
+		initializeAllPartialLh();
+}
+
+void PhyloSuperTreePlen::linkTrees() {
+	mapTrees();
+//	int part = 0;
+//	iterator it;
+//	for (it = begin(), part = 0; it != end(); it++, part++) {
+//		(*it)->initializeTree();
+//		(*it)->setAlignment((*it)->aln);
+//		NodeVector my_taxa, part_taxa;
+//		(*it)->getOrderedTaxa(my_taxa);
+//		part_taxa.resize(leafNum, NULL);
+//		int i;
+//		for (i = 0; i < leafNum; i++) {
+//			int id = ((SuperAlignment*)aln)->taxa_index[i][part];
+//			if (id >=0) part_taxa[i] = my_taxa[id];
+//		}
+//		linkTree(part, part_taxa);
+//	}
+//	if (getModel())
+//		initializeAllPartialLh();
+
+}
+
+
+double PhyloSuperTreePlen::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
+	//initPartitionInfo(); // OLGA: not needed here
+	//cout<<"Optimizing all branches"<<endl;
+	for(int part = 0; part < size(); part++){
+		part_info[part].cur_score = 0.0;
+	}
+
+//	double logLH1=computeLikelihood();
+//	clearAllPartialLH();
+//	double logLH2=computeLikelihood();
+//	if(fabs(logLH1-logLH2)>1){
+//		cout<<"---------------------------------------------------------"<<endl;
+//		cout<<"BEFORE calling phylotree::optimize all branches "<<endl;
+//		cout<<"DIFFERENCE IN RECOMPUTATION of log-lh = "<<fabs(logLH1-logLH2)<<endl;
+//		cout<<"  initial    = "<<logLH1<<endl;
+//		cout<<"  recomputed = "<<logLH2<<endl;
+//	}
+
+	return PhyloTree::optimizeAllBranches(my_iterations,tolerance, maxNRStep);
+}
+
+void PhyloSuperTreePlen::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH, int maxNRStep) {
+
+	SuperNeighbor *nei1 = (SuperNeighbor*)node1->findNeighbor(node2);
+	SuperNeighbor *nei2 = (SuperNeighbor*)node2->findNeighbor(node1);
+	int part;
+
+	current_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+    current_it_back = (PhyloNeighbor*) node2->findNeighbor(node1);
+	for (part = 0; part < size(); part++) {
+		if (((SuperNeighbor*)current_it)->link_neighbors[part]) {
+            at(part)->current_it = ((SuperNeighbor*)current_it)->link_neighbors[part];
+            at(part)->current_it_back = ((SuperNeighbor*)current_it_back)->link_neighbors[part];
+		}
+	}
+    
+	double current_len = current_it->length;
+	for (part = 0; part < size(); part++) {
+		at(part)->theta_computed = false;
+	}
+
+	//this->clearAllPartialLH();
+	PhyloTree::optimizeOneBranch(node1, node2, false, maxNRStep);
+
+    if (part_order.empty()) computePartitionOrder();
+	// bug fix: assign cur_score into part_info
+    #ifdef _OPENMP
+    #pragma omp parallel for private(part) schedule(dynamic)
+    #endif    
+	for (int partid = 0; partid < size(); partid++) {
+        part = part_order_by_nptn[partid];
+		if (((SuperNeighbor*)current_it)->link_neighbors[part]) {
+			part_info[part].cur_score = at(part)->computeLikelihoodFromBuffer();
+		}
+	}
+
+	if(clearLH && current_len != current_it->length){
+		for (int part = 0; part < size(); part++) {
+			PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+			PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+			if(nei1_part){
+				((PhyloNode*)nei1_part->node)->clearReversePartialLh(((PhyloNode*)nei2_part->node));
+				((PhyloNode*)nei2_part->node)->clearReversePartialLh(((PhyloNode*)nei1_part->node));
+			}
+		}
+	}
+
+//	return tree_lh;
+}
+
+double PhyloSuperTreePlen::computeFunction(double value) {
+
+	double tree_lh = 0.0;
+	int ntrees = size();
+
+	if (!central_partial_lh) initializeAllPartialLh();
+
+	double lambda = value-current_it->length;
+	current_it->length = value;
+    current_it_back->length = value;
+
+	SuperNeighbor *nei1 = (SuperNeighbor*)current_it_back->node->findNeighbor(current_it->node);
+	SuperNeighbor *nei2 = (SuperNeighbor*)current_it->node->findNeighbor(current_it_back->node);
+	assert(nei1 && nei2);
+
+    if (part_order.empty()) computePartitionOrder();
+    #ifdef _OPENMP
+    #pragma omp parallel for reduction(+: tree_lh) schedule(dynamic)
+    #endif    
+	for (int partid = 0; partid < ntrees; partid++) {
+            int part = part_order_by_nptn[partid];
+			PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+			PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+			if (nei1_part && nei2_part) {
+				at(part)->current_it = nei1_part;
+				at(part)->current_it_back = nei2_part;
+				nei1_part->length += lambda*part_info[part].part_rate;
+				nei2_part->length += lambda*part_info[part].part_rate;
+				part_info[part].cur_score = at(part)->computeLikelihoodBranch(nei2_part,(PhyloNode*)nei1_part->node);
+				tree_lh += part_info[part].cur_score;
+			} else {
+				if (part_info[part].cur_score == 0.0)
+					part_info[part].cur_score = at(part)->computeLikelihood();
+				tree_lh += part_info[part].cur_score;
+			}
+		}
+    return -tree_lh;
+}
+
+double PhyloSuperTreePlen::computeLikelihoodFromBuffer() {
+    //return -computeFunction(current_it->length);
+	double score = 0.0;
+	int part, ntrees = size();
+	for (part = 0; part < ntrees; part++) {
+		assert(part_info[part].cur_score != 0.0);
+		score += part_info[part].cur_score;
+	}
+	return score;
+}
+
+void PhyloSuperTreePlen::computeFuncDerv(double value, double &df_ret, double &ddf_ret) {
+//	double tree_lh = 0.0;
+	double df = 0.0;
+	double ddf = 0.0;
+
+	int ntrees = size();
+
+	if (!central_partial_lh) initializeAllPartialLh();
+
+	double lambda = value-current_it->length;
+	current_it->length = value;
+    current_it_back->length = value;
+
+	SuperNeighbor *nei1 = (SuperNeighbor*)current_it_back->node->findNeighbor(current_it->node);
+	SuperNeighbor *nei2 = (SuperNeighbor*)current_it->node->findNeighbor(current_it_back->node);
+	assert(nei1 && nei2);
+
+    if (part_order.empty()) computePartitionOrder();
+    #ifdef _OPENMP
+    #pragma omp parallel for reduction(+: df, ddf) schedule(dynamic)
+    #endif    
+	for (int partid = 0; partid < ntrees; partid++) {
+        int part = part_order_by_nptn[partid];
+        double df_aux, ddf_aux;
+			PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+			PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+			if (nei1_part && nei2_part) {
+				at(part)->current_it = nei1_part;
+				at(part)->current_it_back = nei2_part;
+
+				nei1_part->length += lambda*part_info[part].part_rate;
+				nei2_part->length += lambda*part_info[part].part_rate;
+				if(nei1_part->length<-1e-4){
+					cout<<"lambda = "<<lambda<<endl;
+					cout<<"NEGATIVE BRANCH len = "<<nei1_part->length<<endl<<" rate = "<<part_info[part].part_rate<<endl;
+					outError("shit!!   ",__func__);
+				}
+//				part_info[part].cur_score = at(part)->computeLikelihoodDerv(nei2_part,(PhyloNode*)nei1_part->node, df_aux, ddf_aux);
+				at(part)->computeLikelihoodDerv(nei2_part,(PhyloNode*)nei1_part->node, df_aux, ddf_aux);
+//				tree_lh += part_info[part].cur_score;
+				df += part_info[part].part_rate*df_aux;
+				ddf += part_info[part].part_rate*part_info[part].part_rate*ddf_aux;
+			}
+			else {
+//				part_info[part].cur_score = 0.0;
+				if (part_info[part].cur_score == 0.0)
+					part_info[part].cur_score = at(part)->computeLikelihood();
+//				tree_lh += part_info[part].cur_score;
+			}
+		}
+    df_ret = -df;
+    ddf_ret = -ddf;
+//    return -tree_lh;
+}
+
+NNIMove PhyloSuperTreePlen::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves)
+{
+	assert(node1->degree() == 3 && node2->degree() == 3);
+
+	double backupScore = curScore;
+
+//	SuperNeighbor *nei1 = ((SuperNeighbor*)node1->findNeighbor(node2));
+//	SuperNeighbor *nei2 = ((SuperNeighbor*)node2->findNeighbor(node1));
+//	assert(nei1 && nei2);
+//
+//	SuperNeighbor *node1_nei = NULL;
+//	SuperNeighbor *node2_nei = NULL;
+//	SuperNeighbor *node2_nei_other = NULL;
+//
+//	FOR_NEIGHBOR_DECLARE(node1, node2, node1_it) {
+//		node1_nei = (SuperNeighbor*)(*node1_it);
+//		break;
+//	}
+//	FOR_NEIGHBOR_DECLARE(node2, node1, node2_it) {
+//		node2_nei = (SuperNeighbor*)(*node2_it);
+//		break;
+//	}
+//
+//	FOR_NEIGHBOR_IT(node2, node1, node2_it_other)
+//	if ((*node2_it_other) != node2_nei) {
+//		node2_nei_other = (SuperNeighbor*)(*node2_it_other);
+//		break;
+//	}
+
+/*	#ifdef _OPENMP
+	#pragma omp parallel for reduction(+: nni1_score, nni2_score) private(part)
+	#endif
+*/
+	SwapNNIParam nni_param;
+	// nni_param.node1/2_nei tell swapNNIBranch what to swap first
+//	nni_param.node1_nei = node1_nei;
+//	nni_param.node2_nei = node2_nei;
+
+	// ------------------------------------------------------------------
+    int cnt;
+
+	//NNIMove nniMoves[2];
+    bool newNNIMoves = false;
+    if (!nniMoves) {
+		//   Initialize the 2 NNI moves
+    	newNNIMoves = true;
+    	nniMoves = new NNIMove[2];
+    	nniMoves[0].ptnlh = nniMoves[1].ptnlh = NULL;
+    	nniMoves[0].node1 = NULL;
+    }
+
+    if (nniMoves[0].node1) {
+    	// assuming that node1Nei_it and node2Nei_it are defined in nniMoves structure
+    	for (cnt = 0; cnt < 2; cnt++) {
+    		// sanity check
+    		if (!node1->findNeighbor((*nniMoves[cnt].node1Nei_it)->node)) outError(__func__);
+    		if (!node2->findNeighbor((*nniMoves[cnt].node2Nei_it)->node)) outError(__func__);
+    	}
+    } else {
+        FOR_NEIGHBOR_IT(node1, node2, node1_it) {
+			cnt = 0;
+			FOR_NEIGHBOR_IT(node2, node1, node2_it) {
+				//   Initialize the 2 NNI moves
+				nniMoves[cnt].node1Nei_it = node1_it; // the same neighbor of node1 for cnt = 0 and cnt = 1
+				nniMoves[cnt].node2Nei_it = node2_it;
+				cnt++;
+			}
+			break;
+        }
+    }
+
+    // Initialize node1 and node2 in nniMoves
+	nniMoves[0].node1 = nniMoves[1].node1 = node1;
+	nniMoves[0].node2 = nniMoves[1].node2 = node2;
+
+	//--------------------------------------------------------------------------
+
+	this->swapNNIBranch(0.0, node1, node2, &nni_param, nniMoves);
+
+/*
+    NNIMove myMove;
+    myMove.newloglh = 0;
+
+	// Choose NNI move for SuperTree===========================================
+	if (nni_param.nni1_score > nni_param.nni2_score) {
+		myMove.swap_id = 1;
+		myMove.node1Nei_it = node1->findNeighborIt(node1_nei->node);
+		myMove.node2Nei_it = node2->findNeighborIt(node2_nei->node);
+		myMove.newloglh = nni_param.nni1_score;
+		myMove.node1 = node1;
+		myMove.node2 = node2;
+		myMove.newLen[0] = nni_param.nni1_brlen;
+		//myMove.oldLen[0] = oldLEN;
+	} else {
+		myMove.swap_id = 2;
+		myMove.node1Nei_it = node1->findNeighborIt(node1_nei->node);
+		myMove.node2Nei_it = node2->findNeighborIt(node2_nei_other->node);
+		myMove.newloglh = nni_param.nni2_score;
+		myMove.node1 = node1;
+		myMove.node2 = node2;
+		myMove.newLen[0] = nni_param.nni2_brlen;
+		//myMove.oldLen[0] = oldLEN;
+	}
+	// ========================================================================
+*/
+
+	 // restore curScore
+	 curScore = backupScore;
+
+	 NNIMove myMove;
+	 if (nniMoves[0].newloglh > nniMoves[1].newloglh) {
+		 myMove = nniMoves[0];
+		 myMove.swap_id = 1;
+	 } else {
+		 myMove = nniMoves[1];
+		 myMove.swap_id = 2;
+	 }
+	if (newNNIMoves) {
+		delete [] nniMoves;
+	}
+	return myMove;
+}
+
+void PhyloSuperTreePlen::doNNIs(int nni2apply, bool changeBran) {
+	IQTree::doNNIs(nni2apply, changeBran);
+	mapBranchLen();
+	//clearAllPartialLH();
+}
+
+
+void PhyloSuperTreePlen::getNNIType(PhyloNode *node1, PhyloNode *node2, vector<NNIType> &nni_type) {
+	int epsilon_cnt, part, ntrees=size();
+	nni_type.resize(ntrees, NNI_NO_EPSILON);
+	for(part=0; part<ntrees;part++){
+		totalNNIs++;
+		nni_type[part] = NNI_NO_EPSILON;
+		epsilon_cnt = 0;
+
+		FOR_NEIGHBOR_DECLARE(node1,NULL,nit){
+			if(!((SuperNeighbor*)*nit)->link_neighbors[part]) { epsilon_cnt++; }
+		}
+		FOR_NEIGHBOR(node2, node1, nit) {
+			if(!((SuperNeighbor*)*nit)->link_neighbors[part]) { epsilon_cnt++; }
+		}
+		if(epsilon_cnt == 0){
+			nni_type[part]=NNI_NO_EPSILON;
+		}else if(epsilon_cnt == 1){
+			nni_type[part] = NNI_ONE_EPSILON;
+		}else if(epsilon_cnt == 2){
+			nni_type[part]=NNI_TWO_EPSILON;
+		}else if(epsilon_cnt == 3){
+			nni_type[part]=NNI_THREE_EPSILON;
+		}else {
+			nni_type[part] = NNI_MANY_EPSILON;
+		}
+	}
+}
+
+void PhyloSuperTreePlen::doNNI(NNIMove &move, bool clearLH)
+{
+	//checkBranchLen();
+	SuperNeighbor *nei1 = (SuperNeighbor*)move.node1->findNeighbor(move.node2);
+	SuperNeighbor *nei2 = (SuperNeighbor*)move.node2->findNeighbor(move.node1);
+	SuperNeighbor *node1_nei = (SuperNeighbor*)*move.node1Nei_it;
+	SuperNeighbor *node2_nei = (SuperNeighbor*)*move.node2Nei_it;
+
+	int part = 0, ntrees = size();
+	iterator it;
+	vector<NNIMove> part_move;
+	vector<NNIType> is_nni;
+	part_move.resize(ntrees);
+	getNNIType(move.node1, move.node2, is_nni);
+
+
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+
+		if(is_nni[part] == NNI_NO_EPSILON){
+			PhyloNeighbor *nei1_part = nei1->link_neighbors[part];
+			PhyloNeighbor *nei2_part = nei2->link_neighbors[part];
+			part_move[part].node1 = (PhyloNode*)nei2_part->node;
+			part_move[part].node2 = (PhyloNode*)nei1_part->node;
+			part_move[part].node1Nei_it = part_move[part].node1->findNeighborIt(node1_nei->link_neighbors[part]->node);
+			part_move[part].node2Nei_it = part_move[part].node2->findNeighborIt(node2_nei->link_neighbors[part]->node);
+		}
+	}
+//	PhyloTree::doNNI(move,clearLH);
+	PhyloTree::doNNI(move,false);
+	//nei1->length = move.newLen[0];
+	//nei2->length = move.newLen[0];
+	PhyloNode *node1, *node2;
+
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		switch (is_nni[part]) {
+		case NNI_NO_EPSILON:
+			(*it)->doNNI(part_move[part],clearLH);
+			break;
+		case NNI_ONE_EPSILON:
+			linkBranch(part, nei1, nei2);
+			if (clearLH) {
+				// clear partial likelihood vector
+				node1 = (PhyloNode*)nei2->link_neighbors[part]->node;
+				node2 = (PhyloNode*)nei1->link_neighbors[part]->node;
+				nei1->link_neighbors[part]->clearPartialLh();
+				nei2->link_neighbors[part]->clearPartialLh();
+				node2->clearReversePartialLh(node1);
+				node1->clearReversePartialLh(node2);
+			}
+			break;
+		case NNI_TWO_EPSILON:
+			node1 = (PhyloNode*)nei2->link_neighbors[part]->node;
+			node2 = (PhyloNode*)nei1->link_neighbors[part]->node;
+			linkBranch(part, nei1, nei2);
+			if(clearLH){
+				// the check "&& !(PhyloNode*)nei2->link_neighbors[part]" is not needed,
+				// since the branch lengths are changed during the optimization
+				// and we anyway have to clearReversePartialLh
+				node2->clearReversePartialLh(node1);
+				node1->clearReversePartialLh(node2);
+			}
+			break;
+		case NNI_THREE_EPSILON:
+			linkBranch(part, nei1, nei2);
+			if (clearLH) {
+				// clear partial likelihood vector
+				node1 = (PhyloNode*)nei2->link_neighbors[part]->node;
+				node2 = (PhyloNode*)nei1->link_neighbors[part]->node;
+				node2->clearReversePartialLh(node1);
+				node1->clearReversePartialLh(node2);
+			}
+			break;
+		case NNI_MANY_EPSILON:
+			break;
+		}
+	}
+}
+
+double PhyloSuperTreePlen::swapNNIBranch(double cur_score, PhyloNode *node1, PhyloNode *node2, SwapNNIParam *nni_param, NNIMove *nniMoves) {
+
+//	for (iterator it = begin(); it != end(); it++)
+//		if ((*it)->sse != LK_EIGEN_SSE)
+//			outError("hey!");
+
+//	double score_mine = this->computeLikelihood();
+
+	//cout<<"starting NNI evaluation"<<endl;
+	//checkBranchLen();
+
+	int i = 0, id = 0;
+	int part, ntrees = size();
+
+	/*===========================================================================================
+	 * Identify NNIType for partitions
+	 *===========================================================================================*/
+	vector<NNIType> is_nni;
+	getNNIType(node1, node2, is_nni);
+	if(verbose_mode >= VB_MED){
+		for (part = 0; part < ntrees; part++)
+			switch (is_nni[part]) {
+			case NNI_NO_EPSILON:
+				allNNIcases_computed[0]++;
+				break;
+			case NNI_ONE_EPSILON:
+				allNNIcases_computed[1]++;
+				break;
+			case NNI_TWO_EPSILON:
+				allNNIcases_computed[2]++;
+				break;
+			case NNI_THREE_EPSILON:
+				allNNIcases_computed[3]++;
+				break;
+			case NNI_MANY_EPSILON:
+				allNNIcases_computed[4]++;
+				break;
+			}
+	}
+	//==================================================================================================
+	// SuperTREE: saving Neighbors and allocating new ones; assign which nodes/neighbors to be swapped.
+	//==================================================================================================
+	double old_brlen = node1->findNeighbor(node2)->length; // length of the branch between node1 and node2 on SuperTree before NNI
+	int IT_NUM = (params->nni5) ? 6 : 2;
+	NeighborVec::iterator it, saved_it[6], node_nei_it[4];
+	Node* neighbor_nodes[4];
+
+	saved_it[id++] = node1->findNeighborIt(node2);
+	saved_it[id++] = node2->findNeighborIt(node1);
+
+	//if (params->nni5) {
+		FOR_NEIGHBOR(node1, node2, it){
+			saved_it[id++] = (*it)->node->findNeighborIt(node1);
+			node_nei_it[i++] = it;
+			neighbor_nodes[i-1] = (*it)->node;
+		}
+		FOR_NEIGHBOR(node2, node1, it){
+			saved_it[id++] = (*it)->node->findNeighborIt(node2);
+			node_nei_it[i++] = it;
+			neighbor_nodes[i-1] = (*it)->node;
+		}
+	//}
+
+//		cout<<"------NODE_id check-----------------------------------"<<endl;
+//		for(part=0; part<ntrees; part++){
+//			cout<<"PART = "<<part<<endl;
+//			for(id=2; id<6; id++){
+//				if(node1->isNeighbor(neighbor_nodes[id-2])){
+//					if(((SuperNeighbor*)(node1->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]){
+//						cout<<"node1: "<<"id = "<<id<<"; node_id = "<<
+//								((SuperNeighbor*)(node1->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]->node->id<<";"<<endl;
+//					} else {
+//						cout<<"node1: "<<"id = "<<id<<"; no neighbor;"<<endl;
+//					}
+//				} else if(node2->isNeighbor(neighbor_nodes[id-2])){
+//					if(((SuperNeighbor*)(node2->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]){
+//						cout<<"node2: "<<"id = "<<id<<"; node_id = "<<
+//								((SuperNeighbor*)(node2->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]->node->id<<";"<<endl;
+//					} else {
+//						cout<<"node2: "<<"id = "<<id<<"; no neighbor;"<<endl;
+//					}
+//				}
+//			}
+//			cout<<"------"<<endl;
+//			for(id=2; id<6; id++){
+//				if(((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]){
+//					cout<<"id = "<<id<<"; node_id = "<<((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node->id<<";"<<endl;
+//				}
+//			}
+//		}
+//		cout<<"------------------------------------------------------"<<endl;
+
+
+
+	/*------------------------------------------------------------------------------------
+	 * Saving original neighbors:
+	 * saved_nei[0] - node2 as a neighbor of node1
+	 * saved_nei[1] - node1 as a neighbor of node2
+	 * IF(nni5Branches)
+	 * 		saved_nei[2(3)] - node1 as a neighbor of its nei1(nei2) different from node2
+	 * 		saved_nei[4(5)] - node2 as a neighbor of its nei1(nei2) different from node1
+	 *------------------------------------------------------------------------------------*/
+
+	SuperNeighbor *saved_nei[6];
+
+	// allocate new Super Neighbor pointers
+	for (id = 0; id < IT_NUM; id++) {
+		saved_nei[id] = (SuperNeighbor*)(*saved_it[id]);
+		*saved_it[id] = new SuperNeighbor(saved_nei[id]->node, saved_nei[id]->length);
+		(*saved_it[id])->id = saved_nei[id]->id;
+		for(part = 0; part < ntrees; part++)
+			((SuperNeighbor*)*saved_it[id])->link_neighbors.push_back(NULL);
+	}
+
+	// Getting NEW Neighbors: get the Neighbors again since they were saved for restoring purpose and replaced by new
+	SuperNeighbor *nei1_new = (SuperNeighbor*) node1->findNeighbor(node2);
+	SuperNeighbor *nei2_new = (SuperNeighbor*) node2->findNeighbor(node1);
+
+//	/* -------------------------------------------------------------------------------------------
+//	 *  NNI details: assigning nodes to be swapped on SuperTree
+//	 * -------------------------------------------------------------------------------------------*/
+//
+//	// node1_nei - one of the node1 neighbors, which is not node2
+//	NeighborVec::iterator node1_it = node1->findNeighborIt(nni_param->node1_nei->node);
+//	Neighbor *node1_nei = *node1_it;
+//
+//	// *node2_its[0] - one of the node2 neighbors, which is not node1
+//	// *node2_its[1] - second neighbor of node2,   which is not node1
+//	vector<NeighborVec::iterator> node2_its;
+//	node2_its.push_back(node2->findNeighborIt(nni_param->node2_nei->node));
+//
+//	FOR_NEIGHBOR_DECLARE(node2, node1, node2_it){
+//		FOR_NEIGHBOR_DECLARE(node2,(*node2_it)->node,node2_it2)
+//			node2_its.push_back(node2_it2);
+//	}
+//	assert(node2_its.size() == 2);
+
+	/* =================================================================================================
+	 * SubTREEs: saving Neighbors and allocating new ones.
+	 * =================================================================================================*/
+
+	/*------------------------------------------------------------------------------------
+	 * Variables to be used for saving/restoring purposes on SubTrees
+	 *------------------------------------------------------------------------------------*/
+
+	vector<PhyloNeighbor*> sub_saved_nei1,sub_saved_nei2,sub_saved_nei;
+	vector<NeighborVec::iterator> sub_saved_it;
+
+	// Saving linked neighbor of node1->findNei(node2)
+	sub_saved_nei1.resize(ntrees);
+	// Saving linked neighbor of node2->findNei(node1)
+	sub_saved_nei2.resize(ntrees);
+
+	sub_saved_nei.resize(6*ntrees);
+	sub_saved_it.resize(6*ntrees);
+
+	/*---------------------------------------------------------
+	 * For Restoring: saving branch lengths on SubTrees
+	 *---------------------------------------------------------*/
+	/* NO_EPS:  one/five branches need to be restored in nni1/nni5 cases respectively,
+	 * 			but these branches are saved in saved_nei->link_neighbors, we won't store them again
+	 * ONE_EPS: three branches need to be restored (stick to ids: 0,...,5)
+	 * TWO_EPS: the image of central branch needs to be restored (the id for restoring [6*part+0])
+	 * THREE_EPS: one branch needs to be restored: which the central is relinked to after NNI (the id for restoring [6*part+0])
+	 * MANY_EPS: nothing to be restored
+	 */
+	double *sub_saved_branch = new double[6*ntrees];
+
+	/* ---------------------------------------------------------
+	 * For Restoring: saving current likelihoods for SubTree
+	 * ---------------------------------------------------------*/
+	double *saved_cur_score = new double[part_info.size()];
+	for (i = 0; i < part_info.size(); i++)
+		saved_cur_score[i] = part_info[i].cur_score;
+
+	/* -------------------------------------------------------------------------------------------------------------------
+	 * Allocate new PhyloNeighbors:
+	 * NO_EPS:  2 or 6 for nni1 and nni5 respectively; update link_neighbors for corresponding SuperNeighbors.
+	 * ONE_EPS: 1 or 3 for nni1 and nni5 respectively; update link_neighbors for corresponding SuperNeighbors LATER
+	 * 			(since it depends on particular NNI).
+	 * -------------------------------------------------------------------------------------------------------------------*/
+
+	// Auxiliary variables: we allocate new PhyloNeighbor for [node_link->findNeighbor(nei_link)]
+	Node *node_link, *nei_link;
+	SuperNeighbor *nei;
+
+	// For ONE_epsilon case: saves "id" of the neighbors that have an empty image
+	int id_eps[part];
+
+	for(part = 0; part < ntrees; part++){
+		if(is_nni[part]==NNI_NO_EPSILON){
+			//evalNNIs++;
+			//part_info[part].evalNNIs++;
+
+			// one branch optimization ------------------------------------------------------------------
+			for(id = 0; id < 2; id++){
+				/*
+					for id=0, nei_link  = node1->find(node2)->link->node = node2_link
+				 	for id=0, node_link = node2->find(node1)->link->node = node1_link
+					for id=0, saving iterator of neighbor of node1_link, that is node2_link;
+						 	  then on this place we'll create a new PhyloNei
+				*/
+
+				nei_link  = saved_nei[id]->link_neighbors[part]->node;
+				node_link = saved_nei[1-id]->link_neighbors[part]->node;
+				sub_saved_it[part*6 + id] = node_link->findNeighborIt(nei_link);
+
+				// Create a new PhyloNeighbor, with new partial lhs, scale number and set the branch id as before
+				*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, saved_nei[id]->link_neighbors[part]->length);
+				((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = at(part)->newPartialLh();
+				((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = at(part)->newScaleNum();
+				(*sub_saved_it[part*6 + id])->id = saved_nei[id]->link_neighbors[part]->id;
+
+				// update link_neighbor[part]: for New SuperNeighbor we set the corresponding new PhyloNeighbor on partition part
+				((SuperNeighbor*)*saved_it[id])->link_neighbors[part] = (PhyloNeighbor*)*sub_saved_it[part*6 + id];
+			}
+
+			// optimization on 5 branches ------------------------------------------------------------------
+			if(params->nni5){
+				for(id = 2; id < 6; id ++){
+					nei_link = saved_nei[id]->link_neighbors[part]->node;
+					node_link = ((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node;
+					sub_saved_it[part*6 + id] = node_link->findNeighborIt(nei_link);
+					*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, saved_nei[id]->link_neighbors[part]->length);
+					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = at(part)->newPartialLh();
+					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = at(part)->newScaleNum();
+					(*sub_saved_it[part*6 + id])->id = saved_nei[id]->link_neighbors[part]->id;
+
+					// update link_neighbor[part]
+					((SuperNeighbor*)*saved_it[id])->link_neighbors[part] = (PhyloNeighbor*)*sub_saved_it[part*6 + id];
+				}
+			}
+
+		} else if(is_nni[part]==NNI_ONE_EPSILON){
+
+			// Make sure to update all the necessary link_neighbors and take care of branch lengths
+			// (increase/decrease by central branch where necessary).
+
+			nei1_new->link_neighbors[part] = saved_nei[0]->link_neighbors[part];
+			nei2_new->link_neighbors[part] = saved_nei[1]->link_neighbors[part];
+
+			// Change the length of branch, (node1,node2) WAS linked to (-=)
+			nei1_new->link_neighbors[part]->length -= old_brlen * part_info[part].part_rate;
+			nei2_new->link_neighbors[part]->length -= old_brlen * part_info[part].part_rate;
+			assert(nei1_new->link_neighbors[part]->length >= 0.0);
+
+			// Allocate three new PhyloNeighbors.
+			// For nni1 only one of it will be actually used and which one depends on the NNI.
+
+			// We have this if condition, since saved_nei will be newly allocated neis in nni5 case,
+			// while saved_it are the actual neighbors and we don't want to mess them up
+			for(id = 2; id < 6; id++){
+				if(params->nni5){
+					nei = saved_nei[id];
+				} else {
+					nei = (SuperNeighbor*)(*saved_it[id]);
+				}
+				if(nei->link_neighbors[part]){
+					// nei_link is either node1 or node2 on SubTrees
+					nei_link = nei->link_neighbors[part]->node;
+					// node_link are nodes neighbors of node1 and node2 on SubTrees
+					node_link = ((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node;
+					sub_saved_it[part*6 + id] = node_link->findNeighborIt(nei_link);
+
+					// Saving branch lengths
+					sub_saved_branch[6*part + id] = nei->link_neighbors[part]->length;
+
+					*sub_saved_it[part*6 + id] = new PhyloNeighbor(nei_link, nei->link_neighbors[part]->length);
+					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->partial_lh = at(part)->newPartialLh();
+					((PhyloNeighbor*) (*sub_saved_it[part*6 + id]))->scale_num = at(part)->newScaleNum();
+					(*sub_saved_it[part*6 + id])->id = nei->link_neighbors[part]->id;
+
+					// If nni5 we update the link neighbors already here, otherwise
+					// they will be updated for each NNI within the loop.
+					if(params->nni5){
+						((SuperNeighbor*)*saved_it[id])->link_neighbors[part] = (PhyloNeighbor*)*sub_saved_it[part*6 + id];
+					}
+					//cout<<"saved_it["<<id<<"]; neighbor->node->id = "<<(*sub_saved_it[part*6 + id])->node->id<<endl;
+					//cout<<"saved_it["<<id<<"];           node->id = "<<node_link->id<<endl;
+				} else {
+					id_eps[part] = id;
+				}
+			}
+		}else if(is_nni[part]==NNI_THREE_EPSILON && params->nni5){
+			// you fill out link neighbors vector for newly allocated SuperNeighbors
+			for(id = 2; id < 6; id++){
+				if(saved_nei[id]->link_neighbors[part]){
+					((SuperNeighbor*)*saved_it[id])->link_neighbors[part] = saved_nei[id]->link_neighbors[part];
+				}
+			}
+		}else if(is_nni[part]==NNI_TWO_EPSILON && params->nni5){
+			// you fill out link neighbors vector for newly allocated SuperNeighbors
+			for(id = 2; id < 6; id++){
+				if(saved_nei[id]->link_neighbors[part]){
+					((SuperNeighbor*)*saved_it[id])->link_neighbors[part] = saved_nei[id]->link_neighbors[part];
+				}
+			}
+		}
+	}
+
+	/* -------------------------------------------------------------------
+	 * Variables to store the information about which nodes/neighbors
+	 * to be swapped on SubTrees for the corresponding NNI on SuperTree
+	 *
+	 * node1 -> node1_link[part]
+	 * node2 -> node2_link[part]
+	 * node1_nei -> node1_link_nei[part]
+	 * node2_nei -> node2_link_nei[part]
+	 * -------------------------------------------------------------------*/
+	vector<PhyloNode*> node1_link,node2_link;
+	vector<PhyloNeighbor*> node1_link_nei,node2_link_nei;
+	vector<NeighborVec::iterator> node1_link_it, node2_link_it;
+
+	// Nodes which correspond to node1 and node2 on partitions
+	node1_link.resize(ntrees);
+	node2_link.resize(ntrees);
+	// Neighbors of node1_link and node2_link to be swapped during NNI
+	node1_link_nei.resize(ntrees);
+	node2_link_nei.resize(ntrees);
+	// iterators for the neighbors of node1_link and node2_link to be swapped
+	node1_link_it.resize(ntrees);
+	node2_link_it.resize(ntrees);
+
+	/*===========================================================================================
+	 * 	MAIN:
+	 * 	- do the NNI swap on SuperTree and perform the corresponding actions on SubTrees;
+	 *	- compute the likelihood of swapped topology;
+	 *  - swap back;
+	 *	- restore if necessary.
+	 *===========================================================================================*/
+	int cnt;
+	for (cnt = 0; cnt < 2; cnt++) {
+		//cout<<"NNI Loop-----------------------------NNI."<<cnt<<endl;
+
+    	NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
+    	NeighborVec::iterator node2_it = nniMoves[cnt].node2Nei_it;
+        Neighbor *node1_nei = *node1_it;
+        Neighbor *node2_nei = *node2_it;
+
+		//node2_it = node2_its[cnt];
+		//Neighbor *node2_nei = *node2_it;
+
+		// Define which nodes/neighbors to be swapped on SubTree ----------------------------
+		for(part=0; part<ntrees; part++)
+			if(is_nni[part]==NNI_NO_EPSILON){
+				node1_link[part] = (PhyloNode*) nei2_new->link_neighbors[part]->node;
+				node2_link[part] = (PhyloNode*) nei1_new->link_neighbors[part]->node;
+				node1_link_nei[part] = ((SuperNeighbor*)node1_nei)->link_neighbors[part];
+				node1_link_it[part] = node1_link[part]->findNeighborIt(node1_link_nei[part]->node);
+				node2_link_nei[part] = ((SuperNeighbor*)node2_nei)->link_neighbors[part];
+				node2_link_it[part] = node2_link[part]->findNeighborIt(node2_link_nei[part]->node);
+			}
+
+		// Do the NNI swap on SuperTrees ----------------------------------------------------
+		node1->updateNeighbor(node1_it, node2_nei);
+		node2_nei->node->updateNeighbor(node2, node1);
+		node2->updateNeighbor(node2_it, node1_nei);
+		node1_nei->node->updateNeighbor(node1, node2);
+
+		// Perform actions in accordance with the type of NNI for a given partition ---------
+		for(part = 0; part < ntrees; part++){
+			//cout<<"Partition: "<<part<<endl;
+
+			if(is_nni[part]==NNI_NO_EPSILON){
+				//cout<<part<<"- NO_EPS: do NNI swap"<<endl;
+				//allNNIcases_computed[0] += 1;
+
+				// Do NNI swap on partition
+				node1_link[part]->updateNeighbor(node1_link_it[part], node2_link_nei[part]);
+				node2_link_nei[part]->node->updateNeighbor(node2_link[part], node1_link[part]);
+				node2_link[part]->updateNeighbor(node2_link_it[part], node1_link_nei[part]);
+				node1_link_nei[part]->node->updateNeighbor(node1_link[part], node2_link[part]);
+
+				for(id=0; id<IT_NUM; id++){
+					((PhyloNeighbor*)(*sub_saved_it[part*6+id]))->clearPartialLh();
+				}
+				//checkBranchLen();
+			} else if(is_nni[part]==NNI_MANY_EPSILON){
+				//cout<<part<<"- MANY_EPS: do nothing"<<endl;
+				// the NNI on SuperTree does not change anything on SubTree
+
+			} else if(is_nni[part]==NNI_THREE_EPSILON){
+				//cout<<part<<"- THREE_EPS: relink"<<endl;
+
+				// The central branch had no image before the NNI.
+				// Relink the central branch and take care of branch lengths.
+				// In the end restore one branch (valid for both nni1 and nni5).
+
+				linkBranch(part, nei1_new, nei2_new);
+				assert(nei1_new->link_neighbors[part]);
+
+				// Save the branch length
+				if(cnt == 0)
+					sub_saved_branch[6*part] = nei1_new->link_neighbors[part]->length;
+
+				nei1_new->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+				nei2_new->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+
+				// since the branch length was changed we have to recompute the likelihood of the branch
+				part_info[part].cur_score = at(part)->computeLikelihoodBranch(nei1_new->link_neighbors[part],
+						(PhyloNode*)nei2_new->link_neighbors[part]->node);
+
+			}else if(is_nni[part]==NNI_TWO_EPSILON){
+				//cout<<part<<"- TWO_EPS: relink"<<endl;
+
+				/* In fact, before relinking the image of central branch is NULL (because we allocated
+				 * new SuperNeighbor and filled the link_neighbors with NULL for all partitions).
+				 * After relinking it can be either NULL or it should relink to the same branch as before.
+				 * In the end restore one branch (valid for both nni1 and nni5).*/
+
+				// Save the branch length
+				if(cnt == 0)
+					sub_saved_branch[6*part] = saved_nei[0]->link_neighbors[part]->length;
+
+				linkBranch(part, nei1_new, nei2_new);
+				if(!nei1_new->link_neighbors[part]){
+					saved_nei[0]->link_neighbors[part]->length -= old_brlen * part_info[part].part_rate;
+					saved_nei[1]->link_neighbors[part]->length -= old_brlen * part_info[part].part_rate;
+					part_info[part].cur_score = at(part)->computeLikelihoodBranch(saved_nei[0]->link_neighbors[part],
+							(PhyloNode*)saved_nei[1]->link_neighbors[part]->node);
+				}
+
+			}else if(is_nni[part] == NNI_ONE_EPSILON){
+				//cout<<part<<"- ONE_EPS: relink, update the link_neighbors"<<endl;
+
+				/* The crazy case, which absorbs most of the bugs:(
+				 * Lets say on SuperTree there are five branches, a,b,c,d and central e, and d has an empty image.
+				 * The corresponding SubTree has 3 branches, a',b',c'.
+				 * Before NNI central branch, e, has an image. Lets say it maps to a'.
+				 * After NNI it will be remapped either to b' or c', depending on which nodes will be swapped.
+				 * Update the corresponding link_neighbors. Make sure that link_neighbors for central branch e
+				 * and for the one it is now mapped to (b' or c'), are the same.
+				 * Decrease a' (done before). Increase b' or c' depending on the NNI. Restore three branches.*/
+
+				linkBranch(part, nei1_new, nei2_new);
+				assert(nei1_new->link_neighbors[part]);
+
+				//cout<<"nei1_new->link_nei["<<part<<"]->node->id"<<nei1_new->link_neighbors[part]->node->id<<endl;
+				//cout<<"nei2_new->link_nei["<<part<<"]->node->id"<<nei2_new->link_neighbors[part]->node->id<<endl;
+
+				assert(nei1_new->link_neighbors[part]->node->findNeighbor(nei2_new->link_neighbors[part]->node));
+				assert(nei2_new->link_neighbors[part]->node->findNeighbor(nei1_new->link_neighbors[part]->node));
+
+				// nni1:
+				// - you need to update only one link_neighbor with new PhyloNeighbor
+				//	 (either node1->findNeighbor(node2) or node2->findNeighbor(node1))
+				// - the second is already linked to some existing PhyloNeighbor after linkBranch().
+				for(id=2; id<6; id++){
+					if(node2->isNeighbor(neighbor_nodes[id-2])){
+						// nei2_new should be updated
+						if(((SuperNeighbor*)node2->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]){
+							//cout<<"node2: "<<"id = "<<id<<"; node_id = "<<((SuperNeighbor*)(node2->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]->node->id<<";"<<endl;
+							if(((SuperNeighbor*)node2->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]->node
+									== nei1_new->link_neighbors[part]->node){
+								//assert(((SuperNeighbor*)node2->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]->node->id
+									//	== (*sub_saved_it[part*6 + id])->node->id);
+								nei2_new->link_neighbors[part] = (PhyloNeighbor*)(*sub_saved_it[part*6 + id]);
+								//cout<<"   nei id = "<<id<<"; node_id = "<<((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node->id<<";"<<endl;
+								//cout<<"   sub "<<"id = "<<id<<"; node_id = "<<(*sub_saved_it[part*6 + id])->node->id<<";"<<endl;
+								break;
+							}
+						}
+					} else {
+						// nei1_new should be updated
+						assert(node1->isNeighbor(neighbor_nodes[id-2]));
+						if(((SuperNeighbor*)node1->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]){
+							//cout<<"node1: "<<"id = "<<id<<"; node_id = "<<((SuperNeighbor*)(node1->findNeighbor(neighbor_nodes[id-2])))->link_neighbors[part]->node->id<<";"<<endl;
+							if(((SuperNeighbor*)node1->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]->node
+									== nei2_new->link_neighbors[part]->node){
+								//assert(((SuperNeighbor*)node1->findNeighbor(neighbor_nodes[id-2]))->link_neighbors[part]->node->id
+									//	== (*sub_saved_it[part*6 + id])->node->id);
+								nei1_new->link_neighbors[part] = (PhyloNeighbor*)(*sub_saved_it[part*6 + id]);
+								//cout<<"   nei id = "<<id<<"; node_id = "<<((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->node->id<<";"<<endl;
+								//cout<<"   sub "<<"id = "<<id<<"; node_id = "<<(*sub_saved_it[part*6 + id])->node->id<<";"<<endl;
+								break;
+							}
+						}
+					}
+				}
+
+				// Clear partial likelihoods for all three neighbors nei1/2->find(node1/2)
+				if(params->nni5 && cnt == 1){
+					for(id=2; id<6; id++){
+						if(id != id_eps[part]){
+							((PhyloNeighbor*)(*sub_saved_it[part*6 + id]))->clearPartialLh();
+						}
+					}
+				}
+				//cout<<"nei1_new->link_nei["<<part<<"]->node->id"<<nei1_new->link_neighbors[part]->node->id<<endl;
+				//cout<<"nei2_new->link_nei["<<part<<"]->node->id"<<nei2_new->link_neighbors[part]->node->id<<endl;
+
+				assert(nei1_new->link_neighbors[part]->node->findNeighbor(nei2_new->link_neighbors[part]->node));
+				assert(nei2_new->link_neighbors[part]->node->findNeighbor(nei1_new->link_neighbors[part]->node));
+
+				// Increase the branch to which the central is relinked.
+				nei1_new->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+				nei2_new->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+
+			} // end of else ONE_EPS case
+		} // end of part loop
+
+/*===============================================================================================================================*
+ * 											Compute the score of the swapped topology 				  							 *
+ *===============================================================================================================================*/
+		//cout<<"Before optimization"<<endl;
+		//mapBranchLen();
+		//checkBranchLen();
+
+		optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
+//		double score = computeLikelihoodFromBuffer();
+		nniMoves[cnt].newLen[0] = node1->findNeighbor(node2)->length;
+
+//		if (verbose_mode >= VB_MED) {
+//			cout << "After_nni1 [" << score << "] ";
+//			printTree(cout);
+//			cout << endl;
+//    		//for(part = 0; part < ntrees; part++)
+//    		//	cout << is_nni[part] << " ";
+//    		//cout << endl;
+//    		//cout<<"NNI count = "<<cnt<<endl;
+//		}
+		//cout<<"After optimization"<<endl;
+		//checkBranchLen();
+
+		// %%%%%%%%%%%%%%%%%%%%%%%%  FIVE BRANCH OPTIMIZATION  %%%%%%%%%%%%%%%%%%%%%%%%
+		i=1;
+	    if (params->nni5) {
+
+	    	// ------ Optimization of branches incident to node1 ---------------
+	    	FOR_NEIGHBOR(node1, node2, it){
+	    		// Clear the partial likelihood of node1 neighbor: only for NO or ONE epsilon cases
+	    		for(part = 0; part < ntrees; part++)
+	    			if(((SuperNeighbor*)(*it))->link_neighbors[part] && (is_nni[part]==NNI_NO_EPSILON || is_nni[part]==NNI_ONE_EPSILON)){
+	    				node_link = ((SuperNeighbor*)(*it))->link_neighbors[part]->node;
+	    				nei_link  = nei2_new->link_neighbors[part]->node; // this should be node 1 on subtree
+	    				// the problem is that for ONE_epsilon case node1 on subtree is equal to its neighbor node on subtree
+	    				// in this case we have to set nei_link to node2 on subtree
+	    				if(node_link->id == nei_link->id){
+	    					nei_link = nei1_new->link_neighbors[part]->node;
+	    				}
+	    				//cout<<"HERE it is: "<<((SuperNeighbor*)(*it))->link_neighbors[part]->node->id<<endl;
+	    				//cout<<nei2_new->link_neighbors[part]->node->id<<endl;
+						((PhyloNeighbor*)node_link->findNeighbor(nei_link))->clearPartialLh();
+						//cout<<"CASE:"<<is_nni[part]<<"Cleared partial likelihood"<<endl;
+	    			}
+	    		// Optimize the branch incident to node1
+	    		//cout<<"NNI5 : node1 : Before optimization"<<endl;
+	    		//checkBranchLen();
+	    		optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
+				nniMoves[cnt].newLen[i] = node1->findNeighbor((*it)->node)->length;
+				i++;
+
+
+	    		//cout<<"NNI5 : node1 : After optimization"<<endl;
+	    		//checkBranchLen();
+	    	}
+
+	    	// ------ Clear the partial likelihood on the central branch -------
+	    	for(part = 0; part < ntrees; part++)
+	    		if(((SuperNeighbor*)node2->findNeighbor(node1))->link_neighbors[part] && (is_nni[part]==NNI_NO_EPSILON || is_nni[part]==NNI_ONE_EPSILON)){
+	    			((SuperNeighbor*)node2->findNeighbor(node1))->link_neighbors[part]->clearPartialLh();
+	    		}
+
+	    	// ------ Optimization of branches incident to node2 ---------------
+	    	FOR_NEIGHBOR(node2, node1, it){
+	    		// Clear the partial likelihood of node2 neighbor: only for NO or ONE epsilon cases
+	    		for(part = 0; part < ntrees; part++){
+	    			if(((SuperNeighbor*)(*it))->link_neighbors[part] && (is_nni[part]==NNI_NO_EPSILON || is_nni[part]==NNI_ONE_EPSILON)){
+	    				node_link = ((SuperNeighbor*)(*it))->link_neighbors[part]->node;
+	    				nei_link  = nei1_new->link_neighbors[part]->node;
+	    				if(node_link->id == nei_link->id){
+	    					nei_link = nei2_new->link_neighbors[part]->node;
+	    				}
+	    				((PhyloNeighbor*)node_link->findNeighbor(nei_link))->clearPartialLh();
+	    				//cout<<"CASE:"<<is_nni[part]<<"Cleared partial likelihood"<<endl;
+	    			}
+	    		}
+	    		// Optimize the branch incident to node2
+	    		optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
+				nniMoves[cnt].newLen[i] = node2->findNeighbor((*it)->node)->length;
+				i++;
+	    	}
+	    }
+
+		double score = computeLikelihoodFromBuffer();
+		if (verbose_mode >= VB_DEBUG)
+			cout << "Log-likelihood: " << score << endl;
+
+		// %%%%%%%%%%%%%%%%%%%%%%%%%%%  END of nni5branch  %%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+		nniMoves[cnt].newloglh = score;
+
+//	    if (verbose_mode >= VB_MED) {
+//			//this->clearAllPartialLH();
+//			//for(part = 0; part<ntrees; part++){
+//			//	at(part)->clearAllPartialLH();
+//			//}
+//			//cout << "[" << this->computeLikelihood() << "] ";
+//			cout << "After_nni5 " << score << " ";
+//			printTree(cout);
+//			cout << endl;
+//			for(part = 0; part < ntrees; part++)
+//				cout << is_nni[part] << " ";
+//			cout << endl;
+//		}
+
+		// FOR SH-aLRT test
+		if (nniMoves[cnt].ptnlh)
+			computePatternLikelihood(nniMoves[cnt].ptnlh, &score);
+
+	    // Save current tree for ufboot analysis
+	    if (save_all_trees == 2) {
+	    		saveCurrentTree(score);
+	    }
+
+//	    // *************************** STORE INFO ABOUT NNI ***************************
+//
+//	    // Store information about this NNI for NNImove for SuperTree
+//		if (nni_param) {
+////			if (verbose_mode >= VB_MAX)
+////				printTree(cout, WT_BR_LEN + WT_NEWLINE);
+//			if (cnt == 0) {
+//				nni_param->nni1_score = score;
+//				nni_param->nni1_brlen = nei1_new->length;
+//			} else {
+//				nni_param->nni2_score = score;
+//				nni_param->nni2_brlen = nei1_new->length;
+//			}
+//		}
+//		// ***************************************************************************
+
+		// =============================== RESTORE INFO ==============================
+		// Restore the cur_score for partitions
+		for (i = 0; i < part_info.size(); i++)
+			part_info[i].cur_score = saved_cur_score[i];
+
+		//Restoring branch length on Super Tree
+		nei1_new->length = old_brlen;
+		nei2_new->length = old_brlen;
+
+// Swap back on SuperTree --------------------------------------------------------------------------------------------------------------
+		node1->updateNeighbor(node1_it, node1_nei);
+		node1_nei->node->updateNeighbor(node2, node1);
+		node2->updateNeighbor(node2_it, node2_nei);
+		node2_nei->node->updateNeighbor(node1, node2);
+
+		//Restoring 4 branches around central?
+		if(params->nni5){
+			for(id=2;id<6;id++){
+				(*saved_it[id])->length = saved_nei[id]->length;
+				(*node_nei_it[id-2])->length = saved_nei[id]->length;
+			}
+		}
+
+// Swap back or relink back on SubTrees------------------------------------------------------------------------------------------------
+		for(part = 0; part < ntrees; part++){
+
+			if(is_nni[part]==NNI_NO_EPSILON){
+				node1_link[part]->updateNeighbor(node1_link_it[part], node1_link_nei[part]);
+				node1_link_nei[part]->node->updateNeighbor(node2_link[part], node1_link[part]);
+				node2_link[part]->updateNeighbor(node2_link_it[part], node2_link_nei[part]);
+				node2_link_nei[part]->node->updateNeighbor(node1_link[part], node2_link[part]);
+
+				//Restoring the branch length on the SubTree
+				node1_link[part]->findNeighbor(node2_link[part])->length = saved_nei[0]->link_neighbors[part]->length;
+				node2_link[part]->findNeighbor(node1_link[part])->length = saved_nei[0]->link_neighbors[part]->length;
+
+				if(params->nni5){
+					for(id = 2; id < 6; id++){
+						((SuperNeighbor*)(*saved_it[id]))->link_neighbors[part]->length = saved_nei[id]->link_neighbors[part]->length;
+						((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]->length = saved_nei[id]->link_neighbors[part]->length;
+					}
+				}
+				//mapBranchLen();
+
+			} else if(is_nni[part]==NNI_ONE_EPSILON){
+				//linkCheckRe(part,node1,node2,sub_saved_nei2[part],sub_saved_nei1[part]);
+				//linkCheckRe(part,node2,node1,sub_saved_nei1[part],sub_saved_nei2[part]);
+
+				// Relink back
+				linkBranch(part, nei1_new, nei2_new);
+				assert(nei1_new->link_neighbors[part]->node == saved_nei[0]->link_neighbors[part]->node);
+				assert(nei2_new->link_neighbors[part]->node == saved_nei[1]->link_neighbors[part]->node);
+
+				// Restore three branches
+				for(id=2; id<6; id++){
+					if(((SuperNeighbor*)*saved_it[id])->link_neighbors[part]){
+						(*sub_saved_it[part*6+id])->length = sub_saved_branch[6*part + id];
+						((SuperNeighbor*)*saved_it[id])->link_neighbors[part]->length = sub_saved_branch[6*part + id];
+						((SuperNeighbor*)*node_nei_it[id-2])->link_neighbors[part]->length = sub_saved_branch[6*part + id];
+					}
+				}
+
+			} else if(is_nni[part]==NNI_THREE_EPSILON){
+				nei1_new->link_neighbors[part]->length = sub_saved_branch[6*part];
+				nei2_new->link_neighbors[part]->length = sub_saved_branch[6*part];
+				//linkBranch(part, nei1_new, nei2_new);
+			} else if(is_nni[part]==NNI_TWO_EPSILON){
+				//linkBranch(part, nei1_new, nei2_new);
+				saved_nei[0]->link_neighbors[part]->length = sub_saved_branch[6*part];
+				saved_nei[1]->link_neighbors[part]->length = sub_saved_branch[6*part];
+				nei1_new->link_neighbors[part] = NULL;
+				nei2_new->link_neighbors[part] = NULL;
+			} else if(is_nni[part]==NNI_MANY_EPSILON){
+				// There is no need to restore anything
+			}
+		}
+
+		//cout<<"in NNI1end ---- logL = "<<this->computeLikelihood()<<endl;
+	} // end of for(cnt)
+
+
+//=============================================================================================================================================================
+// 							Restoring after 2 NNIs
+//=============================================================================================================================================================
+// Restoring information for SuperTree ------------------------------------------------------------------------------------------------------------
+	// restore the Neighbors*
+	for (id = IT_NUM-1; id >= 0; id--) {
+		if (*saved_it[id] == current_it) current_it = (SuperNeighbor*) saved_nei[id];
+		if (*saved_it[id] == current_it_back) current_it_back = (SuperNeighbor*) saved_nei[id];
+
+		delete (*saved_it[id]);
+		(*saved_it[id]) = saved_nei[id];
+	 }
+	// restore the length of 4 branches around node1, node2
+	// since you have restored the neighbors and by this also the correct branch lengths,
+	// now just restore branch lengths of the second corresponding neighbors
+	FOR_NEIGHBOR(node1, node2, it)
+		(*it)->length = (*it)->node->findNeighbor(node1)->length;
+	FOR_NEIGHBOR(node2, node1, it)
+		(*it)->length = (*it)->node->findNeighbor(node2)->length;
+
+// Restoring information for SubTrees ------------------------------------------------------------------------------------------------------------
+		for(part = 0; part < ntrees; part++){
+			if(is_nni[part] == NNI_NO_EPSILON){
+				// restore the Neighbors*
+				for (i = IT_NUM-1; i >= 0; i--) {
+					if((*sub_saved_it[part*6+i])){
+						aligned_free(((PhyloNeighbor*) *sub_saved_it[part*6+i])->scale_num);
+						aligned_free(((PhyloNeighbor*) *sub_saved_it[part*6+i])->partial_lh);
+						if (*sub_saved_it[part*6+i] == at(part)->current_it) at(part)->current_it = saved_nei[i]->link_neighbors[part];
+						if (*sub_saved_it[part*6+i] == at(part)->current_it_back) at(part)->current_it_back = saved_nei[i]->link_neighbors[part];
+
+						delete (*sub_saved_it[part*6+i]);
+						(*sub_saved_it[part*6+i]) = saved_nei[i]->link_neighbors[part];
+					}
+				}
+				// restore the length of 4 branches around node1_link[part], node2_link[part]
+				node1_link[part] = (PhyloNode*)(saved_nei[1]->link_neighbors[part]->node);
+				node2_link[part] = (PhyloNode*)(saved_nei[0]->link_neighbors[part]->node);
+				FOR_NEIGHBOR(node1_link[part], node2_link[part], it)
+					(*it)->length = (*it)->node->findNeighbor(node1_link[part])->length;
+				FOR_NEIGHBOR(node2_link[part], node1_link[part], it)
+					(*it)->length = (*it)->node->findNeighbor(node2_link[part])->length;
+
+			} else if(is_nni[part] == NNI_ONE_EPSILON){
+
+				// Delete the allocated neighbors and restore from saved neighbors
+				for (id = 5; id >= 2; id--) {
+					//if((*sub_saved_it[part*6+id])){
+					if(((SuperNeighbor*)(*node_nei_it[id-2]))->link_neighbors[part]){
+						aligned_free(((PhyloNeighbor*) *sub_saved_it[part*6+id])->scale_num);
+						aligned_free(((PhyloNeighbor*) *sub_saved_it[part*6+id])->partial_lh);
+
+						// It was commented, not sure why.. Just keep in mind------------------
+						if (*sub_saved_it[part*6+id] == at(part)->current_it)
+							at(part)->current_it = saved_nei[id]->link_neighbors[part];
+						if (*sub_saved_it[part*6+id] == at(part)->current_it_back)
+							at(part)->current_it_back = saved_nei[id]->link_neighbors[part];
+						//---------------------------------------------------------------------
+
+						delete (*sub_saved_it[part*6+id]);
+						(*sub_saved_it[part*6+id]) = ((SuperNeighbor*)(*saved_it[id]))->link_neighbors[part];
+					}
+				}
+				// Increase the central branch, since the length that was saved, was decreased
+				saved_nei[0]->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+				saved_nei[1]->link_neighbors[part]->length += old_brlen * part_info[part].part_rate;
+			}
+		}
+		//mapBranchLen();
+		//cout<<"In the end of swap NNI"<<endl;
+		//checkBranchLen();
+//------------------------------------------------------------------------------------------------------------------------------------------------
+	//if(score_mine != this->computeLikelihood())
+	//	cout<<"Something weird happens during NNI evaluation..." << score_mine << " " << computeLikelihood() <<endl;
+
+	delete [] saved_cur_score;
+	delete [] sub_saved_branch;
+	return cur_score;
+}
+
+void PhyloSuperTreePlen::linkCheck(int part,Node* node, Node* dad, PhyloNeighbor* saved_link_dad_nei){
+	NeighborVec::iterator it;
+	SuperNeighbor *dad_nei = (SuperNeighbor*)dad->findNeighbor(node);
+	SuperNeighbor *node_nei = (SuperNeighbor*)node->findNeighbor(dad);
+	FOR_NEIGHBOR(node, dad, it){
+		if(((SuperNeighbor*)(*it))->link_neighbors[part] == saved_link_dad_nei){
+			((SuperNeighbor*)(*it))->link_neighbors[part] = dad_nei->link_neighbors[part];
+			((SuperNeighbor*)((*it)->node->findNeighbor(node)))->link_neighbors[part] = node_nei->link_neighbors[part];
+			linkCheck(part, (*it)->node, node, saved_link_dad_nei);
+		}
+	}
+}
+
+void PhyloSuperTreePlen::linkCheckRe(int part,Node* node, Node* dad, PhyloNeighbor* saved_link_dad_nei,PhyloNeighbor* saved_link_node_nei){
+	NeighborVec::iterator it;
+	FOR_NEIGHBOR(node, dad, it){
+		if(((SuperNeighbor*)(*it))->link_neighbors[part] == ((SuperNeighbor*)dad->findNeighbor(node))->link_neighbors[part]){
+			linkCheckRe(part, (*it)->node, node, saved_link_dad_nei, saved_link_node_nei);
+			((SuperNeighbor*)(*it))->link_neighbors[part] = saved_link_dad_nei;
+			((SuperNeighbor*)((*it)->node->findNeighbor(node)))->link_neighbors[part] = saved_link_node_nei;
+		}
+	}
+}
+void PhyloSuperTreePlen::restoreAllBrans(PhyloNode *node, PhyloNode *dad) {
+	IQTree::restoreAllBrans(node,dad);
+	mapTrees();
+}
+
+bool PhyloSuperTreePlen::checkBranchLen(){
+
+//	NodeVector nodes1,nodes2;
+//	int i;
+//	getBranches(nodes1, nodes2);
+//	double *checkVAL = new double[branchNum];
+//	for(int part = 0; part < size(); part++){
+//		memset(checkVAL, 0, at(part)->branchNum*sizeof(double));
+//		for (i = 0; i < nodes1.size(); i++){
+//			if(((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part])
+//				checkVAL[((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part]->id] += nodes1[i]->findNeighbor(nodes2[i])->length * part_info[part].part_rate;
+//		}
+//		NodeVector nodes1_sub, nodes2_sub;
+//		at(part)->getBranches(nodes1_sub, nodes2_sub);
+//		for(int j = 0; j<nodes1_sub.size();j++)
+//			if(fabs(nodes1_sub[j]->findNeighbor(nodes2_sub[j])->length-checkVAL[nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id])>0.0001){
+//				//drawTree(cout, WT_BR_SCALE + WT_INT_NODE + WT_BR_LEN);
+//				printMapInfo();
+//				cout<<endl;
+//				cout<<"Partition = "<<part<<", Branch id = "<<nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id<<endl;
+//				outError("Branches on SuperTree and SubTree do not match!!",__func__);
+//			}
+//
+//	}
+//	delete [] checkVAL;
+
+	return true;
+}
+
+void PhyloSuperTreePlen::mapBranchLen()
+{
+	NodeVector nodes1,nodes2;
+	int i;
+	getBranches(nodes1, nodes2);
+	double *checkVAL = new double[branchNum];
+	for(int part = 0; part < size(); part++){
+		memset(checkVAL,0,at(part)->branchNum*sizeof(double));
+		for (i = 0; i < nodes1.size(); i++){
+			if(((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part])
+				checkVAL[((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part]->id] +=
+						nodes1[i]->findNeighbor(nodes2[i])->length * part_info[part].part_rate;
+		}
+		NodeVector nodes1_sub, nodes2_sub;
+		at(part)->getBranches(nodes1_sub, nodes2_sub);
+		for(int j = 0; j<nodes1_sub.size();j++){
+			nodes1_sub[j]->findNeighbor(nodes2_sub[j])->length = checkVAL[nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id];
+			nodes2_sub[j]->findNeighbor(nodes1_sub[j])->length = checkVAL[nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id];
+		}
+	}
+	delete [] checkVAL;
+}
+
+void PhyloSuperTreePlen::mapBranchLen(int part)
+{
+	NodeVector nodes1,nodes2;
+	int i;
+	getBranches(nodes1, nodes2);
+	double *checkVAL = new double[branchNum];
+    memset(checkVAL,0,at(part)->branchNum*sizeof(double));
+    for (i = 0; i < nodes1.size(); i++){
+        if(((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part])
+            checkVAL[((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part]->id] +=
+                    nodes1[i]->findNeighbor(nodes2[i])->length * part_info[part].part_rate;
+    }
+    NodeVector nodes1_sub, nodes2_sub;
+    at(part)->getBranches(nodes1_sub, nodes2_sub);
+    for(int j = 0; j<nodes1_sub.size();j++){
+        nodes1_sub[j]->findNeighbor(nodes2_sub[j])->length = checkVAL[nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id];
+        nodes2_sub[j]->findNeighbor(nodes1_sub[j])->length = checkVAL[nodes1_sub[j]->findNeighbor(nodes2_sub[j])->id];
+    }
+	delete [] checkVAL;
+}
+
+void PhyloSuperTreePlen::printMapInfo() {
+	NodeVector nodes1, nodes2;
+	getBranches(nodes1, nodes2);
+	int part = 0;
+	drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_LEN);
+	for (iterator it = begin(); it != end(); it++, part++) {
+		cout << "Subtree for partition " << part << endl;
+		(*it)->drawTree(cout, WT_BR_SCALE | WT_INT_NODE | WT_TAXON_ID | WT_NEWLINE | WT_BR_LEN);
+		for (int i = 0; i < nodes1.size(); i++) {
+			PhyloNeighbor *nei1 = ((SuperNeighbor*)nodes1[i]->findNeighbor(nodes2[i]))->link_neighbors[part];
+			PhyloNeighbor *nei2 = ((SuperNeighbor*)nodes2[i]->findNeighbor(nodes1[i]))->link_neighbors[part];
+			cout << nodes1[i]->findNeighbor(nodes2[i])->id << ":";
+			if (nodes1[i]->isLeaf()) cout << nodes1[i]->name; else cout << nodes1[i]->id;
+			cout << ",";
+			if (nodes2[i]->isLeaf()) cout << nodes2[i]->name; else cout << nodes2[i]->id;
+			cout <<"("<<nodes1[i]->findNeighbor(nodes2[i])->length<<")"<< " -> ";
+			if (nei2) {
+				cout << nei2->id << ":";
+				if (nei2->node->isLeaf())
+					cout << nei2->node->name;
+				else cout << nei2->node->id;
+			}
+			else cout << -1;
+			cout << ",";
+			if (nei1){
+				if (nei1->node->isLeaf())
+					cout << nei1->node->name;
+				else cout << nei1->node->id;
+				cout <<"("<<nei1->length<<")";
+			}
+			else cout << -1;
+			cout << endl;
+		}
+	}
+}
+
+void PhyloSuperTreePlen::initPartitionInfo() {
+	//PhyloSuperTree::initPartitionInfo();
+	for (int part = 0; part < size(); part++){
+		if(part_info[part].part_rate == 0.0) { part_info[part].part_rate = 1.0; }
+		part_info[part].cur_score = 0.0;
+	}
+}
+
+void PhyloSuperTreePlen::printNNIcasesNUM(){
+	cout<<"For each \"NNI case\" on subtree the number of times it appeared during NNI evaluation:"<<endl;
+	cout<<"Case 1: NO_EPS    = "<<allNNIcases_computed[0]<<endl;
+	cout<<"Case 2: ONE_EPS   = "<<allNNIcases_computed[1]<<endl;
+	cout<<"Case 3: TWO_EPS   = "<<allNNIcases_computed[2]<<endl;
+	cout<<"Case 4: THREE_EPS = "<<allNNIcases_computed[3]<<endl;
+	cout<<"Case 5: MANY_EPS  = "<<allNNIcases_computed[4]<<endl;
+}
+
+void PhyloSuperTreePlen::computeBranchLengths()
+{
+	}
+
+int PhyloSuperTreePlen::fixNegativeBranch(bool force, Node *node, Node *dad) {
+
+	mapTrees();
+	int fixed = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		(*it)->initializeAllPartialPars();
+		(*it)->clearAllPartialLH();
+		fixed += (*it)->fixNegativeBranch(force);
+		(*it)->clearAllPartialLH();
+	}
+    // FOR OLGA: because this check is not performed, branch lengths of user tree will change even with -fixbr command line
+    if (fixed) {
+        PhyloSuperTree::computeBranchLengths();
+
+        // it is necessary to map the branch lengths from supertree into gene trees!
+        mapTrees();
+    }
+
+	return fixed;
+}
+
+void PhyloSuperTreePlen::changeNNIBrans(NNIMove nnimove) {
+
+	PhyloTree::changeNNIBrans(nnimove);
+	//mapBranchLen();
+
+}
+
+
+/**
+        initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+ */
+void PhyloSuperTreePlen::initializeAllPartialLh() {
+	iterator it;
+	int part;
+	int ntrees = size();
+
+	block_size.resize(ntrees);
+	scale_block_size.resize(ntrees);
+
+	vector<uint64_t> mem_size, lh_cat_size;
+	mem_size.resize(ntrees);
+	lh_cat_size.resize(ntrees);
+	uint64_t total_mem_size = 0, total_block_size = 0, total_lh_cat_size = 0;
+
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		size_t nptn = (*it)->getAlnNPattern() + (*it)->aln->num_states; // extra #numStates for ascertainment bias correction
+		if (instruction_set >= 7)
+			mem_size[part] = ((nptn +3)/4)*4;
+		else
+			mem_size[part] = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+		scale_block_size[part] = nptn;
+		block_size[part] = mem_size[part] * (*it)->aln->num_states * (*it)->getRate()->getNRate() *
+				(((*it)->model_factory->fused_mix_rate)? 1 : (*it)->getModel()->getNMixtures());
+
+		lh_cat_size[part] = mem_size[part] * (*it)->getRate()->getNDiscreteRate() *
+				(((*it)->model_factory->fused_mix_rate)? 1 : (*it)->getModel()->getNMixtures());
+		total_mem_size += mem_size[part];
+		total_block_size += block_size[part];
+		total_lh_cat_size += lh_cat_size[part];
+	}
+
+    if (!_pattern_lh)
+        _pattern_lh = aligned_alloc<double>(total_mem_size);
+    front()->_pattern_lh = _pattern_lh;
+    if (!_pattern_lh_cat)
+        _pattern_lh_cat = aligned_alloc<double>(total_lh_cat_size);
+    front()->_pattern_lh_cat = _pattern_lh_cat;
+    if (!theta_all)
+        theta_all = aligned_alloc<double>(total_block_size);
+    front()->theta_all = theta_all;
+    if (!ptn_freq) {
+        ptn_freq = aligned_alloc<double>(total_mem_size);
+        ptn_freq_computed = false;
+    }
+    front()->ptn_freq = ptn_freq;
+    front()->ptn_freq_computed = false;
+    if (!ptn_invar)
+        ptn_invar = aligned_alloc<double>(total_mem_size);
+    front()->ptn_invar = ptn_invar;
+
+    size_t IT_NUM = (params->nni5) ? 6 : 2;
+    if (!nni_partial_lh) {
+        nni_partial_lh = aligned_alloc<double>(IT_NUM*total_block_size);
+    }
+    front()->nni_partial_lh = nni_partial_lh;
+    
+    if (!nni_scale_num) {
+        nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*total_mem_size);
+    }
+    front()->nni_scale_num = nni_scale_num;
+
+	for (it = begin()+1, part = 0; it != end(); it++, part++) {
+		(*it)->_pattern_lh = (*(it-1))->_pattern_lh + mem_size[part];
+		(*it)->_pattern_lh_cat = (*(it-1))->_pattern_lh_cat + lh_cat_size[part];
+		(*it)->theta_all = (*(it-1))->theta_all + block_size[part];
+		(*it)->ptn_freq = (*(it-1))->ptn_freq + mem_size[part];
+		(*it)->ptn_freq_computed = false;
+		(*it)->ptn_invar = (*(it-1))->ptn_invar + mem_size[part];
+        (*it)->nni_partial_lh = (*(it-1))->nni_partial_lh + IT_NUM*block_size[part];
+        (*it)->nni_scale_num = (*(it-1))->nni_scale_num + IT_NUM*mem_size[part];
+	}
+
+	// compute total memory for all partitions
+	uint64_t total_partial_lh_entries = 0, total_scale_num_entries = 0, total_partial_pars_entries = 0;
+	partial_lh_entries.resize(ntrees);
+	scale_num_entries.resize(ntrees);
+	partial_pars_entries.resize(ntrees);
+	for (it = begin(), part = 0; it != end(); it++, part++) {
+		(*it)->getMemoryRequired(partial_lh_entries[part], scale_num_entries[part], partial_pars_entries[part]);
+		total_partial_lh_entries += partial_lh_entries[part];
+		total_scale_num_entries += scale_num_entries[part];
+		total_partial_pars_entries += partial_pars_entries[part];
+	}
+
+	// allocate central memory for all partitions
+	if (!central_partial_lh) {
+        try {
+        	central_partial_lh = aligned_alloc<double>(total_partial_lh_entries);
+        	central_scale_num = aligned_alloc<UBYTE>(total_scale_num_entries);
+        } catch (std::bad_alloc &ba) {
+        	outError("Not enough memory for partial likelihood vectors (bad_alloc)");
+        }
+	}
+//    if (!central_partial_pars) {
+//        try {
+//        	central_partial_pars = aligned_alloc<UINT>(total_partial_pars_entries);
+//        } catch (std::bad_alloc &ba) {
+//        	outError("Not enough memory for partial parsimony vectors (bad_alloc)");
+//        }
+//    }
+
+    // assign individual chunk just to prevent reallocation of memory, they will not be used
+	for (it = begin(); it != end(); it++) {
+		(*it)->central_partial_lh = central_partial_lh;
+		(*it)->central_scale_num = central_scale_num;
+//		(*it)->central_partial_pars = central_partial_pars;
+	}
+
+	double *lh_addr = central_partial_lh;
+	UBYTE *scale_addr = central_scale_num;
+	UINT *pars_addr = central_partial_pars;
+	clearAllPartialLH(true);
+
+	initializeAllPartialLh(lh_addr, scale_addr, pars_addr);
+    assert((lh_addr - central_partial_lh) < total_partial_lh_entries*sizeof(double) && lh_addr > central_partial_lh);
+    tip_partial_lh = NULL;
+    for (it = begin(), part = 0; it != end(); it++, part++) {
+        (*it)->tip_partial_lh = lh_addr;
+        uint64_t tip_partial_lh_size = (*it)->aln->num_states * ((*it)->aln->STATE_UNKNOWN+1) * (*it)->model->getNMixtures();
+        lh_addr += tip_partial_lh_size;
+    }
+}
+
+void PhyloSuperTreePlen::initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_addr, UINT* &pars_addr, PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    if (dad) {
+        // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
+        SuperNeighbor *nei = (SuperNeighbor*) node->findNeighbor(dad);
+		SuperNeighbor *nei_back = (SuperNeighbor*) dad->findNeighbor(node);
+        for (int part = 0; part < size(); part++) {
+        	PhyloNeighbor *nei_part = nei->link_neighbors[part];
+        	if (!nei_part) continue;
+        	PhyloNeighbor *nei_part_back = nei_back->link_neighbors[part];
+            
+
+            if (params->lh_mem_save == LM_PER_NODE && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                if (!nei_part_back->node->isLeaf()) {
+                    if (!nei_part_back->partial_lh) {
+                        nei_part_back->partial_lh = lh_addr;
+                        nei_part_back->scale_num = scale_addr;
+                        lh_addr = lh_addr + block_size[part];
+                        scale_addr = scale_addr + scale_block_size[part];
+                    }
+                } else {
+                    nei_part_back->partial_lh = NULL;
+                    nei_part_back->scale_num = NULL;
+                }
+//                nei_part->partial_lh = NULL;
+//                nei_part->scale_num = NULL;
+            } else {
+                if (nei_part->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                    nei_part->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
+                    nei_part->scale_num = NULL;
+                } else if (!nei_part->partial_lh) {
+                    nei_part->partial_lh = lh_addr;
+                    nei_part->scale_num = scale_addr;
+                    lh_addr = lh_addr + block_size[part];
+                    scale_addr = scale_addr + scale_block_size[part];
+                }
+    //			nei_part->partial_pars = pars_addr;
+    //			pars_addr += partial_pars_entries[part];
+
+                nei_part = nei_back->link_neighbors[part];
+                if (nei_part->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                    nei_part->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
+                    nei_part->scale_num = NULL;
+                } else if (!nei_part->partial_lh) {
+                    nei_part->partial_lh = lh_addr;
+                    nei_part->scale_num = scale_addr;
+                    lh_addr = lh_addr + block_size[part];
+                    scale_addr = scale_addr + scale_block_size[part];
+                }
+    //			nei_part->partial_pars = pars_addr;
+    //			pars_addr += partial_pars_entries[part];
+            }
+        }
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) initializeAllPartialLh(lh_addr, scale_addr, pars_addr, (PhyloNode*) (*it)->node, node);
+}
+
+void PhyloSuperTreePlen::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
+	// this function should not be used, assertion raised if accidentally called
+	assert(0);
+}
+
diff --git a/phylosupertreeplen.h b/phylosupertreeplen.h
new file mode 100644
index 0000000..0ccb320
--- /dev/null
+++ b/phylosupertreeplen.h
@@ -0,0 +1,322 @@
+/*
+ * phylosupertreeplen.h
+ *
+ *  Created on: Aug 5, 2013
+ *      Author: olga
+ */
+
+#ifndef PHYLOSUPERTREEPLEN_H_
+#define PHYLOSUPERTREEPLEN_H_
+
+#include "phylosupertree.h"
+#include "model/partitionmodel.h"
+#include "superalignmentpairwise.h"
+
+
+/**
+ * this is to classify the cases which happen on the subtree
+ *
+ *  NNI_NONE_EPSILON: all 5 branches have images on subtree, this corresponds to change in subtree topology
+ * 					  2 partial_lh vectors for -nni1 or 6 partial_lh vectors for -nni5 options
+ *  NNI_ONE_EPSILON:  only one of the 5 branches has no image on subtree, this does not change subtree topology, but changes branch length of subtrees
+ * 					  we need to allocate partial likelihood memory (1 partial_lh vectors for -nni1 option or 3 partial_lh for -nni5 option)
+ * 	NNI_TWO_EPSILON:  two branches (on different sides of central branch) have no images, here after the NNI swap,
+ * 					  the image of central branch either does not change or is equal to epsilon (then we decrease the branch length)
+ * 					  and no allocation of partial_lh is needed
+ * 	NNI_THREE_EPSILON: central and two adjacent edges have no images: after the NNI swap, central branch will have image and we need to relink it
+ * 					no allocation of partial_lh is needed
+ *  NNI_MANY_EPSILON: more than 3 branches have no images on subtree: nothing changes in subtree and no recomputation of partial likelihood are required
+ */
+enum NNIType {NNI_NO_EPSILON, NNI_ONE_EPSILON, NNI_TWO_EPSILON, NNI_THREE_EPSILON, NNI_MANY_EPSILON};
+
+
+/**
+Edge lengths in subtrees are proportional to edge lengths in a supertree.
+
+	@author Olga Chernomor <olga.chernomor at univie.ac.at>
+*/
+
+class PhyloSuperTreePlen;
+
+// Auxiliary classes ====================================================================================
+// ======================================================================================================
+class SuperAlignmentPairwisePlen : public SuperAlignmentPairwise {
+
+	public:
+
+		/**
+		    constructor
+		 */
+
+	    SuperAlignmentPairwisePlen();
+
+		/**
+			construct the pairwise alignment from two sequences of a multiple alignment
+			@param aln input multiple alignment
+			@param seq_id1 ID of the first sequence
+			@param seq_id2 ID of the second sequence
+		*/
+		SuperAlignmentPairwisePlen(PhyloSuperTreePlen *atree, int seq1, int seq2);
+
+	    ~SuperAlignmentPairwisePlen();
+
+		/**
+			compute the likelihood for a distance between two sequences. Used for the ML optimization of the distance.
+			@param value x-value of the function
+			@return log-likelihood
+		*/
+		virtual double computeFunction(double value);
+
+		/**
+			This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+			used by Newton raphson method to minimize the function.
+			@param value x-value of the function
+			@param df (OUT) first derivative
+			@param ddf (OUT) second derivative
+			@return f(value) of function f you want to minimize
+		*/
+		virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+		/**
+			partition information
+		*/
+		vector<PartitionInfo>* part_info;
+
+};
+// ======================================================================================================
+class PartitionModelPlen : public PartitionModel
+{
+public:
+    PartitionModelPlen();
+	/**
+		constructor
+		create partition model with possible rate heterogeneity. Create proper class objects
+		for two variables: model and site_rate. It takes the following field of params into account:
+			model_name, num_rate_cats, freq_type, store_trans_matrix
+		@param params program parameters
+		@param tree associated phylogenetic super-tree
+	*/
+	PartitionModelPlen(Params &params, PhyloSuperTreePlen *tree, ModelsBlock *models_block);
+
+    ~PartitionModelPlen();
+
+    /**
+     * @return #parameters of the model + # branches
+     */
+    virtual int getNParameters();
+    virtual int getNDim();
+
+	/**
+		optimize model parameters and tree branch lengths
+		@param fixed_len TRUE to fix branch lengths, default is false
+		@return the best likelihood
+	*/
+	virtual double optimizeParameters(bool fixed_len = false, bool write_info = true,
+                                      double logl_epsilon = 0.1, double gradient_epsilon = 0.001);
+
+	double optimizeGeneRate(double tol);
+
+//	virtual double targetFunk(double x[]);
+//	virtual void getVariables(double *variables);
+//	virtual void setVariables(double *variables);
+
+    /** partition ID currently under optimization of of its rate */
+//    int optimizing_part;
+
+    /**
+        compute the likelihood for a partition under rate optimization (optimizing_rate).
+        Used for the ML optimization of gene rate
+        @param value x-value of the function
+        @return log-likelihood
+    */
+//    virtual double computeFunction(double value);
+
+
+};
+
+// ======================================================================================================
+// ======================================================================================================
+
+class PhyloSuperTreePlen : public PhyloSuperTree {
+
+public:
+	/**
+		constructors
+	*/
+	PhyloSuperTreePlen();
+	PhyloSuperTreePlen(Params &params);
+	PhyloSuperTreePlen(SuperAlignment *alignment, PhyloSuperTree *super_tree);
+
+	~PhyloSuperTreePlen();
+
+    /**
+            compute the distance between 2 sequences.
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2
+            @param initial_dist initial distance
+            @return distance between seq1 and seq2
+     */
+
+    virtual double computeDist(int seq1, int seq2, double initial_dist, double &var);
+
+	/**
+		create sub-trees T|Y_1,...,T|Y_k of the current super-tree T
+		and map F={f_1,...,f_k} the edges of supertree T to edges of subtrees T|Y_i
+	*/
+	virtual void mapTrees();
+
+	/**
+	 * Given current supertree T and subtrees T|Y_1,...,T|Y_k, build all maps f_1,...,f_k
+	 */
+	virtual void linkTrees();
+
+    /**
+            initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+     */
+    virtual void initializeAllPartialLh();
+
+    /**
+            initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param index the index
+     */
+    virtual void initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    void initializeAllPartialLh(double* &lh_addr, UBYTE* &scale_addr, UINT* &pars_addr, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            de-allocate central_partial_lh
+     */
+    virtual void deleteAllPartialLh();
+
+	/**
+	 * @return the type of NNI around node1-node2 for partition part
+	 */
+	void getNNIType(PhyloNode *node1, PhyloNode *node2, vector<NNIType> &nni_type);
+
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+	virtual double computeFunction(double value);
+
+    /**
+            compute tree likelihood on a branch given buffer (theta_all), used after optimizing branch length
+            @return tree likelihood
+     */
+
+    virtual double computeLikelihoodFromBuffer();
+
+    /**
+            optimize all branch lengths of all subtrees, then compute branch lengths
+            of supertree as weighted average over all subtrees
+            @param iterations number of iterations to loop through all branches
+            @return the likelihood of the tree
+     */
+    virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD, int maxNRStep = 100);
+
+    /**
+            optimize one branch length by ML by optimizing all mapped branches of subtrees
+            @param node1 1st end node of the branch
+            @param node2 2nd end node of the branch
+            @param clearLH true to clear the partial likelihood, otherwise false
+            @return likelihood score
+     */
+    virtual void optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH = true, int maxNRStep = 100);
+
+    /**
+            search the best swap for a branch
+            @return NNIMove The best Move/Swap
+            @param cur_score the current score of the tree before the swaps
+            @param node1 1 of the 2 nodes on the branch
+            @param node2 1 of the 2 nodes on the branch
+     */
+    virtual NNIMove getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves = NULL);
+
+
+    /**
+            Do an NNI on the supertree and synchronize all subtrees respectively
+            @param move the single NNI
+     */
+    virtual void doNNI(NNIMove &move, bool clearLH = true);
+    /**
+            apply nni2apply NNIs from the non-conflicting NNI list
+            @param nni2apply number of NNIs to apply from the list
+            @param changeBran whether or not the computed branch lengths should be applied
+     */
+    virtual void doNNIs(int nni2apply, bool changeBran = true);
+
+    /**
+     *   Apply 5 new branch lengths stored in the NNI move
+     *   @param nnimove the NNI move currently in consideration
+     */
+    virtual void changeNNIBrans(NNIMove nnimove);
+
+    /**
+            This is for ML. try to swap the tree with nearest neigbor interchange at the branch connecting node1-node2.
+            If a swap shows better score, return the swapped tree and the score.
+            @param cur_score current likelihood score
+            @param node1 1st end node of the branch
+            @param node2 2nd end node of the branch
+            @param nni_param (OUT) if not NULL: swapping information returned
+            @return the likelihood of the tree
+     */
+    virtual double swapNNIBranch(double cur_score, PhyloNode *node1, PhyloNode *node2, SwapNNIParam *nni_param = NULL, NNIMove *nniMoves = NULL);
+
+    /**
+     *	used in swapNNIBranch to update link_neighbors of other SuperNeighbors that point to the same branch on SubTree as (node,dad)
+     *	@param saved_link_dad_nei   pointer to link_neighbor dad_nei
+     */
+    void linkCheck(int part, Node* node, Node* dad, PhyloNeighbor* saved_link_dad_nei);
+    void linkCheckRe(int part, Node* node, Node* dad, PhyloNeighbor* saved_link_dad_nei,PhyloNeighbor* saved_link_node_nei);
+
+	/**
+		compute the weighted average of branch lengths over partitions
+	*/
+	virtual void computeBranchLengths();
+
+	bool checkBranchLen();
+	void mapBranchLen();
+	void mapBranchLen(int part);
+	virtual void printMapInfo();
+
+	virtual void restoreAllBrans(PhyloNode *node, PhyloNode *dad);
+
+	/**
+	 * initialize partition information for super tree
+	*/
+	virtual void initPartitionInfo();
+
+	void printNNIcasesNUM();
+
+    /**
+     * 		indicates whether partition rates are fixed or not
+     */
+
+    bool fixed_rates;
+
+    /*
+     * 1 - # of is_nni on subtree
+     * 2 - # of relink branch to an empty one
+     * 3 - # of empty to empty
+     * 4 - # of relink branch to a  new one (50% saving on these cases compared to the previous implementation)
+     * 5 - # of relink branch to an old one + relink empty to some branch (100% saving on these cases)
+     */
+    int allNNIcases_computed[5];
+
+    /**
+            Neighbor-joining/parsimony tree might contain negative branch length. This
+            function will fix this.
+            @param fixed_length fixed branch length to set to negative branch lengths
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return The number of branches that have no/negative length
+     */
+    virtual int fixNegativeBranch(bool force = false, Node *node = NULL, Node *dad = NULL);
+
+protected:
+	vector<uint64_t> partial_lh_entries, scale_num_entries, partial_pars_entries, block_size, scale_block_size;
+
+};
+
+
+
+#endif /* PHYLOSUPERTREEPLEN_H_ */
diff --git a/phylotesting.cpp b/phylotesting.cpp
new file mode 100644
index 0000000..59cc791
--- /dev/null
+++ b/phylotesting.cpp
@@ -0,0 +1,2029 @@
+/*
+ * phylotesting.cpp
+ *
+ *  Created on: Aug 23, 2013
+ *      Author: minh
+ */
+
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <iqtree_config.h>
+#include "phylotree.h"
+#include "iqtree.h"
+#include "phylosupertree.h"
+#include "phylotesting.h"
+
+#include "model/modelgtr.h"
+#include "model/modeldna.h"
+#include "myreader.h"
+#include "model/rateheterogeneity.h"
+#include "model/rategamma.h"
+#include "model/rateinvar.h"
+#include "model/rategammainvar.h"
+#include "model/ratefree.h"
+//#include "modeltest_wrapper.h"
+#include "model/modelprotein.h"
+#include "model/modelbin.h"
+#include "model/modelcodon.h"
+#include "model/modelmorphology.h"
+#include "timeutil.h"
+
+#include "phyloanalysis.h"
+
+
+/******* Binary model set ******/
+const char* bin_model_names[] = { "JC2", "GTR2" };
+
+
+/******* Morphological model set ******/
+const char* morph_model_names[] = {"MK", "ORDERED"};
+
+
+/******* DNA model set ******/
+const char* dna_model_names[] = { "JC", "F81", "K80", "HKY", "TNe",
+		"TN", "K81", "K81u", "TPM2", "TPM2u", "TPM3", "TPM3u", "TIMe", "TIM",
+		"TIM2e", "TIM2", "TIM3e", "TIM3", "TVMe", "TVM", "SYM", "GTR" };
+
+/* DNA models supported by PhyML/PartitionFinder */
+const char* dna_model_names_old[] ={"JC", "F81", "K80", "HKY", "TNe",
+ 	 	 "TN", "K81", "K81u", "TIMe", "TIM", "TVMe", "TVM", "SYM", "GTR"};
+
+/* DNA model supported by RAxML */
+const char* dna_model_names_rax[] ={"GTR"};
+
+/* DNA model supported by MrBayes */
+const char *dna_model_names_mrbayes[] = {"JC", "F81", "K80", "HKY", "SYM", "GTR"};
+
+
+/****** Protein model set ******/
+const char* aa_model_names[] = { "Dayhoff", "mtMAM", "JTT", "WAG",
+		"cpREV", "mtREV", "rtREV", "mtART", "mtZOA", "VT", "LG", "DCMut", "PMB",
+		"HIVb", "HIVw", "JTTDCMut", "FLU", "Blosum62" };
+        
+/* Protein models supported by PhyML/PartitionFinder */
+const char *aa_model_names_phyml[] = { "Dayhoff", "mtMAM", "JTT", "WAG",
+		"cpREV", "mtREV", "rtREV", "mtART", "VT", "LG", "DCMut",
+		"HIVb", "HIVw", "Blosum62" };
+
+/* Protein models supported by RAxML */
+const char *aa_model_names_rax[] = { "Dayhoff", "mtMAM", "JTT", "WAG",
+		"cpREV", "mtREV", "rtREV", "mtART", "mtZOA", "PMB", "HIVb", "HIVw", "JTTDCMut", "FLU", "VT", "LG", "DCMut", "Blosum62" };
+
+const char* aa_model_names_mrbayes[] = {"Poisson", "Dayhoff", "mtMAM", "JTT", "WAG",
+		"cpREV", "mtREV", "rtREV", "VT", "Blosum62" };
+
+const char *aa_model_names_nuclear[] = {"WAG", "Dayhoff","JTT", "LG", "VT", "DCMut", "PMB", "JTTDCMut", "Blosum62"};
+
+const char *aa_model_names_mitochondrial[] = {"mtREV", "mtMAM", "mtART", "mtZOA"};
+
+const char *aa_model_names_chloroplast[] = {"cpREV"};
+
+const char *aa_model_names_viral[] = {"HIVb", "HIVw", "FLU", "rtREV"};
+
+const char* aa_freq_names[] = {"", "+F"};
+
+
+/****** Codon models ******/
+//const char *codon_model_names[] = {"GY", "MG", "MGK", "KOSI07", "SCHN05","KOSI07_GY1KTV","SCHN05_GY1KTV"};
+//short int std_genetic_code[]    = {   0,    0,     0,        1,        1,              1,              1};
+const char *codon_model_names[] = {"MG", "MGK", "GY", "KOSI07", "SCHN05"};
+short int std_genetic_code[]    = {   0,    0,     0,        1,        1};
+
+const char *codon_freq_names[] = {"", "+F1X4", "+F3X4", "+F"};
+
+const double TOL_LIKELIHOOD_MODELTEST = 0.01;
+const double TOL_GRADIENT_MODELTEST   = 0.001;
+
+/**
+ * copy from cvec to strvec
+ */
+void copyCString(const char **cvec, int n, StrVector &strvec, bool touppercase = false) {
+	strvec.resize(n);
+	for (int i = 0; i < n; i++) {
+		strvec[i] = cvec[i];
+        if (touppercase)
+            std::transform(strvec[i].begin(), strvec[i].end(), strvec[i].begin(), ::toupper);
+    }
+}
+
+int getSeqType(const char *model_name, SeqType &seq_type) {
+    bool empirical_model = false;
+    int i;
+    string model_str = model_name;
+    std::transform(model_str.begin(), model_str.end(), model_str.begin(), ::toupper);
+    StrVector model_list;
+
+    seq_type = SEQ_UNKNOWN;
+    
+    copyCString(bin_model_names, sizeof(bin_model_names)/sizeof(char*), model_list, true);
+    for (i = 0; i < model_list.size(); i++)
+        if (model_str == model_list[i]) {
+            seq_type = SEQ_BINARY;
+            break;
+        }
+    copyCString(morph_model_names, sizeof(morph_model_names)/sizeof(char*), model_list, true);
+    for (i = 0; i < model_list.size(); i++)
+        if (model_str == model_list[i]) {
+            seq_type = SEQ_MORPH;
+            break;
+        }
+    copyCString(dna_model_names, sizeof(dna_model_names)/sizeof(char*), model_list, true);
+    for (i = 0; i < model_list.size(); i++)
+        if (model_str == model_list[i]) {
+            seq_type = SEQ_DNA;
+            break;
+        }
+    copyCString(aa_model_names, sizeof(aa_model_names)/sizeof(char*), model_list, true);
+    for (i = 0; i < model_list.size(); i++)
+        if (model_str == model_list[i]) {
+            seq_type = SEQ_PROTEIN;
+            empirical_model = true;
+            break;
+        }
+    copyCString(codon_model_names, sizeof(codon_model_names)/sizeof(char*), model_list, true);
+    for (i = 0; i < model_list.size(); i++)
+        if (model_str.substr(0,model_list[i].length()) == model_list[i]) {
+            seq_type = SEQ_CODON;
+            if (std_genetic_code[i]) empirical_model = true;
+            break;
+        }
+        
+    return (empirical_model) ? 2 : 1;
+}
+
+string getSeqType(string model_name) {
+    SeqType seq_type;
+    getSeqType(model_name.c_str(), seq_type);
+    switch (seq_type) {
+    case SEQ_BINARY: return "BIN"; break;
+    case SEQ_MORPH: return "MORPH"; break;
+    case SEQ_DNA: return "DNA"; break;
+    case SEQ_PROTEIN: return "AA"; break;
+    case SEQ_CODON: return "CODON"; break;
+    default: break;
+    }
+    return "";
+}
+
+void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC) {
+	AIC = -2 * tree_lh + 2 * df;
+	AICc = AIC + 2.0 * df * (df + 1) / max(ssize - df - 1, 1);
+	BIC = -2 * tree_lh + df * log(ssize);
+}
+
+double computeInformationScore(double tree_lh, int df, int ssize, ModelTestCriterion mtc) {
+	double AIC, AICc, BIC;
+	computeInformationScores(tree_lh, df, ssize, AIC, AICc, BIC);
+	if (mtc == MTC_AIC)
+		return AIC;
+	if (mtc == MTC_AICC)
+		return AICc;
+	if (mtc == MTC_BIC)
+		return BIC;
+	return 0.0;
+}
+
+string criterionName(ModelTestCriterion mtc) {
+	if (mtc == MTC_AIC)
+		return "AIC";
+	if (mtc == MTC_AICC)
+		return "AICc";
+	if (mtc == MTC_BIC)
+		return "BIC";
+	return "";
+}
+
+void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh,
+		bool append, const char *linename) {
+	int i;
+	double *pattern_lh;
+	if (!ptn_lh) {
+		pattern_lh = new double[tree->getAlnNPattern()];
+		tree->computePatternLikelihood(pattern_lh);
+	} else
+		pattern_lh = ptn_lh;
+
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		if (append) {
+			out.open(filename, ios::out | ios::app);
+		} else {
+			out.open(filename);
+			out << 1 << " " << tree->getAlnNSite() << endl;
+		}
+		IntVector pattern_index;
+		tree->aln->getSitePatternIndex(pattern_index);
+		if (!linename)
+			out << "Site_Lh   ";
+		else {
+			out.width(10);
+			out << left << linename;
+		}
+		for (i = 0; i < tree->getAlnNSite(); i++)
+			out << " " << pattern_lh[pattern_index[i]];
+		out << endl;
+		out.close();
+		if (!append)
+			cout << "Site log-likelihoods printed to " << filename << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	if (!ptn_lh)
+		delete[] pattern_lh;
+}
+
+void printSiteLhCategory(const char*filename, PhyloTree *tree) {
+    // TODO: mixture model!
+    if (tree->getModel()->isMixture() && !tree->getModelFactory()->fused_mix_rate)
+        outError("Unsupported feature, please contact author if you really need this", __func__);
+	double *pattern_lh, *pattern_lh_cat;
+	int i;
+	int discrete_cat = tree->getRate()->getNDiscreteRate();
+	pattern_lh = new double[tree->getAlnNPattern()];
+	pattern_lh_cat = new double[tree->getAlnNPattern()*(discrete_cat)];
+	tree->computePatternLikelihood(pattern_lh, NULL, pattern_lh_cat);
+        
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+		out.open(filename);
+		out << "Note : P(D|M) is the probability of site D given the model M (i.e., the site likelihood)" << endl;
+		out << "P(D|M,rr[x]) is the probability of site D given the model M and the relative rate" << endl;
+		out << "of evolution rr[x], where x is the class of rate to be considered." << endl;
+		out << "We have P(D|M) = \\sum_x P(x) x P(D|M,rr[x])." << endl << endl;
+		out << "Site   logP(D|M)       ";
+		for (i = 0; i < discrete_cat; i++) {
+			out << "logP(D|M,rr[" << i+1 << "]=" << tree->getRate()->getRate(i)<< ") ";
+		}
+		out << endl;
+		IntVector pattern_index;
+		tree->aln->getSitePatternIndex(pattern_index);
+		for (i = 0; i < tree->getAlnNSite(); i++) {
+			out.width(6);
+			out << left << i+1 << " ";
+			out.width(15);
+			out << pattern_lh[pattern_index[i]] << " ";
+			for (int j = 0; j < discrete_cat; j++) {
+				out.width(15);
+				out << pattern_lh_cat[pattern_index[i]*discrete_cat+j] << " ";
+			}
+			out << endl;
+		}
+		out.close();
+		cout << "Site log-likelihoods per category printed to " << filename << endl;
+        if (!tree->isSuperTree()) {
+            cout << "Log-likelihood of constant sites: " << endl;
+            double const_prob = 0.0;
+            for (i = 0; i < tree->aln->getNPattern(); i++)
+                if (tree->aln->at(i).is_const) {
+                    Pattern pat = tree->aln->at(i);
+                    for (Pattern::iterator it = pat.begin(); it != pat.end(); it++)
+                        cout << tree->aln->convertStateBackStr(*it);
+                    cout << ": " << pattern_lh[i] << endl;
+                    const_prob += exp(pattern_lh[i]);
+                }
+            cout << "Probability of const sites: " << const_prob << endl;
+        }
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, filename);
+	}
+
+	delete[] pattern_lh_cat;
+	delete[] pattern_lh;
+
+}
+
+bool checkModelFile(ifstream &in, bool is_partitioned, vector<ModelInfo> &infos) {
+	if (!in.is_open()) return false;
+	in.exceptions(ios::badbit);
+	string str;
+	if (is_partitioned) {
+		in >> str;
+		if (str != "Charset")
+			return false;
+	}
+	in >> str;
+	if (str != "Model")
+		return false;
+	in >> str;
+	if (str != "df")
+		return false;
+	in >> str;
+	if (str != "LnL")
+		return false;
+	in >> str;
+	if (str != "TreeLen") {
+        outWarning(".model file was produced from a previous version of IQ-TREE");
+		return false;
+    }
+	getline(in, str);
+	while (!in.eof()) {
+		in >> str;
+		if (in.eof())
+			break;
+		ModelInfo info;
+		if (is_partitioned) {
+			info.set_name = str;
+			in >> str;
+		}
+		info.name = str;
+		in >> info.df >> info.logl >> info.tree_len;
+		getline(in, str);
+        info.tree = "";
+        if (*str.rbegin() == ';') {
+            size_t pos = str.rfind('\t');
+            if (pos != string::npos)
+                info.tree = str.substr(pos+1);
+//            else 
+//                outWarning(".model file was produced from a previous version of IQ-TREE");
+        }
+		infos.push_back(info);
+		//cout << str << " " << df << " " << logl << endl;
+	}
+	in.clear();
+	return true;
+}
+
+bool checkModelFile(string model_file, bool is_partitioned, vector<ModelInfo> &infos) {
+	if (!fileExists(model_file))
+		return false;
+	//cout << model_file << " exists, checking this file" << endl;
+	ifstream in;
+	try {
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(model_file.c_str());
+		if (!checkModelFile(in, is_partitioned, infos))
+			throw false;
+		// set the failbit again
+		in.exceptions(ios::failbit | ios::badbit);
+		in.close();
+	} catch (bool ret) {
+		in.close();
+		return ret;
+	} catch (ios::failure) {
+		outError("Cannot read file ", model_file);
+	}
+	return true;
+}
+
+/**
+ * get the list of model
+ * @param models (OUT) vectors of model names
+ * @return maximum number of rate categories
+ */
+int getModelList(Params &params, Alignment *aln, StrVector &models, bool separate_rate = false) {
+	StrVector model_names;
+    StrVector freq_names;
+	SeqType seq_type = aln->seq_type;
+    
+	const char *rate_options[]    = {  "", "+I", "+ASC", "+G", "+I+G", "+ASC+G", "+R", "+ASC+R"};
+	bool test_options_default[]   = {true, true,  false, true,   true,    false,false,    false};
+	bool test_options_morph[]     = {true,false,   true, true,  false,     true,false,    false};    
+	bool test_options_asc[]       ={false,false,   true,false,  false,     true,false,    false};
+	bool test_options_new[]       = {true, true,  false, true,   true,    false, true,    false};
+	bool test_options_morph_new[] = {true,false,   true, true,  false,     true, true,     true};
+	bool test_options_asc_new[]   ={false,false,   true,false,  false,     true,false,     true};
+    bool *test_options = test_options_default;
+//	bool test_options_codon[] =  {true,false,  false,false,  false,    false};
+	const int noptions = sizeof(rate_options) / sizeof(char*);
+	int i, j;
+    
+	if (seq_type == SEQ_BINARY) {
+		copyCString(bin_model_names, sizeof(bin_model_names) / sizeof(char*), model_names);
+	} else if (seq_type == SEQ_MORPH) {
+		copyCString(morph_model_names, sizeof(morph_model_names) / sizeof(char*), model_names);
+	} else if (seq_type == SEQ_DNA) {
+		if (params.model_set == NULL) {
+			copyCString(dna_model_names, sizeof(dna_model_names) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "partitionfinder") == 0 || strcmp(params.model_set, "phyml") == 0) {
+			copyCString(dna_model_names_old, sizeof(dna_model_names_old) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "raxml") == 0) {
+			copyCString(dna_model_names_rax, sizeof(dna_model_names_rax) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "mrbayes") == 0) {
+			copyCString(dna_model_names_mrbayes, sizeof(dna_model_names_mrbayes) / sizeof(char*), model_names);
+		} else {
+			convert_string_vec(params.model_set, model_names);
+		}
+	} else if (seq_type == SEQ_PROTEIN) {
+		if (params.model_set == NULL) {
+			copyCString(aa_model_names, sizeof(aa_model_names) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "partitionfinder") == 0 || strcmp(params.model_set, "phyml") == 0) {
+			copyCString(aa_model_names_phyml, sizeof(aa_model_names_phyml) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "raxml") == 0) {
+			copyCString(aa_model_names_rax, sizeof(aa_model_names_rax) / sizeof(char*), model_names);
+		} else if (strcmp(params.model_set, "mrbayes") == 0) {
+			copyCString(aa_model_names_mrbayes, sizeof(aa_model_names_mrbayes) / sizeof(char*), model_names);
+		} else {
+			convert_string_vec(params.model_set, model_names);
+		}
+        copyCString(aa_freq_names, sizeof(aa_freq_names)/sizeof(char*), freq_names);
+        
+        if (params.model_subset) {
+            StrVector submodel_names;
+            if (strncmp(params.model_subset, "nuclear", 3) == 0) {
+                copyCString(aa_model_names_nuclear, sizeof(aa_model_names_nuclear) / sizeof(char*), submodel_names);
+            } else if (strncmp(params.model_subset, "mitochondrial", 3) == 0) {
+                copyCString(aa_model_names_mitochondrial, sizeof(aa_model_names_mitochondrial) / sizeof(char*), submodel_names);
+            } else if (strncmp(params.model_subset, "chloroplast", 3) == 0) {
+                copyCString(aa_model_names_chloroplast, sizeof(aa_model_names_chloroplast) / sizeof(char*), submodel_names);
+            } else if (strncmp(params.model_subset, "viral",3) == 0) {
+                copyCString(aa_model_names_viral, sizeof(aa_model_names_viral) / sizeof(char*), submodel_names);
+            } else {
+                outError("Wrong -msub option");
+            }
+            for (i = 0; i < model_names.size(); i++) {
+                bool appear = false;
+                for (j = 0; j < submodel_names.size(); j++) 
+                    if (model_names[i] == submodel_names[j]) {
+                        appear = true;
+                        break;
+                    }
+                if (!appear) {
+                    model_names.erase(model_names.begin()+i);
+                    i--;
+                }
+            }
+        }
+
+	} else if (seq_type == SEQ_CODON) {
+		if (params.model_set == NULL) {
+			if (aln->isStandardGeneticCode())
+				copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*), model_names);
+			else {
+                i = sizeof(codon_model_names) / sizeof(char*);
+                for (j = 0; j < i; j++)
+                    if (!std_genetic_code[j])
+                        model_names.push_back(codon_model_names[j]);
+//				copyCString(codon_model_names, sizeof(codon_model_names) / sizeof(char*) - 1, model_names);
+            }
+		} else
+			convert_string_vec(params.model_set, model_names);
+        copyCString(codon_freq_names, sizeof(codon_freq_names) / sizeof(char*), freq_names);
+	}
+    
+	if (model_names.empty()) 
+        return 1;
+    
+    if (params.state_freq_set)
+        convert_string_vec(params.state_freq_set, freq_names);
+    for (j = 0; j < freq_names.size(); j++) {
+        std::transform(freq_names[j].begin(), freq_names[j].end(), freq_names[j].begin(), ::toupper);
+//        for (i = 0; i < freq_names.size(); i++)
+//            cout << " " << freq_names[i];
+//        cout << endl;
+        if (freq_names[j] != "" && freq_names[j][0] != '+')
+            freq_names[j] = "+" + freq_names[j];
+    }
+    
+    if (freq_names.size() > 0) {
+        StrVector orig_model_names = model_names;
+        model_names.clear();
+        for (j = 0; j < orig_model_names.size(); j++) {
+            if (aln->seq_type == SEQ_CODON) {
+                SeqType seq_type;
+                int model_type = getSeqType(orig_model_names[j].c_str(), seq_type);
+                for (i = 0; i < freq_names.size(); i++) {
+                    // disallow MG+F
+                    if (freq_names[i] == "+F" && orig_model_names[j].find("MG") != string::npos)
+                        continue;
+                    if (freq_names[i] != "" || model_type == 2) // empirical model also allow ""
+                        model_names.push_back(orig_model_names[j] + freq_names[i]);
+                }
+            } else {
+                for (i = 0; i < freq_names.size(); i++)
+                    model_names.push_back(orig_model_names[j] + freq_names[i]);
+            }
+        }
+    }
+
+    bool with_new = params.model_name.find("NEW") != string::npos;
+    bool with_asc = params.model_name.find("ASC") != string::npos;
+
+//	if (seq_type == SEQ_CODON) {
+//		for (i = 0; i < noptions; i++)
+//			test_options[i] = test_options_codon[i];
+//	} else 
+    if (seq_type == SEQ_MORPH || aln->frac_const_sites == 0.0) {
+        // morphological or SNP data: activate +ASC
+        if (with_new) {
+            if (with_asc)
+                test_options = test_options_asc_new;
+            else
+                test_options = test_options_morph_new;
+        } else if (with_asc)
+            test_options = test_options_asc;
+        else
+            test_options = test_options_morph;
+	} else {
+        // normal data, use +I instead
+        if (with_new) {
+            // change +I+G to +R
+            if (with_asc)
+                test_options = test_options_asc_new;
+            else
+                test_options = test_options_new;
+        } else if (with_asc) {
+            test_options = test_options_asc;
+        } else
+            test_options = test_options_default;
+    }
+    
+
+    StrVector ratehet;
+    int max_cats = params.num_rate_cats;
+
+	if (params.ratehet_set) {
+		// take the rate_options from user-specified models
+		convert_string_vec(params.ratehet_set, ratehet);
+		if (!ratehet.empty() && ratehet[0] == "default") {
+			ratehet.erase(ratehet.begin());
+			StrVector ratedef;
+			for (j = 0; j < noptions; j++)
+				if (test_options[j])
+					ratedef.push_back(rate_options[j]);
+			ratehet.insert(ratehet.begin(), ratedef.begin(), ratedef.end());
+		}
+        for (j = 0; j < ratehet.size(); j++) {
+            if (ratehet[j] != "" && ratehet[j][0] != '+')
+                ratehet[j] = "+" + ratehet[j];
+            if (ratehet[j] == "+E") // for equal rate model 
+                ratehet[j] = "";
+        }
+    } else {
+        for (j = 0; j < noptions; j++)
+            if (test_options[j])
+                ratehet.push_back(rate_options[j]);
+        
+    }
+    
+    size_t pos;
+
+    for (j = 0; j < ratehet.size(); j++)
+        if ( (pos = ratehet[j].find("+R")) != string::npos && (pos >= ratehet[j].length()-2 || !isdigit(ratehet[j][pos+2]) )) {
+            string str = ratehet[j];
+            ratehet[j].insert(pos+2, convertIntToString(params.min_rate_cats));
+            max_cats = max(max_cats, params.max_rate_cats);
+            for (int k = params.min_rate_cats+1; k <= params.max_rate_cats; k++) {
+                ratehet.insert(ratehet.begin()+j+k-params.min_rate_cats, str.substr(0, pos+2) + convertIntToString(k) + str.substr(pos+2));
+            }
+            break;
+        }
+
+    if (separate_rate) {
+        for (i = 0; i < model_names.size(); i++) 
+            models.push_back(model_names[i]);
+        for (j = 0; j < ratehet.size(); j++)
+            if (ratehet[j] != "")
+                models.push_back(ratehet[j]);
+    } else {
+        for (i = 0; i < model_names.size(); i++)
+            for (j = 0; j < ratehet.size(); j++) {
+                models.push_back(model_names[i] + ratehet[j]);
+            }
+    }
+    return max_cats;
+}
+
+/*
+bool checkPartitionModel(Params &params, PhyloSuperTree *in_tree, vector<ModelInfo> &model_info) {
+	return true;
+
+	PhyloSuperTree::iterator it;
+	int i, all_models = 0;
+	for (it = in_tree->begin(), i = 0; it != in_tree->end(); it++, i++) {
+		int count = 0;
+		for (vector<ModelInfo>::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
+			if (mit->set_name == in_tree->part_info[i].name)
+				count++;
+		int nstates = (*it)->aln->num_states;
+		int num_models;
+		getModelList(params, nstates, num_models);
+		if (count != num_models * 4) {
+			return false;
+		}
+		all_models += count;
+	}
+	return true;
+}
+*/
+void replaceModelInfo(vector<ModelInfo> &model_info, vector<ModelInfo> &new_info) {
+	vector<ModelInfo>::iterator first_info = model_info.end(), last_info = model_info.end();
+	vector<ModelInfo>::iterator mit;
+	// scan through models for this partition, assuming the information occurs consecutively
+	for (mit = model_info.begin(); mit != model_info.end(); mit++)
+		if (mit->set_name == new_info.front().set_name) {
+			if (first_info == model_info.end()) first_info = mit;
+		} else if (first_info != model_info.end()) {
+			last_info = mit;
+			break;
+		}
+	if (new_info.size() == (last_info - first_info)) {
+		// replace sub vector
+		for (mit = first_info; mit != last_info; mit++)
+			*mit = new_info[mit - first_info];
+	} else {
+		if (first_info != model_info.end()) {
+			model_info.erase(first_info, last_info);
+		}
+		model_info.insert(model_info.end(), new_info.begin(), new_info.end());
+	}
+}
+
+void extractModelInfo(string set_name, vector<ModelInfo> &model_info, vector<ModelInfo> &part_model_info) {
+	for (vector<ModelInfo>::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
+		if (mit->set_name == set_name)
+			part_model_info.push_back(*mit);
+		else if (part_model_info.size() > 0)
+			break;
+}
+
+void mergePartitions(PhyloSuperTree* super_tree, vector<IntVector> &gene_sets, StrVector &model_names) {
+	cout << "Merging into " << gene_sets.size() << " partitions..." << endl;
+	vector<IntVector>::iterator it;
+	SuperAlignment *super_aln = (SuperAlignment*)super_tree->aln;
+	vector<PartitionInfo> part_info;
+	vector<PhyloTree*> tree_vec;
+	for (it = gene_sets.begin(); it != gene_sets.end(); it++) {
+		PartitionInfo info;
+		info.name = "";
+		info.position_spec = "";
+		info.aln_file = "";
+		info.sequence_type = "";
+		info.model_name = model_names[it-gene_sets.begin()];
+        info.part_rate = 1.0; // BIG FIX: make -spp works with -m TESTMERGE now!
+        info.evalNNIs = 0;
+		for (IntVector::iterator i = it->begin(); i != it->end(); i++) {
+			if (i != it->begin()) {
+				info.name += "+";
+				info.position_spec += ", ";
+			}
+			info.name += super_tree->part_info[*i].name;
+			info.position_spec += super_tree->part_info[*i].position_spec;
+			if (!super_tree->part_info[*i].aln_file.empty()) {
+                if (info.aln_file.empty())
+                    info.aln_file = super_tree->part_info[*i].aln_file;
+                else if (info.aln_file != super_tree->part_info[*i].aln_file) {
+                    info.aln_file = "__NA__";
+                }
+			}
+			if (!super_tree->part_info[*i].sequence_type.empty()) {
+                if (info.sequence_type.empty())
+                    info.sequence_type = super_tree->part_info[*i].sequence_type;
+                else if (info.sequence_type != super_tree->part_info[*i].sequence_type) {
+                    info.sequence_type = "__NA__";
+                }
+			}
+		}
+		info.cur_ptnlh = NULL;
+		info.nniMoves[0].ptnlh = NULL;
+		info.nniMoves[1].ptnlh = NULL;
+		part_info.push_back(info);
+		Alignment *aln = super_aln->concatenateAlignments(*it);
+		PhyloTree *tree = super_tree->extractSubtree(*it);
+		tree->setAlignment(aln);
+		tree_vec.push_back(tree);
+	}
+
+	for (PhyloSuperTree::reverse_iterator tit = super_tree->rbegin(); tit != super_tree->rend(); tit++)
+		delete (*tit);
+	super_tree->clear();
+	super_tree->insert(super_tree->end(), tree_vec.begin(), tree_vec.end());
+	super_tree->part_info = part_info;
+
+	delete super_tree->aln;
+	super_tree->aln = new SuperAlignment(super_tree);
+}
+
+void printModelFile(ostream &fmodel, Params &params, PhyloTree *tree, ModelInfo &info, string &set_name) {
+	string sitelh_file = params.out_prefix;
+	sitelh_file += ".sitelh";
+	SeqType seq_type = tree->aln->seq_type;
+	if (tree->isSuperTree())
+		seq_type = ((PhyloSuperTree*)tree)->front()->aln->seq_type;
+
+    fmodel.precision(4);
+    fmodel << fixed;
+    if (set_name != "")
+        fmodel << set_name << "\t";
+    fmodel << info.name << "\t" << info.df << "\t" << info.logl << "\t" << info.tree_len;
+    if (seq_type == SEQ_DNA) {
+        int nrates = tree->getModel()->getNumRateEntries();
+        double *rate_mat = new double[nrates];
+        tree->getModel()->getRateMatrix(rate_mat);
+        for (int rate = 0; rate < nrates; rate++)
+            fmodel << "\t" << rate_mat[rate];
+        delete [] rate_mat;
+    }
+    if (seq_type == SEQ_DNA || seq_type == SEQ_BINARY) {
+        int nstates = (seq_type == SEQ_DNA) ? 4 : 2;
+        double *freqs = new double[nstates];
+        tree->getModel()->getStateFrequency(freqs);
+        for (int freq = 0; freq < nstates; freq++)
+            fmodel << "\t" << freqs[freq];
+        delete [] freqs;
+    }
+    double alpha = tree->getRate()->getGammaShape();
+    fmodel << "\t";
+    if (alpha > 0) fmodel << alpha; else fmodel << "NA";
+    fmodel << "\t";
+    double pinvar = tree->getRate()->getPInvar();
+    if (pinvar > 0) fmodel << pinvar; else fmodel << "NA";
+    fmodel << "\t";
+//    tree->printTree(fmodel);
+    fmodel << info.tree;
+    fmodel << endl;
+    fmodel.precision(4);
+    const char *model_name = (params.print_site_lh) ? info.name.c_str() : NULL;
+    if (params.print_site_lh)
+        printSiteLh(sitelh_file.c_str(), tree, NULL, true, model_name);
+    if (params.model_test_and_tree) {
+        delete tree;
+        tree = NULL;
+    }
+}
+
+/**
+ * select models for all partitions
+ * @param model_info (IN/OUT) all model information
+ * @return total number of parameters
+ */
+void testPartitionModel(Params &params, PhyloSuperTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel) {
+//    params.print_partition_info = true;
+//    params.print_conaln = true;
+	int i = 0;
+//	PhyloSuperTree::iterator it;
+	DoubleVector lhvec; // log-likelihood for each partition
+	DoubleVector dfvec; // number of parameters for each partition
+    DoubleVector lenvec; // tree length for each partition
+	double lhsum = 0.0;
+	int dfsum = 0;
+	int ssize = in_tree->getAlnNSite();
+	int num_model = 0;
+    int total_num_model = in_tree->size();
+	if (params.model_name.find("LINK") != string::npos || params.model_name.find("MERGE") != string::npos) {
+        double p = params.partfinder_rcluster/100.0;
+        total_num_model += round(in_tree->size()*(in_tree->size()-1)*p/2);
+        for (i = in_tree->size()-2; i > 0; i--)
+            total_num_model += max(round(i*p), 1.0);
+    }
+    
+    double start_time = getRealTime();
+    
+	cout << "Selecting individual models for " << in_tree->size() << " charsets using " << criterionName(params.model_test_criterion) << "..." << endl;
+	//cout << " No. AIC         AICc        BIC         Charset" << endl;
+	cout << " No. Model        Score       Charset" << endl;
+
+	lhvec.resize(in_tree->size());
+	dfvec.resize(in_tree->size());
+	lenvec.resize(in_tree->size());
+
+    double *dist = new double[in_tree->size()*(in_tree->size()-1)/2];
+    int *distID = new int[in_tree->size()*(in_tree->size()-1)/2];
+    
+    // sort partition by computational cost for OpenMP effciency
+	for (i = 0; i < in_tree->size(); i++) {
+        distID[i] = i;
+        Alignment *this_aln = in_tree->at(i)->aln;
+        // computation cost is proportional to #sequences, #patterns, and #states
+        dist[i] = -((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
+    }
+    
+    if (params.num_threads > 1)
+        quicksort(dist, 0, in_tree->size()-1, distID);
+
+
+#ifdef _OPENMP
+//        for (i = 0; i < in_tree->size(); i++)
+//            cout << distID[i]+1 << "\t" << in_tree->part_info[distID[i]].name << "\t" << -dist[i] << endl;
+#pragma omp parallel for private(i) schedule(dynamic) reduction(+: lhsum, dfsum)
+#endif
+	for (int j = 0; j < in_tree->size(); j++) {
+        i = distID[j];
+        PhyloTree *this_tree = in_tree->at(i);
+		// scan through models for this partition, assuming the information occurs consecutively
+		vector<ModelInfo> part_model_info;
+		extractModelInfo(in_tree->part_info[i].name, model_info, part_model_info);
+        stringstream this_fmodel;
+		// do the computation
+//#ifdef _OPENMP
+		string model = testModel(params, this_tree, part_model_info, this_fmodel, in_tree->part_info[i].name);
+//#else
+//		string model = testModel(params, this_tree, part_model_info, fmodel, in_tree->part_info[i].name);
+//#endif
+		double score = computeInformationScore(part_model_info[0].logl,part_model_info[0].df,
+				this_tree->getAlnNSite(),params.model_test_criterion);
+		in_tree->part_info[i].model_name = model;
+		lhsum += (lhvec[i] = part_model_info[0].logl);
+		dfsum += (dfvec[i] = part_model_info[0].df);
+        lenvec[i] = part_model_info[0].tree_len;
+
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+        {
+//#ifdef _OPENMP
+            fmodel << this_fmodel.str();
+//#endif
+            num_model++;
+            cout.width(4);
+            cout << right << num_model << " ";
+            cout.width(12);
+            cout << left << model << " ";
+            cout.width(11);
+            cout << score << " " << in_tree->part_info[i].name;
+            if (num_model >= 10) {
+                double remain_time = (total_num_model-num_model)*(getRealTime()-start_time)/num_model;
+                cout << "\t" << convert_time(getRealTime()-start_time) << " (" 
+                    << convert_time(remain_time) << " left)";
+            }
+            cout << endl;
+            replaceModelInfo(model_info, part_model_info);
+        }
+    }
+
+	if (params.model_name.find("LINK") == string::npos && params.model_name.find("MERGE") == string::npos) {
+		in_tree->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
+		in_tree->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
+        delete [] distID;
+        delete [] dist;
+		return;
+	}
+
+	/* following implements the greedy algorithm of Lanfear et al. (2012) */
+//	int part1, part2;
+	double inf_score = computeInformationScore(lhsum, dfsum, ssize, params.model_test_criterion);
+	cout << "Full partition model " << criterionName(params.model_test_criterion) << " score: " << inf_score << " (lh=" << lhsum << "  df=" << dfsum << ")" << endl;
+	SuperAlignment *super_aln = ((SuperAlignment*)in_tree->aln);
+	vector<IntVector> gene_sets;
+	gene_sets.resize(in_tree->size());
+	StrVector model_names;
+	model_names.resize(in_tree->size());
+	StrVector greedy_model_trees;
+	greedy_model_trees.resize(in_tree->size());
+	for (i = 0; i < gene_sets.size(); i++) {
+		gene_sets[i].push_back(i);
+		model_names[i] = in_tree->part_info[i].model_name;
+		greedy_model_trees[i] = in_tree->part_info[i].name;
+	}
+	cout << "Merging models to increase model fit (about " << total_num_model << " total partition schemes)..." << endl;
+	int prev_part = -1;
+	while (gene_sets.size() >= 2) {
+		// stepwise merging charsets
+		double new_score = DBL_MAX;
+		double opt_lh = 0.0;
+		int opt_df = 0;
+        double opt_treelen = 0.0;
+		int opt_part1 = 0, opt_part2 = 1;
+		IntVector opt_merged_set;
+		string opt_set_name = "";
+		string opt_model_name = "";
+        int num_pairs = 0;
+        // 2015-06-24: begin rcluster algorithm
+        // compute distance between gene_sets
+		for (int part1 = 0; part1 < gene_sets.size()-1; part1++)
+			for (int part2 = part1+1; part2 < gene_sets.size(); part2++)
+			if (super_aln->partitions[gene_sets[part1][0]]->seq_type == super_aln->partitions[gene_sets[part2][0]]->seq_type)
+            {
+				// only merge partitions of the same data type
+                dist[num_pairs] = fabs(lenvec[part1] - lenvec[part2]);
+                distID[num_pairs] = (part1 << 16) | part2;
+                num_pairs++;
+            }
+        if (num_pairs > 0 && params.partfinder_rcluster < 100) {
+            // sort distance
+            quicksort(dist, 0, num_pairs-1, distID);
+            num_pairs = (int)round(num_pairs * (params.partfinder_rcluster/100.0));
+            if (num_pairs <= 0) num_pairs = 1;
+        }
+        // sort partition by computational cost for OpenMP effciency
+        for (i = 0; i < num_pairs; i++) {
+            // computation cost is proportional to #sequences, #patterns, and #states
+            Alignment *this_aln = in_tree->at(distID[i] >> 16)->aln;
+            dist[i] = -((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
+            this_aln = in_tree->at(distID[i] & ((1<<16)-1))->aln;
+            dist[i] -= ((double)this_aln->getNSeq())*this_aln->getNPattern()*this_aln->num_states;
+        }
+        if (params.num_threads > 1)
+            quicksort(dist, 0, num_pairs-1, distID);
+
+#ifdef _OPENMP
+#pragma omp parallel for private(i) schedule(dynamic)
+#endif
+        for (int pair = 0; pair < num_pairs; pair++) {
+            int part1 = distID[pair] >> 16;
+            int part2 = distID[pair] & ((1<<16)-1);
+            assert(part1 != part2);
+            IntVector merged_set;
+            merged_set.insert(merged_set.end(), gene_sets[part1].begin(), gene_sets[part1].end());
+            merged_set.insert(merged_set.end(), gene_sets[part2].begin(), gene_sets[part2].end());
+            string set_name = "";
+            for (i = 0; i < merged_set.size(); i++) {
+                if (i > 0)
+                    set_name += "+";
+                set_name += in_tree->part_info[merged_set[i]].name;
+            }
+            string model = "";
+            double logl = 0.0;
+            int df = 0;
+            double treelen = 0.0;
+            bool done_before = false;
+            if (prev_part >= 0 && part1 != prev_part && part2 != prev_part) {
+                // if pairs previously examined, reuse the information
+                for (vector<ModelInfo>::iterator mit = model_info.begin(); mit != model_info.end(); mit++)
+                    if (mit->set_name == set_name) {
+                        model = mit->name;
+                        logl = mit->logl;
+                        df = mit->df;
+                        treelen = mit->tree_len;
+                        done_before = true;
+                        break;
+                    }
+            }
+            vector<ModelInfo> part_model_info;
+            stringstream this_fmodel;
+            if (!done_before) {
+                Alignment *aln = super_aln->concatenateAlignments(merged_set);
+                PhyloTree *tree = in_tree->extractSubtree(merged_set);
+                tree->setAlignment(aln);
+                extractModelInfo(set_name, model_info, part_model_info);
+//#ifdef _OPENMP
+                model = testModel(params, tree, part_model_info, this_fmodel, set_name);
+//#else
+//                model = testModel(params, tree, part_model_info, fmodel, set_name);
+//#endif
+                logl = part_model_info[0].logl;
+                df = part_model_info[0].df;
+                treelen = part_model_info[0].tree_len;
+                delete tree;
+                delete aln;
+            }
+            double lhnew = lhsum - lhvec[part1] - lhvec[part2] + logl;
+            int dfnew = dfsum - dfvec[part1] - dfvec[part2] + df;
+            double score = computeInformationScore(lhnew, dfnew, ssize, params.model_test_criterion);
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+			{
+				if (!done_before) {
+//#ifdef _OPENMP
+                    fmodel << this_fmodel.str();
+//#endif
+					replaceModelInfo(model_info, part_model_info);
+                    num_model++;
+					cout.width(4);
+					cout << right << num_model << " ";
+					cout.width(12);
+					cout << left << model << " ";
+					cout.width(11);
+					cout << score << " " << set_name;
+                    if (num_model >= 10) {
+                        double remain_time = max(total_num_model-num_model, 0)*(getRealTime()-start_time)/num_model;
+                        cout << "\t" << convert_time(getRealTime()-start_time) << " (" 
+                            << convert_time(remain_time) << " left)";
+                    }
+                    cout << endl;
+				}
+				if (score < new_score) {
+					new_score = score;
+					opt_part1 = part1;
+					opt_part2 = part2;
+					opt_lh = logl;
+					opt_df = df;
+                    opt_treelen = treelen;
+					opt_merged_set = merged_set;
+					opt_set_name = set_name;
+					opt_model_name = model;
+				}
+			}
+
+        }
+		if (new_score >= inf_score) break;
+		inf_score = new_score;
+
+		lhsum = lhsum - lhvec[opt_part1] - lhvec[opt_part2] + opt_lh;
+		dfsum = dfsum - dfvec[opt_part1] - dfvec[opt_part2] + opt_df;
+		cout << "Merging " << opt_set_name << " with " << criterionName(params.model_test_criterion) << " score: " << new_score << " (lh=" << lhsum << "  df=" << dfsum << ")" << endl;
+		// change entry opt_part1 to merged one
+		gene_sets[opt_part1] = opt_merged_set;
+		lhvec[opt_part1] = opt_lh;
+		dfvec[opt_part1] = opt_df;
+        lenvec[opt_part1] = opt_treelen;
+		model_names[opt_part1] = opt_model_name;
+		greedy_model_trees[opt_part1] = "(" + greedy_model_trees[opt_part1] + "," + greedy_model_trees[opt_part2] + ")" +
+				convertIntToString(in_tree->size()-gene_sets.size()+1) + ":" + convertDoubleToString(inf_score);
+		prev_part = opt_part1;
+
+		// delete entry opt_part2
+		lhvec.erase(lhvec.begin() + opt_part2);
+		dfvec.erase(dfvec.begin() + opt_part2);
+		lenvec.erase(lenvec.begin() + opt_part2);
+		gene_sets.erase(gene_sets.begin() + opt_part2);
+		model_names.erase(model_names.begin() + opt_part2);
+		greedy_model_trees.erase(greedy_model_trees.begin() + opt_part2);
+	}
+
+	string final_model_tree;
+	if (greedy_model_trees.size() == 1)
+		final_model_tree = greedy_model_trees[0];
+	else {
+		final_model_tree = "(";
+		for (i = 0; i < greedy_model_trees.size(); i++) {
+			if (i>0)
+				final_model_tree += ",";
+			final_model_tree += greedy_model_trees[i];
+		}
+		final_model_tree += ")";
+	}
+
+	cout << "BEST-FIT PARTITION MODEL: " << endl;
+	cout << "  charpartition " << criterionName(params.model_test_criterion) << " = ";
+	for (i = 0; i < gene_sets.size(); i++) {
+		if (i > 0)
+			cout << ", ";
+		cout << model_names[i] << ":";
+		for (int j = 0; j < gene_sets[i].size(); j++) {
+			cout << " " << in_tree->part_info[gene_sets[i][j]].name;
+		}
+	}
+	cout << ";" << endl;
+	cout << "Agglomerative model selection: " << final_model_tree << endl;
+    
+    delete [] distID;
+    delete [] dist;
+	mergePartitions(in_tree, gene_sets, model_names);
+	in_tree->printBestPartition((string(params.out_prefix) + ".best_scheme.nex").c_str());
+	in_tree->printBestPartitionRaxml((string(params.out_prefix) + ".best_scheme").c_str());
+}
+
+string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel, string set_name, bool print_mem_usage) {
+	SeqType seq_type = in_tree->aln->seq_type;
+	if (in_tree->isSuperTree())
+		seq_type = ((PhyloSuperTree*)in_tree)->front()->aln->seq_type;
+	if (seq_type == SEQ_UNKNOWN)
+		outError("Unknown data for model testing.");
+	string fmodel_str = params.out_prefix;
+	fmodel_str += ".model";
+	string sitelh_file = params.out_prefix;
+	sitelh_file += ".sitelh";
+	in_tree->params = ¶ms;
+	StrVector model_names;
+	int max_cats = getModelList(params, in_tree->aln, model_names, params.model_test_separate_rate);
+	int model;
+
+    if (print_mem_usage) {
+        uint64_t mem_size = in_tree->getMemoryRequired(max_cats);
+        cout << "NOTE: MODEL SELECTION REQUIRES " << (mem_size / 1024) / 1024
+                << " MB MEMORY!" << endl;
+        if (mem_size >= getMemorySize()) {
+            outError("Memory required exceeds your computer RAM size!");
+        }
+#ifdef BINARY32
+        if (mem_size >= 2000000000) {
+            outError("Memory required exceeds 2GB limit of 32-bit executable");
+        }
+#endif
+    }
+
+
+	string best_model = "";
+	/* first check the model file */
+
+	if (in_tree->isSuperTree()) {
+		// select model for each partition
+		PhyloSuperTree *stree = (PhyloSuperTree*)in_tree;
+		testPartitionModel(params, stree, model_info, fmodel);
+		string res_models = "";
+		for (vector<PartitionInfo>::iterator it = stree->part_info.begin(); it != stree->part_info.end(); it++) {
+			if (it != stree->part_info.begin()) res_models += ",";
+			res_models += (*it).model_name;
+		}
+		return res_models;
+	}
+
+	in_tree->optimize_by_newton = params.optimize_by_newton;
+	in_tree->setLikelihoodKernel(params.SSE);
+
+    int num_rate_classes = 3 + params.max_rate_cats;
+
+	RateHeterogeneity ** rate_class = new RateHeterogeneity*[num_rate_classes];
+	rate_class[0] = new RateHeterogeneity();
+	rate_class[1] = new RateInvar(-1, NULL);
+	rate_class[2] = new RateGamma(params.num_rate_cats, params.gamma_shape, params.gamma_median, NULL);
+	rate_class[3] = new RateGammaInvar(params.num_rate_cats, params.gamma_shape, params.gamma_median, -1, params.optimize_model_rate_joint, NULL);
+    for (model = 4; model < num_rate_classes; model++)
+        rate_class[model] = new RateFree(model-2, params.gamma_shape, "", false, params.optimize_alg, NULL);
+        
+	ModelGTR *subst_model = NULL;
+	if (seq_type == SEQ_BINARY)
+		subst_model = new ModelBIN("JC2", "", FREQ_UNKNOWN, "", in_tree);
+	else if (seq_type == SEQ_DNA)
+		subst_model = new ModelDNA("JC", "", FREQ_UNKNOWN, "", in_tree);
+	else if (seq_type == SEQ_PROTEIN)
+		subst_model = new ModelProtein("WAG", "", FREQ_UNKNOWN, "", in_tree);
+	else if (seq_type == SEQ_MORPH)
+		subst_model = new ModelMorphology("MK", "", FREQ_UNKNOWN, "", in_tree);
+	else if (seq_type == SEQ_CODON)
+		subst_model = new ModelCodon("GY", "", FREQ_UNKNOWN, "", in_tree, false);
+
+	assert(subst_model);
+
+	ModelFactory *model_fac = new ModelFactory();
+	model_fac->joint_optimize = params.optimize_model_rate_joint;
+
+	int ssize = in_tree->aln->getNSite(); // sample size
+	if (params.model_test_sample_size)
+		ssize = params.model_test_sample_size;
+	if (set_name == "") {
+		cout << "Testing " << model_names.size() << " "
+			<< ((seq_type == SEQ_BINARY) ? "binary" : ((seq_type == SEQ_DNA) ? "DNA" :
+				((seq_type == SEQ_PROTEIN) ? "protein": ((seq_type == SEQ_CODON) ? "codon": "morphological"))))
+			<< " models (sample size: " << ssize << ") ..." << endl;
+        if (params.model_test_and_tree == 0)
+            cout << " No. Model         -LnL         df  AIC          AICc         BIC" << endl;
+	}
+	if (params.print_site_lh) {
+		ofstream sitelh_out(sitelh_file.c_str());
+		if (!sitelh_out.is_open())
+			outError("Cannot write to file ", sitelh_file);
+		sitelh_out << model_names.size() << " " << in_tree->getAlnNSite() << endl;
+		sitelh_out.close();
+	}
+	vector<ModelInfo>::iterator it;
+	for (it = model_info.begin(); it != model_info.end(); it++) {
+		it->AIC_score = DBL_MAX;
+		it->AICc_score = DBL_MAX;
+		it->BIC_score = DBL_MAX;
+	}
+
+	int num_cat = 0;
+    int model_aic = -1, model_aicc = -1, model_bic = -1;
+    string prev_tree_string = "";
+    int prev_model_id = -1;
+    int skip_model = 0;
+
+	for (model = 0; model < model_names.size(); model++) {
+		//cout << model_names[model] << endl;
+        if (model_names[model][0] == '+') {
+            // now switching to test rate heterogeneity
+            if (best_model == "")
+                switch (params.model_test_criterion) {
+                case MTC_AIC:
+                    best_model = model_info[model_aic].name;
+                    break;
+                case MTC_AICC:
+                    best_model = model_info[model_aicc].name;
+                    break;
+                case MTC_BIC:
+                    best_model = model_info[model_bic].name;
+                    break;
+                default: assert(0);
+                }
+            model_names[model] = best_model + model_names[model];
+        }
+		PhyloTree *tree = in_tree;
+        
+        if (model_names[model].find("+ASC") != string::npos) {
+            model_fac->unobserved_ptns = in_tree->aln->getUnobservedConstPatterns();
+            if (model_fac->unobserved_ptns.size() == 0) {
+                cout.width(3);
+                cout << right << model+1 << "  ";
+                cout.width(13);
+                cout << left << model_names[model] << " ";                
+                cout << "Skipped since +ASC is not applicable" << endl;
+                continue;
+            }
+            tree->aln->buildSeqStates(true);
+            if (model_fac->unobserved_ptns.size() < tree->aln->getNumNonstopCodons())
+                outError("Invalid use of +ASC because constant patterns are observed in the alignment");
+        } else {
+            model_fac->unobserved_ptns = "";
+            tree->aln->buildSeqStates(false);
+        }
+        // initialize tree
+        // initialize model
+        subst_model->setTree(tree);
+        StateFreqType freq_type = FREQ_UNKNOWN;
+        if (model_names[model].find("+F1X4") != string::npos)
+            freq_type = FREQ_CODON_1x4;
+        else if (model_names[model].find("+F3X4C") != string::npos)
+            freq_type = FREQ_CODON_3x4C;
+        else if (model_names[model].find("+F3X4") != string::npos)
+            freq_type = FREQ_CODON_3x4;
+        else if (model_names[model].find("+FQ") != string::npos)
+            freq_type = FREQ_EQUAL;
+        else if (model_names[model].find("+F") != string::npos)
+            freq_type = FREQ_EMPIRICAL;
+            
+        subst_model->init(model_names[model].substr(0, model_names[model].find('+')).c_str(), "", freq_type, "");
+        tree->params = ¶ms;
+
+        tree->setModel(subst_model);
+        // initialize rate
+        size_t pos;
+        int ncat = 0;
+        if ((pos = model_names[model].find("+R")) != string::npos) {
+            ncat = params.num_rate_cats;
+            if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
+                ncat = convert_int(model_names[model].c_str() + pos+2);
+//                tree->getRate()->setNCategory(ncat);
+            }
+            if (ncat <= 1) outError("Number of rate categories for " + model_names[model] + " is <= 1");
+            if (ncat > params.max_rate_cats)
+                outError("Number of rate categories for " + model_names[model] + " exceeds " + convertIntToString(params.max_rate_cats));
+            tree->setRate(rate_class[2+ncat]);
+        } else if (model_names[model].find("+I") != string::npos && (pos = model_names[model].find("+G")) != string::npos) {
+            tree->setRate(rate_class[3]);
+            if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
+                int ncat = convert_int(model_names[model].c_str() + pos+2);
+                if (ncat < 1) outError("Wrong number of category for +G in " + model_names[model]);
+                tree->getRate()->setNCategory(ncat);
+            }
+        } else if ((pos = model_names[model].find("+G")) != string::npos) {
+            tree->setRate(rate_class[2]);
+            if (model_names[model].length() > pos+2 && isdigit(model_names[model][pos+2])) {
+                ncat = convert_int(model_names[model].c_str() + pos+2);
+                if (ncat < 1) outError("Wrong number of category for +G in " + model_names[model]);
+                tree->getRate()->setNCategory(ncat);
+            }
+        } else if (model_names[model].find("+I") != string::npos)
+            tree->setRate(rate_class[1]);
+        else
+            tree->setRate(rate_class[0]);
+
+        tree->getRate()->setTree(tree);
+
+        // initialize model factory
+        model_fac->model = subst_model;
+        model_fac->site_rate = tree->getRate();
+        tree->setModelFactory(model_fac);
+
+        tree->clearAllPartialLH();
+
+
+		// optimize model parameters
+		ModelInfo info;        
+		info.set_name = set_name;
+		info.df = tree->getModelFactory()->getNParameters();
+		info.name = tree->getModelName();
+		int model_id = -1;
+        if (skip_model) {
+            assert(prev_model_id>=0);
+            size_t pos_r = info.name.find("+R");
+            if (pos_r == string::npos || info.name.substr(0, pos_r) != model_info[prev_model_id].name.substr(0, pos_r))
+                skip_model = 0;
+        }
+		for (int i = 0; i < model_info.size(); i++)
+			if (info.name == model_info[i].name) {
+				model_id = i;
+				if (info.df != model_info[i].df)
+					outError("Inconsistent model file " + fmodel_str + ", please rerun using -mredo option");
+				break;
+			}
+		if (model_id >= 0) {
+			info.logl = model_info[model_id].logl;
+            info.tree_len = model_info[model_id].tree_len;
+            info.tree = model_info[model_id].tree;
+            prev_tree_string = model_info[model_id].tree;
+        } else if (skip_model) {
+            info.logl = model_info[prev_model_id].logl;
+            info.tree_len = model_info[prev_model_id].tree_len;
+//            info.tree = model_info[prev_model_id].tree;
+            prev_tree_string = model_info[prev_model_id].tree;
+//            cout << "Skipped " << info.name << endl;
+		} else {
+            if (params.model_test_and_tree) {
+                string original_model = params.model_name;
+                params.model_name = model_names[model];
+                char *orig_user_tree = params.user_file;
+                string new_user_tree = (string)params.out_prefix+".treefile";
+                if (params.model_test_and_tree == 1 && model>0 && fileExists(new_user_tree)) {
+                    params.user_file = (char*)new_user_tree.c_str();
+                }
+                if (in_tree->isSuperTree()) {
+                    outError("-mtree option is not supported for partition model");
+                }
+                IQTree *iqtree = new IQTree(in_tree->aln);
+                cout << endl << "===> Testing model " << model+1 << ": " << params.model_name << endl;
+                runTreeReconstruction(params, original_model, *iqtree, model_info);
+                info.logl = iqtree->computeLikelihood();
+                info.tree_len = iqtree->treeLength();
+//                info.tree = iqtree->getTreeString();
+                params.model_name = original_model;
+                params.user_file = orig_user_tree;
+                tree = iqtree;
+            } else {
+                if (tree->getRate()->getNRate() > num_cat) {
+                    tree->deleteAllPartialLh();
+                    num_cat = tree->getRate()->getNRate();
+                    tree->initializeAllPartialLh();
+                }
+                if (prev_tree_string != "") {
+                    tree->readTreeString(prev_tree_string);
+                }
+                prev_tree_string = "";
+                info.logl = tree->getModelFactory()->optimizeParameters(false, false, TOL_LIKELIHOOD_MODELTEST, TOL_GRADIENT_MODELTEST);
+                info.tree_len = tree->treeLength();
+                if (prev_model_id >= 0) {
+                    // check stop criterion for +R
+                    size_t prev_pos_r = model_info[prev_model_id].name.find("+R");
+                    size_t pos_r = info.name.find("+R");
+                    if ( prev_pos_r != string::npos &&  pos_r != string::npos && 
+                        model_info[prev_model_id].name.substr(0,prev_pos_r) == info.name.substr(0, pos_r) && 
+                        info.logl < model_info[prev_model_id].logl) 
+                    {
+                        if (verbose_mode >= VB_MED)
+                            cout << "reoptimizing from previous parameters of +R...." << endl;
+                        dynamic_cast<RateFree*>(rate_class[2+ncat])->setRateAndProp(dynamic_cast<RateFree*>(rate_class[1+ncat]));
+                        info.logl = tree->getModelFactory()->optimizeParameters(false, false, TOL_LIKELIHOOD_MODELTEST, TOL_GRADIENT_MODELTEST);
+                        info.tree_len = tree->treeLength();                        
+                    }
+                }
+//                info.tree = tree->getTreeString();
+            }
+			// print information to .model file
+            info.tree = tree->getTreeString();
+            printModelFile(fmodel, params, tree, info, set_name);
+		}
+		computeInformationScores(info.logl, info.df, ssize, info.AIC_score, info.AICc_score, info.BIC_score);
+        if (prev_model_id >= 0) {
+            // check stop criterion for +R
+            size_t prev_pos_r = model_info[prev_model_id].name.find("+R");
+            size_t pos_r = info.name.find("+R");
+            if ( prev_pos_r != string::npos &&  pos_r != string::npos && 
+            model_info[prev_model_id].name.substr(0,prev_pos_r) == info.name.substr(0, pos_r)) {
+                switch (params.model_test_stop_rule) {
+                case MTC_ALL:
+                    if (info.AIC_score > model_info[prev_model_id].AIC_score && info.AICc_score > model_info[prev_model_id].AICc_score &&
+                        info.BIC_score > model_info[prev_model_id].BIC_score) {
+                        // skip remaining model
+                        skip_model++;
+                    }
+                    break;
+                case MTC_AIC:
+                    if (info.AIC_score > model_info[prev_model_id].AIC_score) {
+                        // skip remaining model
+                        skip_model++;
+                    }
+                    break;
+                case MTC_AICC:
+                    if (info.AICc_score > model_info[prev_model_id].AICc_score) {
+                        // skip remaining model
+                        skip_model++;
+                    }
+                    break;
+                case MTC_BIC:
+                    if (info.BIC_score > model_info[prev_model_id].BIC_score) {
+                        // skip remaining model
+                        skip_model++;
+                    }
+                    break;
+                }
+            }
+        }
+        if (skip_model > 1)
+            info.AIC_score = DBL_MAX;
+        
+		if (model_id >= 0) {
+			model_info[model_id] = info;
+		} else {
+			model_info.push_back(info);
+            model_id = model_info.size()-1;
+		}
+		if (model_aic < 0 || model_info[model_id].AIC_score < model_info[model_aic].AIC_score)
+			model_aic = model_id;
+		if (model_aicc < 0 || model_info[model_id].AICc_score < model_info[model_aicc].AICc_score)
+			model_aicc = model_id;
+		if (model_bic < 0 || model_info[model_id].BIC_score < model_info[model_bic].BIC_score)
+			model_bic = model_id;
+        
+        in_tree->setModel(NULL);
+        in_tree->setModelFactory(NULL);
+        in_tree->setRate(NULL);
+
+        prev_model_id = model_id;
+
+		if (set_name != "") continue;
+
+		cout.width(3);
+		cout << right << model+1 << "  ";
+		cout.width(13);
+		cout << left << info.name << " ";
+        
+        if (skip_model > 1) {
+            cout << "Skipped " << endl;
+            continue;
+        }
+        
+		cout.precision(3);
+		cout << fixed;
+		cout.width(12);
+		cout << -info.logl << " ";
+		cout.width(3);
+		cout << info.df << " ";
+		cout.width(12);
+		cout << info.AIC_score << " ";
+		cout.width(12);
+		cout << info.AICc_score << " " << info.BIC_score;
+		cout << endl;
+
+
+	}
+
+    if (model_bic < 0) 
+        outError("No models were examined! Please check messages above");
+
+	//cout.unsetf(ios::fixed);
+	/*
+	for (it = model_info.begin(); it != model_info.end(); it++)
+		computeInformationScores(it->logl, it->df, ssize, it->AIC_score, it->AICc_score, it->BIC_score);
+*/
+//	for (it = model_info.begin(), model = 0; it != model_info.end(); it++, model++) {
+//		if ((*it).AIC_score < model_info[model_aic].AIC_score)
+//			model_aic = model;
+//		if ((*it).AICc_score < model_info[model_aicc].AICc_score)
+//			model_aicc = model;
+//		if ((*it).BIC_score < model_info[model_bic].BIC_score)
+//			model_bic = model;
+//	}
+	if (set_name == "") {
+		cout << "Akaike Information Criterion:           " << model_info[model_aic].name << endl;
+		cout << "Corrected Akaike Information Criterion: " << model_info[model_aicc].name << endl;
+		cout << "Bayesian Information Criterion:         " << model_info[model_bic].name << endl;
+	} else {
+		/*
+		cout.width(11);
+		cout << left << model_info[model_aic].name << " ";
+		cout.width(11);
+		cout << left << model_info[model_aicc].name << " ";
+		cout.width(11);
+		cout << left << model_info[model_bic].name << " ";
+		cout << set_name;
+		*/
+	}
+
+	/* computing model weights */
+	double AIC_sum = 0.0, AICc_sum = 0.0, BIC_sum = 0.0;
+	for (it = model_info.begin(); it != model_info.end(); it++) {
+		it->AIC_weight = exp(-0.5*(it->AIC_score-model_info[model_aic].AIC_score));
+		it->AICc_weight = exp(-0.5*(it->AICc_score-model_info[model_aicc].AICc_score));
+		it->BIC_weight = exp(-0.5*(it->BIC_score-model_info[model_bic].BIC_score));
+		it->AIC_conf = false;
+		it->AICc_conf = false;
+		it->BIC_conf = false;
+		AIC_sum += it->AIC_weight;
+		AICc_sum += it->AICc_weight;
+		BIC_sum += it->BIC_weight;
+	}
+	for (it = model_info.begin(); it != model_info.end(); it++) {
+		it->AIC_weight /= AIC_sum;
+		it->AICc_weight /= AICc_sum;
+		it->BIC_weight /= BIC_sum;
+	}
+
+	int *model_rank = new int[model_info.size()];
+	double *scores = new double[model_info.size()];
+
+	/* compute confidence set for BIC */
+    AIC_sum = 0.0;
+    AICc_sum = 0.0;
+    BIC_sum = 0.0;
+	for (model = 0; model < model_info.size(); model++)
+		scores[model] = model_info[model].BIC_score;
+	sort_index(scores, scores+model_info.size(), model_rank);
+	for (model = 0; model < model_info.size(); model++) {
+		model_info[model_rank[model]].BIC_conf = true;
+		BIC_sum += model_info[model_rank[model]].BIC_weight;
+		if (BIC_sum > 0.95) break;
+	}
+	/* compute confidence set for AIC */
+	for (model = 0; model < model_info.size(); model++)
+		scores[model] = model_info[model].AIC_score;
+	sort_index(scores, scores+model_info.size(), model_rank);
+	for (model = 0; model < model_info.size(); model++) {
+		model_info[model_rank[model]].AIC_conf = true;
+		AIC_sum += model_info[model_rank[model]].AIC_weight;
+		if (AIC_sum > 0.95) break;
+	}
+
+	/* compute confidence set for AICc */
+	for (model = 0; model < model_info.size(); model++)
+		scores[model] = model_info[model].AICc_score;
+	sort_index(scores, scores+model_info.size(), model_rank);
+	for (model = 0; model < model_info.size(); model++) {
+		model_info[model_rank[model]].AICc_conf = true;
+		AICc_sum += model_info[model_rank[model]].AICc_weight;
+		if (AICc_sum > 0.95) break;
+	}
+
+    string best_tree; // BQM 2015-07-21: With Lars find best model
+	/* sort models by their scores */
+	switch (params.model_test_criterion) {
+	case MTC_AIC:
+		for (model = 0; model < model_info.size(); model++)
+			scores[model] = model_info[model].AIC_score;
+		best_model = model_info[model_aic].name;
+        best_tree = model_info[model_aic].tree;
+		break;
+	case MTC_AICC:
+		for (model = 0; model < model_info.size(); model++)
+			scores[model] = model_info[model].AICc_score;
+		best_model = model_info[model_aicc].name;
+        best_tree = model_info[model_aicc].tree;
+		break;
+	case MTC_BIC:
+		for (model = 0; model < model_info.size(); model++)
+			scores[model] = model_info[model].BIC_score;
+		best_model = model_info[model_bic].name;
+        best_tree = model_info[model_bic].tree;
+		break;
+    default: assert(0);
+	}
+	sort_index(scores, scores + model_info.size(), model_rank);
+
+	vector<ModelInfo> sorted_info;
+	for (model = 0; model < model_info.size(); model++)
+		sorted_info.push_back(model_info[model_rank[model]]);
+	model_info = sorted_info;
+
+	delete [] model_rank;
+	delete [] scores;
+
+	delete model_fac;
+	delete subst_model;
+	for (int rate_type = num_rate_classes-1; rate_type >= 0; rate_type--) {
+		delete rate_class[rate_type];
+    }
+    delete [] rate_class;
+//	delete tree_hetero;
+//	delete tree_homo;
+	in_tree->deleteAllPartialLh();
+    
+    // BQM 2015-07-21 with Lars: load the best_tree
+//	if (params.model_test_and_tree)
+		in_tree->readTreeString(best_tree);
+
+    
+	if (set_name == "") {
+		cout << "Best-fit model: " << best_model << " chosen according to " << 
+            ((params.model_test_criterion == MTC_BIC) ? "BIC" :
+			((params.model_test_criterion == MTC_AIC) ? "AIC" : "AICc")) << endl;
+	}
+	if (params.print_site_lh)
+		cout << "Site log-likelihoods per model printed to " << sitelh_file << endl;
+	return best_model;
+}
+
+int countDistinctTrees(const char *filename, bool rooted, IQTree *tree, IntVector &distinct_ids, bool exclude_duplicate) {
+	StringIntMap treels;
+	try {
+		ifstream in;
+		in.exceptions(ios::failbit | ios::badbit);
+		in.open(filename);
+		// remove the failbit
+		in.exceptions(ios::badbit);
+		int tree_id;
+		for (tree_id = 0; !in.eof(); tree_id++) {
+			if (exclude_duplicate) {
+				tree->freeNode();
+				tree->readTree(in, rooted);
+				tree->setAlignment(tree->aln);
+				tree->setRootNode((char*)tree->aln->getSeqName(0).c_str());
+				StringIntMap::iterator it = treels.end();
+				ostringstream ostr;
+				tree->printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+				it = treels.find(ostr.str());
+				if (it != treels.end()) { // already in treels
+					distinct_ids.push_back(it->second);
+				} else {
+					distinct_ids.push_back(-1);
+					treels[ostr.str()] = tree_id;
+				}
+			} else {
+				// ignore tree
+				char ch;
+				do {
+					in >> ch;
+				} while (!in.eof() && ch != ';');
+				distinct_ids.push_back(-1);
+			}
+			char ch;
+			in.exceptions(ios::goodbit);
+			(in) >> ch;
+			if (in.eof()) break;
+			in.unget();
+			in.exceptions(ios::failbit | ios::badbit);
+
+		}
+		in.close();
+	} catch (ios::failure) {
+		outError("Cannot read file ", filename);
+	}
+	if (exclude_duplicate)
+		return treels.size();
+	else
+		return distinct_ids.size();
+}
+
+//const double TOL_RELL_SCORE = 0.01;
+
+void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVector &distinct_ids)
+{
+	if (!params.treeset_file)
+		return;
+	cout << endl;
+	//MTreeSet trees(params.treeset_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
+	cout << "Reading trees in " << params.treeset_file << " ..." << endl;
+	int ntrees = countDistinctTrees(params.treeset_file, params.is_rooted, tree, distinct_ids, params.distinct_trees);
+	if (ntrees < distinct_ids.size()) {
+		cout << "WARNING: " << distinct_ids.size() << " trees detected but only " << ntrees << " distinct trees will be evaluated" << endl;
+	} else {
+		cout << ntrees << (params.distinct_trees ? " distinct" : "") << " trees detected" << endl;
+	}
+	if (ntrees == 0) return;
+	ifstream in(params.treeset_file);
+
+	//if (trees.size() == 1) return;
+	//string tree_file = params.treeset_file;
+	string tree_file = params.out_prefix;
+	tree_file += ".trees";
+	ofstream treeout;
+	//if (!params.fixed_branch_length) {
+		treeout.open(tree_file.c_str());
+	//}
+	string score_file = params.out_prefix;
+	score_file += ".treelh";
+	ofstream scoreout;
+	if (params.print_tree_lh)
+		scoreout.open(score_file.c_str());
+	string site_lh_file = params.out_prefix;
+	site_lh_file += ".sitelh";
+	if (params.print_site_lh) {
+		ofstream site_lh_out(site_lh_file.c_str());
+		site_lh_out << ntrees << " " << tree->getAlnNSite() << endl;
+		site_lh_out.close();
+	}
+
+	double time_start = getCPUTime();
+
+	int *boot_samples = NULL;
+	int boot;
+	//double *saved_tree_lhs = NULL;
+	double *tree_lhs = NULL;
+	double *pattern_lh = NULL;
+	double *pattern_lhs = NULL;
+	double *orig_tree_lh = NULL;
+	double *max_lh = NULL;
+	double *lhdiff_weights = NULL;
+	int nptn = tree->getAlnNPattern();
+	if (params.topotest_replicates && ntrees > 1) {
+		size_t mem_size = (size_t)params.topotest_replicates*nptn*sizeof(int) +
+				ntrees*params.topotest_replicates*sizeof(double) +
+				(nptn + ntrees*3 + params.topotest_replicates*2)*sizeof(double) +
+				ntrees*sizeof(TreeInfo) +
+				params.do_weighted_test*(ntrees * nptn * sizeof(double) + ntrees*ntrees*sizeof(double));
+		cout << "Note: " << ((double)mem_size/1024)/1024 << " MB of RAM required!" << endl;
+		if (mem_size > getMemorySize()-100000)
+			outWarning("The required memory does not fit in RAM!");
+		cout << "Creating " << params.topotest_replicates << " bootstrap replicates..." << endl;
+		if (!(boot_samples = new int [params.topotest_replicates*nptn]))
+			outError(ERR_NO_MEMORY);
+		for (boot = 0; boot < params.topotest_replicates; boot++)
+			tree->aln->createBootstrapAlignment(boot_samples + (boot*nptn), params.bootstrap_spec);
+		//if (!(saved_tree_lhs = new double [ntrees * params.topotest_replicates]))
+		//	outError(ERR_NO_MEMORY);
+		if (!(tree_lhs = new double [ntrees * params.topotest_replicates]))
+			outError(ERR_NO_MEMORY);
+		if (params.do_weighted_test) {
+			if (!(lhdiff_weights = new double [ntrees * ntrees]))
+				outError(ERR_NO_MEMORY);
+			if (!(pattern_lhs = new double[ntrees* nptn]))
+				outError(ERR_NO_MEMORY);
+		}
+		if (!(pattern_lh = new double[nptn]))
+			outError(ERR_NO_MEMORY);
+		if (!(orig_tree_lh = new double[ntrees]))
+			outError(ERR_NO_MEMORY);
+		if (!(max_lh = new double[params.topotest_replicates]))
+			outError(ERR_NO_MEMORY);
+	}
+	int tree_index, tid, tid2;
+	info.resize(ntrees);
+	//for (MTreeSet::iterator it = trees.begin(); it != trees.end(); it++, tree_index++) {
+	for (tree_index = 0, tid = 0; tree_index < distinct_ids.size(); tree_index++) {
+
+		cout << "Tree " << tree_index + 1;
+		if (distinct_ids[tree_index] >= 0) {
+			cout << " / identical to tree " << distinct_ids[tree_index]+1 << endl;
+			// ignore tree
+			char ch;
+			do {
+				in >> ch;
+			} while (!in.eof() && ch != ';');
+			continue;
+		}
+		tree->freeNode();
+		tree->readTree(in, params.is_rooted);
+		tree->setAlignment(tree->aln);
+		if ((tree->sse == LK_EIGEN || tree->sse == LK_EIGEN_SSE) && !tree->isBifurcating()) {
+			cout << "NOTE: Changing to old kernel as user tree is multifurcating" << endl;
+			if (tree->sse == LK_EIGEN)
+				tree->changeLikelihoodKernel(LK_NORMAL);
+			else
+				tree->changeLikelihoodKernel(LK_SSE);
+		}
+
+		tree->initializeAllPartialLh();
+		tree->fixNegativeBranch(false);
+		if (tree->isSuperTree())
+			((PhyloSuperTree*) tree)->mapTrees();
+		if (!params.fixed_branch_length) {
+			tree->setCurScore(tree->optimizeAllBranches(100, 0.001));
+		} else {
+			tree->setCurScore(tree->computeLikelihood());
+		}
+		treeout << "[ tree " << tree_index+1 << " lh=" << tree->getCurScore() << " ]";
+		tree->printTree(treeout);
+		treeout << endl;
+		if (params.print_tree_lh)
+			scoreout << tree->getCurScore() << endl;
+
+		cout << " / LogL: " << tree->getCurScore() << endl;
+
+		if (pattern_lh) {
+			double curScore = tree->getCurScore();
+			tree->computePatternLikelihood(pattern_lh, &curScore);
+			if (params.do_weighted_test)
+				memcpy(pattern_lhs + tid*nptn, pattern_lh, nptn*sizeof(double));
+		}
+		if (params.print_site_lh) {
+			string tree_name = "Tree" + convertIntToString(tree_index+1);
+			printSiteLh(site_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
+		}
+		info[tid].logl = tree->getCurScore();
+
+		if (!params.topotest_replicates || ntrees <= 1) {
+			tid++;
+			continue;
+		}
+		// now compute RELL scores
+		orig_tree_lh[tid] = tree->getCurScore();
+		double *tree_lhs_offset = tree_lhs + (tid*params.topotest_replicates);
+		for (boot = 0; boot < params.topotest_replicates; boot++) {
+			double lh = 0.0;
+			int *this_boot_sample = boot_samples + (boot*nptn);
+			for (int ptn = 0; ptn < nptn; ptn++)
+				lh += pattern_lh[ptn] * this_boot_sample[ptn];
+			tree_lhs_offset[boot] = lh;
+		}
+		tid++;
+	}
+
+	assert(tid == ntrees);
+
+	if (params.topotest_replicates && ntrees > 1) {
+		double *tree_probs = new double[ntrees];
+		memset(tree_probs, 0, ntrees*sizeof(double));
+		int *tree_ranks = new int[ntrees];
+
+		/* perform RELL BP method */
+		cout << "Performing RELL test..." << endl;
+		int *maxtid = new int[params.topotest_replicates];
+		double *maxL = new double[params.topotest_replicates];
+		int *maxcount = new int[params.topotest_replicates];
+		memset(maxtid, 0, params.topotest_replicates*sizeof(int));
+		memcpy(maxL, tree_lhs, params.topotest_replicates*sizeof(double));
+		for (boot = 0; boot < params.topotest_replicates; boot++)
+			maxcount[boot] = 1;
+		for (tid = 1; tid < ntrees; tid++) {
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			for (boot = 0; boot < params.topotest_replicates; boot++)
+				if (tree_lhs_offset[boot] > maxL[boot] + params.ufboot_epsilon) {
+					maxL[boot] = tree_lhs_offset[boot];
+					maxtid[boot] = tid;
+					maxcount[boot] = 1;
+				} else if (tree_lhs_offset[boot] > maxL[boot] - params.ufboot_epsilon &&
+						random_double() <= 1.0/(maxcount[boot]+1)) {
+					maxL[boot] = max(maxL[boot],tree_lhs_offset[boot]);
+					maxtid[boot] = tid;
+					maxcount[boot]++;
+				}
+		}
+		for (boot = 0; boot < params.topotest_replicates; boot++)
+			tree_probs[maxtid[boot]] += 1.0;
+		for (tid = 0; tid < ntrees; tid++) {
+			tree_probs[tid] /= params.topotest_replicates;
+			info[tid].rell_confident = false;
+			info[tid].rell_bp = tree_probs[tid];
+		}
+		sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
+		double prob_sum = 0.0;
+		// obtain the confidence set
+		for (tid = ntrees-1; tid >= 0; tid--) {
+			info[tree_ranks[tid]].rell_confident = true;
+			prob_sum += tree_probs[tree_ranks[tid]];
+			if (prob_sum > 0.95) break;
+		}
+
+		// sanity check
+		for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
+			prob_sum += tree_probs[tid];
+		if (fabs(prob_sum-1.0) > 0.01)
+			outError("Internal error: Wrong ", __func__);
+
+		delete [] maxcount;
+		delete [] maxL;
+		delete [] maxtid;
+
+		/* now do the SH test */
+		cout << "Performing KH and SH test..." << endl;
+		// SH centering step
+		for (boot = 0; boot < params.topotest_replicates; boot++)
+			max_lh[boot] = -DBL_MAX;
+		double *avg_lh = new double[ntrees];
+		for (tid = 0; tid < ntrees; tid++) {
+			avg_lh[tid] = 0.0;
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			for (boot = 0; boot < params.topotest_replicates; boot++)
+				avg_lh[tid] += tree_lhs_offset[boot];
+			avg_lh[tid] /= params.topotest_replicates;
+			for (boot = 0; boot < params.topotest_replicates; boot++) {
+				max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot] - avg_lh[tid]);
+			}
+		}
+
+		double orig_max_lh = orig_tree_lh[0];
+		int orig_max_id = 0;
+		double orig_2ndmax_lh = -DBL_MAX;
+		int orig_2ndmax_id = -1;
+		// find the max tree ID
+		for (tid = 1; tid < ntrees; tid++)
+			if (orig_max_lh < orig_tree_lh[tid]) {
+				orig_max_lh = orig_tree_lh[tid];
+				orig_max_id = tid;
+			}
+		// find the 2nd max tree ID
+		for (tid = 0; tid < ntrees; tid++)
+			if (tid != orig_max_id && orig_2ndmax_lh < orig_tree_lh[tid]) {
+				orig_2ndmax_lh = orig_tree_lh[tid];
+				orig_2ndmax_id = tid;
+			}
+
+
+		// SH compute p-value
+		for (tid = 0; tid < ntrees; tid++) {
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			// SH compute original deviation from max_lh
+			info[tid].kh_pvalue = 0.0;
+			info[tid].sh_pvalue = 0.0;
+			int max_id = (tid != orig_max_id) ? orig_max_id : orig_2ndmax_id;
+			double orig_diff = orig_tree_lh[max_id] - orig_tree_lh[tid] - avg_lh[tid];
+			double *max_kh = tree_lhs + (max_id * params.topotest_replicates);
+			for (boot = 0; boot < params.topotest_replicates; boot++) {
+				if (max_lh[boot] - tree_lhs_offset[boot] > orig_diff)
+					info[tid].sh_pvalue += 1.0;
+				//double max_kh_here = max(max_kh[boot]-avg_lh[max_id], tree_lhs_offset[boot]-avg_lh[tid]);
+				double max_kh_here = (max_kh[boot]-avg_lh[max_id]);
+				if (max_kh_here - tree_lhs_offset[boot] > orig_diff)
+					info[tid].kh_pvalue += 1.0;
+			}
+			info[tid].sh_pvalue /= params.topotest_replicates;
+			info[tid].kh_pvalue /= params.topotest_replicates;
+		}
+
+		if (params.do_weighted_test) {
+
+			cout << "Computing pairwise logl difference variance ..." << endl;
+			/* computing lhdiff_weights as 1/sqrt(lhdiff_variance) */
+			for (tid = 0; tid < ntrees; tid++) {
+				double *pattern_lh1 = pattern_lhs + (tid * nptn);
+				lhdiff_weights[tid*ntrees+tid] = 0.0;
+				for (tid2 = tid+1; tid2 < ntrees; tid2++) {
+					double lhdiff_variance = tree->computeLogLDiffVariance(pattern_lh1, pattern_lhs + (tid2*nptn));
+					lhdiff_weights[tid*ntrees+tid2] = 1.0/sqrt(lhdiff_variance);
+					lhdiff_weights[tid2*ntrees+tid] = lhdiff_weights[tid*ntrees+tid2];
+				}
+			}
+
+			// Weighted KH and SH test
+			cout << "Performing WKH and WSH test..." << endl;
+			for (tid = 0; tid < ntrees; tid++) {
+				double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+				info[tid].wkh_pvalue = 0.0;
+				info[tid].wsh_pvalue = 0.0;
+				double worig_diff = -DBL_MAX;
+				int max_id = -1;
+				for (tid2 = 0; tid2 < ntrees; tid2++)
+					if (tid2 != tid) {
+						double wdiff = (orig_tree_lh[tid2] - orig_tree_lh[tid])*lhdiff_weights[tid*ntrees+tid2];
+						if (wdiff > worig_diff) {
+							worig_diff = wdiff;
+							max_id = tid2;
+						}
+					}
+				for (boot = 0; boot < params.topotest_replicates; boot++) {
+					double wmax_diff = -DBL_MAX;
+					for (tid2 = 0; tid2 < ntrees; tid2++)
+						if (tid2 != tid)
+							wmax_diff = max(wmax_diff,
+									(tree_lhs[tid2*params.topotest_replicates+boot] - avg_lh[tid2] -
+									tree_lhs_offset[boot] + avg_lh[tid]) * lhdiff_weights[tid*ntrees+tid2]);
+					if (wmax_diff > worig_diff)
+						info[tid].wsh_pvalue += 1.0;
+					wmax_diff = (tree_lhs[max_id*params.topotest_replicates+boot] - avg_lh[max_id] -
+							tree_lhs_offset[boot] + avg_lh[tid]);
+					if (wmax_diff >  orig_tree_lh[max_id] - orig_tree_lh[tid])
+						info[tid].wkh_pvalue += 1.0;
+				}
+				info[tid].wsh_pvalue /= params.topotest_replicates;
+				info[tid].wkh_pvalue /= params.topotest_replicates;
+			}
+		}
+		/* now to ELW - Expected Likelihood Weight method */
+		cout << "Performing ELW test..." << endl;
+
+		for (boot = 0; boot < params.topotest_replicates; boot++)
+			max_lh[boot] = -DBL_MAX;
+		for (tid = 0; tid < ntrees; tid++) {
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			for (boot = 0; boot < params.topotest_replicates; boot++)
+				max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot]);
+		}
+		double *sumL = new double[params.topotest_replicates];
+		memset(sumL, 0, sizeof(double) * params.topotest_replicates);
+		for (tid = 0; tid < ntrees; tid++) {
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			for (boot = 0; boot < params.topotest_replicates; boot++) {
+				tree_lhs_offset[boot] = exp(tree_lhs_offset[boot] - max_lh[boot]);
+				sumL[boot] += tree_lhs_offset[boot];
+			}
+		}
+		for (tid = 0; tid < ntrees; tid++) {
+			double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
+			tree_probs[tid] = 0.0;
+			for (boot = 0; boot < params.topotest_replicates; boot++) {
+				tree_probs[tid] += (tree_lhs_offset[boot] / sumL[boot]);
+			}
+			tree_probs[tid] /= params.topotest_replicates;
+			info[tid].elw_confident = false;
+			info[tid].elw_value = tree_probs[tid];
+		}
+
+		sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
+		prob_sum = 0.0;
+		// obtain the confidence set
+		for (tid = ntrees-1; tid >= 0; tid--) {
+			info[tree_ranks[tid]].elw_confident = true;
+			prob_sum += tree_probs[tree_ranks[tid]];
+			if (prob_sum > 0.95) break;
+		}
+
+		// sanity check
+		for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
+			prob_sum += tree_probs[tid];
+		if (fabs(prob_sum-1.0) > 0.01)
+			outError("Internal error: Wrong ", __func__);
+		delete [] sumL;
+
+		delete [] tree_ranks;
+		delete [] tree_probs;
+
+	}
+	if (max_lh)
+		delete [] max_lh;
+	if (orig_tree_lh)
+		delete [] orig_tree_lh;
+	if (pattern_lh)
+		delete [] pattern_lh;
+	if (pattern_lhs)
+		delete [] pattern_lhs;
+	if (lhdiff_weights)
+		delete [] lhdiff_weights;
+	if (tree_lhs)
+		delete [] tree_lhs;
+	//if (saved_tree_lhs)
+	//	delete [] saved_tree_lhs;
+	if (boot_samples)
+		delete [] boot_samples;
+
+	if (params.print_tree_lh) {
+		scoreout.close();
+	}
+
+	treeout.close();
+	in.close();
+
+	cout << "Time for evaluating all trees: " << getCPUTime() - time_start << " sec." << endl;
+
+}
+
+
+void evaluateTrees(Params &params, IQTree *tree) {
+	vector<TreeInfo> info;
+	IntVector distinct_ids;
+	evaluateTrees(params, tree, info, distinct_ids);
+}
diff --git a/phylotesting.h b/phylotesting.h
new file mode 100644
index 0000000..d1b1585
--- /dev/null
+++ b/phylotesting.h
@@ -0,0 +1,113 @@
+/*
+ * phylotesting.h
+ *
+ *  Created on: Aug 23, 2013
+ *      Author: minh
+ */
+
+#ifndef PHYLOTESTING_H_
+#define PHYLOTESTING_H_
+
+#include "tools.h"
+
+class PhyloTree;
+class IQTree;
+
+
+struct ModelInfo {
+	string set_name; // subset name
+	string name; // model name
+	double logl; // tree log likelihood
+	int df;      // #parameters
+    double tree_len; // tree length, added 2015-06-24 for rcluster algorithm
+    string tree; // added 2015-04-28: tree string
+	double AIC_score, AICc_score, BIC_score;    // scores
+	double AIC_weight, AICc_weight, BIC_weight; // weights
+	bool AIC_conf, AICc_conf, BIC_conf;         // in confidence set?
+};
+
+
+struct TreeInfo {
+	double logl; // log likelihood
+	double se; // standard error of deltaL (logl difference to max), or square root of variance
+	double rell_bp; // bootstrap proportion by RELL method
+	bool rell_confident; // confidence set for RELL-BP
+	double sh_pvalue; // p-value by Shimodaira-Hasegawa test
+	double wsh_pvalue; // p-value by weighted Shimodaira-Hasegawa test
+	double kh_pvalue; // p-value by Kishino-Hasegawa test
+	double wkh_pvalue; // p-value by weighted Kishino-Hasegawa test
+	double elw_value; // ELW - expected likelihood weights test
+	bool elw_confident; // to represent confidence set of ELW test
+};
+
+
+/**
+ * computing AIC, AICc, and BIC scores
+ */
+void computeInformationScores(double tree_lh, int df, int ssize, double &AIC, double &AICc, double &BIC);
+
+/**
+ * check if the model file contains correct information
+ * @param model_file model file names
+ * @param model_name (OUT) vector of model names
+ * @param lh_scores (OUT) vector of tree log-likelihoods
+ * @param df_vec (OUT) vector of degrees of freedom (or K)
+ * @return TRUE if success, FALSE failed.
+ */
+
+bool checkModelFile(string model_file, bool is_partitioned, vector<ModelInfo> &infos);
+
+/**
+ testing the best-fit model
+ return in params.freq_type and params.rate_type
+ @param set_name for partitioned analysis
+ @param in_tree phylogenetic tree
+ @param model_info (IN/OUT) information for all models considered
+ @param set_name for partition model selection
+ @param print_mem_usage true to print RAM memory used (default: false) 
+ @return name of best-fit-model
+ */
+string testModel(Params &params, PhyloTree* in_tree, vector<ModelInfo> &model_info, ostream &fmodel,
+		string set_name = "", bool print_mem_usage = false);
+
+/**
+ * print site log likelihoods to a fileExists
+ * @param filename output file name
+ * @param tree phylogenetic tree
+ * @param ptn_lh pattern log-likelihoods, will be computed if NULL
+ * @param append TRUE to append to existing file, FALSE otherwise
+ * @param linename name of the line, default "Site_Lh" if NULL
+ */
+void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh = NULL,
+		bool append = false, const char *linename = NULL);
+
+/**
+ * print site log likelihoods per category to a file
+ * @param filename output file name
+ * @param tree phylogenetic tree
+ */
+void printSiteLhCategory(const char*filename, PhyloTree *tree);
+
+/**
+ * Evaluate user-trees with possibility of tree topology tests
+ * @param params program parameters
+ * @param tree current tree
+ * @param info (OUT) output information
+ * @param distinct_ids IDs of distinct trees
+ */
+void evaluateTrees(Params &params, IQTree *tree, vector<TreeInfo> &info, IntVector &distinct_ids);
+
+void evaluateTrees(Params &params, IQTree *tree);
+
+/**
+    get sequence type for a model name
+    @param model_name model name string
+    @param seq_type (OUT) sequence type, SEQ_UNKNOWN if is not determined
+    @return 1 for parametric model, 2 for empirical model
+*/
+int getSeqType(const char *model_name, SeqType &seq_type);
+
+string getSeqType(string model_name);
+
+
+#endif /* PHYLOTESTING_H_ */
diff --git a/phylotree.cpp b/phylotree.cpp
new file mode 100644
index 0000000..65554be
--- /dev/null
+++ b/phylotree.cpp
@@ -0,0 +1,5076 @@
+//
+// C++ Implementation: phylotree
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#include "phylotree.h"
+#include "bionj.h"
+//#include "rateheterogeneity.h"
+#include "alignmentpairwise.h"
+#include <algorithm>
+#include <limits>
+#include "timeutil.h"
+#include "pllnni.h"
+#include "phylosupertree.h"
+#include "phylosupertreeplen.h"
+#include "upperbounds.h"
+
+//const static int BINARY_SCALE = floor(log2(1/SCALING_THRESHOLD));
+//const static double LOG_BINARY_SCALE = -(log(2) * BINARY_SCALE);
+
+/****************************************************************************
+ SPRMoves class
+ ****************************************************************************/
+
+void SPRMoves::add(PhyloNode *prune_node, PhyloNode *prune_dad, PhyloNode *regraft_node, PhyloNode *regraft_dad,
+        double score) {
+    if (size() >= MAX_SPR_MOVES && score <= rbegin()->score)
+        return;
+    if (size() >= MAX_SPR_MOVES) {
+        iterator it = end();
+        it--;
+        erase(it);
+    }
+    SPRMove spr;
+    spr.prune_node = prune_node;
+    spr.prune_dad = prune_dad;
+    spr.regraft_node = regraft_node;
+    spr.regraft_dad = regraft_dad;
+    spr.score = score;
+    insert(spr);
+}
+
+/****************************************************************************
+ PhyloTree class
+ ****************************************************************************/
+
+PhyloTree::PhyloTree() : MTree() {
+    init();
+}
+
+void PhyloTree::init() {
+    aln = NULL;
+    model = NULL;
+    site_rate = NULL;
+    optimize_by_newton = true;
+    central_partial_lh = NULL;
+    nni_partial_lh = NULL;
+    tip_partial_lh = NULL;
+    tip_partial_lh_computed = false;
+    ptn_freq_computed = false;
+    central_scale_num = NULL;
+    nni_scale_num = NULL;
+    central_partial_pars = NULL;
+    model_factory = NULL;
+//    tmp_partial_lh1 = NULL;
+//    tmp_partial_lh2 = NULL;
+//    tmp_anscentral_state_prob1 = NULL;
+//    tmp_anscentral_state_prob2 = NULL;
+    //tmp_ptn_rates = NULL;
+    //state_freqs = NULL;
+//    tmp_scale_num1 = NULL;
+//    tmp_scale_num2 = NULL;
+    discard_saturated_site = true;
+    _pattern_lh = NULL;
+    _pattern_lh_cat = NULL;
+    //root_state = STATE_UNKNOWN;
+    root_state = 126;
+    theta_all = NULL;
+    ptn_freq = NULL;
+    ptn_invar = NULL;
+    subTreeDistComputed = false;
+    dist_matrix = NULL;
+    setLikelihoodKernel(LK_SSE);  // FOR TUNG: you forgot to initialize this variable!
+    save_all_trees = 0;
+    nodeBranchDists = NULL;
+    // FOR: upper bounds
+    mlCheck = 0;
+    skippedNNIub = 0;
+    totalNNIub = 0;
+    minStateFreq = 0.0;
+    //minUB = 0.0;
+    //meanUB = 0.0;
+    //maxUB = 0.0;
+    pllInst = NULL;
+    pllAlignment = NULL;
+    pllPartitions = NULL;
+//    lhComputed = false;
+    curScore = -DBL_MAX;
+    root = NULL;
+    params = NULL;
+    current_scaling = 1.0;
+    is_opt_scaling = false;
+}
+
+PhyloTree::PhyloTree(Alignment *aln) : MTree() {
+    init();
+    this->aln = aln;
+}
+
+void PhyloTree::discardSaturatedSite(bool val) {
+    discard_saturated_site = val;
+}
+
+PhyloTree::~PhyloTree() {
+    if (nni_scale_num)
+        aligned_free(nni_scale_num);
+    nni_scale_num = NULL;
+    if (nni_partial_lh)
+        aligned_free(nni_partial_lh);
+    nni_partial_lh = NULL;
+    if (central_partial_lh)
+        aligned_free(central_partial_lh);
+    central_partial_lh = NULL;
+    if (central_scale_num)
+        aligned_free(central_scale_num);
+    central_scale_num = NULL;
+
+    if (central_partial_pars)
+        aligned_free(central_partial_pars);
+    central_partial_pars = NULL;
+    if (model_factory)
+        delete model_factory;
+    if (model)
+        delete model;
+    if (site_rate)
+        delete site_rate;
+//    if (tmp_scale_num1)
+//        delete[] tmp_scale_num1;
+//    if (tmp_scale_num2)
+//        delete[] tmp_scale_num2;
+//    if (tmp_partial_lh1)
+//        delete[] tmp_partial_lh1;
+//    if (tmp_partial_lh2)
+//        delete[] tmp_partial_lh2;
+//    if (tmp_anscentral_state_prob1)
+//        delete[] tmp_anscentral_state_prob1;
+//    if (tmp_anscentral_state_prob2)
+//        delete[] tmp_anscentral_state_prob2;
+    //if (tmp_ptn_rates)
+    //	delete [] tmp_ptn_rates;
+    if (_pattern_lh_cat)
+        aligned_free(_pattern_lh_cat);
+    if (_pattern_lh)
+        aligned_free(_pattern_lh);
+    //if (state_freqs)
+    //	delete [] state_freqs;
+    if (theta_all)
+        aligned_free(theta_all);
+    if (ptn_freq)
+        aligned_free(ptn_freq);
+    ptn_freq_computed = false;
+    if (ptn_invar)
+    	aligned_free(ptn_invar);
+    if (dist_matrix)
+    	delete[] dist_matrix;
+}
+
+void PhyloTree::readTree(const char *infile, bool &is_rooted) {
+	MTree::readTree(infile, is_rooted);
+}
+
+void PhyloTree::readTree(istream &in, bool &is_rooted) {
+	MTree::readTree(in, rooted);
+	// remove taxa if necessary
+	if (removed_seqs.size() > 0)
+		removeTaxa(removed_seqs);
+
+	// collapse any internal node of degree 2
+	NodeVector nodes;
+	getInternalNodes(nodes);
+	int num_collapsed = 0;
+	for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
+		if ((*it)->degree() == 2) {
+			Node *left = (*it)->neighbors[0]->node;
+			Node *right = (*it)->neighbors[1]->node;
+			double len = (*it)->neighbors[0]->length+(*it)->neighbors[1]->length;
+			left->updateNeighbor((*it), right, len);
+			right->updateNeighbor((*it), left, len);
+			delete (*it);
+			num_collapsed++;
+			if (verbose_mode >= VB_MED)
+				cout << "Node of degree 2 collapsed" << endl;
+		}
+	if (num_collapsed)
+		initializeTree();
+}
+
+void PhyloTree::assignLeafNames(Node *node, Node *dad) {
+    if (!node)
+        node = root;
+    if (node->isLeaf()) {
+        node->id = atoi(node->name.c_str());
+        assert(node->id >= 0 && node->id < leafNum);
+        node->name = aln->getSeqName(node->id).c_str();
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)assignLeafNames((*it)->node, node);
+}
+
+void PhyloTree::copyTree(MTree *tree) {
+    MTree::copyTree(tree);
+    if (!aln)
+        return;
+    // reset the ID with alignment
+    setAlignment(aln);
+}
+
+void PhyloTree::copyTree(MTree *tree, string &taxa_set) {
+    MTree::copyTree(tree, taxa_set);
+    if (!aln)
+        return;
+    // reset the ID with alignment
+    setAlignment(aln);
+}
+
+void PhyloTree::copyPhyloTree(PhyloTree *tree) {
+    MTree::copyTree(tree);
+    if (!tree->aln)
+        return;
+    setAlignment(tree->aln);
+}
+
+void PhyloTree::setAlignment(Alignment *alignment) {
+    aln = alignment;
+    bool err = false;
+    int nseq = aln->getNSeq();
+    for (int seq = 0; seq < nseq; seq++) {
+        string seq_name = aln->getSeqName(seq);
+        Node *node = findLeafName(seq_name);
+        if (!node) {
+            string str = "Alignment sequence ";
+            str += seq_name;
+            str += " does not appear in the tree";
+            err = true;
+            outError(str, false);
+        } else {
+            assert(node->isLeaf());
+            node->id = seq;
+        }
+    }
+    StrVector taxname;
+    getTaxaName(taxname);
+    for (StrVector::iterator it = taxname.begin(); it != taxname.end(); it++)
+    	if (alignment->getSeqID(*it) < 0) {
+    		outError((string)"Tree taxon " + (*it) + " does not appear in the alignment", false);
+    		err = true;
+    	}
+    if (err) outError("Tree taxa and alignment sequence do not match (see above)");
+}
+
+void PhyloTree::setRootNode(const char *my_root) {
+    string root_name;
+    if (my_root)
+        root_name = my_root;
+    else
+        root_name = aln->getSeqName(0);
+    root = findNodeName(root_name);
+    assert(root);
+}
+
+void PhyloTree::setParams(Params* params) {
+	this->params = params;
+}
+
+void PhyloTree::readTreeString(const string &tree_string) {
+	stringstream str;
+	str << tree_string;
+	str.seekg(0, ios::beg);
+	freeNode();
+	readTree(str, rooted);
+	setAlignment(aln);
+	setRootNode(params->root);
+
+	if (isSuperTree()) {
+		((PhyloSuperTree*) this)->mapTrees();
+	}
+	if (params->pll) {
+		pllReadNewick(getTreeString());
+	}
+	resetCurScore();
+//	lhComputed = false;
+}
+
+int PhyloTree::wrapperFixNegativeBranch(bool force_change) {
+    // Initialize branch lengths for the parsimony tree
+    initializeAllPartialPars();
+    clearAllPartialLH();
+    int numFixed = fixNegativeBranch(force_change);
+    if (params->pll) {
+    	pllReadNewick(getTreeString());
+    }
+    resetCurScore();
+//    lhComputed = false;
+    return numFixed;
+}
+
+void PhyloTree::pllReadNewick(string newickTree) {
+    pllNewickTree *newick = pllNewickParseString(newickTree.c_str());
+    pllTreeInitTopologyNewick(pllInst, newick, PLL_FALSE);
+    pllNewickParseDestroy(&newick);
+}
+
+void PhyloTree::readTreeFile(const string &file_name) {
+	ifstream str;
+	str.open(file_name.c_str());
+//	str << tree_string;
+//	str.seekg(0, ios::beg);
+	freeNode();
+	readTree(str, rooted);
+	setAlignment(aln);
+    if (isSuperTree()) {
+        ((PhyloSuperTree*) this)->mapTrees();
+    } else {
+    	clearAllPartialLH();
+    }
+    str.close();
+}
+
+string PhyloTree::getTreeString() {
+	stringstream tree_stream;
+	printTree(tree_stream);
+	return tree_stream.str();
+}
+
+string PhyloTree::getTopology() {
+    stringstream tree_stream;
+    // important: to make topology string unique
+    setRootNode(params->root);
+    printTree(tree_stream, WT_TAXON_ID + WT_SORT_TAXA);
+    return tree_stream.str();
+}
+
+void PhyloTree::rollBack(istream &best_tree_string) {
+    best_tree_string.seekg(0, ios::beg);
+    freeNode();
+    readTree(best_tree_string, rooted);
+    assignLeafNames();
+    initializeAllPartialLh();
+    clearAllPartialLH();
+}
+
+void PhyloTree::setModel(ModelSubst *amodel) {
+    model = amodel;
+    //state_freqs = new double[numStates];
+    //model->getStateFrequency(state_freqs);
+}
+
+void PhyloTree::setModelFactory(ModelFactory *model_fac) {
+    model_factory = model_fac;
+    if (model_factory && model_factory->model->isMixture())
+    	setLikelihoodKernel(sse);
+}
+
+void PhyloTree::setRate(RateHeterogeneity *rate) {
+    site_rate = rate;
+    if (!rate)
+        return;
+    //numCat = site_rate->getNRate();
+    //if (aln) {
+    //    block = aln->num_states * numCat;
+    //    lh_size = aln->size() * block;
+    //}
+}
+
+RateHeterogeneity *PhyloTree::getRate() {
+    return site_rate;
+}
+
+Node* PhyloTree::newNode(int node_id, const char* node_name) {
+    return (Node*) (new PhyloNode(node_id, node_name));
+}
+
+Node* PhyloTree::newNode(int node_id, int node_name) {
+    return (Node*) (new PhyloNode(node_id, node_name));
+}
+
+void PhyloTree::clearAllPartialLH(bool make_null) {
+    if (!root)
+        return;
+    ((PhyloNode*) root->neighbors[0]->node)->clearAllPartialLh(make_null, (PhyloNode*) root);
+    tip_partial_lh_computed = false;
+}
+
+void PhyloTree::computeAllPartialLh(PhyloNode *node, PhyloNode *dad) {
+	if (!node) node = (PhyloNode*)root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
+			computePartialLikelihood((PhyloNeighbor*)*it, node);
+		PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
+		if ((rev->partial_lh_computed & 1) == 0)
+			computePartialLikelihood(rev, (PhyloNode*)(*it)->node);
+		computeAllPartialLh((PhyloNode*)(*it)->node, node);
+	}
+}
+
+string PhyloTree::getModelName() {
+	string name = model->name;
+	if (model_factory->unobserved_ptns.size() > 0)
+		name += "+ASC";
+	if (model_factory->fused_mix_rate) {
+		name += "*" + site_rate->name.substr(1);
+	} else {
+		name += site_rate->name;
+	}
+	if (model->getFreqType() == FREQ_EMPIRICAL)
+		name += "+F";
+	else if (model->getFreqType() == FREQ_CODON_1x4)
+		name += "+F1X4";
+	else if (model->getFreqType() == FREQ_CODON_3x4)
+		name += "+F3X4";
+	else if (model->getFreqType() == FREQ_CODON_3x4C)
+		name += "+F3X4C";
+	else if (model->getFreqType() == FREQ_ESTIMATE && aln->seq_type != SEQ_DNA)
+		name += "+FO";
+	else if (model->getFreqType() == FREQ_EQUAL && aln->seq_type != SEQ_DNA)
+		name += "+FQ";
+	return name;
+}
+
+string PhyloTree::getModelNameParams() {
+	string name = model->getNameParams();
+	if (model_factory->unobserved_ptns.size() > 0)
+		name += "+ASC";
+    string rate_name = site_rate->getNameParams();
+
+	if (model_factory->fused_mix_rate) {
+		name += "*" + rate_name.substr(1);
+	} else {
+		name += rate_name;
+	}
+
+	if (model->getFreqType() == FREQ_EMPIRICAL || (model->getFreqType() == FREQ_USER_DEFINED && aln->seq_type == SEQ_DNA)) {
+		name += "+F";
+        double *state_freq = new double[model->num_states];
+        model->getStateFrequency(state_freq);
+        name += "{" + convertDoubleToString(state_freq[0]);
+        for (int i = 1; i < model->num_states; i++)
+            name += "," + convertDoubleToString(state_freq[i]);
+        name += "}";
+        delete [] state_freq;
+	} else if (model->getFreqType() == FREQ_CODON_1x4)
+		name += "+F1X4";
+	else if (model->getFreqType() == FREQ_CODON_3x4)
+		name += "+F3X4";
+	else if (model->getFreqType() == FREQ_CODON_3x4C)
+		name += "+F3X4C";
+	else if (model->getFreqType() == FREQ_ESTIMATE) {
+		name += "+FO";
+        double *state_freq = new double[model->num_states];
+        model->getStateFrequency(state_freq);
+        name += "{" + convertDoubleToString(state_freq[0]);
+        for (int i = 1; i < model->num_states; i++)
+            name += "," + convertDoubleToString(state_freq[i]);
+        name += "}";
+        delete [] state_freq;
+    }
+	else if (model->getFreqType() == FREQ_EQUAL && aln->seq_type != SEQ_DNA)
+		name += "+FQ";
+	return name;
+}
+
+void PhyloTree::saveBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+        assert(branchNum == nodeNum-1);
+        if (lenvec.empty()) lenvec.resize(branchNum+startid);
+    }
+    FOR_NEIGHBOR_IT(node, dad, it){
+    	lenvec[(*it)->id + startid] = (*it)->length;
+    	PhyloTree::saveBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
+    }
+}
+
+void PhyloTree::restoreBranchLengths(DoubleVector &lenvec, int startid, PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+        assert(!lenvec.empty());
+    }
+    FOR_NEIGHBOR_IT(node, dad, it){
+    	(*it)->length = (*it)->node->findNeighbor(node)->length = lenvec[(*it)->id + startid];
+    	PhyloTree::restoreBranchLengths(lenvec, startid, (PhyloNode*) (*it)->node, node);
+    }
+}
+
+
+/****************************************************************************
+ Parsimony function
+ ****************************************************************************/
+
+/*
+ double PhyloTree::computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+ //	double corrected_bran = 0;
+ //	int parbran;
+ //	int parscore = computeParsimonyBranch(node21_it, node2, &parbran);
+ //	if (site_rate->getGammaShape() != 0) {
+ //		corrected_bran = (aln->num_states - 1.0) / aln->num_states
+ //				* site_rate->getGammaShape()
+ //				* (pow( 1.0 - aln->num_states / (aln->num_states - 1.0) * ((double) parbran / aln->getNSite()),
+ //						-1.0 / site_rate->getGammaShape()) - 1.0);
+ //	} else {
+ //		corrected_bran = -((aln->num_states - 1.0) / aln->num_states)
+ //				* log(1.0 - (aln->num_states / (aln->num_states - 1.0)) * ((double) parbran / aln->getNSite()));
+ //	}
+ //	return corrected_bran;
+ }
+ */
+void PhyloTree::initializeAllPartialPars() {
+    int index = 0;
+    initializeAllPartialPars(index);
+    clearAllPartialLH();
+    //assert(index == (nodeNum - 1)*2);
+}
+
+void PhyloTree::initializeAllPartialPars(int &index, PhyloNode *node, PhyloNode *dad) {
+    size_t pars_block_size = getBitsBlockSize();
+    if (!node) {
+        node = (PhyloNode*) root;
+        // allocate the big central partial pars memory
+        if (!central_partial_pars) {
+            int memsize = (aln->getNSeq() - 1) * 4 * pars_block_size;
+            if (verbose_mode >= VB_MED)
+                cout << "Allocating " << memsize * sizeof(UINT) << " bytes for partial parsimony vectors" << endl;
+            central_partial_pars = aligned_alloc<UINT>(memsize);
+            if (!central_partial_pars)
+                outError("Not enough memory for partial parsimony vectors");
+        }
+        index = 0;
+    }
+    if (dad) {
+        // make memory alignment 16
+        // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
+        PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        nei->partial_pars = central_partial_pars + (index * pars_block_size);
+        nei = (PhyloNeighbor*) dad->findNeighbor(node);
+        nei->partial_pars = central_partial_pars + ((index + 1) * pars_block_size);
+        index += 2;
+        //assert(index < nodeNum * 2 - 1);
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)initializeAllPartialPars(index, (PhyloNode*) (*it)->node, node);
+}
+
+#define SIMD_BITS 256
+
+size_t PhyloTree::getBitsBlockSize() {
+    // reserve the last entry for parsimony score
+//    return (aln->num_states * aln->size() + UINT_BITS - 1) / UINT_BITS + 1;
+    size_t len = aln->num_states * ((max(aln->size(), (size_t)aln->num_informative_sites) + SIMD_BITS - 1) / UINT_BITS) + 4;
+    len = ((len+7)/8)*8;
+    return len;
+}
+
+int PhyloTree::getBitsEntrySize() {
+    // reserve the last entry for parsimony score
+    return (aln->num_states + SIMD_BITS - 1) / UINT_BITS;
+}
+
+UINT *PhyloTree::newBitsBlock() {
+    return aligned_alloc<UINT>(getBitsBlockSize());
+}
+
+void PhyloTree::getBitsBlock(UINT *bit_vec, int index, UINT* &bits_entry) {
+    int nstates = aln->num_states;
+    int myindex = (index * nstates);
+    int bit_pos_begin = myindex >> BITS_DIV;
+    int bit_off_begin = myindex & BITS_MODULO;
+    int bit_pos_end = (myindex + nstates) >> BITS_DIV;
+    int bit_off_end = (myindex + nstates) & BITS_MODULO;
+
+    if (bit_pos_begin == bit_pos_end) {
+        bits_entry[0] = (bit_vec[bit_pos_begin] >> bit_off_begin) & ((1 << nstates) - 1);
+        return;
+    }
+    UINT part1 = (bit_vec[bit_pos_begin] >> bit_off_begin);
+    int rest_bits = nstates;
+    int id;
+    for (id = 0; rest_bits >= UINT_BITS; id++, rest_bits -= UINT_BITS, bit_pos_begin++) {
+        bits_entry[id] = part1;
+        if (bit_off_begin > 0)
+            bits_entry[id] |= (bit_vec[bit_pos_begin + 1] << (UINT_BITS - bit_off_begin));
+        part1 = (bit_vec[bit_pos_begin + 1] >> bit_off_begin);
+    }
+    if (bit_pos_begin == bit_pos_end) {
+        bits_entry[id] = (bit_vec[bit_pos_begin] >> bit_off_begin) & ((1 << rest_bits) - 1);
+        return;
+    }
+    UINT part2 = bit_vec[bit_pos_end];
+    if (bit_off_end < UINT_BITS)
+        part2 &= ((1 << bit_off_end) - 1);
+    bits_entry[id] = part1;
+    if (bit_off_begin > 0)
+        bits_entry[id] |= (part2 << (UINT_BITS - bit_off_begin));
+}
+
+void PhyloTree::setBitsBlock(UINT* &bit_vec, int index, UINT *bits_entry) {
+    int nstates = aln->num_states;
+    int myindex = (index * nstates);
+    int bit_pos_begin = myindex >> BITS_DIV;
+    int bit_off_begin = myindex & BITS_MODULO;
+    int bit_pos_end = (myindex + nstates) >> BITS_DIV;
+    int bit_off_end = (myindex + nstates) & BITS_MODULO;
+
+    //assert(value <= allstates);
+
+    if (bit_pos_begin == bit_pos_end) {
+        // first clear the bit between bit_off_begin and bit_off_end
+        int allstates = (1 << nstates) - 1;
+        bit_vec[bit_pos_begin] &= ~(allstates << bit_off_begin);
+        // now set the bit
+        bit_vec[bit_pos_begin] |= bits_entry[0] << bit_off_begin;
+        return;
+    }
+    int len1 = UINT_BITS - bit_off_begin;
+    // clear bit from bit_off_begin to UINT_BITS
+    bit_vec[bit_pos_begin] &= (1 << bit_off_begin) - 1;
+    // set bit  from bit_off_begin to UINT_BITS
+    bit_vec[bit_pos_begin] |= (bits_entry[0] << bit_off_begin);
+    int rest_bits = nstates - len1;
+    int id;
+    for (id = 0; rest_bits >= UINT_BITS; bit_pos_begin++, id++, rest_bits -= UINT_BITS) {
+        bit_vec[bit_pos_begin + 1] = (bits_entry[id + 1] << bit_off_begin);
+        if (len1 < UINT_BITS)
+            bit_vec[bit_pos_begin + 1] |= (bits_entry[id] >> len1);
+    }
+
+    assert(bit_pos_begin == bit_pos_end - 1);
+    // clear bit from 0 to bit_off_end
+    bit_vec[bit_pos_end] &= ~((1 << bit_off_end) - 1);
+    // now set the bit the value
+    if (len1 < UINT_BITS)
+        bit_vec[bit_pos_end] |= (bits_entry[id] >> len1);
+    rest_bits -= bit_off_begin;
+    if (rest_bits > 0)
+        bit_vec[bit_pos_end] |= (bits_entry[id + 1] << bit_off_begin);
+}
+
+bool PhyloTree::isEmptyBitsEntry(UINT *bits_entry) {
+    int rest_bits = aln->num_states;
+    int i;
+    for (i = 0; rest_bits >= UINT_BITS; rest_bits -= UINT_BITS, i++)
+        if (bits_entry[i])
+            return false;
+    if (bits_entry[i] & ((1 << rest_bits) - 1))
+        return false;
+    return true;
+}
+
+void PhyloTree::unionBitsEntry(UINT *bits_entry1, UINT *bits_entry2, UINT* &bits_union) {
+    int rest_bits = aln->num_states;
+    int i;
+    for (i = 0; rest_bits > 0; rest_bits -= UINT_BITS, i++)
+        bits_union[i] = bits_entry1[i] | bits_entry2[i];
+}
+
+void PhyloTree::setBitsEntry(UINT* &bits_entry, int id) {
+    int bit_pos = id >> BITS_DIV;
+    int bit_off = id & BITS_MODULO;
+    bits_entry[bit_pos] |= (1 << bit_off);
+}
+
+bool PhyloTree::getBitsEntry(UINT* &bits_entry, int id) {
+    int bit_pos = id >> BITS_DIV;
+    int bit_off = id & BITS_MODULO;
+    if (bits_entry[bit_pos] & (1 << bit_off))
+        return true;
+    return false;
+}
+
+void setBitsAll(UINT* &bit_vec, int num) {
+    //int id;
+    int size = num / UINT_BITS;
+    memset(bit_vec, 255, size * sizeof(UINT));
+    num &= BITS_MODULO;
+    if (num)
+        bit_vec[size] = (1 << num) - 1;
+}
+
+void PhyloTree::computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    (this->*computePartialParsimonyPointer)(dad_branch, dad);
+}
+
+void PhyloTree::computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad) {
+	PhyloNeighbor *node_nei = (PhyloNeighbor*)node->findNeighbor(dad);
+	assert(node_nei);
+	computePartialParsimony(node_nei, node);
+	for (NeighborVec::iterator it = node->neighbors.begin(); it != node->neighbors.end(); it ++)
+		if ((*it)->node != dad)
+			computeReversePartialParsimony((PhyloNode*)(*it)->node, node);
+
+}
+
+void PhyloTree::computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the parsimony
+    if (dad_branch->partial_lh_computed & 2)
+        return;
+    Node *node = dad_branch->node;
+    //assert(node->degree() <= 3);
+    int ptn;
+    int nstates = aln->num_states;
+    int pars_size = getBitsBlockSize();
+    int entry_size = getBitsEntrySize();
+    assert(dad_branch->partial_pars);
+    UINT *bits_entry = new UINT[entry_size];
+    UINT *bits_entry_child = new UINT[entry_size];
+    //UINT *bits_entry1 = new UINT[entry_size];
+    //UINT *bits_entry2 = new UINT[entry_size];
+
+    if (node->isLeaf() && dad) {
+        // external node
+    	int ambi_aa[] = {4+8, 32+64, 512+1024};
+
+        setBitsAll(dad_branch->partial_pars, nstates * aln->size());
+        dad_branch->partial_pars[pars_size - 1] = 0;
+        for (ptn = 0; ptn < aln->size(); ptn++)
+            if (!aln->at(ptn).is_const) {
+                char state;
+                if (node->name == ROOT_NAME) {
+                    state = aln->STATE_UNKNOWN;
+                } else {
+                    assert(node->id < aln->getNSeq());
+                    state = (aln->at(ptn))[node->id];
+                }
+                if (state == aln->STATE_UNKNOWN) {
+                    // fill all entries with bit 1
+                    //setBitsBlock(dad_branch->partial_pars, ptn, (1 << nstates) - 1);
+                } else if (state < nstates) {
+                    memset(bits_entry, 0, sizeof(UINT) * entry_size);
+                    setBitsEntry(bits_entry, state);
+                    setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
+                } else if (aln->seq_type == SEQ_DNA) {
+                    // ambiguous character, for DNA, RNA
+                    state = state - (nstates - 1);
+                    memset(bits_entry, 0, sizeof(UINT) * entry_size);
+                    bits_entry[0] = state;
+                    setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
+                }  else if (aln->seq_type == SEQ_PROTEIN) {
+            		if (state >= 23) return;
+            		state -= 20;
+                    memset(bits_entry, 0, sizeof(UINT) * entry_size);
+                    bits_entry[0] = ambi_aa[(int)state];
+                    setBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
+                } else {
+                	assert(0);
+                }
+            }
+    } else {
+        // internal node
+        memset(dad_branch->partial_pars, 255, pars_size * sizeof(int));
+        UINT *partial_pars_dad = dad_branch->partial_pars;
+        int partial_pars = 0;
+        //UINT *partial_pars_child1 = NULL, *partial_pars_child2 = NULL;
+        // take the intersection of two child states (with &= bit operation)
+        FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
+            computePartialParsimonyNaive((PhyloNeighbor*) (*it), (PhyloNode*) node);
+            /*
+             if (!partial_pars_child1)
+             partial_pars_child1 = ((PhyloNeighbor*) (*it))->partial_pars;
+             else
+             partial_pars_child2 = ((PhyloNeighbor*) (*it))->partial_pars;
+             */
+            UINT *partial_pars_child = ((PhyloNeighbor*) (*it))->partial_pars;
+            for (int i = 0; i < pars_size - 1; i++)
+            partial_pars_dad[i] &= partial_pars_child[i];
+            partial_pars += partial_pars_child[pars_size - 1];
+        }
+        //assert(partial_pars_child1 && partial_pars_child2);
+        // take the intersection of two bits block
+        //for (int i = 0; i < pars_size - 1; i++)
+        //    partial_pars_dad[i] = partial_pars_child1[i] & partial_pars_child2[i];
+        //int partial_pars = partial_pars_child1[pars_size - 1] + partial_pars_child2[pars_size - 1];
+        // now check if some intersection is empty, change to union (Fitch algorithm) and increase the parsimony score
+        memset(bits_entry, 0, entry_size * sizeof(UINT));
+        for (ptn = 0; ptn < aln->size(); ptn++)
+            if (!aln->at(ptn).is_const) {
+                getBitsBlock(partial_pars_dad, ptn, bits_entry);
+                if (isEmptyBitsEntry(bits_entry)) {
+                    FOR_NEIGHBOR_IT(node, dad, it2)if ((*it2)->node->name != ROOT_NAME) {
+                        UINT *partial_pars_child = ((PhyloNeighbor*) (*it2))->partial_pars;
+                        getBitsBlock(partial_pars_child, ptn, bits_entry_child);
+                        unionBitsEntry(bits_entry, bits_entry_child, bits_entry);
+                    }
+                    //getBitsBlock(partial_pars_child2, ptn, bits_entry2);
+                    //unionBitsEntry(bits_entry1, bits_entry2, bits_entry);
+                    //cout << bits_entry[0] << " " << bits_entry[1] << endl;
+                    setBitsBlock(partial_pars_dad, ptn, bits_entry);
+                    partial_pars += aln->at(ptn).frequency;
+                }
+            }
+
+            /*
+             for (ptn = 0; ptn < aln->size(); ptn++)
+             if (!aln->at(ptn).is_const) {
+             getBitsBlock(partial_pars_dad, ptn, bits_entry);
+             if (isEmptyBitsEntry(bits_entry)) {
+             getBitsBlock(partial_pars_child1, ptn, bits_entry1);
+             getBitsBlock(partial_pars_child2, ptn, bits_entry2);
+             unionBitsEntry(bits_entry1, bits_entry2, bits_entry);
+             //cout << bits_entry[0] << " " << bits_entry[1] << endl;
+             setBitsBlock(partial_pars_dad, ptn, bits_entry);
+             partial_pars += aln->at(ptn).frequency;
+             }
+             }*/
+        partial_pars_dad[pars_size - 1] = partial_pars;
+    }
+    dad_branch->partial_lh_computed |= 2;
+    //delete[] bits_entry2;
+    //delete[] bits_entry1;
+    delete[] bits_entry_child;
+    delete[] bits_entry;
+}
+
+int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+    return (this->*computeParsimonyBranchPointer)(dad_branch, dad, branch_subst);
+}
+
+int PhyloTree::computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+        
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    assert(node_branch);
+    if (!central_partial_pars)
+        initializeAllPartialPars();
+    // swap node and dad if dad is a leaf
+    if (node->isLeaf()) {
+        PhyloNode *tmp_node = dad;
+        dad = node;
+        node = tmp_node;
+        PhyloNeighbor *tmp_nei = dad_branch;
+        dad_branch = node_branch;
+        node_branch = tmp_nei;
+        //cout << "swapped\n";
+    }
+    if ((dad_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyNaive(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyNaive(node_branch, node);
+    // now combine likelihood at the branch
+
+    int pars_size = getBitsBlockSize();
+    int entry_size = getBitsEntrySize();
+    //int nstates = aln->num_states;
+    int i, ptn;
+    int tree_pars = 0;
+    UINT *partial_pars = newBitsBlock();
+    UINT *bits_entry = new UINT[entry_size];
+    for (i = 0; i < pars_size - 1; i++)
+        partial_pars[i] = (node_branch->partial_pars[i] & dad_branch->partial_pars[i]);
+
+    for (ptn = 0; ptn < aln->size(); ptn++)
+        if (!aln->at(ptn).is_const) {
+            getBitsBlock(partial_pars, ptn, bits_entry);
+            if (isEmptyBitsEntry(bits_entry))
+                tree_pars += aln->at(ptn).frequency;
+        }
+    if (branch_subst)
+        *branch_subst = tree_pars;
+    tree_pars += node_branch->partial_pars[pars_size - 1] + dad_branch->partial_pars[pars_size - 1];
+    delete[] bits_entry;
+    aligned_free(partial_pars);
+    return tree_pars;
+}
+
+int PhyloTree::computeParsimony() {
+    return computeParsimonyBranch((PhyloNeighbor*) root->neighbors[0], (PhyloNode*) root);
+}
+
+void PhyloTree::printParsimonyStates(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    if (!dad) {
+        dad = (PhyloNode*) root;
+        dad_branch = (PhyloNeighbor*) root->neighbors[0];
+        cout << "Parsimonious states for every node and site: " << endl;
+    }
+    int site;
+    cout << "States for node ";
+    int max_len = aln->getMaxSeqNameLength();
+    if (max_len < 3)
+        max_len = 3;
+    cout.width(max_len);
+    if (!dad_branch->node->name.empty())
+        cout << left << dad_branch->node->name;
+    else
+        cout << left << dad_branch->node->id;
+    cout << " are ";
+    UINT *bits_entry = new UINT[getBitsEntrySize()];
+    for (site = 0; site < aln->getNSite(); site++) {
+        int ptn = aln->getPatternID(site);
+        getBitsBlock(dad_branch->partial_pars, ptn, bits_entry);
+        if (aln->at(ptn).is_const) {
+            int state = aln->at(ptn)[0];
+            if (state < aln->num_states)
+                setBitsEntry(bits_entry, state);
+            else {
+                memset(bits_entry, 0, sizeof(UINT) * getBitsEntrySize());
+                bits_entry[0] = state - (aln->num_states - 1);
+                ;
+            }
+        }
+        cout << "{";
+        bool first = true;
+        for (int i = 0; i < aln->num_states; i++)
+            if (getBitsEntry(bits_entry, i)) {
+                cout << ((!first) ? "," : "") << i;
+                first = false;
+            }
+        cout << "}\t";
+    }
+    cout << endl;
+    delete[] bits_entry;
+    FOR_NEIGHBOR_IT(dad_branch->node, dad, it)printParsimonyStates((PhyloNeighbor*) (*it), (PhyloNode*) (dad_branch->node));
+}
+
+int PhyloTree::computeParsimonyScore(int ptn, int &states, PhyloNode *node, PhyloNode *dad) {
+    int score = 0;
+    states = 0;
+    if (!node)
+        node = (PhyloNode*) root;
+    if (node->degree() > 3)
+        outError("Does not work with multifurcating tree");
+    if (verbose_mode == VB_DEBUG)
+        cout << ptn << " " << node->id << "  " << node->name << endl;
+
+    if (node->isLeaf()) {
+        char state;
+        if (node->name == ROOT_NAME) {
+            state = aln->STATE_UNKNOWN;
+        } else {
+            assert(node->id < aln->getNSeq());
+            state = (*aln)[ptn][node->id];
+        }
+        if (state == aln->STATE_UNKNOWN) {
+            states = (1 << aln->num_states) - 1;
+        } else if (state < aln->num_states)
+            states = (1 << state);
+        else {
+            // ambiguous character, for DNA, RNA
+            states = state - 3;
+        }
+    }
+    if (!node->isLeaf() || node == root) {
+        int union_states = 0;
+        int intersect_states = (1 << aln->num_states) - 1;
+        if (states != 0) {
+            union_states = states;
+            intersect_states = states;
+        }
+
+        FOR_NEIGHBOR_IT(node, dad, it){
+        int states_child;
+        int score_child = computeParsimonyScore(ptn, states_child, (PhyloNode*) ((*it)->node), node);
+        union_states |= states_child;
+        intersect_states &= states_child;
+        score += score_child;
+    }
+        if (intersect_states)
+            states = intersect_states;
+        else {
+            states = union_states;
+            score++;
+        }
+    }
+    return score;
+}
+
+int PhyloTree::computeParsimonyScore() {
+    assert(root && root->isLeaf());
+
+    int score = 0;
+    for (int ptn = 0; ptn < aln->size(); ptn++)
+        if (!aln->at(ptn).is_const) {
+            int states;
+            score += computeParsimonyScore(ptn, states) * (*aln)[ptn].frequency;
+        }
+    return score;
+}
+
+/****************************************************************************
+ Nearest Neighbor Interchange with parsimony
+ ****************************************************************************/
+
+double PhyloTree::swapNNI(double cur_score, PhyloNode *node1, PhyloNode *node2) {
+    assert(node1->degree() == 3 && node2->degree() == 3);
+    FOR_NEIGHBOR_DECLARE(node1, node2, it1)
+        break;
+    Node *node1_nei = (*it1)->node;
+
+    FOR_NEIGHBOR_IT(node2, node1, it2){
+    // do the NNI swap
+    Node *node2_nei = (*it2)->node;
+    node1->updateNeighbor(node1_nei, node2_nei);
+    node1_nei->updateNeighbor(node1, node2);
+    node2->updateNeighbor(node2_nei, node1_nei);
+    node2_nei->updateNeighbor(node2, node1);
+
+    // compute the score of the swapped topology
+    double score = computeParsimonyScore();
+    // if better: return
+    if (score < cur_score) return score;
+    // else, swap back
+    node1->updateNeighbor(node2_nei, node1_nei);
+    node1_nei->updateNeighbor(node2, node1);
+    node2->updateNeighbor(node1_nei, node2_nei);
+    node2_nei->updateNeighbor(node1, node2);
+}
+    return cur_score;
+}
+
+double PhyloTree::searchNNI(double cur_score, PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    if (!node->isLeaf() && dad && !dad->isLeaf()) {
+        double score = swapNNI(cur_score, node, dad);
+        if (score < cur_score)
+            return score;
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    double score = searchNNI(cur_score, (PhyloNode*) (*it)->node, node);
+    if (score < cur_score) return score;
+}
+    return cur_score;
+}
+
+void PhyloTree::searchNNI() {
+    cout << "Search with Nearest Neighbor Interchange..." << endl;
+    double cur_score = computeParsimonyScore();
+    do {
+        double score = searchNNI(cur_score);
+        if (score >= cur_score)
+            break;
+        cout << "Better score found: " << score << endl;
+        cur_score = score;
+    } while (true);
+}
+
+
+int PhyloTree::addTaxonMP(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad) {
+    Neighbor *dad_nei = dad->findNeighbor(node);
+
+    // now insert the new node in the middle of the branch node-dad
+    double len = dad_nei->length;
+    node->updateNeighbor(dad, added_node, len / 2.0);
+    dad->updateNeighbor(node, added_node, len / 2.0);
+    added_node->updateNeighbor((Node*) 1, node, len / 2.0);
+    added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
+    // compute the likelihood
+    //clearAllPartialLh();
+    int best_score = computeParsimonyScore();
+    target_node = node;
+    target_dad = dad;
+    // remove the added node
+    node->updateNeighbor(added_node, dad, len);
+    dad->updateNeighbor(added_node, node, len);
+    added_node->updateNeighbor(node, (Node*) 1, len);
+    added_node->updateNeighbor(dad, (Node*) 2, len);
+
+    // now tranverse the tree downwards
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    Node *target_node2;
+    Node *target_dad2;
+    double score = addTaxonMP(added_node, target_node2, target_dad2, (*it)->node, node);
+    if (score < best_score) {
+        best_score = score;
+        target_node = target_node2;
+        target_dad = target_dad2;
+    }
+}
+    return best_score;
+}
+
+void PhyloTree::growTreeMP(Alignment *alignment) {
+
+    cout << "Stepwise addition using maximum parsimony..." << endl;
+    aln = alignment;
+    int size = aln->getNSeq();
+    if (size < 3)
+        outError(ERR_FEW_TAXA);
+
+    root = newNode();
+    Node *new_taxon;
+
+    // create initial tree with 3 taxa
+    for (leafNum = 0; leafNum < 3; leafNum++) {
+        if (verbose_mode >= VB_MAX)
+            cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
+        new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
+        root->addNeighbor(new_taxon, 1.0);
+        new_taxon->addNeighbor(root, 1.0);
+    }
+    root = findNodeID(0);
+    //optimizeAllBranches();
+
+    // stepwise adding the next taxon
+    for (leafNum = 3; leafNum < size; leafNum++) {
+        if (verbose_mode >= VB_MAX)
+            cout << "Add " << aln->getSeqName(leafNum) << " to the tree";
+        // allocate a new taxon and a new ajedcent internal node
+        new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
+        Node *added_node = newNode();
+        added_node->addNeighbor(new_taxon, 1.0);
+        new_taxon->addNeighbor(added_node, 1.0);
+
+        // preserve two neighbors
+        added_node->addNeighbor((Node*) 1, 1.0);
+        added_node->addNeighbor((Node*) 2, 1.0);
+
+        Node *target_node = NULL;
+        Node *target_dad = NULL;
+        int score = addTaxonMP(added_node, target_node, target_dad, root->neighbors[0]->node, root);
+        if (verbose_mode >= VB_MAX)
+            cout << ", score = " << score << endl;
+        // now insert the new node in the middle of the branch node-dad
+        double len = target_dad->findNeighbor(target_node)->length;
+        target_node->updateNeighbor(target_dad, added_node, len / 2.0);
+        target_dad->updateNeighbor(target_node, added_node, len / 2.0);
+        added_node->updateNeighbor((Node*) 1, target_node, len / 2.0);
+        added_node->updateNeighbor((Node*) 2, target_dad, len / 2.0);
+        // compute the likelihood
+        //clearAllPartialLh();
+        //optimizeAllBranches();
+        //optimizeNNI();
+    }
+
+    nodeNum = 2 * leafNum - 2;
+}
+
+/****************************************************************************
+ likelihood function
+ ****************************************************************************/
+
+void PhyloTree::initializeAllPartialLh() {
+    int index, indexlh;
+    int numStates = model->num_states;
+	// Minh's question: why getAlnNSite() but not getAlnNPattern() ?
+    //size_t mem_size = ((getAlnNSite() % 2) == 0) ? getAlnNSite() : (getAlnNSite() + 1);
+    size_t nptn = getAlnNPattern() + numStates; // extra #numStates for ascertainment bias correction
+
+    size_t mem_size;
+    if (instruction_set >= 7)
+    	mem_size = ((nptn +3)/4)*4;
+    else
+    	mem_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+
+    size_t block_size = mem_size * numStates * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+    // make sure _pattern_lh size is divisible by 4 (e.g., 9->12, 14->16)
+    if (!_pattern_lh)
+        _pattern_lh = aligned_alloc<double>(mem_size);
+    if (!_pattern_lh_cat)
+        _pattern_lh_cat = aligned_alloc<double>(mem_size * site_rate->getNDiscreteRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
+    if (!theta_all)
+        theta_all = aligned_alloc<double>(block_size);
+    if (!ptn_freq) {
+        ptn_freq = aligned_alloc<double>(mem_size);
+        ptn_freq_computed = false;
+    }
+    if (!ptn_invar)
+        ptn_invar = aligned_alloc<double>(mem_size);
+    bool benchmark_mem = (!central_partial_lh && verbose_mode >= VB_MED);
+    if (benchmark_mem) {
+    	cout << "Measuring run time for allocating " << getMemoryRequired() << " bytes RAM" << endl;
+    }
+    double cpu_start_time = getCPUTime();
+    double wall_start_time = getRealTime();
+    initializeAllPartialLh(index, indexlh);
+    if (benchmark_mem) {
+    	cout << "CPU time for initializeAllPartialLh: " << getCPUTime() - cpu_start_time << " sec" << endl;
+    	cout << "Wall-clock time for initializeAllPartialLh: " << getRealTime() - wall_start_time << " sec" << endl;
+    }
+    assert(index == (nodeNum - 1) * 2);
+    if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+        if (params->lh_mem_save == LM_PER_NODE)
+            assert(indexlh == nodeNum-leafNum);
+        else
+            assert(indexlh == (nodeNum-1)*2-leafNum);
+    } else
+    	assert(indexlh == (nodeNum-1)*2);
+    clearAllPartialLH();
+
+}
+
+void PhyloTree::deleteAllPartialLh() {
+
+	if (central_partial_lh) {
+		aligned_free(central_partial_lh);
+	}
+	if (central_scale_num) {
+		aligned_free(central_scale_num);
+	}
+	if (central_partial_pars)
+		aligned_free(central_partial_pars);
+
+    if (nni_scale_num)
+        aligned_free(nni_scale_num);
+    nni_scale_num = NULL;
+    if (nni_partial_lh)
+        aligned_free(nni_partial_lh);
+    nni_partial_lh = NULL;
+
+	if (ptn_invar)
+		aligned_free(ptn_invar);
+	if (ptn_freq)
+		aligned_free(ptn_freq);
+	if (theta_all)
+		aligned_free(theta_all);
+
+	if (_pattern_lh_cat)
+		aligned_free(_pattern_lh_cat);
+	if (_pattern_lh)
+		aligned_free(_pattern_lh);
+	central_partial_lh = NULL;
+	central_scale_num = NULL;
+	central_partial_pars = NULL;
+
+	ptn_invar = NULL;
+	ptn_freq = NULL;
+	ptn_freq_computed = false;
+	theta_all = NULL;
+	_pattern_lh_cat = NULL;
+	_pattern_lh = NULL;
+
+    tip_partial_lh = NULL;
+
+    clearAllPartialLH();
+}
+ 
+uint64_t PhyloTree::getMemoryRequired(size_t ncategory) {
+	size_t nptn = aln->getNPattern() + aln->num_states; // +num_states for ascertainment bias correction
+	uint64_t block_size;
+	if (instruction_set >= 7)
+		// block size must be divisible by 4
+		block_size = ((nptn+3)/4)*4;
+	else
+		// block size must be divisible by 2
+		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+    block_size = block_size * aln->num_states;
+    if (site_rate)
+    	block_size *= site_rate->getNRate();
+    else
+    	block_size *= ncategory;
+    if (model && !model_factory->fused_mix_rate)
+    	block_size *= model->getNMixtures();
+    uint64_t mem_size = ((uint64_t) leafNum*4) * block_size *sizeof(double) + 2 + (leafNum) * 4 * nptn * sizeof(UBYTE);
+    if (params->SSE == LK_EIGEN || params->SSE == LK_EIGEN_SSE) {
+    	mem_size -= ((uint64_t)leafNum) * ((uint64_t)block_size*sizeof(double) + nptn * sizeof(UBYTE));
+        if (params->lh_mem_save == LM_PER_NODE) {
+            mem_size -= ((uint64_t)leafNum*2 - 4) * ((uint64_t)block_size*sizeof(double) + nptn * sizeof(UBYTE));
+        }
+    }
+    if (params->gbo_replicates)
+        mem_size += params->gbo_replicates*nptn*sizeof(BootValType);
+    return mem_size;
+}
+
+void PhyloTree::getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries) {
+	size_t nptn = aln->getNPattern() + aln->num_states; // +num_states for ascertainment bias correction
+	uint64_t block_size;
+	if (instruction_set >= 7)
+		// block size must be divisible by 4
+		block_size = ((nptn+3)/4)*4;
+	else
+		// block size must be divisible by 2
+		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+    block_size = block_size * aln->num_states;
+    if (site_rate)
+    	block_size *= site_rate->getNRate();
+    if (model && !model_factory->fused_mix_rate)
+    	block_size *= model->getNMixtures();
+
+	uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
+    if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+        if (params->lh_mem_save == LM_PER_NODE)
+            partial_lh_entries = ((uint64_t)leafNum - 2) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+        else
+            partial_lh_entries = ((uint64_t)leafNum * 3 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+    } else
+    	partial_lh_entries = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+
+
+	if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+        if (params->lh_mem_save == LM_PER_NODE)
+            scale_num_entries = (leafNum - 2) * nptn;
+        else
+            scale_num_entries = (leafNum*3 - 4) * nptn;
+	} else
+		scale_num_entries = (leafNum*4 - 4) * nptn;
+
+    size_t pars_block_size = getBitsBlockSize();
+    partial_pars_entries = (leafNum - 1) * 4 * pars_block_size;
+}
+
+void PhyloTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
+    size_t pars_block_size = getBitsBlockSize();
+    size_t nptn = aln->size()+aln->num_states; // +num_states for ascertainment bias correction
+    size_t block_size;
+    if (instruction_set >= 7)
+    	// block size must be divisible by 4
+    	block_size = ((nptn+3)/4)*4;
+	else
+		// block size must be divisible by 2
+		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+
+    size_t scale_block_size = nptn;
+
+    block_size = block_size * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+    if (!node) {
+        node = (PhyloNode*) root;
+        // allocate the big central partial likelihoods memory
+        if (!nni_partial_lh) {
+            // allocate memory only once!
+//            intptr_t MEM_ALIGNMENT = (instruction_set >= 7) ? 32 : 16;
+//            nni_partial_lh = aligned_alloc<double>(IT_NUM*partial_lh_size+MEM_ALIGNMENT/sizeof(double));
+//            nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_num_size+MEM_ALIGNMENT/sizeof(UBYTE));
+            size_t IT_NUM = (params->nni5) ? 6 : 2;
+            nni_partial_lh = aligned_alloc<double>(IT_NUM*block_size);
+            nni_scale_num = aligned_alloc<UBYTE>(IT_NUM*scale_block_size);
+        }
+
+
+        if (!central_partial_lh) {
+        	uint64_t tip_partial_lh_size = aln->num_states * (aln->STATE_UNKNOWN+1) * model->getNMixtures();
+            uint64_t mem_size = ((uint64_t)leafNum * 4 - 6) * (uint64_t) block_size + 2 + tip_partial_lh_size;
+            if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+                if (params->lh_mem_save == LM_PER_NODE)
+                    mem_size -= ((uint64_t)leafNum * 3 - 4) * (uint64_t)block_size;
+                else 
+                    mem_size -= (uint64_t)leafNum * (uint64_t)block_size;
+            }
+            if (verbose_mode >= VB_MED)
+                cout << "Allocating " << mem_size * sizeof(double) << " bytes for partial likelihood vectors" << endl;
+            try {
+            	central_partial_lh = aligned_alloc<double>(mem_size);
+            } catch (std::bad_alloc &ba) {
+            	outError("Not enough memory for partial likelihood vectors (bad_alloc)");
+            }
+            if (!central_partial_lh)
+                outError("Not enough memory for partial likelihood vectors");
+        }
+
+        // now always assign tip_partial_lh
+        if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+            if (params->lh_mem_save == LM_PER_NODE)
+                tip_partial_lh = central_partial_lh + ((nodeNum - leafNum)*block_size);
+            else
+                tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2-leafNum)*block_size);
+        } else
+            tip_partial_lh = central_partial_lh + (((nodeNum - 1)*2)*block_size);
+
+        if (!central_scale_num) {
+        	uint64_t mem_size = (leafNum - 1) * 4 * scale_block_size;
+        	if (sse == LK_EIGEN || sse == LK_EIGEN_SSE) {
+                if (params->lh_mem_save == LM_PER_NODE)
+                    mem_size -= ((uint64_t)leafNum*3 - 2) * (uint64_t) scale_block_size;
+                else
+                    mem_size -= (uint64_t)leafNum * (uint64_t) scale_block_size;
+            }
+            if (verbose_mode >= VB_MED)
+                cout << "Allocating " << mem_size * sizeof(UBYTE) << " bytes for scale num vectors" << endl;
+            try {
+            	central_scale_num = aligned_alloc<UBYTE>(mem_size);
+            } catch (std::bad_alloc &ba) {
+            	outError("Not enough memory for scale num vectors (bad_alloc)");
+            }
+            if (!central_scale_num)
+                outError("Not enough memory for scale num vectors");
+        }
+
+        if (!central_partial_pars) {
+            if (verbose_mode >= VB_MED)
+                cout << "Allocating " << (leafNum - 1) * 4 * pars_block_size * sizeof(UINT)
+                        << " bytes for partial parsimony vectors" << endl;
+            try {
+            	central_partial_pars = aligned_alloc<UINT>((leafNum - 1) * 4 * pars_block_size);
+            } catch (std::bad_alloc &ba) {
+            	outError("Not enough memory for partial parsimony vectors (bad_alloc)");
+            }
+            if (!central_partial_pars)
+                outError("Not enough memory for partial parsimony vectors");
+        }
+        index = 0;
+        indexlh = 0;
+    }
+    if (dad) {
+        // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
+        PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        PhyloNeighbor *nei2 = (PhyloNeighbor*) dad->findNeighbor(node);
+        
+        // first initialize partial_pars
+        nei->partial_pars = central_partial_pars + (index * pars_block_size);
+        index++;
+        nei2->partial_pars = central_partial_pars + (index * pars_block_size);
+        index ++;
+        assert(index < nodeNum * 2 - 1);
+        
+        // now initialize partial_lh and scale_num
+        if (params->lh_mem_save == LM_PER_NODE && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+            if (!node->isLeaf()) { // only allocate memory to internal node
+                nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
+                nei->scale_num = NULL;
+                nei2->scale_num = central_scale_num + ((indexlh) * scale_block_size);
+                nei2->partial_lh = central_partial_lh + (indexlh * block_size);
+                indexlh++;
+            } else {
+                nei->partial_lh = NULL; 
+                nei->scale_num = NULL;
+                nei2->scale_num = NULL;
+                nei2->partial_lh = NULL;
+            }
+        } else {
+            if (nei->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                nei->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
+                nei->scale_num = NULL;
+            } else {
+                nei->scale_num = central_scale_num + (indexlh * scale_block_size);
+                nei->partial_lh = central_partial_lh + (indexlh * block_size);
+                indexlh++;
+            }
+            if (nei2->node->isLeaf() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+                nei2->partial_lh = NULL; // do not allocate memory for tip, use tip_partial_lh instead
+                nei2->scale_num = NULL;
+            } else {
+                nei2->scale_num = central_scale_num + ((indexlh) * scale_block_size);
+                nei2->partial_lh = central_partial_lh + (indexlh * block_size);
+                indexlh++;
+            }
+        }
+    }
+    FOR_NEIGHBOR_IT(node, dad, it) initializeAllPartialLh(index, indexlh, (PhyloNode*) (*it)->node, node);
+}
+
+double *PhyloTree::newPartialLh() {
+    double *ret = aligned_alloc<double>((aln->size()+aln->num_states+3) * aln->num_states * site_rate->getNRate() *
+                             ((model_factory->fused_mix_rate)? 1 : model->getNMixtures()));
+    return ret;
+}
+
+int PhyloTree::getPartialLhBytes() {
+    size_t nptn = aln->size()+aln->num_states; // +num_states for ascertainment bias correction
+    size_t block_size;
+    if (instruction_set >= 7)
+    	// block size must be divisible by 4
+    	block_size = ((nptn+3)/4)*4;
+	else
+		// block size must be divisible by 2
+		block_size = ((nptn % 2) == 0) ? nptn : (nptn + 1);
+
+    block_size = block_size * model->num_states * site_rate->getNRate() * ((model_factory->fused_mix_rate)? 1 : model->getNMixtures());
+
+	return block_size * sizeof(double);
+}
+
+int PhyloTree::getScaleNumBytes() {
+	return (aln->size()+aln->num_states) * sizeof(UBYTE);
+}
+
+UBYTE *PhyloTree::newScaleNum() {
+    return aligned_alloc<UBYTE>(aln->size()+aln->num_states);
+}
+
+double PhyloTree::computeLikelihood(double *pattern_lh) {
+    assert(model);
+    assert(site_rate);
+    assert(root->isLeaf());
+    PhyloNeighbor *nei = ((PhyloNeighbor*) root->neighbors[0]);
+    current_it = nei;
+    assert(current_it);
+    current_it_back = (PhyloNeighbor*) nei->node->findNeighbor(root);
+    assert(current_it_back);
+
+    double score;
+    string root_name = ROOT_NAME;
+    Node *vroot = findLeafName(root_name);
+    if (root_state != aln->STATE_UNKNOWN && vroot) {
+        if (verbose_mode >= VB_DEBUG)
+            cout << __func__ << " HIT ROOT STATE " << endl;
+        score = computeLikelihoodRooted((PhyloNeighbor*) vroot->neighbors[0], (PhyloNode*) vroot);
+    } else {
+        score = computeLikelihoodBranch(nei, (PhyloNode*) root);
+    }
+    if (pattern_lh)
+        memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
+
+    if (pattern_lh && nei->lh_scale_factor < 0.0) {
+        int nptn = aln->getNPattern();
+        //double check_score = 0.0;
+        for (int i = 0; i < nptn; i++) {
+            pattern_lh[i] += max(nei->scale_num[i], UBYTE(0)) * LOG_SCALING_THRESHOLD;
+            //check_score += (pattern_lh[i] * (aln->at(i).frequency));
+        }
+        /*       if (fabs(score - check_score) > 1e-6) {
+         cout << "score = " << score << " check_score = " << check_score << endl;
+         outError("Scaling error ", __func__);
+         }*/
+    }
+    return score;
+}
+
+double PhyloTree::computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    double score = computeLikelihoodBranchNaive(dad_branch, dad);
+    if (verbose_mode >= VB_DEBUG) {
+        printTransMatrices(dad_branch->node, dad);
+        /*
+         FOR_NEIGHBOR_IT(dad_branch->node, dad, it) {
+         PhyloNeighbor *pit = (PhyloNeighbor*)(*it);
+         cout << pit->node->name << "\t" << pit->partial_lh[0] << endl;
+
+         }*/
+    }
+    double* state_freq = new double[aln->num_states];
+    model->getStateFrequency(state_freq);
+    score -= log(state_freq[(int) root_state]);
+    delete[] state_freq;
+    return score;
+}
+
+void PhyloTree::computePatternLikelihood(double *ptn_lh, double *cur_logl, double *ptn_lh_cat) {
+    /*	if (!dad_branch) {
+     dad_branch = (PhyloNeighbor*) root->neighbors[0];
+     dad = (PhyloNode*) root;
+     }*/
+    int nptn = aln->getNPattern();
+    int i;
+    int ncat = site_rate->getNDiscreteRate();
+    if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
+        ncat *= getModel()->getNMixtures();
+    if (ptn_lh_cat) {
+    	// Right now only Naive version store _pattern_lh_cat!
+    	if (sse == LK_NORMAL || sse == LK_SSE)
+    		computeLikelihoodBranchNaive(current_it, (PhyloNode*)current_it_back->node);
+    	else {
+//    		switch (aln->num_states) {
+//    		case 4: computeLikelihoodBranchEigen<4>(current_it, (PhyloNode*)current_it_back->node); break;
+//    		case 20: computeLikelihoodBranchEigen<20>(current_it, (PhyloNode*)current_it_back->node); break;
+//    		case 2: computeLikelihoodBranchEigen<2>(current_it, (PhyloNode*)current_it_back->node); break;
+//    		case 64: computeLikelihoodBranchEigen<64>(current_it, (PhyloNode*)current_it_back->node); break;
+//    		default: outError("Option unsupported yet for this sequence type. Contact author if you really need it."); break;
+//    		}
+            if (!getModel()->isMixture()) {
+                computeLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node); 
+            } else if (getModelFactory()->fused_mix_rate) {
+                computeMixrateLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node); 
+            } else {
+                computeMixtureLikelihoodBranchEigen(current_it, (PhyloNode*)current_it_back->node); 
+            }
+        }
+    }
+    
+    double sum_scaling = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+    //double sum_scaling = 0.0;
+    if (sum_scaling < 0.0) {
+    	if (current_it->lh_scale_factor == 0.0) {
+			for (i = 0; i < nptn; i++) {
+				ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
+			}
+    	} else if (current_it_back->lh_scale_factor == 0.0){
+			for (i = 0; i < nptn; i++) {
+				ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
+			}
+    	} else {
+			for (i = 0; i < nptn; i++) {
+				ptn_lh[i] = _pattern_lh[i] + (max(UBYTE(0), current_it->scale_num[i]) +
+					max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
+			}
+    	}
+    } else {
+        memmove(ptn_lh, _pattern_lh, nptn * sizeof(double));
+    }
+    if (ptn_lh_cat) {
+    	int offset = 0;
+    	if (sum_scaling == 0.0) {
+    		int nptncat = nptn * ncat;
+            for (i = 0; i < nptncat; i++) {
+            	ptn_lh_cat[i] = log(_pattern_lh_cat[i]);
+            }
+    	} else if (current_it->lh_scale_factor == 0.0) {
+			for (i = 0; i < nptn; i++) {
+				double scale = (max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
+				for (int j = 0; j < ncat; j++, offset++)
+					ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
+			}
+    	} else if (current_it_back->lh_scale_factor == 0.0) {
+			for (i = 0; i < nptn; i++) {
+				double scale = (max(UBYTE(0), current_it->scale_num[i])) * LOG_SCALING_THRESHOLD;
+				for (int j = 0; j < ncat; j++, offset++)
+					ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
+			}
+    	} else {
+			for (i = 0; i < nptn; i++) {
+				double scale = (max(UBYTE(0), current_it->scale_num[i]) +
+						max(UBYTE(0), current_it_back->scale_num[i])) * LOG_SCALING_THRESHOLD;
+				for (int j = 0; j < ncat; j++, offset++)
+					ptn_lh_cat[offset] = log(_pattern_lh_cat[offset]) + scale;
+			}
+    	}
+    }
+//    if (cur_logl) {
+//        double check_score = 0.0;
+//        for (int i = 0; i < nptn; i++) {
+//            check_score += (ptn_lh[i] * (aln->at(i).frequency));
+//        }
+//        if (fabs(check_score - *cur_logl) > 0.01) {
+//            cout << *cur_logl << " " << check_score << endl;
+//            assert(0);
+//        }
+//    }
+    //double score = computeLikelihoodBranch(dad_branch, dad, pattern_lh);
+    //return score;
+}
+
+int PhyloTree::computePatternCategories(IntVector *pattern_ncat) {
+    if (sse != LK_EIGEN) {
+        // compute _pattern_lh_cat
+        if (!getModel()->isMixture())
+            computeLikelihoodBranchEigen((PhyloNeighbor*)root->neighbors[0], (PhyloNode*)root);
+        else if (getModelFactory()->fused_mix_rate) {
+            computeMixrateLikelihoodBranchEigen((PhyloNeighbor*)root->neighbors[0], (PhyloNode*)root);
+            assert(getModel()->getNMixtures() == getRate()->getNRate());
+        } else {
+            computeMixtureLikelihoodBranchEigen((PhyloNeighbor*)root->neighbors[0], (PhyloNode*)root);
+        }
+    }
+    
+	size_t npattern = aln->getNPattern();
+    size_t ncat = getRate()->getNRate();
+    size_t nmixture;
+    if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate)
+    	nmixture = getModel()->getNMixtures();
+    else
+    	nmixture = ncat;
+    size_t ptn, m, c;
+    if (pattern_ncat)
+        pattern_ncat->resize(npattern);
+    if (ptn_cat_mask.empty())
+        ptn_cat_mask.resize(npattern, 0);
+    
+    size_t num_best_mixture = 0;
+    assert(ncat < sizeof(uint64_t)*8 && nmixture < sizeof(uint64_t)*8);
+
+	double *lh_cat = _pattern_lh_cat;
+//    double *cat_prob = new double[ncat];
+    double *lh_mixture = new double[nmixture];
+    double *sorted_lh_mixture = new double[nmixture];
+    int *id_mixture = new int[nmixture];
+    
+//    for (c = 0; c < ncat; c++)
+//        cat_prob[c] = getRate()->getProp(c);
+    
+//    cout << "Ptn\tFreq\tNumMix\tBestMix" << endl;
+    size_t sum_nmix = 0;
+	for (ptn = 0; ptn < npattern; ptn++) {
+		double sum_prob = 0.0, acc_prob = 0.0;
+        memset(lh_mixture, 0, nmixture*sizeof(double));
+        if (getModel()->isMixture() && !getModelFactory()->fused_mix_rate) {
+            for (m = 0; m < nmixture; m++) {
+                for (c = 0; c < ncat; c++) {
+//                    lh_mixture[m] += lh_cat[c] * cat_prob[c];
+                    lh_mixture[m] += lh_cat[c];
+                }
+//                lh_mixture[m] *= prop[m];
+                sum_prob += lh_mixture[m];
+                lh_cat += ncat;
+                id_mixture[m] = m;
+            }
+        } else {
+            for (m = 0; m < nmixture; m++) {
+//                lh_mixture[m] = lh_cat[m] * prop[m];
+                lh_mixture[m] = lh_cat[m];
+                sum_prob += lh_mixture[m];
+                id_mixture[m] = m;
+            }
+            lh_cat += nmixture;
+        }
+        sum_prob = 1.0 / sum_prob;
+        for (m = 0; m < nmixture; m++) {
+            lh_mixture[m] *= sum_prob;
+            sorted_lh_mixture[m] = -lh_mixture[m];
+        }
+        quicksort(sorted_lh_mixture, 0, m-1, id_mixture);
+        for (m = 0; m < nmixture && acc_prob <= 0.99; m++) {
+            acc_prob -= sorted_lh_mixture[m];
+            ptn_cat_mask[ptn] |= (uint64_t)1 << id_mixture[m];
+        }
+        if (m > num_best_mixture)
+            num_best_mixture = m;
+        sum_nmix += m;
+        if (pattern_ncat)
+            (*pattern_ncat)[ptn] = m;
+
+        if (verbose_mode >= VB_MED) {
+            cout << ptn << "\t" << (int)ptn_freq[ptn] << "\t" << m << "\t" << id_mixture[0];
+            for (c = 0; c < m; c++)
+                cout  << "\t" << id_mixture[c] << "\t" << -sorted_lh_mixture[c];
+            cout << endl;
+        }
+	}
+//    cout << 100*(double(sum_nmix)/nmixture)/npattern << "% computation necessary" << endl;
+    delete [] id_mixture;
+    delete [] sorted_lh_mixture;
+    delete [] lh_mixture;
+//    delete [] cat_prob;
+    return num_best_mixture;
+}
+
+double PhyloTree::computeLogLVariance(double *ptn_lh, double tree_lh) {
+    int i;
+    int nptn = getAlnNPattern();
+    int nsite = getAlnNSite();
+    double *pattern_lh = ptn_lh;
+    if (!ptn_lh) {
+        pattern_lh = new double[nptn];
+        computePatternLikelihood(pattern_lh);
+    }
+    IntVector pattern_freq;
+    aln->getPatternFreq(pattern_freq);
+    if (tree_lh == 0.0) {
+        for (i = 0; i < nptn; i++)
+            tree_lh += pattern_lh[i] * pattern_freq[i];
+    }
+    double avg_site_lh = tree_lh / nsite;
+    double variance = 0.0;
+    for (i = 0; i < nptn; i++) {
+        double diff = (pattern_lh[i] - avg_site_lh);
+        variance += diff * diff * pattern_freq[i];
+    }
+    if (!ptn_lh)
+        delete[] pattern_lh;
+    if (nsite <= 1)
+        return 0.0;
+    return variance * ((double) nsite / (nsite - 1.0));
+}
+
+double PhyloTree::computeLogLDiffVariance(double *pattern_lh_other, double *ptn_lh) {
+    int i;
+    int nptn = getAlnNPattern();
+    int nsite = getAlnNSite();
+    double *pattern_lh = ptn_lh;
+    if (!ptn_lh) {
+        pattern_lh = new double[nptn];
+        computePatternLikelihood(pattern_lh);
+    }
+    IntVector pattern_freq;
+    aln->getPatternFreq(pattern_freq);
+
+    double avg_site_lh_diff = 0.0;
+    for (i = 0; i < nptn; i++)
+        avg_site_lh_diff += (pattern_lh[i] - pattern_lh_other[i]) * pattern_freq[i];
+    avg_site_lh_diff /= nsite;
+    double variance = 0.0;
+    for (i = 0; i < nptn; i++) {
+        double diff = (pattern_lh[i] - pattern_lh_other[i] - avg_site_lh_diff);
+        variance += diff * diff * pattern_freq[i];
+    }
+    if (!ptn_lh)
+        delete[] pattern_lh;
+    if (nsite <= 1)
+        return 0.0;
+    return variance * ((double) nsite / (nsite - 1.0));
+}
+
+double PhyloTree::computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh) {
+    double *pattern_lh_other = new double[getAlnNPattern()];
+    other_tree->computePatternLikelihood(pattern_lh_other);
+    delete[] pattern_lh_other;
+    double res = computeLogLDiffVariance(pattern_lh_other, pattern_lh);
+    return res;
+}
+
+void PhyloTree::getUnmarkedNodes(PhyloNodeVector& unmarkedNodes, PhyloNode* node, PhyloNode* dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+
+    if (markedNodeList.find(node->id) == markedNodeList.end()) {
+        int numUnmarkedNei = 0;
+        for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++) {
+            if (markedNodeList.find((*it)->node->id) == markedNodeList.end())
+                numUnmarkedNei++;
+        }
+        if (numUnmarkedNei == 1)
+            unmarkedNodes.push_back(node);
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    getUnmarkedNodes(unmarkedNodes, (PhyloNode*) (*it)->node, node);
+}
+}
+
+double PhyloTree::optimizeOneBranchLS(PhyloNode *node1, PhyloNode *node2) {
+    if (!subTreeDistComputed) {
+    	if (params->ls_var_type == WLS_PAUPLIN) {
+    		computeNodeBranchDists();
+    		for (int i = 0; i < leafNum; i++)
+    			for (int j = 0; j < leafNum; j++)
+    				var_matrix[i*leafNum+j] = pow(2.0,nodeBranchDists[i*nodeNum+j]);
+    	}
+        computeSubtreeDists();
+    }
+    double A, B, C, D;
+    A = B = C = D = 0;
+    PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
+    double lsBranch;
+
+    // One of the node is a leaf
+    if (node1->isLeaf() || node2->isLeaf()) {
+        if (node1->isLeaf()) {
+            // nodeA and nodeB are children of node2
+            FOR_NEIGHBOR_IT(node2, node1, it){
+				if (A == 0) {
+					A = getNumTaxa((*it)->node, node2);
+					nodeA = (PhyloNode*) (*it)->node;
+				} else {
+					B = getNumTaxa((*it)->node, node2);
+					nodeB = (PhyloNode*) (*it)->node;
+				}
+            }
+			// nodeC is now node1
+			nodeC = node1;
+		} else {
+			// nodeA and nodeB are children of node1
+			FOR_NEIGHBOR_IT(node1, node2, it) {
+				if (A == 0) {
+					A = getNumTaxa((*it)->node, node1);
+					nodeA = (PhyloNode*) (*it)->node;
+				} else {
+					B = getNumTaxa((*it)->node, node1);
+					nodeB = (PhyloNode*) (*it)->node;
+				}
+			}
+			// nodeC is now node1
+			nodeC = node2;
+		}
+		assert(A != 0);
+		assert(B != 0);
+		string keyAC = getBranchID(nodeA, nodeC);
+		assert(subTreeDists.count(keyAC));
+		double distAC = subTreeDists[keyAC];
+		double weightAC = subTreeWeights[keyAC];
+		string keyBC = getBranchID(nodeB, nodeC);
+		assert(subTreeDists.count(keyBC));
+		double distBC = subTreeDists[keyBC];
+		double weightBC = subTreeWeights[keyBC];
+		string keyAB = getBranchID(nodeA, nodeB);
+		assert(subTreeDists.count(keyAB));
+		double distAB = subTreeDists[keyAB];
+		double weightAB = subTreeWeights[keyAB];
+		if (params->ls_var_type == OLS/* || params->ls_var_type == FIRST_TAYLOR || params->ls_var_type == FITCH_MARGOLIASH
+				|| params->ls_var_type == SECOND_TAYLOR*/) {
+			lsBranch = 0.5 * (distAC / A + distBC / B - distAB / (A * B));
+		} /*else if (params->ls_var_type == PAUPLIN) {
+			// TODO: Chua test bao gio
+			outError("Paulin formula not supported yet");
+			lsBranch = 0.5 * (distAC + distBC) - 0.5 * distAB;
+		}*/ else {
+			// weighted least square
+			lsBranch = 0.5*(distAC/weightAC + distBC/weightBC - distAB/weightAB);
+		}
+	} else { // Both node are internal node
+		FOR_NEIGHBOR_IT(node1, node2, it) {
+			if (A == 0) {
+				A = getNumTaxa((*it)->node, node1);
+				nodeA = (PhyloNode*) (*it)->node;
+			} else {
+				B = getNumTaxa((*it)->node, node1);
+				nodeB = (PhyloNode*) (*it)->node;
+			}
+		}
+
+		FOR_NEIGHBOR_IT(node2, node1, it) {
+			if (C == 0) {
+				C = getNumTaxa((*it)->node, node2);
+				nodeC = (PhyloNode*) (*it)->node;
+			} else {
+				D = getNumTaxa((*it)->node, node2);
+				nodeD = (PhyloNode*) (*it)->node;
+			}
+		}
+
+		string keyAC = getBranchID(nodeA, nodeC);
+		assert(subTreeDists.count(keyAC));
+		double distAC = subTreeDists[keyAC];
+		double weightAC = subTreeWeights[keyAC];
+
+		string keyBD = getBranchID(nodeB, nodeD);
+		assert(subTreeDists.count(keyBD));
+		double distBD = subTreeDists[keyBD];
+		double weightBD = subTreeWeights[keyBD];
+
+		string keyBC = getBranchID(nodeB, nodeC);
+		assert(subTreeDists.count(keyBC));
+		double distBC = subTreeDists[keyBC];
+		double weightBC = subTreeWeights[keyBC];
+
+		string keyAD = getBranchID(nodeA, nodeD);
+		assert(subTreeDists.count(keyAD));
+		double distAD = subTreeDists[keyAD];
+		double weightAD = subTreeWeights[keyAD];
+
+		string keyAB = getBranchID(nodeA, nodeB);
+		assert(subTreeDists.count(keyAB));
+		double distAB = subTreeDists[keyAB];
+		double weightAB = subTreeWeights[keyAB];
+
+		string keyCD = getBranchID(nodeC, nodeD);
+		assert(subTreeDists.count(keyCD));
+		double distCD = subTreeDists[keyCD];
+		double weightCD = subTreeWeights[keyCD];
+
+		/*if (params->ls_var_type == PAUPLIN) {
+			// this distance has a typo as also seen in Mihaescu & Pachter 2008
+			//lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB - distCD);
+			outError("Paulin formula not supported yet");
+			lsBranch = 0.25 * (distAC + distBD + distAD + distBC) - 0.5 * (distAB + distCD);
+		} else*/ if (params->ls_var_type == OLS) {
+			double gamma = (B * C + A * D) / ((A + B)*(C + D));
+			lsBranch = 0.5 * (gamma * (distAC / (A * C) + distBD / (B * D))
+					+ (1 - gamma) * (distBC / (B * C) + distAD / (A * D))
+					- distAB / (A * B) - distCD / (C * D));
+		} else {
+			// weighted least square
+			double K = 1.0/weightAC + 1.0/weightBD + 1.0/weightAD + 1.0/weightBC;
+			lsBranch =
+					((distAC/weightAC+distBD/weightBD)*(weightAD+weightBC)/(weightAD*weightBC)+
+					(distAD/weightAD+distBC/weightBC)*(weightAC+weightBD)/(weightAC*weightBD))/K
+					- distAB/weightAB - distCD/weightCD;
+			lsBranch = 0.5*lsBranch;
+		}
+	}
+    return lsBranch;
+}
+
+void PhyloTree::updateSubtreeDists(NNIMove &nnimove) {
+	assert(subTreeDistComputed);
+	PhyloNode *nodeA = NULL, *nodeB = NULL, *nodeC = NULL, *nodeD = NULL;
+	PhyloNode *node1 = nnimove.node1;
+	PhyloNode *node2 = nnimove.node2;
+	NeighborVec::iterator node1Nei_it = nnimove.node1Nei_it;
+	NeighborVec::iterator node2Nei_it = nnimove.node2Nei_it;
+	Neighbor *node1Nei = *(node1Nei_it);
+	Neighbor *node2Nei = *(node2Nei_it);
+
+	// ((A,C),(B,D))
+	// C and D are the 2 subtree that get swapped
+	FOR_NEIGHBOR_IT(node1, node2, it) {
+		if ((*it)->id != node1Nei->id) {
+			nodeA = (PhyloNode*) (*it)->node;
+		} else {
+			nodeC = (PhyloNode*) (*it)->node;
+		}
+	}
+
+	assert(nodeA);
+	assert(nodeC);
+
+	FOR_NEIGHBOR_IT(node2, node1, it) {
+		if ((*it)->id != node2Nei->id) {
+			nodeB = (PhyloNode*) (*it)->node;
+		} else {
+			nodeD = (PhyloNode*) (*it)->node;
+		}
+	}
+
+	assert(nodeB);
+	assert(nodeD);
+
+    NodeVector nodeListA, nodeListB, nodeListC, nodeListD;
+    getAllNodesInSubtree(nodeA, node1, nodeListA);
+    getAllNodesInSubtree(nodeC, node1, nodeListC);
+    getAllNodesInSubtree(nodeB, node2, nodeListB);
+    getAllNodesInSubtree(nodeD, node2, nodeListD);
+
+    for (NodeVector::iterator it = nodeListA.begin(); it != nodeListA.end(); ++it) {
+        string key = getBranchID((*it), node2);
+        double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
+        double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
+        double newDist = distB + distD;
+        StringDoubleMap::iterator dist_it = subTreeDists.find(key);
+        assert(dist_it != subTreeDists.end());
+        dist_it->second = newDist;
+    }
+
+    for (NodeVector::iterator it = nodeListB.begin(); it != nodeListB.end(); ++it) {
+        string key = getBranchID((*it), node1);
+        double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
+        double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
+        double newDist = distC + distA;
+        StringDoubleMap::iterator dist_it = subTreeDists.find(key);
+        assert(dist_it != subTreeDists.end());
+        dist_it->second = newDist;
+    }
+
+    for (NodeVector::iterator it = nodeListC.begin(); it != nodeListC.end(); ++it) {
+        string key = getBranchID((*it), node2);
+        double distD = subTreeDists.find(getBranchID((*it), nodeD))->second;
+        double distB = subTreeDists.find(getBranchID((*it), nodeB))->second;
+        double newDist = distD + distB;
+        StringDoubleMap::iterator dist_it = subTreeDists.find(key);
+        assert(dist_it != subTreeDists.end());
+        dist_it->second = newDist;
+    }
+
+    for (NodeVector::iterator it = nodeListD.begin(); it != nodeListD.end(); ++it) {
+        string key = getBranchID((*it), node1);
+        double distA = subTreeDists.find(getBranchID((*it), nodeA))->second;
+        double distC = subTreeDists.find(getBranchID((*it), nodeC))->second;
+        double newDist = distA + distC;
+        StringDoubleMap::iterator dist_it = subTreeDists.find(key);
+        assert(dist_it != subTreeDists.end());
+        dist_it->second = newDist;
+    }
+
+    double distAB = subTreeDists.find(getBranchID(nodeA, nodeB))->second;
+    double distAD = subTreeDists.find(getBranchID(nodeA, nodeD))->second;
+    double distCB = subTreeDists.find(getBranchID(nodeC, nodeB))->second;
+    double distCD = subTreeDists.find(getBranchID(nodeC, nodeD))->second;
+
+    subTreeDists.find(getBranchID(node1, node2))->second = distAB + distAD + distCB + distCD;
+
+}
+
+void PhyloTree::computeSubtreeDists() {
+    PhyloNodeVector unmarkedNodes;
+    subTreeDists.clear();
+    subTreeWeights.clear();
+    do {
+        // Generate a list of unmarked node that is adjacent to exactly one unmarked nodes
+        // Here we will work up the tree in a bottom up manner
+        unmarkedNodes.clear();
+        getUnmarkedNodes(unmarkedNodes);
+        if (unmarkedNodes.size() == 0)
+            break;
+
+        for (PhyloNodeVector::iterator it = unmarkedNodes.begin(); it != unmarkedNodes.end(); ++it) {
+            // if the node is an internal node then all of its child nodes should be marked
+            // source_nei1 and source_nei2 are the 2 marked child node
+            // nextNode is the other node, used for traversal
+            PhyloNode* source_nei1 = NULL;
+            PhyloNode* source_nei2 = NULL;
+            PhyloNode* nextNode;
+            if (!(*it)->isLeaf()) {
+                // select the 2 marked child nodes
+                for (NeighborVec::iterator it2 = (*it)->neighbors.begin(); it2 != (*it)->neighbors.end(); ++it2) {
+                    if (markedNodeList.find((*it2)->node->id) != markedNodeList.end()) {
+                        if (!source_nei1) {
+                            source_nei1 = (PhyloNode*) (*it2)->node;
+                        } else {
+                            source_nei2 = (PhyloNode*) (*it2)->node;
+                        }
+                    } else {
+                        nextNode = (PhyloNode*) (*it2)->node;
+                    }
+                }
+                assert(source_nei1);
+                assert(source_nei2);
+            } else {
+                nextNode = (PhyloNode*) (*it)->neighbors[0]->node;
+            }
+            // warning: 'nextNode' may be used uninitialized in this function
+            computeAllSubtreeDistForOneNode((*it), source_nei1, source_nei2, (*it), nextNode);
+            markedNodeList.insert(IntPhyloNodeMap::value_type((*it)->id, (*it)));
+        }
+    } while (true);
+    markedNodeList.clear();
+    subTreeDistComputed = true;
+}
+
+void PhyloTree::computeAllSubtreeDistForOneNode(PhyloNode* source, PhyloNode* source_nei1, PhyloNode* source_nei2,
+        PhyloNode* node, PhyloNode* dad) {
+    string key = getBranchID(source, dad);
+    double dist, weight;
+    if (markedNodeList.find(dad->id) != markedNodeList.end()) {
+        return;
+    } else if (source->isLeaf() && dad->isLeaf()) {
+        assert(dist_matrix);
+        int nseq = aln->getNSeq();
+        if (params->ls_var_type == OLS) {
+        	dist = dist_matrix[dad->id * nseq + source->id];
+        	weight = 1.0;
+        } else {
+        	// this will take into account variances, also work for OLS since var = 1
+        	weight = 1.0/var_matrix[dad->id * nseq + source->id];
+        	dist = dist_matrix[dad->id * nseq + source->id] * weight;
+        }
+        subTreeDists.insert(StringDoubleMap::value_type(key, dist));
+        subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
+    } else if (!source->isLeaf() && dad->isLeaf()) {
+        assert(source_nei1);
+        assert(source_nei2);
+        string key1 = getBranchID(source_nei1, dad);
+        assert(subTreeDists.find(key1) == subTreeDists.end());
+        double dist1 = subTreeDists.find(key1)->second;
+        double weight1 = subTreeWeights.find(key1)->second;
+        string key2 = getBranchID(source_nei2, dad);
+        assert(subTreeDists.find(key2) == subTreeDists.end());
+        double dist2 = subTreeDists.find(key2)->second;
+        double weight2 = subTreeWeights.find(key2)->second;
+        dist = dist1 + dist2;
+        weight = weight1 + weight2;
+        subTreeDists.insert(StringDoubleMap::value_type(key, dist));
+        subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
+    } else {
+        PhyloNode* dad_nei1 = NULL;
+        PhyloNode* dad_nei2 = NULL;
+        for (NeighborVec::iterator it = dad->neighbors.begin(); it != dad->neighbors.end(); ++it) {
+            if ((*it)->node != node) {
+                if (!dad_nei1) {
+                    dad_nei1 = (PhyloNode*) (*it)->node;
+                } else {
+                    dad_nei2 = (PhyloNode*) (*it)->node;
+                }
+            }
+        }
+        assert(dad_nei1);
+        assert(dad_nei2);
+        computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei1);
+        computeAllSubtreeDistForOneNode(source, source_nei1, source_nei2, dad, dad_nei2);
+        string key1 = getBranchID(source, dad_nei1);
+        string key2 = getBranchID(source, dad_nei2);
+        assert(subTreeDists.find(key1) != subTreeDists.end());
+        assert(subTreeDists.find(key2) != subTreeDists.end());
+        double dist1 = subTreeDists.find(key1)->second;
+        double weight1 = subTreeWeights.find(key1)->second;
+        double dist2 = subTreeDists.find(key2)->second;
+        double weight2 = subTreeWeights.find(key2)->second;
+        dist = dist1 + dist2;
+        weight = weight1 + weight2;
+        subTreeDists.insert(StringDoubleMap::value_type(key, dist));
+        subTreeWeights.insert(StringDoubleMap::value_type(key, weight));
+    }
+}
+
+set<int> PhyloTree::computeNodeBranchDists(Node *node, Node *dad) {
+	set<int>::iterator i, j;
+	if (!nodeBranchDists) {
+		cout << "nodeNum = " << nodeNum << endl;
+		nodeBranchDists = new int[nodeNum*nodeNum];
+	}
+	if (!node) {
+		memset(nodeBranchDists, 0, sizeof(int)*nodeNum*nodeNum);
+		assert(root->isLeaf());
+		dad = root;
+		node = dad->neighbors[0]->node;
+		set<int> res = computeNodeBranchDists(node, dad);
+		for (i = res.begin(); i != res.end(); i++)
+			nodeBranchDists[(*i)*nodeNum + dad->id] = nodeBranchDists[(dad->id)*nodeNum + (*i)] =
+				nodeBranchDists[(*i)*nodeNum + node->id] + 1;
+		// sanity check that all distances are filled
+		for (int x = 0; x < nodeNum; x++)
+			for (int y = 0; y < nodeNum; y++)
+				if (x != y)
+					assert(nodeBranchDists[x*nodeNum+y] != 0);
+				else
+					assert(nodeBranchDists[x*nodeNum+y] == 0);
+		return res;
+	}
+	if (node->isLeaf()) {
+		set<int> res;
+		res.insert(node->id);
+		return res;
+	}
+	assert(node->degree() == 3);
+	Node *left = NULL, *right = NULL;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (*it)->node; else right = (*it)->node;
+	}
+	set<int> resl = computeNodeBranchDists(left, node);
+	set<int> resr = computeNodeBranchDists(right, node);
+	for (i = resl.begin(); i != resl.end(); i++)
+		nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
+			nodeBranchDists[(*i)*nodeNum + left->id] + 1;
+	for (i = resr.begin(); i != resr.end(); i++)
+		nodeBranchDists[(*i)*nodeNum + node->id] = nodeBranchDists[(node->id)*nodeNum + (*i)] =
+			nodeBranchDists[(*i)*nodeNum + right->id] + 1;
+	for (i = resl.begin(); i != resl.end(); i++)
+		for (j = resr.begin(); j != resr.end(); j++)
+			nodeBranchDists[(*i)*nodeNum + (*j)] = nodeBranchDists[(*j)*nodeNum+(*i)] =
+				nodeBranchDists[(*i)*nodeNum+node->id]+nodeBranchDists[(*j)*nodeNum+node->id];
+	resl.insert(resr.begin(), resr.end());
+	resl.insert(node->id);
+	return resl;
+}
+
+
+/*
+    b0: initial guess for the maximum
+*/
+double PhyloTree::approxOneBranch(PhyloNode *node, PhyloNode *dad, double b0) {
+    double b_max, ddl, b1, b2, std, seqlen;
+    double t1, t3, t5, t11, t18, t21, t26, t29, t30, t32, t44, t46, t48;
+    double beps = 1/DBL_MAX;
+
+    /* TODO: insert call to get sequence length */
+    seqlen = getAlnNSite();
+
+    /* use a robust first order approximation to the variance */
+    std = sqrt(b0/seqlen);
+
+    /* determine neighbour points */
+    b1 = b0 - std;
+    if (b1<=0) b1 = beps; /* only happens for b<=1 with small seq. len. */
+    b2 = b0 + std;
+
+    /* TODO: insert calls to log-likelihood function */
+    PhyloNeighbor *dad_nei = (PhyloNeighbor*)(dad->findNeighbor(node));
+    PhyloNeighbor *node_nei = (PhyloNeighbor*)(node->findNeighbor(dad));
+    double old_len = dad_nei->length;
+    dad_nei->length = node_nei->length = b0;
+    double l0 = computeLikelihoodBranch(dad_nei, dad);
+    dad_nei->length = node_nei->length = b1;
+    double l1 = computeLikelihoodBranch(dad_nei, dad);
+    dad_nei->length = node_nei->length = b2;
+    double l2 = computeLikelihoodBranch(dad_nei, dad);
+    dad_nei->length = node_nei->length = old_len;
+
+    t1 = sqrt(b0);
+    t3 = sqrt(b2);
+    t5 = sqrt(b1);
+    t11 = pow(-t1*l2+t3*l0+t5*l2+t1*l1-t5*l0-t3*l1,2.0);
+    t18 = -b0*l2+b2*l0+b1*l2+b0*l1-b1*l0-b2*l1;
+    t21 = t1-t5;
+    t26 = -t1*t3+t1*t5+b2-t5*t3;
+    t29 = t18*t18;
+    t30 = 1/t11;
+    t32 = sqrt(t29*t30);
+    ddl = -2.0*t11/t18/t21/t26/t32;
+
+    if (ddl > 0) {
+        /* the analytic extremum is a minimum,
+           so the maximum is at the lower bound */
+        b_max = 0;
+    } else {
+        t44 = pow(-t1*b2+t5*b2-t5*b0+t3*b0-t3*b1+t1*b1,2.0);
+        t46 = t21*t21;
+        t48 = t26*t26;
+        b_max = t29*t44/t46/t48*t30/4.0;
+    }
+
+    return(b_max);
+}
+
+void PhyloTree::approxAllBranches(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+
+    if (dad) {
+        PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
+        double len = approxOneBranch(node, dad, dad_node_nei->length);
+        node_dad_nei->length = len;
+        dad_node_nei->length = len;
+    }
+
+    for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
+        if ((*it)->node != (dad)) {
+        	approxAllBranches((PhyloNode*) (*it)->node, node);
+        }
+}
+
+/*
+ void PhyloTree::computeAllSubtreeDists(PhyloNode* node, PhyloNode* dad) {
+ if (!node) {
+ node = (PhyloNode*) root;
+ }
+
+ if (dad) {
+ // This function compute all pairwise subtree distance between subtree rooted at dad and others
+
+ computeSubtreeDists(node, dad);
+ }
+
+ FOR_NEIGHBOR_IT(node, dad, it) {
+
+ computeAllSubtreeDists((PhyloNode*) (*it)->node, node);
+ }
+ }
+
+ void PhyloTree::computeSubtreeDists(PhyloNode* node, PhyloNode* dad) {
+ // if both nodes are leaf then it is trivial, just retrieve the values from the distance matrix
+ if (dad->isLeaf() && node->isLeaf()) {
+ string key = nodePair2String(dad, node);
+ assert(dist_matrix);
+ int nseq = aln->getNSeq();
+ double dist = dist_matrix[dad->id * nseq + node->id];
+ interSubtreeDistances.insert(StringDoubleMap::value_type(key, dist));
+ } else if (!dad->isLeaf() && node->isLeaf()) {
+
+ FOR_NEIGHBOR_IT(node, dad, it) {
+
+ computeSubtreeDists(dad, (PhyloNode*) (*it)->node);
+ }
+
+ }
+ }
+ */
+
+double PhyloTree::computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    double obsLen = 0.0;
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    assert(node_branch);
+    /*
+     if (node->isLeaf() || dad->isLeaf()) {
+     return -1.0;
+     }*/
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihood(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihood(node_branch, node);
+    // now combine likelihood at the branch
+    int nstates = aln->num_states;
+    int numCat = site_rate->getNRate();
+    size_t block = numCat * nstates;
+    size_t nptn = aln->size();
+    size_t ptn;
+    int cat, state;
+    double *tmp_state_freq = new double[nstates];
+    double *tmp_anscentral_state_prob1 = new double[nstates];
+    double *tmp_anscentral_state_prob2 = new double[nstates];
+
+    //computeLikelihoodBranchNaive(dad_branch, dad, NULL, tmp_ptn_rates);
+    //double sum_rates = 0.0;
+    //for (ptn = 0; ptn < nptn; ptn++)
+    //    sum_rates += tmp_ptn_rates[ptn] * aln->at(ptn).frequency;
+    //cout << "sum_rates = " << sum_rates << endl;
+
+    model->getStateFrequency(tmp_state_freq);
+
+    for (ptn = 0; ptn < nptn; ptn++) {
+        // Compute the probability of each state for the current site
+        double sum_prob1 = 0.0, sum_prob2 = 0.0;
+        size_t offset = ptn * block;
+        double *partial_lh_site = node_branch->partial_lh + (offset);
+        double *partial_lh_child = dad_branch->partial_lh + (offset);
+        for (state = 0; state < nstates; state++) {
+            tmp_anscentral_state_prob1[state] = 0.0;
+            tmp_anscentral_state_prob2[state] = 0.0;
+            for (cat = 0; cat < numCat; cat++) {
+                tmp_anscentral_state_prob1[state] += partial_lh_site[nstates * cat + state];
+                tmp_anscentral_state_prob2[state] += partial_lh_child[nstates * cat + state];
+            }
+            tmp_anscentral_state_prob1[state] *= tmp_state_freq[state];
+            tmp_anscentral_state_prob2[state] *= tmp_state_freq[state];
+            sum_prob1 += tmp_anscentral_state_prob1[state];
+            sum_prob2 += tmp_anscentral_state_prob2[state];
+        }
+        bool sameState = false;
+        int state1 = 0, state2 = 0;
+        double cutoff = 1.0/nstates;
+        for (state = 0; state < nstates; state++) {
+            tmp_anscentral_state_prob1[state] /= sum_prob1;
+            tmp_anscentral_state_prob2[state] /= sum_prob2;
+            if (tmp_anscentral_state_prob1[state] > tmp_anscentral_state_prob1[state1])
+            	state1 = state;
+            if (tmp_anscentral_state_prob2[state] > tmp_anscentral_state_prob2[state2])
+            	state2 = state;
+            if (tmp_anscentral_state_prob1[state] > cutoff && tmp_anscentral_state_prob2[state] > cutoff)
+            	sameState = true;
+        }
+        sameState = sameState || (state1 == state2);
+        if (!sameState) {
+            obsLen += aln->at(ptn).frequency;
+        }
+
+    }
+    obsLen /= getAlnNSite();
+    if (obsLen < MIN_BRANCH_LEN)
+        obsLen = MIN_BRANCH_LEN;
+    delete[] tmp_anscentral_state_prob2;
+    delete[] tmp_anscentral_state_prob1;
+    delete[] tmp_state_freq;
+
+    return obsLen;
+}
+
+double PhyloTree::correctBranchLengthF81(double observedBran, double alpha) {
+    double H = 0.0;
+    double correctedBranLen;
+    for (int i = 0; i < model->num_states; i++) {
+        H += model->state_freq[i] * (1 - model->state_freq[i]);
+    }
+    observedBran = 1.0 - observedBran / H;
+    // no gamma
+    if (observedBran <= 0.0)
+        return MAX_BRANCH_LEN;
+
+    if (alpha <= 0.0) {
+        correctedBranLen = -H * log(observedBran);
+    } else {
+        //if (verbose_mode >= VB_MAX) cout << "alpha: " << alpha << endl;
+
+        correctedBranLen = H * alpha * (pow(observedBran, -1 / alpha) - 1);
+    }
+
+    if (correctedBranLen < MIN_BRANCH_LEN)
+    	correctedBranLen = MIN_BRANCH_LEN;
+    if (correctedBranLen > MAX_BRANCH_LEN)
+    	correctedBranLen = MAX_BRANCH_LEN;
+
+    return correctedBranLen;
+}
+
+double PhyloTree::computeCorrectedBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    double observedBran = computeBayesianBranchLength(dad_branch, dad);
+    return correctBranchLengthF81(observedBran, site_rate->getGammaShape());
+}
+
+void PhyloTree::computeAllBayesianBranchLengths(Node *node, Node *dad) {
+
+    if (!node)
+        node = root;
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+        double branch_length = computeBayesianBranchLength((PhyloNeighbor*) (*it), (PhyloNode*) node);
+        (*it)->length = branch_length;
+        // set the backward branch length
+        (*it)->node->findNeighbor(node)->length = (*it)->length;
+		computeAllBayesianBranchLengths((*it)->node, node);
+    }
+}
+
+//double PhyloTree::computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double *pattern_lh, double *pattern_rate) {
+double PhyloTree::computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    //assert(node_branch);
+    //assert(!site_rate->isSiteSpecificRate() || !model->isSiteSpecificModel());
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    // swap node and dad if dad is a leaf
+    // NEW: swap if root_state is given
+    if (node->isLeaf() || (node->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)) {
+        PhyloNode *tmp_node = dad;
+        dad = node;
+        node = tmp_node;
+        PhyloNeighbor *tmp_nei = dad_branch;
+        dad_branch = node_branch;
+        node_branch = tmp_nei;
+        //cout << "swapped\n";
+    }
+
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihood(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihood(node_branch, node);
+    // now combine likelihood at the branch
+
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    int ncat = site_rate->getNRate();
+    double p_invar = site_rate->getPInvar();
+    double p_var_cat = (1.0 - p_invar) / (double) ncat;
+    int nstates = aln->num_states;
+    size_t block = ncat * nstates;
+    int trans_size = model->getTransMatrixSize();
+    size_t ptn; // for big data size > 4GB memory required
+    int cat, state1, state2;
+    size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
+    size_t orig_nptn = aln->size();
+    int discrete_cat = site_rate->getNDiscreteRate();
+    double *trans_mat = new double[discrete_cat * trans_size];
+    double *state_freq = new double[nstates];
+    model->getStateFrequency(state_freq);
+
+    if (!site_rate->isSiteSpecificRate())
+        for (cat = 0; cat < discrete_cat; cat++) {
+            //trans_mat[cat] = model->newTransMatrix();
+            double *trans_cat = trans_mat + (cat * trans_size);
+            model_factory->computeTransMatrixFreq(dad_branch->length * site_rate->getRate(cat), state_freq, trans_cat);
+        }
+
+    bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
+    double prob_const = 0.0; // probability of unobserved const patterns
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, cat, state1, state2)
+#endif
+    for (ptn = 0; ptn < nptn; ptn++) {
+        double lh_ptn = 0.0; // likelihood of the pattern
+        int dad_state = 1000; // just something big enough
+        int ptn_cat = site_rate->getPtnCat(ptn);
+        if (dad->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN) {
+            dad_state = root_state;
+        } else if (dad->isLeaf()) {
+        	if (ptn < orig_nptn)
+        		dad_state = (*aln)[ptn][dad->id];
+        	else
+        		dad_state = model_factory->unobserved_ptns[ptn-orig_nptn];
+        }
+        int dad_offset = dad_state * nstates;
+        if (site_rate->isSiteSpecificRate()) {
+        	if (ptn < orig_nptn)
+        		model_factory->computeTransMatrixFreq(dad_branch->length * site_rate->getPtnRate(ptn), state_freq, trans_mat);
+        	else
+        		model_factory->computeTransMatrixFreq(dad_branch->length, state_freq, trans_mat);
+        }
+        for (cat = 0; cat < ncat; cat++) {
+            double lh_cat = 0.0; // likelihood of the pattern's category
+            size_t lh_offset = cat * nstates + ptn * block;
+            double *partial_lh_site = node_branch->partial_lh + lh_offset;
+            double *partial_lh_child = dad_branch->partial_lh + lh_offset;
+            if (dad_state < nstates) { // single state
+                // external node
+                double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + dad_offset);
+                if (model->isSiteSpecificModel() && ptn < nptn)
+                    trans_state += (nstates * nstates * model->getPtnModelID(ptn));
+                for (state2 = 0; state2 < nstates; state2++)
+                    lh_cat += partial_lh_child[state2] * trans_state[state2];
+            } else {
+                // internal node, or external node but ambiguous character
+                for (state1 = 0; state1 < nstates; state1++) {
+                    double lh_state = 0.0; // likelihood of state1
+                    double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + state1 * nstates);
+                    if (model->isSiteSpecificModel() && ptn < nptn)
+                        trans_state += (nstates * nstates * model->getPtnModelID(ptn));
+                    for (state2 = 0; state2 < nstates; state2++)
+                        lh_state += partial_lh_child[state2] * trans_state[state2];
+                    lh_cat += lh_state * partial_lh_site[state1];
+                }
+            }
+            lh_ptn += lh_cat;
+            _pattern_lh_cat[ptn * ncat + cat] = lh_cat;
+//            if (pattern_rate)
+//                rate_ptn += lh_cat * site_rate->getRate(cat);
+        }
+        if (ptn < orig_nptn) {
+//			if (pattern_rate)
+//				pattern_rate[ptn] = rate_ptn / lh_ptn;
+			lh_ptn *= p_var_cat;
+			if ((*aln)[ptn].const_char == nstates)
+				lh_ptn += p_invar;
+			else if ((*aln)[ptn].const_char < nstates) {
+				lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
+			}
+			//#ifdef DEBUG
+			if (lh_ptn <= 0.0)
+				cout << "Negative likelihood: " << lh_ptn << " " << site_rate->getPtnRate(ptn) << endl;
+			//#endif
+        	lh_ptn = log(lh_ptn);
+			_pattern_lh[ptn] = lh_ptn;
+			if (discard_saturated_site && site_rate->isSiteSpecificRate() && site_rate->getPtnRate(ptn) >= MAX_SITE_RATE)
+				continue;
+			tree_lh += lh_ptn * aln->at(ptn).frequency;
+        } else {
+        	lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_nptn]];
+        	prob_const += lh_ptn;
+        }
+
+    }
+    if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+    }
+//    if (pattern_lh)
+//        memmove(pattern_lh, _pattern_lh, aln->size() * sizeof(double));
+    delete[] state_freq;
+    delete[] trans_mat;
+    //for (cat = ncat-1; cat >= 0; cat--)
+    //delete trans_mat[cat];
+    //delete state_freq;
+
+    return tree_lh;
+}
+
+double PhyloTree::computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    double lh_zero_branch;
+    double saved_len = dad_branch->length;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) dad_branch->node->findNeighbor(dad);
+    dad_branch->length = 0.0;
+    node_branch->length = 0.0;
+    lh_zero_branch = computeLikelihoodBranch(dad_branch, dad);
+    // restore branch length
+    dad_branch->length = saved_len;
+    node_branch->length = saved_len;
+
+    return lh_zero_branch;
+}
+
+void PhyloTree::computePartialLikelihoodNaive(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    Node * node = dad_branch->node;
+    size_t ptn, cat;
+    int ncat = site_rate->getNRate();
+    int nstates = aln->num_states;
+    size_t block = nstates * site_rate->getNRate();
+    int trans_size = model->getTransMatrixSize();
+    size_t lh_size = (aln->size()+model_factory->unobserved_ptns.size()) * block;
+    double *partial_lh_site;
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    size_t orig_nptn = aln->size();
+
+    dad_branch->lh_scale_factor = 0.0;
+    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+
+    assert(dad_branch->partial_lh);
+    //if (!dad_branch->partial_lh)
+    //	dad_branch->partial_lh = newPartialLh();
+    if (node->isLeaf() && dad) {
+        /* external node */
+        memset(dad_branch->partial_lh, 0, lh_size * sizeof(double));
+        for (ptn = 0; ptn < nptn; ptn++) {
+            char state;
+            partial_lh_site = dad_branch->partial_lh + (ptn * block);
+            if (node->name == ROOT_NAME) {
+                state = aln->STATE_UNKNOWN;
+            } else {
+                assert(node->id < aln->getNSeq());
+                if (ptn < orig_nptn)
+                	state = (aln->at(ptn))[node->id];
+                else // ascertainment bias correction
+                	state = model_factory->unobserved_ptns[ptn-orig_nptn];
+            }
+            if (state < nstates) {
+				for (cat = 0; cat < ncat; cat++)
+					partial_lh_site[cat * nstates + state] = 1.0;
+			} else if (state == aln->STATE_UNKNOWN) {
+                // fill all entries (also over rate category) with 1.0
+                dad_branch->scale_num[ptn] = -1;
+                for (int state2 = 0; state2 < block; state2++) {
+                    partial_lh_site[state2] = 1.0;
+                }
+            } else if (aln->seq_type == SEQ_DNA) {
+                // ambiguous character, for DNA, RNA
+                state = state - (nstates - 1);
+                for (int state2 = 0; state2 < nstates && state2 <= 6; state2++)
+                    if (state & (1 << state2)) {
+                        for (cat = 0; cat < ncat; cat++)
+                            partial_lh_site[cat * nstates + state2] = 1.0;
+                    }
+            } else if (aln->seq_type == SEQ_PROTEIN) {
+                // ambiguous character, for DNA, RNA
+                state = state - (nstates);
+                assert(state < 3);
+                int state_map[] = {4+8,32+64,512+1024};
+                for (int state2 = 0; state2 < 11; state2++)
+                    if (state_map[(int)state] & (1 << state2)) {
+                        for (cat = 0; cat < ncat; cat++)
+                            partial_lh_site[cat * nstates + state2] = 1.0;
+                    }
+            } else {
+            	outError("Internal error ", __func__);
+            }
+        }
+    } else {
+        /* internal node */
+        int discrete_cat = site_rate->getNDiscreteRate();
+        double *trans_mat = new double[discrete_cat * trans_size];
+        //for (cat = 0; cat < discrete_cat; cat++) trans_mat[cat] = model->newTransMatrix();
+        for (ptn = 0; ptn < lh_size; ptn++) {
+            dad_branch->partial_lh[ptn] = 1.0;
+        }
+        for (ptn = 0; ptn < nptn; ptn++)
+            dad_branch->scale_num[ptn] = -1;
+
+        FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
+            computePartialLikelihoodNaive((PhyloNeighbor*) (*it), (PhyloNode*) node);
+
+            dad_branch->lh_scale_factor += ((PhyloNeighbor*) (*it))->lh_scale_factor;
+
+            if (!site_rate->isSiteSpecificRate())
+            for (cat = 0; cat < discrete_cat; cat++)
+            model_factory->computeTransMatrix((*it)->length * site_rate->getRate(cat), trans_mat + (cat * trans_size));
+
+            bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
+
+            double sum_scale = 0.0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, cat, partial_lh_site)
+#endif
+            for (ptn = 0; ptn < nptn; ptn++)
+            if (((PhyloNeighbor*) (*it))->scale_num[ptn] >= 0) {
+                // avoid the case that all child partial likelihoods equal 1.0
+                //double *partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + (ptn*block);
+                //if (partial_lh_child[0] < 0.0) continue;
+                //
+                if (dad_branch->scale_num[ptn] < 0) dad_branch->scale_num[ptn] = 0;
+                dad_branch->scale_num[ptn] += ((PhyloNeighbor*) (*it))->scale_num[ptn];
+                int ptn_cat = 0;
+                if (ptn < orig_nptn) {
+                	ptn_cat = site_rate->getPtnCat(ptn);
+					if (site_rate->isSiteSpecificRate())
+						model_factory->computeTransMatrix((*it)->length * site_rate->getPtnRate(ptn), trans_mat);
+                } else {
+					if (site_rate->isSiteSpecificRate())
+						model_factory->computeTransMatrix((*it)->length, trans_mat);
+                }
+                for (cat = 0; cat < ncat; cat++) {
+                    size_t lh_offset = cat * nstates + ptn*block;
+                    partial_lh_site = dad_branch->partial_lh + lh_offset;
+                    double *partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + lh_offset;
+                    for (int state = 0; state < nstates; state++) {
+                        double lh_child = 0.0;
+                        double *trans_state = trans_mat + ((not_ptn_cat ? cat : ptn_cat) * trans_size + state * nstates);
+                        if (model->isSiteSpecificModel() && ptn < orig_nptn)
+                        	trans_state += (nstates * nstates * model->getPtnModelID(ptn));
+                        for (int state2 = 0; state2 < nstates; state2++)
+                        lh_child += trans_state[state2] * partial_lh_child[state2];
+
+                        if (!isfinite(lh_child))
+                        	outError("Numerical error with ", __func__);
+                        partial_lh_site[state] *= lh_child;
+                    }
+                }
+                // check if one should scale partial likelihoods
+                bool do_scale = true;
+                partial_lh_site = dad_branch->partial_lh + (ptn * block);
+                for (cat = 0; cat < block; cat++)
+                if (partial_lh_site[cat] > SCALING_THRESHOLD) {
+                    do_scale = false;
+                    break;
+                }
+                if (!do_scale) continue;
+                // now do the likelihood scaling
+                /*
+                 double lh_max = partial_lh_site[0];
+                 for (cat = 1; cat < block; cat++)
+                 if (lh_max < partial_lh_site[cat]) lh_max = partial_lh_site[cat];
+                 for (cat = 0; cat < block; cat++)
+                 partial_lh_site[cat] /= lh_max;
+                 dad_branch->lh_scale_factor += log(lh_max) * (*aln)[ptn].frequency;
+
+                 */
+                for (cat = 0; cat < block; cat++)
+                partial_lh_site[cat] /= SCALING_THRESHOLD;
+                // unobserved const pattern will never have underflow
+                sum_scale += LOG_SCALING_THRESHOLD * (*aln)[ptn].frequency;
+                dad_branch->scale_num[ptn] += 1;
+
+//                if (pattern_scale)
+//                pattern_scale[ptn] += LOG_SCALING_THRESHOLD;
+            }
+            dad_branch->lh_scale_factor += sum_scale;
+        }
+        delete[] trans_mat;
+        //for (cat = ncat - 1; cat >= 0; cat--)
+        //  delete [] trans_mat[cat];
+    }
+    dad_branch->partial_lh_computed |= 1;
+
+}
+
+void PhyloTree::computeLikelihoodDervNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    assert(node_branch);
+    // swap node and dad if dad is a leaf
+    // NEW: swap if root_state is given
+    if (node->isLeaf() || (node->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)) {
+        PhyloNode *tmp_node = dad;
+        dad = node;
+        node = tmp_node;
+        PhyloNeighbor *tmp_nei = dad_branch;
+        dad_branch = node_branch;
+        node_branch = tmp_nei;
+        //cout << "swapped\n";
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodNaive(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodNaive(node_branch, node);
+
+    // now combine likelihood at the branch
+//    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    df = ddf = 0.0;
+    int ncat = site_rate->getNRate();
+    double p_invar = site_rate->getPInvar();
+    double p_var_cat = (1.0 - p_invar) / (double) ncat;
+    int nstates = aln->num_states;
+    size_t block = ncat * nstates;
+    int trans_size = model->getTransMatrixSize();
+    size_t nptn = aln->size() + model_factory->unobserved_ptns.size();
+    size_t orig_nptn = aln->size();
+    size_t ptn, cat, state1, state2;
+
+    int discrete_cat = site_rate->getNDiscreteRate();
+
+    double *trans_mat = new double[discrete_cat * trans_size];
+    double *trans_derv1 = new double[discrete_cat * trans_size];
+    double *trans_derv2 = new double[discrete_cat * trans_size];
+    double *state_freq = new double[nstates];
+    model->getStateFrequency(state_freq);
+
+    if (!site_rate->isSiteSpecificRate())
+        for (cat = 0; cat < discrete_cat; cat++) {
+            //trans_mat[cat] = model->newTransMatrix();
+            double *trans_cat = trans_mat + (cat * trans_size);
+            double *derv1_cat = trans_derv1 + (cat * trans_size);
+            double *derv2_cat = trans_derv2 + (cat * trans_size);
+            double rate_val = site_rate->getRate(cat);
+            //double rate_sqr = rate_val * rate_val;
+            model_factory->computeTransDervFreq(dad_branch->length, rate_val, state_freq, trans_cat, derv1_cat,
+                    derv2_cat);
+            /*
+             for (state1 = 0; state1 < nstates; state1++) {
+             double *trans_mat_state = trans_cat + (state1 * nstates);
+             double *trans_derv1_state = derv1_cat + (state1 * nstates);
+             double *trans_derv2_state = derv2_cat + (state1 * nstates);
+
+             for (state2 = 0; state2 < nstates; state2++) {
+             trans_mat_state[state2] *= state_freq[state1];
+             trans_derv1_state[state2] *= (state_freq[state1] * rate_val);
+             trans_derv2_state[state2] *= (state_freq[state1] * rate_sqr);
+             }
+             }*/
+        }
+
+    bool not_ptn_cat = (site_rate->getPtnCat(0) < 0);
+    double derv1_frac;
+    double derv2_frac;
+
+    double my_df = 0.0;
+    double my_ddf = 0.0;
+    double prob_const = 0.0;
+    double prob_const_derv1 = 0.0, prob_const_derv2 = 0.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, prob_const_derv1, prob_const_derv2) private(ptn, cat, state1, state2, derv1_frac, derv2_frac)
+#endif
+    for (ptn = 0; ptn < nptn; ptn++) {
+        int ptn_cat = site_rate->getPtnCat(ptn);
+        if (discard_saturated_site && site_rate->isSiteSpecificRate() && nptn<orig_nptn &&site_rate->getPtnRate(ptn) >= MAX_SITE_RATE)
+            continue;
+        double lh_ptn = 0.0; // likelihood of the pattern
+        double lh_ptn_derv1 = 0.0;
+        double lh_ptn_derv2 = 0.0;
+        int dad_state = aln->STATE_UNKNOWN;
+
+        if (dad->name == ROOT_NAME && root_state != aln->STATE_UNKNOWN)
+            dad_state = root_state;
+        else if (dad->isLeaf()) {
+        	if (ptn < orig_nptn)
+        		dad_state = (*aln)[ptn][dad->id];
+        	else
+        		dad_state = model_factory->unobserved_ptns[ptn-orig_nptn];
+        }
+        int dad_offset = dad_state * nstates;
+        if (site_rate->isSiteSpecificRate()) {
+        	if (ptn < orig_nptn)
+            model_factory->computeTransDervFreq(dad_branch->length, site_rate->getPtnRate(ptn), state_freq, trans_mat,
+                    trans_derv1, trans_derv2);
+        	else
+                model_factory->computeTransDervFreq(dad_branch->length, 1.0, state_freq, trans_mat,
+                        trans_derv1, trans_derv2);
+        }
+        for (cat = 0; cat < ncat; cat++) {
+            size_t lh_offset = cat * nstates + ptn * block;
+            double *partial_lh_site = node_branch->partial_lh + lh_offset;
+            double *partial_lh_child = dad_branch->partial_lh + lh_offset;
+            if (dad_state < nstates) {
+                // external node
+                int cat2 = (not_ptn_cat ? cat : ptn_cat) * trans_size + dad_offset;
+                if (model->isSiteSpecificModel() && ptn < orig_nptn)
+                    cat2 += (nstates * nstates * model->getPtnModelID(ptn));
+                double *trans_state = trans_mat + cat2;
+                double *derv1_state = trans_derv1 + cat2;
+                double *derv2_state = trans_derv2 + cat2;
+                for (state2 = 0; state2 < nstates; state2++) {
+                    lh_ptn += partial_lh_child[state2] * trans_state[state2];
+                    lh_ptn_derv1 += partial_lh_child[state2] * derv1_state[state2];
+                    lh_ptn_derv2 += partial_lh_child[state2] * derv2_state[state2];
+                }
+            } else {
+                // internal node, or external node but ambiguous character
+                for (state1 = 0; state1 < nstates; state1++) {
+                    double lh_state = 0.0; // likelihood of state1
+                    double lh_state_derv1 = 0.0;
+                    double lh_state_derv2 = 0.0;
+                    int cat2 = (not_ptn_cat ? cat : ptn_cat) * trans_size + state1 * nstates;
+                    if (model->isSiteSpecificModel() && ptn < orig_nptn)
+                        cat2 += (nstates * nstates * model->getPtnModelID(ptn));
+                    double *trans_state = trans_mat + cat2;
+                    double *derv1_state = trans_derv1 + cat2;
+                    double *derv2_state = trans_derv2 + cat2;
+                    for (state2 = 0; state2 < nstates; state2++) {
+                        lh_state += partial_lh_child[state2] * trans_state[state2];
+                        lh_state_derv1 += partial_lh_child[state2] * derv1_state[state2];
+                        lh_state_derv2 += partial_lh_child[state2] * derv2_state[state2];
+                    }
+                    lh_ptn += lh_state * partial_lh_site[state1];
+                    lh_ptn_derv1 += lh_state_derv1 * partial_lh_site[state1];
+                    lh_ptn_derv2 += lh_state_derv2 * partial_lh_site[state1];
+                }
+            }
+        }
+        /*		if (p_invar > 0.0) {
+         lh_ptn *= p_var_cat;
+         lh_ptn_derv1 *= p_var_cat;
+         lh_ptn_derv2 *= p_var_cat;
+         if ((*aln)[ptn].is_const && (*aln)[ptn].const_char < nstates) {
+         lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
+         }
+         assert(lh_ptn > 0);
+         double derv1_frac = lh_ptn_derv1 / lh_ptn;
+         double derv2_frac = lh_ptn_derv2 / lh_ptn;
+         tree_lh += log(lh_ptn) * (*aln)[ptn].frequency;
+         df += derv1_frac * (*aln)[ptn].frequency;
+         ddf += (derv2_frac - derv1_frac * derv1_frac) * (*aln)[ptn].frequency;
+         } else {
+         double derv1_frac = lh_ptn_derv1 / lh_ptn;
+         double derv2_frac = lh_ptn_derv2 / lh_ptn;
+         lh_ptn *= p_var_cat;
+         assert(lh_ptn > 0);
+         tree_lh += log(lh_ptn) * (*aln)[ptn].frequency;
+         df += derv1_frac * (*aln)[ptn].frequency;
+         ddf += (derv2_frac - derv1_frac * derv1_frac) * (*aln)[ptn].frequency;
+
+         }
+         */
+        // Tung beo's trick
+        if (lh_ptn<=0) {
+            cout << "Abnormal " << __func__;
+            abort();
+        }
+
+        if (ptn < orig_nptn) {
+            lh_ptn = lh_ptn * p_var_cat;
+			if ((*aln)[ptn].const_char == nstates)
+				lh_ptn += p_invar;
+			else if ((*aln)[ptn].const_char < nstates) {
+				lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
+			}
+	        double pad = p_var_cat / lh_ptn;
+	        if (std::isinf(pad)) {
+	            lh_ptn_derv1 *= p_var_cat;
+	            lh_ptn_derv2 *= p_var_cat;
+	            derv1_frac = lh_ptn_derv1 / lh_ptn;
+	            derv2_frac = lh_ptn_derv2 / lh_ptn;
+	        } else {
+	            derv1_frac = lh_ptn_derv1 * pad;
+	            derv2_frac = lh_ptn_derv2 * pad;
+	        }
+	        double freq = (*aln)[ptn].frequency;
+	        double tmp1 = derv1_frac * freq;
+	        double tmp2 = derv2_frac * freq;
+	        my_df += tmp1;
+	        my_ddf += tmp2 - tmp1 * derv1_frac;
+//	        lh_ptn = log(lh_ptn);
+//	        tree_lh += lh_ptn * freq;
+//	        _pattern_lh[ptn] = lh_ptn;
+	        if (!isfinite(lh_ptn) || !isfinite(my_df) || !isfinite(my_ddf)) {
+	            cout << "Abnormal " << __func__;
+	            abort();
+	        }
+        } else {
+        	lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_nptn]];
+        	prob_const += lh_ptn;
+        	prob_const_derv1 += lh_ptn_derv1 * p_var_cat;
+        	prob_const_derv2 += lh_ptn_derv2 * p_var_cat;
+        }
+
+    }
+    if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	derv1_frac = prob_const_derv1 / prob_const;
+    	derv2_frac = prob_const_derv2 / prob_const;
+    	int nsites = aln->getNSite();
+    	my_df += nsites * derv1_frac;
+    	my_ddf += nsites *(derv2_frac + derv1_frac*derv1_frac);
+//    	prob_const = log(prob_const);
+//    	tree_lh -= nsites * prob_const;
+//    	for (ptn = 0; ptn < orig_nptn; ptn++)
+//    		_pattern_lh[ptn] -= prob_const;
+    }
+    delete[] state_freq;
+    delete[] trans_derv2;
+    delete[] trans_derv1;
+    delete[] trans_mat;
+    //for (cat = ncat-1; cat >= 0; cat--)
+    //delete trans_mat[cat];
+    //delete state_freq;
+    df = my_df;
+    ddf = my_ddf;
+
+//    return tree_lh;
+}
+
+/****************************************************************************
+ Branch length optimization by maximum likelihood
+ ****************************************************************************/
+
+const double MIN_TREE_LENGTH_SCALE = 0.001;
+const double MAX_TREE_LENGTH_SCALE = 1000.0;
+const double TOL_TREE_LENGTH_SCALE = 0.001;
+
+
+double PhyloTree::optimizeTreeLengthScaling(double &scaling, double gradient_epsilon) {
+    is_opt_scaling = true;
+    current_scaling = scaling;
+    double negative_lh, ferror;
+    scaling = minimizeOneDimen(MIN_TREE_LENGTH_SCALE, scaling, MAX_TREE_LENGTH_SCALE, max(TOL_TREE_LENGTH_SCALE, gradient_epsilon), &negative_lh, &ferror);
+    if (scaling != current_scaling) {
+        scaleLength(scaling / current_scaling);
+        current_scaling = scaling;
+        clearAllPartialLH();
+    }
+    is_opt_scaling = false;
+    return computeLikelihood();
+}
+
+double PhyloTree::computeFunction(double value) {
+    if (!is_opt_scaling) {
+        current_it->length = value;
+        current_it_back->length = value;
+        return -computeLikelihoodBranch(current_it, (PhyloNode*) current_it_back->node);
+    } else {
+        if (value != current_scaling) {
+            scaleLength(value / current_scaling);
+            current_scaling = value;
+            clearAllPartialLH();
+        }
+        return -computeLikelihood();
+    }
+}
+
+void PhyloTree::computeFuncDerv(double value, double &df, double &ddf) {
+    current_it->length = value;
+    current_it_back->length = value;
+//    double lh;
+//	lh = -computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, df, ddf);
+    computeLikelihoodDerv(current_it, (PhyloNode*) current_it_back->node, df, ddf);
+
+	df = -df;
+    ddf = -ddf;
+
+//    return lh;
+}
+
+void PhyloTree::optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH, int maxNRStep) {
+    double negative_lh;
+    current_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+    assert(current_it);
+    current_it_back = (PhyloNeighbor*) node2->findNeighbor(node1);
+    assert(current_it_back);
+    double current_len = current_it->length;
+    double ferror, optx;
+    assert(current_len >= 0.0);
+    theta_computed = false;
+    if (optimize_by_newton) {
+    	// Newton-Raphson method
+    	optx = minimizeNewton(MIN_BRANCH_LEN, current_len, MAX_BRANCH_LEN, TOL_BRANCH_LEN, negative_lh, maxNRStep);
+        if (verbose_mode >= VB_DEBUG) {
+            cout << "minimizeNewton logl: " << computeLikelihoodFromBuffer() << endl;
+        }
+    	if (optx > MAX_BRANCH_LEN*0.95) {
+    		// newton raphson diverged, reset
+    	    double opt_lh = computeLikelihoodFromBuffer();
+    	    current_it->length = current_len;
+    	    current_it_back->length = current_len;
+    	    double orig_lh = computeLikelihoodFromBuffer();
+    	    if (orig_lh > opt_lh) {
+    	    	optx = current_len;
+    	    }
+    	}
+	}	else {
+        // Brent method
+        optx = minimizeOneDimen(MIN_BRANCH_LEN, current_len, MAX_BRANCH_LEN, TOL_BRANCH_LEN, &negative_lh, &ferror);
+        if (verbose_mode >= VB_DEBUG) {
+            cout << "minimizeBrent logl: " << -negative_lh << endl;
+        }
+	}
+
+    current_it->length = optx;
+    current_it_back->length = optx;
+    //curScore = -negative_lh;
+
+    if (clearLH && current_len != optx) {
+        node1->clearReversePartialLh(node2);
+        node2->clearReversePartialLh(node1);
+    }
+
+//    return -negative_lh;
+}
+
+double PhyloTree::optimizeChildBranches(PhyloNode *node, PhyloNode *dad) {
+
+//    double tree_lh = 0.0;
+
+    FOR_NEIGHBOR_DECLARE(node, dad, it){
+
+//    tree_lh = optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
+    	optimizeOneBranch((PhyloNode*) node, (PhyloNode*) (*it)->node);
+    }
+    return computeLikelihoodFromBuffer();
+//    return tree_lh;
+}
+
+void PhyloTree::optimizeAllBranchesLS(PhyloNode *node, PhyloNode *dad) {
+    if (!node) {
+        node = (PhyloNode*) root;
+    }
+
+    if (dad) {
+        double lsBran = optimizeOneBranchLS(node, dad);
+        PhyloNeighbor *node_dad_nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        PhyloNeighbor *dad_node_nei = (PhyloNeighbor*) dad->findNeighbor(node);
+        node_dad_nei->length = lsBran;
+        dad_node_nei->length = lsBran;
+    }
+
+    for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
+        if ((*it)->node != (dad)) {
+            optimizeAllBranchesLS((PhyloNode*) (*it)->node, node);
+        }
+}
+
+void PhyloTree::optimizeAllBranches(PhyloNode *node, PhyloNode *dad, int maxNRStep) {
+//    double tree_lh = -DBL_MAX;
+
+    for (NeighborVec::iterator it = (node)->neighbors.begin(); it != (node)->neighbors.end(); it++)
+        if ((*it)->node != (dad)) {
+            optimizeAllBranches((PhyloNode*) (*it)->node, node, maxNRStep);
+        }
+    if (dad)
+        optimizeOneBranch(node, dad, true, maxNRStep); // BQM 2014-02-24: true was missing
+
+//    return tree_lh;
+}
+
+double PhyloTree::optimizeAllBranches(int my_iterations, double tolerance, int maxNRStep) {
+    if (verbose_mode >= VB_MAX)
+        cout << "Optimizing branch lengths (max " << my_iterations << " loops)..." << endl;
+    double tree_lh = computeLikelihood();
+    if (verbose_mode >= VB_MAX) {
+        cout << "Initial tree log-likelihood: " << tree_lh << endl;
+    }
+    //cout << tree_lh << endl;
+    for (int i = 0; i < my_iterations; i++) {
+//    	string string_brlen = getTreeString();
+    	DoubleVector lenvec;
+    	saveBranchLengths(lenvec);
+//        if (verbose_mode >= VB_DEBUG) {
+//            printTree(cout, WT_BR_LEN+WT_NEWLINE);
+//        }
+
+    	optimizeAllBranches((PhyloNode*) root, NULL, maxNRStep);
+        double new_tree_lh = computeLikelihoodFromBuffer();
+        //cout<<"After opt  log-lh = "<<new_tree_lh<<endl;
+
+        if (verbose_mode >= VB_MAX) {
+            cout << "Likelihood after iteration " << i + 1 << " : ";
+            cout << new_tree_lh << endl;
+        }
+
+//        if (verbose_mode >= VB_DEBUG) {
+//            printTree(cout, WT_BR_LEN+WT_NEWLINE);
+//        }
+
+//        if (new_tree_lh < tree_lh - 10.0) { // make sure that the new tree likelihood never decreases too much
+//            cout << "ERROR: Branch length optimization failed as log-likelihood decreases too much: " << tree_lh << "  --> " << new_tree_lh << endl;
+//            getModel()->writeInfo(cout);
+//            getRate()->writeInfo(cout);
+//            assert(new_tree_lh >= tree_lh - 10.0);
+//        }
+        
+
+        if (new_tree_lh < tree_lh) {
+        	// IN RARE CASE: tree log-likelihood decreases, revert the branch length and stop
+        	if (verbose_mode >= VB_MED)
+        		cout << "NOTE: Restoring branch lengths as tree log-likelihood decreases after branch length optimization: "
+        			<< tree_lh << " -> " << new_tree_lh << endl;
+
+        	clearAllPartialLH();
+        	restoreBranchLengths(lenvec);
+
+        	//clearAllPartialLH();
+//        	readTreeString(string_brlen);
+        	new_tree_lh = computeLikelihood();
+            if (fabs(new_tree_lh-tree_lh) > 1.0)
+                cout << "new_tree_lh: " << new_tree_lh << "   tree_lh: " << tree_lh << endl;
+        	assert(fabs(new_tree_lh-tree_lh) < 1.0);
+        	return new_tree_lh;
+        }
+
+        // only return if the new_tree_lh >= tree_lh! (in rare case that likelihood decreases, continue the loop)
+        if (tree_lh <= new_tree_lh && new_tree_lh <= tree_lh + tolerance)
+        	return new_tree_lh;
+        tree_lh = new_tree_lh;
+    }
+    return tree_lh;
+}
+
+/****************************************************************************
+ Stepwise addition (greedy) by maximum likelihood
+ ****************************************************************************/
+
+double PhyloTree::addTaxonML(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad) {
+
+    Neighbor *dad_nei = dad->findNeighbor(node);
+
+    // now insert the new node in the middle of the branch node-dad
+    double len = dad_nei->length;
+    node->updateNeighbor(dad, added_node, len / 2.0);
+    dad->updateNeighbor(node, added_node, len / 2.0);
+    added_node->updateNeighbor((Node*) 1, node, len / 2.0);
+    added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
+    // compute the likelihood
+    clearAllPartialLH();
+    double best_score = optimizeChildBranches((PhyloNode*) added_node);
+    target_node = node;
+    target_dad = dad;
+    // remove the added node
+    node->updateNeighbor(added_node, dad, len);
+    dad->updateNeighbor(added_node, node, len);
+    added_node->updateNeighbor(node, (Node*) 1, len);
+    added_node->updateNeighbor(dad, (Node*) 2, len);
+
+    // now tranverse the tree downwards
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    Node *target_node2;
+    Node *target_dad2;
+    double score = addTaxonML(added_node, target_node2, target_dad2, (*it)->node, node);
+    if (score > best_score) {
+
+        best_score = score;
+        target_node = target_node2;
+        target_dad = target_dad2;
+    }
+}
+    return best_score;
+}
+
+void PhyloTree::growTreeML(Alignment *alignment) {
+
+    cout << "Stepwise addition using ML..." << endl;
+    aln = alignment;
+    int size = aln->getNSeq();
+    if (size < 3)
+        outError(ERR_FEW_TAXA);
+
+    root = newNode();
+    Node * new_taxon;
+
+    // create initial tree with 3 taxa
+    for (leafNum = 0; leafNum < 3; leafNum++) {
+        cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
+        new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
+        root->addNeighbor(new_taxon, 1.0);
+        new_taxon->addNeighbor(root, 1.0);
+    }
+    root = findNodeID(0);
+    optimizeAllBranches();
+
+    // stepwise adding the next taxon
+    for (leafNum = 3; leafNum < size; leafNum++) {
+
+        cout << "Add " << aln->getSeqName(leafNum) << " to the tree" << endl;
+        // allocate a new taxon and a new ajedcent internal node
+        new_taxon = newNode(leafNum, aln->getSeqName(leafNum).c_str());
+        Node *added_node = newNode();
+        added_node->addNeighbor(new_taxon, 1.0);
+        new_taxon->addNeighbor(added_node, 1.0);
+
+        // preserve two neighbors
+        added_node->addNeighbor((Node*) 1, 1.0);
+        added_node->addNeighbor((Node*) 2, 1.0);
+
+        Node *target_node = NULL;
+        Node *target_dad = NULL;
+        addTaxonML(added_node, target_node, target_dad, root->neighbors[0]->node, root);
+        // now insert the new node in the middle of the branch node-dad
+        double len = target_dad->findNeighbor(target_node)->length;
+        target_node->updateNeighbor(target_dad, added_node, len / 2.0);
+        target_dad->updateNeighbor(target_node, added_node, len / 2.0);
+        added_node->updateNeighbor((Node*) 1, target_node, len / 2.0);
+        added_node->updateNeighbor((Node*) 2, target_dad, len / 2.0);
+        // compute the likelihood
+        clearAllPartialLH();
+        optimizeAllBranches();
+        //optimizeNNI();
+    }
+
+    nodeNum = 2 * leafNum - 2;
+}
+
+/****************************************************************************
+ Distance function
+ ****************************************************************************/
+
+double PhyloTree::computeDist(int seq1, int seq2, double initial_dist, double &d2l) {
+    // if no model or site rate is specified, return JC distance
+    if (initial_dist == 0.0) {
+    	if (params->compute_obs_dist)
+            initial_dist = aln->computeObsDist(seq1, seq2);
+    	else
+    		initial_dist = aln->computeDist(seq1, seq2);
+    }
+    if (!model_factory || !site_rate)
+        return initial_dist; // MANUEL: here no d2l is return
+
+    // now optimize the distance based on the model and site rate
+    AlignmentPairwise *aln_pair = new AlignmentPairwise(this, seq1, seq2);
+
+    double dist = aln_pair->optimizeDist(initial_dist, d2l);
+    delete aln_pair;
+    return dist;
+}
+
+double PhyloTree::computeDist(int seq1, int seq2, double initial_dist) {
+    double var;
+    return computeDist(seq1, seq2, initial_dist, var);
+}
+
+double PhyloTree::correctDist(double *dist_mat) {
+    int i, j, k, pos;
+    int n = aln->getNSeq();
+    int nsqr = n * n;
+    // use Floyd algorithm to find shortest path between all pairs of taxa
+    for (k = 0; k < n; k++)
+        for (i = 0, pos = 0; i < n; i++)
+            for (j = 0; j < n; j++, pos++) {
+                double tmp = dist_mat[i * n + k] + dist_mat[k * n + j];
+                if (dist_mat[pos] > tmp)
+                    dist_mat[pos] = tmp;
+            }
+    double longest_dist = 0.0;
+    for (i = 0; i < nsqr; i++)
+        if (dist_mat[i] > longest_dist)
+            longest_dist = dist_mat[i];
+
+    return longest_dist;
+}
+
+double PhyloTree::computeDist(double *dist_mat, double *var_mat) {
+    int nseqs = aln->getNSeq();
+    int pos = 0;
+    int num_pairs = nseqs * (nseqs - 1) / 2;
+    double longest_dist = 0.0;
+    int *row_id = new int[num_pairs];
+    int *col_id = new int[num_pairs];
+
+    row_id[0] = 0;
+    col_id[0] = 1;
+    for (pos = 1; pos < num_pairs; pos++) {
+        row_id[pos] = row_id[pos - 1];
+        col_id[pos] = col_id[pos - 1] + 1;
+        if (col_id[pos] >= nseqs) {
+            row_id[pos]++;
+            col_id[pos] = row_id[pos] + 1;
+        }
+    }
+    // compute the upper-triangle of distance matrix
+#ifdef _OPENMP
+#pragma omp parallel for private(pos)
+#endif
+
+    for (pos = 0; pos < num_pairs; pos++) {
+        int seq1 = row_id[pos];
+        int seq2 = col_id[pos];
+        double d2l; // moved here for thread-safe (OpenMP)
+        int sym_pos = seq1 * nseqs + seq2;
+        dist_mat[sym_pos] = computeDist(seq1, seq2, dist_mat[sym_pos], d2l);
+        if (params->ls_var_type == OLS)
+            var_mat[sym_pos] = 1.0;
+        else if (params->ls_var_type == WLS_PAUPLIN)
+            var_mat[sym_pos] = 0.0;
+        else if (params->ls_var_type == WLS_FIRST_TAYLOR)
+            var_mat[sym_pos] = dist_mat[sym_pos];
+        else if (params->ls_var_type == WLS_FITCH_MARGOLIASH)
+            var_mat[sym_pos] = dist_mat[sym_pos] * dist_mat[sym_pos];
+        else if (params->ls_var_type == WLS_SECOND_TAYLOR)
+            var_mat[sym_pos] = -1.0 / d2l;
+    }
+
+    // copy upper-triangle into lower-triangle and set diagonal = 0
+    for (int seq1 = 0; seq1 < nseqs; seq1++)
+        for (int seq2 = 0; seq2 <= seq1; seq2++) {
+            pos = seq1 * nseqs + seq2;
+            if (seq1 == seq2) {
+                dist_mat[pos] = 0.0;
+                var_mat[pos] = 0.0;
+            }
+            else {
+                dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
+                var_mat[pos] = var_mat[seq2 * nseqs + seq1];
+            }
+            if (dist_mat[pos] > longest_dist)
+                longest_dist = dist_mat[pos];
+        }
+    delete[] col_id;
+    delete[] row_id;
+
+    /*
+     if (longest_dist > MAX_GENETIC_DIST * 0.99)
+     outWarning("Some distances are saturated. Please check your alignment again");*/
+    // NOTE: Bionj does handle long distances already (thanks Manuel)
+    //return correctDist(dist_mat);
+    return longest_dist;
+}
+
+double PhyloTree::computeDist(Params &params, Alignment *alignment, double* &dist_mat, double* &var_mat,
+        string &dist_file) {
+	this->params = ¶ms;
+    double longest_dist = 0.0;
+    aln = alignment;
+    dist_file = params.out_prefix;
+    if (!model_factory) {
+        if (params.compute_obs_dist)
+        	dist_file += ".obsdist";
+        else
+        	//dist_file += ".jcdist"; // too many files, I decided to discard .jcdist
+        	dist_file += ".mldist";
+    } else
+        dist_file += ".mldist";
+
+    if (!dist_mat) {
+        dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
+        memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
+        var_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
+        // BUG!
+        //memset(var_mat, 1, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
+        int nseq = alignment->getNSeq();
+        for (int i = 0; i < nseq; i++)
+        	for (int j = 0; j < nseq; j++)
+        		var_mat[i*nseq+j] = 1.0;
+    }
+    if (!params.dist_file) {
+        longest_dist = computeDist(dist_mat, var_mat);
+        alignment->printDist(dist_file.c_str(), dist_mat);
+    } else {
+        longest_dist = alignment->readDist(params.dist_file, dist_mat);
+        dist_file = params.dist_file;
+    }
+    return longest_dist;
+}
+
+double PhyloTree::computeObsDist(double *dist_mat) {
+    int nseqs = aln->getNSeq();
+    int pos = 0;
+    double longest_dist = 0.0;
+    for (int seq1 = 0; seq1 < nseqs; seq1++)
+        for (int seq2 = 0; seq2 < nseqs; seq2++, pos++) {
+            if (seq1 == seq2)
+                dist_mat[pos] = 0.0;
+            else if (seq2 > seq1) {
+                dist_mat[pos] = aln->computeObsDist(seq1, seq2);
+            } else
+                dist_mat[pos] = dist_mat[seq2 * nseqs + seq1];
+
+            if (dist_mat[pos] > longest_dist)
+                longest_dist = dist_mat[pos];
+        }
+    return longest_dist;
+    //return correctDist(dist_mat);
+}
+
+double PhyloTree::computeObsDist(Params &params, Alignment *alignment, double* &dist_mat, string &dist_file) {
+    double longest_dist = 0.0;
+    aln = alignment;
+    dist_file = params.out_prefix;
+    dist_file += ".obsdist";
+
+    if (!dist_mat) {
+        dist_mat = new double[alignment->getNSeq() * alignment->getNSeq()];
+        memset(dist_mat, 0, sizeof(double) * alignment->getNSeq() * alignment->getNSeq());
+    }
+    longest_dist = computeObsDist(dist_mat);
+    alignment->printDist(dist_file.c_str(), dist_mat);
+
+    return longest_dist;
+}
+
+/****************************************************************************
+ compute BioNJ tree, a more accurate extension of Neighbor-Joining
+ ****************************************************************************/
+
+void PhyloTree::computeBioNJ(Params &params, Alignment *alignment, string &dist_file) {
+    string bionj_file = params.out_prefix;
+    bionj_file += ".bionj";
+    cout << "Computing BIONJ tree..." << endl;
+    BioNj bionj;
+    bionj.create(dist_file.c_str(), bionj_file.c_str());
+//    bool my_rooted = false;
+    bool non_empty_tree = (root != NULL);
+//    if (root)
+//        freeNode();
+    readTreeFile(bionj_file.c_str());
+
+    if (non_empty_tree) {
+        initializeAllPartialLh();
+    }
+//    setAlignment(alignment);
+}
+
+int PhyloTree::fixNegativeBranch(bool force, Node *node, Node *dad) {
+
+    if (!node)
+        node = root;
+    int fixed = 0;
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    if ((*it)->length < 0.0 || force) { // negative branch length detected
+        int branch_subst;
+        int pars_score = computeParsimonyBranch((PhyloNeighbor*) (*it), (PhyloNode*) node, &branch_subst);
+        // first compute the observed parsimony distance
+        double branch_length = (branch_subst > 0) ? ((double) branch_subst / getAlnNSite()) : (1.0 / getAlnNSite());
+        // now correct Juke-Cantor formula
+        double z = (double) aln->num_states / (aln->num_states - 1);
+        double x = 1.0 - (z * branch_length);
+        if (x > 0) branch_length = -log(x) / z;
+        if (branch_length < MIN_BRANCH_LEN)
+            branch_length = MIN_BRANCH_LEN;
+//        if (verbose_mode >= VB_DEBUG)
+//        	cout << "Negative branch length " << (*it)->length << " was set to ";
+        //(*it)->length = fixed_length;
+        //(*it)->length = random_double()+0.1;
+        (*it)->length = branch_length;
+        if (verbose_mode >= VB_DEBUG)
+        cout << (*it)->length << " parsimony = " << pars_score << endl;
+        // set the backward branch length
+        (*it)->node->findNeighbor(node)->length = (*it)->length;
+        fixed++;
+    }
+    if ((*it)->length <= 0.0) {
+        (*it)->length = MIN_BRANCH_LEN;
+        (*it)->node->findNeighbor(node)->length = (*it)->length;
+    }
+    fixed += fixNegativeBranch(force, (*it)->node, node);
+}
+    return fixed;
+}
+
+//int PhyloTree::assignRandomBranchLengths(bool force, Node *node, Node *dad) {
+//
+//    if (!node)
+//        node = root;
+//    int fixed = 0;
+//
+//    FOR_NEIGHBOR_IT(node, dad, it){
+//		if ((*it)->length < 0.0 || force) { // negative branch length detected
+//			if (verbose_mode >= VB_DEBUG)
+//			cout << "Negative branch length " << (*it)->length << " was set to ";
+//			(*it)->length = random_double() + 0.1;
+//			if (verbose_mode >= VB_DEBUG)
+//			cout << (*it)->length << endl;
+//			// set the backward branch length
+//			(*it)->node->findNeighbor(node)->length = (*it)->length;
+//			fixed++;
+//		}
+//		if ((*it)->length <= 0.0) {
+//			(*it)->length = 1e-6;
+//			(*it)->node->findNeighbor(node)->length = (*it)->length;
+//		}
+//		fixed += assignRandomBranchLengths(force, (*it)->node, node);
+//    }
+//    return fixed;
+//}
+
+/****************************************************************************
+ Nearest Neighbor Interchange by maximum likelihood
+ ****************************************************************************/
+
+void PhyloTree::doOneRandomNNI(Node *node1, Node *node2) {
+	assert(isInnerBranch(node1, node2));
+    Neighbor *node1Nei = NULL;
+    Neighbor *node2Nei = NULL;
+    // randomly choose one neighbor from node1 and one neighbor from node2
+    bool chooseNext = false;
+	FOR_NEIGHBOR_IT(node1, node2, it){
+		if (chooseNext) {
+			node1Nei = (*it);
+			break;
+		}
+		int randNum = random_int(1);
+		if (randNum == 0) {
+			node1Nei = (*it);
+			break;
+		} else {
+			chooseNext = true;
+		}
+	}
+	chooseNext = false;
+	FOR_NEIGHBOR_IT(node2, node1, it){
+		if (chooseNext) {
+			node2Nei = (*it);
+			break;
+		}
+		int randNum = random_int(1);
+		if (randNum == 0) {
+			node2Nei = (*it);
+			break;
+		} else {
+			chooseNext = true;
+		}
+	}
+	assert(node1Nei != NULL && node2Nei != NULL);
+
+    NeighborVec::iterator node1NeiIt = node1->findNeighborIt(node1Nei->node);
+    NeighborVec::iterator node2NeiIt = node2->findNeighborIt(node2Nei->node);
+    assert(node1NeiIt != node1->neighbors.end());
+    assert(node1NeiIt != node2->neighbors.end());
+
+    node1->updateNeighbor(node1NeiIt, node2Nei);
+    node2Nei->node->updateNeighbor(node2, node1);
+
+    node2->updateNeighbor(node2NeiIt, node1Nei);
+    node1Nei->node->updateNeighbor(node1, node2);
+}
+
+void PhyloTree::doNNI(NNIMove &move, bool clearLH) {
+    PhyloNode *node1 = move.node1;
+    PhyloNode *node2 = move.node2;
+    NeighborVec::iterator node1Nei_it = move.node1Nei_it;
+    NeighborVec::iterator node2Nei_it = move.node2Nei_it;
+    Neighbor *node1Nei = *(node1Nei_it);
+    Neighbor *node2Nei = *(node2Nei_it);
+
+    // TODO MINH
+    /*	Node *nodeA = node1Nei->node;
+     Node *nodeB = node2Nei->node;
+
+     NeighborVec::iterator nodeA_it = nodeA->findNeighborIt(node1);
+     NeighborVec::iterator nodeB_it = nodeB->findNeighborIt(node2);
+     Neighbor *nodeANei = *(nodeA_it);
+     Neighbor *nodeBNei = *(nodeB_it);
+     *node1Nei_it = node2Nei;
+     *nodeB_it = nodeANei;
+     *node2Nei_it = node1Nei;
+     *nodeA_it = nodeBNei;*/
+    // END TODO MINH
+    assert(node1->degree() == 3 && node2->degree() == 3);
+
+    PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2); // return neighbor of node1 which points to node 2
+    PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1); // return neighbor of node2 which points to node 1
+
+    // reorient partial_lh before swap
+    if (params->lh_mem_save == LM_PER_NODE && !isSuperTree() && (sse == LK_EIGEN || sse == LK_EIGEN_SSE)) {
+        node12_it->reorientPartialLh(node1);
+        node21_it->reorientPartialLh(node2);
+    }
+    
+    // do the NNI swap
+    node1->updateNeighbor(node1Nei_it, node2Nei);
+    node2Nei->node->updateNeighbor(node2, node1);
+
+    node2->updateNeighbor(node2Nei_it, node1Nei);
+    node1Nei->node->updateNeighbor(node1, node2);
+
+    // BQM check branch ID
+    /*
+     if (node1->findNeighbor(nodeB)->id != nodeB->findNeighbor(node1)->id) {
+     cout << node1->findNeighbor(nodeB)->id << "<->" << nodeB->findNeighbor(node1)->id << endl;
+     cout << node1->id << "," << nodeB->id << endl;
+     outError("Wrong ID");
+     }
+     if (node2->findNeighbor(nodeA)->id != nodeA->findNeighbor(node2)->id) {
+     cout << node2->findNeighbor(nodeA)->id << "<->" << nodeA->findNeighbor(node2)->id << endl;
+     cout << node2->id << "," << nodeA->id << endl;
+     outError("Wrong ID");
+     }*/
+
+
+    if (clearLH) {
+        // clear partial likelihood vector
+        node12_it->clearPartialLh();
+        node21_it->clearPartialLh();
+
+        node2->clearReversePartialLh(node1);
+        node1->clearReversePartialLh(node2);
+        //if (params->nni5Branches)
+        //	clearAllPartialLH();
+    }
+
+    if (params->leastSquareNNI) {
+    	updateSubtreeDists(move);
+    }
+}
+
+void PhyloTree::changeNNIBrans(NNIMove nnimove) {
+	PhyloNode *node1 = nnimove.node1;
+	PhyloNode *node2 = nnimove.node2;
+	PhyloNeighbor *node1_node2_nei = (PhyloNeighbor*) node1->findNeighbor(node2);
+	PhyloNeighbor *node2_node1_nei = (PhyloNeighbor*) node2->findNeighbor(node1);
+	node1_node2_nei->length = nnimove.newLen[0];
+	node2_node1_nei->length = nnimove.newLen[0];
+	if (params->nni5) {
+		int i = 1;
+		Neighbor* nei;
+		Neighbor* nei_back;
+		NeighborVec::iterator it;
+		FOR_NEIGHBOR(node1, node2, it)
+		{
+			nei = (*it)->node->findNeighbor(node1);
+			nei_back = (node1)->findNeighbor((*it)->node);
+			nei->length = nnimove.newLen[i];
+			nei_back->length = nnimove.newLen[i];
+			i++;
+		}
+		FOR_NEIGHBOR(node2, node1, it)
+		{
+			nei = (*it)->node->findNeighbor(node2);
+			nei_back = (node2)->findNeighbor((*it)->node);
+			nei->length = nnimove.newLen[i];
+			nei_back->length = nnimove.newLen[i];
+			i++;
+		}
+	}
+}
+
+NNIMove PhyloTree::getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove* nniMoves) {
+
+	assert(!node1->isLeaf() && !node2->isLeaf());
+    assert(node1->degree() == 3 && node2->degree() == 3);
+
+	int IT_NUM = (params->nni5) ? 6 : 2;
+    size_t partial_lh_size = getPartialLhBytes()/sizeof(double);
+    size_t scale_num_size = getScaleNumBytes()/sizeof(UBYTE);
+
+    // Upper Bounds ---------------
+    totalNNIub += 2;
+    if(params->upper_bound_NNI){
+    	NNIMove resMove;
+    	resMove = getBestNNIForBranUB(node1,node2,this);
+    	/* if UB is smaller than the current likelihood, then we don't recompute the likelihood of the swapped topology.
+    	 * Otherwise, follow the normal procedure: evaluate NNIs and compute the likelihood.*/
+
+    	// here, we skip NNI is its UB n times worse than the curLikelihood
+    	if( resMove.newloglh < (1+params->upper_bound_frac)*this->curScore){
+    		//cout << "Skipping Likelihood evaluation of NNIs for this branch :) ........................"<<endl;
+    		return resMove;
+    	}
+    }
+
+    //-----------------------------
+
+	NeighborVec::iterator it;
+
+	NeighborVec::iterator saved_it[6];
+	int id = 0;
+
+	saved_it[id++] = node1->findNeighborIt(node2);
+	saved_it[id++] = node2->findNeighborIt(node1);
+
+	if (params->nni5) {
+		FOR_NEIGHBOR(node1, node2, it)
+			saved_it[id++] = (*it)->node->findNeighborIt(node1);
+
+		FOR_NEIGHBOR(node2, node1, it)
+			saved_it[id++] = (*it)->node->findNeighborIt(node2);
+	}
+	assert(id == IT_NUM);
+
+	Neighbor *saved_nei[6];
+	// save Neighbor and allocate new Neighbor pointer
+	for (id = 0; id < IT_NUM; id++) {
+		saved_nei[id] = (*saved_it[id]);
+		*saved_it[id] = new PhyloNeighbor(saved_nei[id]->node, saved_nei[id]->length);
+		((PhyloNeighbor*) (*saved_it[id]))->partial_lh = nni_partial_lh + id*partial_lh_size;
+		((PhyloNeighbor*) (*saved_it[id]))->scale_num = nni_scale_num + id*scale_num_size;
+//		((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
+	}
+
+	// get the Neighbor again since it is replaced for saving purpose
+	PhyloNeighbor* node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+	PhyloNeighbor* node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
+
+    int cnt;
+
+	//NNIMove nniMoves[2];
+    bool newNNIMoves = false;
+    if (!nniMoves) {
+		//   Initialize the 2 NNI moves
+    	newNNIMoves = true;
+    	nniMoves = new NNIMove[2];
+    	nniMoves[0].ptnlh = nniMoves[1].ptnlh = NULL;
+    	nniMoves[0].node1 = NULL;
+
+    }
+
+    if (nniMoves[0].node1) {
+    	// assuming that node1Nei_it and node2Nei_it is defined in nniMoves structure
+    	for (cnt = 0; cnt < 2; cnt++) {
+    		// sanity check
+    		if (!node1->findNeighbor((*nniMoves[cnt].node1Nei_it)->node)) outError(__func__);
+    		if (!node2->findNeighbor((*nniMoves[cnt].node2Nei_it)->node)) outError(__func__);
+    	}
+    } else {
+        FOR_NEIGHBOR_IT(node1, node2, node1_it) {
+			cnt = 0;
+			FOR_NEIGHBOR_IT(node2, node1, node2_it) {
+				//   Initialize the 2 NNI moves
+				nniMoves[cnt].node1Nei_it = node1_it;
+				nniMoves[cnt].node2Nei_it = node2_it;
+				cnt++;
+			}
+			break;
+        }
+    }
+
+    // Initialize node1 and node2 in nniMoves
+	nniMoves[0].node1 = nniMoves[1].node1 = node1;
+	nniMoves[0].node2 = nniMoves[1].node2 = node2;
+
+    double backupScore = curScore;
+
+    for (cnt = 0; cnt < 2; cnt++) {
+        // do the NNI swap
+    	NeighborVec::iterator node1_it = nniMoves[cnt].node1Nei_it;
+    	NeighborVec::iterator node2_it = nniMoves[cnt].node2Nei_it;
+        Neighbor *node1_nei = *node1_it;
+        Neighbor *node2_nei = *node2_it;
+
+        node1->updateNeighbor(node1_it, node2_nei);
+        node2_nei->node->updateNeighbor(node2, node1);
+
+        node2->updateNeighbor(node2_it, node1_nei);
+        node1_nei->node->updateNeighbor(node1, node2);
+
+		// clear partial likelihood vector
+		node12_it->clearPartialLh();
+		node21_it->clearPartialLh();
+
+		// compute the score of the swapped topology
+//		double saved_len = node1_nei->length;
+
+		optimizeOneBranch(node1, node2, false, NNI_MAX_NR_STEP);
+		nniMoves[cnt].newLen[0] = node1->findNeighbor(node2)->length;
+
+		int i=1;
+        if (params->nni5) {
+			FOR_NEIGHBOR(node1, node2, it)
+			{
+				((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
+				optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
+				nniMoves[cnt].newLen[i] = node1->findNeighbor((*it)->node)->length;
+				i++;
+			}
+
+			 node21_it->clearPartialLh();
+
+			FOR_NEIGHBOR(node2, node1, it)
+			{
+				((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
+				optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false, NNI_MAX_NR_STEP);
+				//node2_lastnei = (PhyloNeighbor*) (*it);
+				nniMoves[cnt].newLen[i] = node2->findNeighbor((*it)->node)->length;
+				i++;
+			}
+			 node12_it->clearPartialLh();
+		}
+		double score = computeLikelihoodFromBuffer();
+		nniMoves[cnt].newloglh = score;
+		// compute the pattern likelihoods if wanted
+		if (nniMoves[cnt].ptnlh)
+			computePatternLikelihood(nniMoves[cnt].ptnlh, &score);
+
+		if (save_all_trees == 2) {
+			saveCurrentTree(score); // BQM: for new bootstrap
+		}
+
+        // else, swap back, also recover the branch lengths
+		node1->updateNeighbor(node1_it, node1_nei);
+		node1_nei->node->updateNeighbor(node2, node1);
+		node2->updateNeighbor(node2_it, node2_nei);
+		node2_nei->node->updateNeighbor(node1, node2);
+		// ONLY FOR CHECKING WITH OLGA's PLEN MODEL
+		//node1_nei->length = node2_nei->length = saved_len;
+    }
+
+	 // restore the Neighbor*
+	 for (id = IT_NUM-1; id >= 0; id--) {
+//		 aligned_free(((PhyloNeighbor*) *saved_it[id])->scale_num);
+		 //delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
+		 if (*saved_it[id] == current_it) current_it = (PhyloNeighbor*) saved_nei[id];
+		 if (*saved_it[id] == current_it_back) current_it_back = (PhyloNeighbor*) saved_nei[id];
+
+		 delete (*saved_it[id]);
+		 (*saved_it[id]) = saved_nei[id];
+	 }
+//	 aligned_free(new_partial_lh);
+
+	 // restore the length of 4 branches around node1, node2
+	 FOR_NEIGHBOR(node1, node2, it)
+		 (*it)->length = (*it)->node->findNeighbor(node1)->length;
+	 FOR_NEIGHBOR(node2, node1, it)
+		 (*it)->length = (*it)->node->findNeighbor(node2)->length;
+
+	 // restore curScore
+	 curScore = backupScore;
+
+	 NNIMove res;
+	 if (nniMoves[0].newloglh > nniMoves[1].newloglh) {
+		 res = nniMoves[0];
+	 } else {
+		 res = nniMoves[1];
+	 }
+	if (newNNIMoves) {
+		delete [] nniMoves;
+	}
+	return res;
+}
+
+
+/****************************************************************************
+ Subtree Pruning and Regrafting by maximum likelihood
+ ****************************************************************************/
+
+double PhyloTree::optimizeSPR_old(double cur_score, PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    PhyloNeighbor * dad1_nei = NULL;
+    PhyloNeighbor * dad2_nei = NULL;
+    PhyloNode * sibling1 = NULL;
+    PhyloNode * sibling2 = NULL;
+    double sibling1_len = 0.0, sibling2_len = 0.0;
+
+    if (dad && !dad->isLeaf()) {
+
+        assert(dad->degree() == 3);
+        // assign the sibling of node, with respect to dad
+
+        FOR_NEIGHBOR_DECLARE(dad, node, it) {
+            if (!sibling1) {
+                dad1_nei = (PhyloNeighbor*) (*it);
+                sibling1 = (PhyloNode*) (*it)->node;
+                sibling1_len = (*it)->length;
+            } else {
+
+                dad2_nei = (PhyloNeighbor*) (*it);
+                sibling2 = (PhyloNode*) (*it)->node;
+                sibling2_len = (*it)->length;
+            }
+        }
+        // remove the subtree leading to node
+        double sum_len = sibling1_len + sibling2_len;
+        sibling1->updateNeighbor(dad, sibling2, sum_len);
+        sibling2->updateNeighbor(dad, sibling1, sum_len);
+        PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
+        PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
+        sibling1_nei->clearPartialLh();
+        sibling2_nei->clearPartialLh();
+
+        // now try to move the subtree to somewhere else
+        vector<PhyloNeighbor*> spr_path;
+
+        FOR_NEIGHBOR(sibling1, sibling2, it)
+        {
+            spr_path.push_back(sibling1_nei);
+            double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
+                    spr_path);
+            // if likelihood score improves, return
+            if (score > cur_score)
+
+                return score;
+            spr_path.pop_back();
+        }
+
+        FOR_NEIGHBOR(sibling2, sibling1, it)
+        {
+            spr_path.push_back(sibling2_nei);
+            double score = swapSPR_old(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
+                    spr_path);
+            // if likelihood score improves, return
+            if (score > cur_score)
+
+                return score;
+            spr_path.pop_back();
+        }
+        // if likelihood does not imporve, swap back
+        sibling1->updateNeighbor(sibling2, dad, sibling1_len);
+        sibling2->updateNeighbor(sibling1, dad, sibling2_len);
+        dad1_nei->node = sibling1;
+        dad1_nei->length = sibling1_len;
+        dad2_nei->node = sibling2;
+        dad2_nei->length = sibling2_len;
+        clearAllPartialLH();
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    double score = optimizeSPR_old(cur_score, (PhyloNode*) (*it)->node, node);
+
+    if (score > cur_score) return score;
+}
+    return cur_score;
+}
+
+/**
+ move the subtree (dad1-node1) to the branch (dad2-node2)
+ */
+double PhyloTree::swapSPR_old(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
+        PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
+    PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
+    PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
+    double node1_dad1_len = node1_nei->length;
+    PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
+
+    if (dad2) {
+        // now, connect (node1-dad1) to the branch (node2-dad2)
+
+        bool first = true;
+        PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
+        PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
+        double len2 = node2_nei->length;
+
+        FOR_NEIGHBOR_IT(dad1, node1, it){
+        if (first) {
+            (*it)->node = dad2;
+            (*it)->length = len2 / 2;
+            dad2->updateNeighbor(node2, dad1, len2 / 2);
+            first = false;
+        } else {
+            (*it)->node = node2;
+            (*it)->length = len2 / 2;
+            node2->updateNeighbor(dad2, dad1, len2 / 2);
+        }
+        ((PhyloNeighbor*) (*it))->clearPartialLh();
+    }
+        node2_nei->clearPartialLh();
+        dad2_nei->clearPartialLh();
+        node1_nei->clearPartialLh();
+        vector<PhyloNeighbor*>::iterator it2;
+        for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++)
+            (*it2)->clearPartialLh();
+        clearAllPartialLH();
+        // optimize relevant branches
+        double score;
+
+        /* testing different branch optimization */
+        optimizeOneBranch(node1, dad1);
+        score = computeLikelihoodFromBuffer();
+        //score = optimizeOneBranch(dad2, dad1);
+        //score = optimizeOneBranch(node2, dad1);
+        /*
+         PhyloNode *cur_node = dad2;
+         for (int i = spr_path.size()-1; i >= 0; i--) {
+         score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
+         cur_node = (PhyloNode*)spr_path[i]->node;
+         }
+         */
+        //score = optimizeAllBranches(dad1);
+        // if score improves, return
+        if (score > cur_score)
+            return score;
+        // else, swap back
+        node2->updateNeighbor(dad1, dad2, len2);
+        dad2->updateNeighbor(dad1, node2, len2);
+        node2_nei->clearPartialLh();
+        dad2_nei->clearPartialLh();
+        node1_nei->length = node1_dad1_len;
+        dad1_nei->length = node1_dad1_len;
+
+        // add to candiate SPR moves
+        spr_moves.add(node1, dad1, node2, dad2, score);
+    }
+    if (cur_depth >= spr_radius)
+
+        return cur_score;
+    spr_path.push_back(node2_nei);
+
+    FOR_NEIGHBOR_IT(node2, dad2, it){
+    double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
+    if (score > cur_score) return score;
+}
+    spr_path.pop_back();
+
+    return cur_score;
+
+}
+
+double PhyloTree::optimizeSPR(double cur_score, PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    PhyloNeighbor * dad1_nei = NULL;
+    PhyloNeighbor * dad2_nei = NULL;
+    PhyloNode * sibling1 = NULL;
+    PhyloNode * sibling2 = NULL;
+    double sibling1_len = 0.0, sibling2_len = 0.0;
+
+    if (dad && !dad->isLeaf()) {
+
+        assert(dad->degree() == 3);
+        // assign the sibling of node, with respect to dad
+
+        FOR_NEIGHBOR_DECLARE(dad, node, it) {
+            if (!sibling1) {
+                dad1_nei = (PhyloNeighbor*) (*it);
+                sibling1 = (PhyloNode*) (*it)->node;
+                sibling1_len = (*it)->length;
+            } else {
+
+                dad2_nei = (PhyloNeighbor*) (*it);
+                sibling2 = (PhyloNode*) (*it)->node;
+                sibling2_len = (*it)->length;
+            }
+        }
+        // remove the subtree leading to node
+        double sum_len = sibling1_len + sibling2_len;
+        sibling1->updateNeighbor(dad, sibling2, sum_len);
+        sibling2->updateNeighbor(dad, sibling1, sum_len);
+        PhyloNeighbor* sibling1_nei = (PhyloNeighbor*) sibling1->findNeighbor(sibling2);
+        PhyloNeighbor* sibling2_nei = (PhyloNeighbor*) sibling2->findNeighbor(sibling1);
+        // save partial likelihood
+        double* sibling1_partial_lh = sibling1_nei->partial_lh;
+        double* sibling2_partial_lh = sibling2_nei->partial_lh;
+        sibling1_nei->partial_lh = newPartialLh();
+        sibling2_nei->partial_lh = newPartialLh();
+        sibling1_nei->clearPartialLh();
+        sibling2_nei->clearPartialLh();
+
+        // now try to move the subtree to somewhere else
+        vector<PhyloNeighbor*> spr_path;
+
+        FOR_NEIGHBOR(sibling1, sibling2, it)
+        {
+            spr_path.push_back(sibling1_nei);
+            double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling1,
+                    spr_path);
+            // if likelihood score improves, return
+            if (score > cur_score) {
+                cout << "cur_score = " << cur_score << endl;
+                cout << "Found new BETTER SCORE by SPR: " << score << endl;
+
+                return score;
+            }
+            spr_path.pop_back();
+        }
+
+        FOR_NEIGHBOR(sibling2, sibling1, it)
+        {
+            spr_path.push_back(sibling2_nei);
+            double score = swapSPR(cur_score, 1, node, dad, sibling1, sibling2, (PhyloNode*) (*it)->node, sibling2,
+                    spr_path);
+            // if likelihood score improves, return
+            if (score > cur_score) {
+                cout << "cur_score = " << cur_score << endl;
+                cout << "Found new BETTER SCORE by SPR: " << score << endl;
+
+                return score;
+            }
+            spr_path.pop_back();
+        }
+        // if likelihood does not imporve, swap back
+        sibling1->updateNeighbor(sibling2, dad, sibling1_len);
+        sibling2->updateNeighbor(sibling1, dad, sibling2_len);
+        dad1_nei->node = sibling1;
+        dad1_nei->length = sibling1_len;
+        dad2_nei->node = sibling2;
+        dad2_nei->length = sibling2_len;
+        delete[] sibling1_nei->partial_lh;
+        delete[] sibling2_nei->partial_lh;
+        sibling1_nei->partial_lh = sibling1_partial_lh;
+        sibling2_nei->partial_lh = sibling2_partial_lh;
+        //clearAllPartialLH();
+
+    }
+
+    FOR_NEIGHBOR_IT(node, dad, it){
+    double score = optimizeSPR(cur_score, (PhyloNode*) (*it)->node, node);
+
+    if (score > cur_score) return score;
+}
+    return cur_score;
+}
+
+/**
+ move the subtree (dad1-node1) to the branch (dad2-node2)
+ */
+double PhyloTree::swapSPR(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1, PhyloNode *orig_node1,
+        PhyloNode *orig_node2, PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path) {
+
+    PhyloNeighbor *node1_nei = (PhyloNeighbor*) node1->findNeighbor(dad1);
+    PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad1->findNeighbor(node1);
+    double node1_dad1_len = node1_nei->length;
+    PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
+    PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
+
+    //double* node1dad1_lh_save = node1_nei->partial_lh;
+    //double* dad1node1_lh_save = dad1_nei->partial_lh;
+    //double node1dad1_scale = node1_nei->lh_scale_factor;
+    //double dad1node1_scale = dad1_nei->lh_scale_factor;
+
+    double* node2dad2_lh_save = node2_nei->partial_lh;
+    double* dad2node2_lh_save = dad2_nei->partial_lh;
+    double node2dad2_scale = node2_nei->lh_scale_factor;
+    double dad2node_scale = dad2_nei->lh_scale_factor;
+
+    double len2 = node2_nei->length;
+    double newLen2 = sqrt(len2);
+
+    if (dad2 && cur_depth >= SPR_DEPTH) {
+        // now, connect (node1-dad1) to the branch (node2-dad2)
+
+        bool first = true;
+        //PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
+        //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
+        //double len2 = node2_nei->length;
+
+        FOR_NEIGHBOR_IT(dad1, node1, it){
+        // Finding new 2 neighbors for dad1 that are not node1
+        if (first) {
+            (*it)->node = dad2;
+            //(*it)->length = len2 / 2;
+            (*it)->length = newLen2;
+            dad2->updateNeighbor(node2, dad1, newLen2);
+            first = false;
+        } else {
+            (*it)->node = node2;
+            (*it)->length = newLen2;
+            node2->updateNeighbor(dad2, dad1, newLen2);
+        }
+        // clear all partial likelihood leading from
+        // dad1 to the new neighbors
+        ((PhyloNeighbor*) (*it))->clearPartialLh();
+    }
+
+    // clear partial likelihood from node2 to dad1
+        node2_nei->clearPartialLh();
+        // clear partial likelihood from dad2 to dad1
+        dad2_nei->clearPartialLh();
+        // clear partial likelihood from dad1 to node1
+        node1_nei->clearPartialLh();
+
+        // set new legnth as suggested by Alexis
+        node1_nei->length = 0.9;
+        dad1_nei->length = 0.9;
+
+        //Save the partial likelihood from the removal point to the insertion point
+        vector<PhyloNeighbor*>::iterator it2;
+        vector<double*> saved_partial_lhs(spr_path.size());
+        for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
+            saved_partial_lhs.push_back((*it2)->partial_lh);
+            (*it2)->partial_lh = newPartialLh();
+            (*it2)->clearPartialLh();
+        }
+
+        // optimize relevant branches
+        double score;
+
+        /* testing different branch optimization */
+        optimizeOneBranch(node1, dad1);
+        optimizeOneBranch(dad2, dad1);
+        optimizeOneBranch(node2, dad1);
+        optimizeOneBranch(orig_node1, orig_node2);
+        score = computeLikelihoodFromBuffer();
+
+        /*
+         PhyloNode *cur_node = dad2;
+         for (int i = spr_path.size()-1; i >= 0; i--) {
+         score = optimizeOneBranch(cur_node, (PhyloNode*)spr_path[i]->node);
+         cur_node = (PhyloNode*)spr_path[i]->node;
+         }
+         */
+        //score = optimizeAllBranches(dad1);
+        // if score improves, return
+        if (score > cur_score) {
+            cout << score << endl;
+            return score;
+        }
+
+        // else, swap back
+        node2->updateNeighbor(dad1, dad2, len2);
+        dad2->updateNeighbor(dad1, node2, len2);
+        //node2_nei->clearPartialLh();
+        //dad2_nei->clearPartialLh();
+        // restore partial likelihood vectors
+        node2_nei->partial_lh = node2dad2_lh_save;
+        node2_nei->lh_scale_factor = node2dad2_scale;
+        dad2_nei->partial_lh = dad2node2_lh_save;
+        dad2_nei->lh_scale_factor = dad2node_scale;
+        node2_nei->length = len2;
+        dad2_nei->length = len2;
+        node1_nei->length = node1_dad1_len;
+        dad1_nei->length = node1_dad1_len;
+        int index = 0;
+        for (it2 = spr_path.begin(); it2 != spr_path.end(); it2++) {
+            delete[] (*it2)->partial_lh;
+            (*it2)->partial_lh = saved_partial_lhs.at(index);
+            (*it2)->unclearPartialLh();
+            index++;
+        }
+
+        // add to candiate SPR moves
+        // Tung : why adding negative SPR move ?
+        spr_moves.add(node1, dad1, node2, dad2, score);
+    }
+    if (cur_depth >= spr_radius)
+
+        return cur_score;
+    spr_path.push_back(node2_nei);
+
+    FOR_NEIGHBOR_IT(node2, dad2, it){
+    double score = swapSPR(cur_score, cur_depth + 1, node1, dad1, orig_node1, orig_node2, (PhyloNode*) (*it)->node, node2, spr_path);
+    if (score > cur_score) return score;
+}
+    spr_path.pop_back();
+
+    return cur_score;
+}
+
+double PhyloTree::assessSPRMove(double cur_score, const SPRMove &spr) {
+
+    PhyloNode *dad = spr.prune_dad;
+    PhyloNode *node = spr.prune_node;
+    PhyloNode *dad2 = spr.regraft_dad;
+    PhyloNode *node2 = spr.regraft_node;
+
+    PhyloNeighbor *dad_nei1 = NULL;
+    PhyloNeighbor *dad_nei2 = NULL;
+    PhyloNode *sibling1 = NULL;
+    PhyloNode *sibling2 = NULL;
+    double sibling1_len = 0.0, sibling2_len = 0.0;
+
+    PhyloNeighbor *node1_nei = (PhyloNeighbor*) node->findNeighbor(dad);
+    PhyloNeighbor *dad1_nei = (PhyloNeighbor*) dad->findNeighbor(node);
+    double node1_dad1_len = node1_nei->length;
+
+    // assign the sibling of node, with respect to dad
+
+    FOR_NEIGHBOR_DECLARE(dad, node, it) {
+        if (!sibling1) {
+            dad_nei1 = (PhyloNeighbor*) (*it);
+            sibling1 = (PhyloNode*) (*it)->node;
+            sibling1_len = (*it)->length;
+        } else {
+
+            dad_nei2 = (PhyloNeighbor*) (*it);
+            sibling2 = (PhyloNode*) (*it)->node;
+            sibling2_len = (*it)->length;
+        }
+    }
+    // remove the subtree leading to node
+    double sum_len = sibling1_len + sibling2_len;
+    sibling1->updateNeighbor(dad, sibling2, sum_len);
+    sibling2->updateNeighbor(dad, sibling1, sum_len);
+    // now try to move the subtree to somewhere else
+
+    bool first = true;
+    PhyloNeighbor *node2_nei = (PhyloNeighbor*) node2->findNeighbor(dad2);
+    //PhyloNeighbor *dad2_nei = (PhyloNeighbor*) dad2->findNeighbor(node2);
+    double len2 = node2_nei->length;
+
+    FOR_NEIGHBOR(dad, node, it)
+    {
+        if (first) {
+            (*it)->node = dad2;
+            (*it)->length = len2 / 2;
+            dad2->updateNeighbor(node2, dad, len2 / 2);
+            first = false;
+        } else {
+            (*it)->node = node2;
+            (*it)->length = len2 / 2;
+            node2->updateNeighbor(dad2, dad, len2 / 2);
+        }
+        ((PhyloNeighbor*) (*it))->clearPartialLh();
+    }
+
+    clearAllPartialLH();
+    // optimize branches
+    double score;
+    optimizeAllBranches(dad);
+    score = computeLikelihoodBranch((PhyloNeighbor*)dad->neighbors.back(), dad);
+
+    // if score improves, return
+    if (score > cur_score)
+        return score;
+    // else, swap back
+    node2->updateNeighbor(dad, dad2, len2);
+    dad2->updateNeighbor(dad, node2, len2);
+
+    node1_nei->length = node1_dad1_len;
+    dad1_nei->length = node1_dad1_len;
+
+    sibling1->updateNeighbor(sibling2, dad, sibling1_len);
+    sibling2->updateNeighbor(sibling1, dad, sibling2_len);
+    dad_nei1->node = sibling1;
+    dad_nei1->length = sibling1_len;
+    dad_nei2->node = sibling2;
+    dad_nei2->length = sibling2_len;
+    clearAllPartialLH();
+
+    return cur_score;
+
+}
+
+double PhyloTree::optimizeSPR() {
+    double cur_score = computeLikelihood();
+    //spr_radius = leafNum / 5;
+    spr_radius = 10;
+    for (int i = 0; i < 100; i++) {
+        cout << "i = " << i << endl;
+        spr_moves.clear();
+        double score = optimizeSPR_old(cur_score, (PhyloNode*) root->neighbors[0]->node);
+        clearAllPartialLH();
+        // why this?
+        if (score <= cur_score) {
+            for (SPRMoves::iterator it = spr_moves.begin(); it != spr_moves.end(); it++) {
+                //cout << (*it).score << endl;
+                score = assessSPRMove(cur_score, *it);
+                // if likelihood score improves, apply to SPR
+                if (score > cur_score)
+                    break;
+            }
+            if (score <= cur_score)
+                break;
+        } else {
+
+            cur_score = optimizeAllBranches();
+            cout << "SPR " << i + 1 << " : " << cur_score << endl;
+            cur_score = score;
+        }
+    }
+    return cur_score;
+    //return optimizeAllBranches();
+}
+
+double PhyloTree::optimizeSPRBranches() {
+    cout << "Search with Subtree Pruning and Regrafting (SPR) using ML..." << endl;
+    double cur_score = computeLikelihood();
+    for (int i = 0; i < 100; i++) {
+        double score = optimizeSPR();
+        if (score <= cur_score + TOL_LIKELIHOOD)
+
+            break;
+        cur_score = score;
+    }
+    return cur_score;
+}
+
+void PhyloTree::pruneSubtree(PhyloNode *node, PhyloNode *dad, PruningInfo &info) {
+
+    bool first = true;
+    info.node = node;
+    info.dad = dad;
+
+    FOR_NEIGHBOR_IT(dad, node, it){
+    if (first) {
+        info.dad_it_left = it;
+        info.dad_nei_left = (*it);
+        info.dad_lh_left = ((PhyloNeighbor*) (*it))->partial_lh;
+        info.left_node = (*it)->node;
+        info.left_len = (*it)->length;
+        first = false;
+    } else {
+
+        info.dad_it_right = it;
+        info.dad_nei_right = (*it);
+        info.dad_lh_right = ((PhyloNeighbor*) (*it))->partial_lh;
+        info.right_node = (*it)->node;
+        info.right_len = (*it)->length;
+    }
+}
+    info.left_it = info.left_node->findNeighborIt(dad);
+    info.right_it = info.right_node->findNeighborIt(dad);
+    info.left_nei = (*info.left_it);
+    info.right_nei = (*info.right_it);
+
+    info.left_node->updateNeighbor(info.left_it, info.dad_nei_right);
+    info.right_node->updateNeighbor(info.right_it, info.dad_nei_left);
+    ((PhyloNeighbor*) info.dad_nei_right)->partial_lh = newPartialLh();
+    ((PhyloNeighbor*) info.dad_nei_left)->partial_lh = newPartialLh();
+}
+
+void PhyloTree::regraftSubtree(PruningInfo &info, PhyloNode *in_node, PhyloNode *in_dad) {
+
+    NeighborVec::iterator in_node_it = in_node->findNeighborIt(in_dad);
+    NeighborVec::iterator in_dad_it = in_dad->findNeighborIt(in_node);
+    Neighbor *in_dad_nei = (*in_dad_it);
+    Neighbor *in_node_nei = (*in_node_it);
+    //double in_len = in_dad_nei->length;
+    info.dad->updateNeighbor(info.dad_it_right, in_dad_nei);
+    info.dad->updateNeighbor(info.dad_it_left, in_node_nei);
+    // SOMETHING NEED TO BE DONE
+    //in_dad->updateNeighbor(in_dad_it,
+
+}
+
+/****************************************************************************
+ Approximate Likelihood Ratio Test with SH-like interpretation
+ ****************************************************************************/
+
+/*void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
+        PhyloNode *node1, PhyloNode *node2) {
+
+    assert(node1->degree() == 3 && node2->degree() == 3);
+
+    // recompute pattern scaling factors if necessary
+    PhyloNeighbor *node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+    PhyloNeighbor *node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
+    NeighborVec::iterator it;
+    const int IT_NUM = 6;
+
+    NeighborVec::iterator saved_it[IT_NUM];
+    int id = 0;
+
+    FOR_NEIGHBOR(node1, node2, it)
+    {
+        saved_it[id++] = (*it)->node->findNeighborIt(node1);
+    } else {
+
+        saved_it[id++] = it;
+    }
+
+    FOR_NEIGHBOR(node2, node1, it)
+    {
+        saved_it[id++] = (*it)->node->findNeighborIt(node2);
+    } else {
+        saved_it[id++] = it;
+    }
+    assert(id == IT_NUM);
+
+    Neighbor * saved_nei[IT_NUM];
+    // save Neighbor and allocate new Neighbor pointer
+    for (id = 0; id < IT_NUM; id++) {
+        saved_nei[id] = (*saved_it[id]);
+        // NOTE BUG DOWN HERE!
+        *saved_it[id] = new PhyloNeighbor(saved_nei[id]->node, saved_nei[id]->length); // BUG for PhyloSuperTree!
+        ((PhyloNeighbor*) (*saved_it[id]))->partial_lh = newPartialLh();
+        ((PhyloNeighbor*) (*saved_it[id]))->scale_num = newScaleNum();
+    }
+
+    // get the Neighbor again since it is replaced for saving purpose
+    node12_it = (PhyloNeighbor*) node1->findNeighbor(node2);
+    node21_it = (PhyloNeighbor*) node2->findNeighbor(node1);
+
+    // PhyloNeighbor *node2_lastnei = NULL;
+
+    // save the first found neighbor of node 1 (excluding node2) in node1_it
+    FOR_NEIGHBOR_DECLARE(node1, node2, node1_it)
+
+        break;
+    Neighbor *node1_nei = *node1_it;
+
+    bool first = true;
+
+    FOR_NEIGHBOR_IT(node2, node1, node2_it) {
+		// do the NNI swap
+		Neighbor *node2_nei = *node2_it;
+		node1->updateNeighbor(node1_it, node2_nei);
+		node2_nei->node->updateNeighbor(node2, node1);
+
+		node2->updateNeighbor(node2_it, node1_nei);
+		node1_nei->node->updateNeighbor(node1, node2);
+
+		// re-optimize five adjacent branches
+		double old_score = -INFINITY, new_score = old_score;
+
+		// clear partial likelihood vector
+		node12_it->clearPartialLh();
+		node21_it->clearPartialLh();
+		int i;
+		for (i = 0; i < 2; i++) {
+
+			new_score = optimizeOneBranch(node1, node2, false);
+
+			FOR_NEIGHBOR(node1, node2, it) {
+				//for (id = 0; id < IT_NUM; id++)
+				//((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
+				((PhyloNeighbor*) (*it)->node->findNeighbor(node1))->clearPartialLh();
+				new_score = optimizeOneBranch(node1, (PhyloNode*) (*it)->node, false);
+			}
+
+			node21_it->clearPartialLh();
+
+			FOR_NEIGHBOR(node2, node1, it) {
+				//for (id = 0; id < IT_NUM; id++)
+				//((PhyloNeighbor*)(*saved_it[id]))->clearPartialLh();
+				((PhyloNeighbor*) (*it)->node->findNeighbor(node2))->clearPartialLh();
+				new_score = optimizeOneBranch(node2, (PhyloNode*) (*it)->node, false);
+				//node2_lastnei = (PhyloNeighbor*) (*it);
+			}
+			node12_it->clearPartialLh();
+			if (new_score < old_score + TOL_LIKELIHOOD) break;
+			old_score = new_score;
+		}
+		saveCurrentTree(new_score); // BQM: for new bootstrap
+
+		//new_score = optimizeOneBranch(node1, node2, false);
+		if (new_score > cur_lh + TOL_LIKELIHOOD)
+		cout << "Alternative NNI shows better likelihood " << new_score << " > " << cur_lh << endl;
+		double *result_lh;
+		if (first) {
+			result_lh = pattern_lh2;
+			lh2 = new_score;
+		} else {
+			result_lh = pattern_lh3;
+			lh3 = new_score;
+		}
+		old_score = new_score;
+		computePatternLikelihood(result_lh);
+		// swap back and recover the branch lengths
+		node1->updateNeighbor(node1_it, node1_nei);
+		node1_nei->node->updateNeighbor(node2, node1);
+		node2->updateNeighbor(node2_it, node2_nei);
+		node2_nei->node->updateNeighbor(node1, node2);
+		first = false;
+	}
+
+// restore the Neighbor*
+    for (id = 0; id < IT_NUM; id++) {
+
+        delete[] ((PhyloNeighbor*) *saved_it[id])->scale_num;
+        delete[] ((PhyloNeighbor*) *saved_it[id])->partial_lh;
+        delete (*saved_it[id]);
+        (*saved_it[id]) = saved_nei[id];
+    }
+
+    // restore the length of 4 branches around node1, node2
+    FOR_NEIGHBOR(node1, node2, it)
+        (*it)->length = (*it)->node->findNeighbor(node1)->length;
+    FOR_NEIGHBOR(node2, node1, it)
+        (*it)->length = (*it)->node->findNeighbor(node2)->length;
+}*/
+
+void PhyloTree::computeNNIPatternLh(double cur_lh, double &lh2, double *pattern_lh2, double &lh3, double *pattern_lh3,
+        PhyloNode *node1, PhyloNode *node2) {
+	NNIMove nniMoves[2];
+	nniMoves[0].ptnlh = pattern_lh2;
+	nniMoves[1].ptnlh = pattern_lh3;
+	bool nni5 = params->nni5;
+	params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
+	nniMoves[0].node1 = nniMoves[1].node1 = NULL;
+	nniMoves[0].node2 = nniMoves[1].node2 = NULL;
+	getBestNNIForBran(node1, node2, nniMoves);
+	params->nni5 = nni5;
+	lh2 = nniMoves[0].newloglh;
+	lh3 = nniMoves[1].newloglh;
+	if (max(lh2,lh3) > cur_lh + TOL_LIKELIHOOD)
+		cout << "Alternative NNI shows better log-likelihood " << max(lh2,lh3) << " > " << cur_lh << endl;
+}
+
+void PhyloTree::resampleLh(double **pat_lh, double *lh_new) {
+    //int nsite = getAlnNSite();
+    int nptn = getAlnNPattern();
+    memset(lh_new, 0, sizeof(double) * 3);
+    int i;
+    IntVector boot_freq;
+    aln->createBootstrapAlignment(boot_freq, params->bootstrap_spec);
+    for (i = 0; i < nptn; i++) {
+
+        lh_new[0] += boot_freq[i] * pat_lh[0][i];
+        lh_new[1] += boot_freq[i] * pat_lh[1][i];
+        lh_new[2] += boot_freq[i] * pat_lh[2][i];
+    }
+}
+
+// Implementation of testBranch follows Guindon et al. (2010)
+
+double PhyloTree::testOneBranch(double best_score, double *pattern_lh, int reps, int lbp_reps, PhyloNode *node1,
+        PhyloNode *node2, double &lbp_support) {
+    const int NUM_NNI = 3;
+    double lh[NUM_NNI];
+    double *pat_lh[NUM_NNI];
+    lh[0] = best_score;
+    pat_lh[0] = pattern_lh;
+    pat_lh[1] = new double[getAlnNPattern()];
+    pat_lh[2] = new double[getAlnNPattern()];
+    computeNNIPatternLh(best_score, lh[1], pat_lh[1], lh[2], pat_lh[2], node1, node2);
+    double aLRT;
+    if (lh[1] > lh[2])
+        aLRT = (lh[0] - lh[1]);
+    else
+        aLRT = (lh[0] - lh[2]);
+
+    int support = 0;
+
+    lbp_support = 0.0;
+    int times = max(reps, lbp_reps);
+
+    for (int i = 0; i < times; i++) {
+        double lh_new[NUM_NNI];
+        // resampling estimated log-likelihood (RELL)
+        resampleLh(pat_lh, lh_new);
+        if (lh_new[0] > lh_new[1] && lh_new[0] > lh_new[2])
+            lbp_support += 1.0;
+        double cs[NUM_NNI], cs_best, cs_2nd_best;
+        cs[0] = lh_new[0] - lh[0];
+        cs[1] = lh_new[1] - lh[1];
+        cs[2] = lh_new[2] - lh[2];
+        if (cs[0] >= cs[1] && cs[0] >= cs[2]) {
+            cs_best = cs[0];
+            if (cs[1] > cs[2])
+                cs_2nd_best = cs[1];
+            else
+                cs_2nd_best = cs[2];
+        } else if (cs[1] >= cs[2]) {
+            cs_best = cs[1];
+            if (cs[0] > cs[2])
+                cs_2nd_best = cs[0];
+            else
+                cs_2nd_best = cs[2];
+        } else {
+            cs_best = cs[2];
+            if (cs[0] > cs[1])
+                cs_2nd_best = cs[0];
+            else
+                cs_2nd_best = cs[1];
+        }
+        if (aLRT > (cs_best - cs_2nd_best) + 0.05)
+            support++;
+    }
+    delete[] pat_lh[2];
+    delete[] pat_lh[1];
+    lbp_support /= times;
+
+    return ((double) support) / times;
+}
+
+int PhyloTree::testAllBranches(int threshold, double best_score, double *pattern_lh, int reps, int lbp_reps,
+        PhyloNode *node, PhyloNode *dad) {
+    int num_low_support = 0;
+    if (!node) {
+        node = (PhyloNode*) root;
+        root->neighbors[0]->node->name = "";
+        if (isSuperTree()) {
+			int tmp = save_all_trees;
+			save_all_trees = 2;
+			bool nni5 = params->nni5;
+			params->nni5 = true; // always optimize 5 branches for accurate SH-aLRT
+			initPartitionInfo();
+			params->nni5 = nni5;
+			save_all_trees = tmp;
+        }
+    }
+    if (dad && !node->isLeaf() && !dad->isLeaf()) {
+        double lbp_support;
+        int support = round(testOneBranch(best_score, pattern_lh, reps, lbp_reps, node, dad, lbp_support) * 100);
+        node->name = convertIntToString(support);
+        if (lbp_reps)
+            node->name += "/" + convertIntToString(round(lbp_support * 100));
+        if (support < threshold)
+            num_low_support = 1;
+        if (((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars) {
+			((PhyloNeighbor*) node->findNeighbor(dad))->partial_pars[0] = support;
+			((PhyloNeighbor*) dad->findNeighbor(node))->partial_pars[0] = support;
+        }
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)num_low_support += testAllBranches(threshold, best_score, pattern_lh, reps, lbp_reps, (PhyloNode*) (*it)->node, node);
+
+    return num_low_support;
+}
+
+/****************************************************************************
+ Collapse stable (highly supported) clades by one representative
+ ****************************************************************************/
+
+void PhyloTree::deleteLeaf(Node *leaf) {
+
+    Node *near_node = leaf->neighbors[0]->node;
+    assert(leaf->isLeaf() && near_node->degree() == 3);
+    Node *node1 = NULL;
+    Node *node2 = NULL;
+    double sum_len = 0.0;
+
+    FOR_NEIGHBOR_IT(near_node, leaf, it){
+    sum_len += (*it)->length;
+    if (!node1)
+    node1 = (*it)->node;
+
+    else
+    node2 = (*it)->node;
+}
+// make sure that the returned node1 and node2 are correct
+    assert(node1 && node2);
+    // update the neighbor
+    node1->updateNeighbor(near_node, node2, sum_len);
+    node2->updateNeighbor(near_node, node1, sum_len);
+}
+
+void PhyloTree::reinsertLeaf(Node *leaf, Node *node, Node *dad) {
+
+    bool first = true;
+    Node *adjacent_node = leaf->neighbors[0]->node;
+    Neighbor *nei = node->findNeighbor(dad);
+    //double len = nei->length;
+    double len = max(nei->length, MIN_BRANCH_LEN * 2);
+    // to avoid too small branch length when reinserting leaf
+
+    FOR_NEIGHBOR_IT(adjacent_node, leaf, it){
+        if (first) {
+            (*it)->node = node;
+            (*it)->length = len / 2;
+            node->updateNeighbor(dad, adjacent_node, len / 2);
+        } else {
+            (*it)->node = dad;
+            (*it)->length = len / 2;
+            dad->updateNeighbor(node, adjacent_node, len / 2);
+        }
+        first = false;
+    }
+}
+
+bool PhyloTree::isSupportedNode(PhyloNode* node, int min_support) {
+    FOR_NEIGHBOR_IT(node, NULL, it)if (!(*it)->node->isLeaf())
+    if (((PhyloNeighbor*) * it)->partial_pars[0] < min_support) {
+
+        return false;
+    }
+    return true;
+}
+
+int PhyloTree::collapseStableClade(int min_support, NodeVector &pruned_taxa, StrVector &linked_name,
+        double* &dist_mat) {
+    NodeVector taxa;
+    NodeVector::iterator tax_it;
+    StrVector::iterator linked_it;
+    getTaxa(taxa);
+    IntVector linked_taxid;
+    linked_taxid.resize(leafNum, -1);
+    int num_pruned_taxa; // global num of pruned taxa
+    int ntaxa = leafNum;
+    do {
+        num_pruned_taxa = 0;
+        for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
+            if (linked_taxid[(*tax_it)->id] < 0) {
+                Node *taxon = (*tax_it);
+                PhyloNode *near_node = (PhyloNode*) taxon->neighbors[0]->node;
+                Node *adj_taxon = NULL;
+                FOR_NEIGHBOR_DECLARE(near_node, taxon, it)
+                    if ((*it)->node->isLeaf()) {
+                        adj_taxon = (*it)->node;
+                        break;
+                    }
+                // if it is not a cherry
+                if (!adj_taxon)
+                    continue;
+                assert(linked_taxid[adj_taxon->id] < 0);
+                PhyloNeighbor * near_nei = NULL;
+                FOR_NEIGHBOR(near_node, taxon, it)
+                    if ((*it)->node != adj_taxon) {
+                        near_nei = (PhyloNeighbor*) (*it);
+                        break;
+                    }
+                assert(near_nei);
+                // continue if the cherry is not stable, or distance between two taxa is near ZERO
+                if (!isSupportedNode((PhyloNode*) near_nei->node, min_support)
+                        && dist_mat[taxon->id * ntaxa + adj_taxon->id] > 2e-6)
+                    continue;
+                // now do the taxon pruning
+                Node * pruned_taxon = taxon, *stayed_taxon = adj_taxon;
+                // prune the taxon that is far away
+                if (adj_taxon->neighbors[0]->length > taxon->neighbors[0]->length) {
+                    pruned_taxon = adj_taxon;
+                    stayed_taxon = taxon;
+                }
+                deleteLeaf(pruned_taxon);
+                linked_taxid[pruned_taxon->id] = stayed_taxon->id;
+                pruned_taxa.push_back(pruned_taxon);
+                linked_name.push_back(stayed_taxon->name);
+                num_pruned_taxa++;
+                // do not prune more than n-4 taxa
+                if (pruned_taxa.size() >= ntaxa - 4)
+                    break;
+            }
+    } while (num_pruned_taxa && pruned_taxa.size() < ntaxa - 4);
+
+    if (pruned_taxa.empty())
+        return 0;
+
+    if (verbose_mode >= VB_MED)
+        for (tax_it = pruned_taxa.begin(), linked_it = linked_name.begin(); tax_it != pruned_taxa.end();
+                tax_it++, linked_it++)
+            cout << "Delete " << (*tax_it)->name << " from " << (*linked_it) << endl;
+
+    // set root to the first taxon which was not deleted
+    for (tax_it = taxa.begin(); tax_it != taxa.end(); tax_it++)
+        if (linked_taxid[(*tax_it)->id] < 0) {
+            root = (*tax_it);
+            break;
+        }
+    // extract the sub alignment
+    IntVector stayed_id;
+    int i, j;
+    for (i = 0; i < taxa.size(); i++)
+        if (linked_taxid[i] < 0)
+            stayed_id.push_back(i);
+    assert(stayed_id.size() + pruned_taxa.size() == leafNum);
+    Alignment * pruned_aln = new Alignment();
+    pruned_aln->extractSubAlignment(aln, stayed_id, 2); // at least 2 informative characters
+    nodeNum = leafNum = stayed_id.size();
+    initializeTree();
+    setAlignment(pruned_aln);
+
+    double *pruned_dist = new double[leafNum * leafNum];
+    for (i = 0; i < leafNum; i++)
+        for (j = 0; j < leafNum; j++)
+            pruned_dist[i * leafNum + j] = dist_mat[stayed_id[i] * ntaxa + stayed_id[j]];
+    dist_mat = pruned_dist;
+
+    return pruned_taxa.size();
+}
+
+int PhyloTree::restoreStableClade(Alignment *original_aln, NodeVector &pruned_taxa, StrVector &linked_name) {
+    //int num_inserted_taxa;
+    NodeVector::reverse_iterator tax_it;
+    StrVector::reverse_iterator linked_it;
+    tax_it = pruned_taxa.rbegin();
+    linked_it = linked_name.rbegin();
+    for (; tax_it != pruned_taxa.rend(); tax_it++, linked_it++) {
+        //cout << "Reinsert " << (*tax_it)->name << " to " << (*linked_it) << endl;
+        Node *linked_taxon = findNodeName((*linked_it));
+        assert(linked_taxon);
+        assert(linked_taxon->isLeaf());
+        leafNum++;
+        reinsertLeaf((*tax_it), linked_taxon, linked_taxon->neighbors[0]->node);
+    }
+    assert(leafNum == original_aln->getNSeq());
+    nodeNum = leafNum;
+    initializeTree();
+    setAlignment(original_aln);
+    root = findNodeName(aln->getSeqName(0));
+    //if (verbose_mode >= VB_MED) drawTree(cout);
+
+    return 0;
+}
+
+bool PhyloTree::checkEqualScalingFactor(double &sum_scaling, PhyloNode *node, PhyloNode *dad) {
+    if (!node)
+        node = (PhyloNode*) root;
+    if (dad) {
+        double scaling = ((PhyloNeighbor*) node->findNeighbor(dad))->lh_scale_factor
+                + ((PhyloNeighbor*) dad->findNeighbor(node))->lh_scale_factor;
+        if (sum_scaling > 0)
+            sum_scaling = scaling;
+        if (fabs(sum_scaling - scaling) > 1e-6) {
+            cout << sum_scaling << " " << scaling << endl;
+            return false;
+        }
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)if (!checkEqualScalingFactor(sum_scaling, (PhyloNode*) (*it)->node, node)) return false;
+
+    return true;
+}
+
+void PhyloTree::randomizeNeighbors(Node *node, Node *dad) {
+
+    if (!node)
+        node = root;
+    FOR_NEIGHBOR_IT(node, dad, it)randomizeNeighbors((*it)->node, node);
+
+    my_random_shuffle(node->neighbors.begin(), node->neighbors.end());
+}
+
+void PhyloTree::printTransMatrices(Node *node, Node *dad) {
+    if (!node)
+        node = root;
+    int nstates = aln->num_states;
+
+    if (dad) {
+        double *trans_cat = new double[nstates * nstates];
+        model_factory->computeTransMatrix(dad->findNeighbor(node)->length * site_rate->getRate(0), trans_cat);
+        cout << "Transition matrix " << dad->name << " to " << node->name << endl;
+        for (int i = 0; i < nstates; i++) {
+            for (int j = 0; j < nstates; j++) {
+                cout << "\t" << trans_cat[i * nstates + j];
+            }
+            cout << endl;
+        }
+        delete[] trans_cat;
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)printTransMatrices((*it)->node, node);
+}
+
+void PhyloTree::removeIdenticalSeqs(Params &params) {
+	// commented out because it also works for SuperAlignment now!
+	Alignment *new_aln;
+	if (params.root)
+		new_aln = aln->removeIdenticalSeq((string)params.root, params.gbo_replicates > 0, removed_seqs, twin_seqs);
+	else
+		new_aln = aln->removeIdenticalSeq("", params.gbo_replicates > 0, removed_seqs, twin_seqs);
+	if (removed_seqs.size() > 0) {
+		cout << "NOTE: " << removed_seqs.size() << " identical sequences will be ignored during tree search" << endl;
+		if (verbose_mode >= VB_MED) {
+			for (int i = 0; i < removed_seqs.size(); i++) {
+				cout << removed_seqs[i] << " is identical to " << twin_seqs[i] << endl;
+			}
+		}
+		delete aln;
+		aln = new_aln;
+	}
+}
+
+void PhyloTree::reinsertIdenticalSeqs(Alignment *orig_aln) {
+	if (removed_seqs.empty()) return;
+
+	insertTaxa(removed_seqs, twin_seqs);
+    setAlignment(orig_aln);
+    // delete all partial_lh, which will be automatically recreated later
+    deleteAllPartialLh();
+    clearAllPartialLH();
+}
+
+void PhyloTree::computeSeqIdentityAlongTree(Split &sp, Node *node, Node *dad) {
+    assert(node && !node->isLeaf());
+    // recursive
+    FOR_NEIGHBOR_IT(node, dad, it) {
+        if ((*it)->node->isLeaf()) {
+            sp.addTaxon((*it)->node->id);
+        } else {
+            Split newsp(leafNum);
+            computeSeqIdentityAlongTree(newsp, (*it)->node, node);
+            sp += newsp;
+        }
+    }
+    if (!dad) return;
+    // now going along alignment to compute seq identity
+    int ident = 0, nseqs = aln->getNSeq();
+    for (Alignment::iterator it = aln->begin(); it != aln->end(); it++) {
+        char state = aln->STATE_UNKNOWN;
+        bool is_const = true;
+        for (int i = 0; i != nseqs; i++)
+            if ((*it)[i] < aln->num_states && sp.containTaxon(i)) {
+                if (state < aln->num_states && state != (*it)[i]) {
+                    is_const = false;
+                    break;
+                }
+                state = (*it)[i];
+            }
+        if (is_const)
+            ident += it->frequency;
+    }
+    ident = (ident*100)/aln->getNSite();
+    if (node->name == "")
+        node->name = convertIntToString(ident);
+    else
+        node->name += "/" + convertIntToString(ident);
+}
+
+void PhyloTree::computeSeqIdentityAlongTree() {
+    Split sp(leafNum);
+    if (root->isLeaf())
+        computeSeqIdentityAlongTree(sp, root->neighbors[0]->node);
+    else
+        computeSeqIdentityAlongTree(sp, root);
+}
+
+void PhyloTree::generateRandomTree(TreeGenType tree_type) {
+    assert(aln);
+    int orig_size = params->sub_size;
+    params->sub_size = aln->getNSeq();
+    MExtTree ext_tree;
+	switch (tree_type) {
+	case YULE_HARDING: 
+		ext_tree.generateYuleHarding(*params);
+		break;
+	case UNIFORM:
+		ext_tree.generateUniform(params->sub_size);
+		break;
+	case CATERPILLAR:
+		ext_tree.generateCaterpillar(params->sub_size);
+		break;
+	case BALANCED:
+		ext_tree.generateBalanced(params->sub_size);
+		break;
+	case STAR_TREE:
+		ext_tree.generateStarTree(*params);
+		break;
+	default:
+		break;
+	}
+    params->sub_size = orig_size;
+	NodeVector taxa;
+	ext_tree.getTaxa(taxa);
+	assert(taxa.size() == aln->getNSeq());
+	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
+		(*it)->name = aln->getSeqName((*it)->id);
+    stringstream str;
+    ext_tree.printTree(str);
+    PhyloTree::readTreeString(str.str());
+}
diff --git a/phylotree.h b/phylotree.h
new file mode 100644
index 0000000..fbdf0b0
--- /dev/null
+++ b/phylotree.h
@@ -0,0 +1,1774 @@
+//
+// C++ Interface: phylotree
+//
+// Description:
+//
+//
+// Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>, (C) 2008
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef PHYLOTREE_H
+#define PHYLOTREE_H
+//#define NDEBUG
+// comented out this for Mac
+
+// PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
+#define EIGEN_NO_AUTOMATIC_RESIZING
+//#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 32 // PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
+//#define EIGEN_UNROLLING_LIMIT 1000 // PLEASE DONT TOUCH THESE VARIABLES ANYMORE!
+
+//#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (512*256)
+//#define EIGEN_TUNE_FOR_CPU_CACHE_SIZE (8*512*512)
+#include "Eigen/Core"
+#include "mtree.h"
+#include "alignment.h"
+#include "model/modelsubst.h"
+#include "model/modelfactory.h"
+#include "phylonode.h"
+#include "optimization.h"
+#include "model/rateheterogeneity.h"
+#include "pllrepo/src/pll.h"
+
+#define BOOT_VAL_FLOAT
+#define BootValType float
+//#define BootValType double
+
+extern int instruction_set;
+
+
+const double MIN_BRANCH_LEN = 0.000001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
+const double MAX_BRANCH_LEN = 100.0;
+const double TOL_BRANCH_LEN = 0.000001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
+const double TOL_LIKELIHOOD = 0.001; // NEVER TOUCH THIS CONSTANT AGAIN PLEASE!
+const double TOL_LIKELIHOOD_PARAMOPT = 0.001; // BQM: newly introduced for ModelFactory::optimizeParameters
+//const static double SCALING_THRESHOLD = sqrt(DBL_MIN);
+//const static double SCALING_THRESHOLD = 1e-100;
+//const static double SCALING_THRESHOLD_INVER = 1 / SCALING_THRESHOLD;
+//const static double LOG_SCALING_THRESHOLD = log(SCALING_THRESHOLD);
+//#include "pll/pll.h"
+// 2^256
+#define SCALING_THRESHOLD_INVER 115792089237316195423570985008687907853269984665640564039457584007913129639936.0
+#define SCALING_THRESHOLD (1.0/SCALING_THRESHOLD_INVER)
+#define LOG_SCALING_THRESHOLD log(SCALING_THRESHOLD)
+
+const int SPR_DEPTH = 2;
+
+using namespace Eigen;
+
+inline size_t get_safe_upper_limit(size_t cur_limit) {
+	if (instruction_set >= 7)
+		// AVX
+		return ((cur_limit+3)/4)*4;
+	else
+		// SSE
+		return ((cur_limit+1)/2)*2;
+}
+
+inline size_t get_safe_upper_limit_float(size_t cur_limit) {
+	if (instruction_set >= 7)
+		// AVX
+		return ((cur_limit+7)/8)*8;
+	else
+		// SSE
+		return ((cur_limit+3)/4)*4;
+}
+
+//inline double *aligned_alloc_double(size_t size) {
+//	size_t MEM_ALIGNMENT = (instruction_set >= 7) ? 32 : 16;
+//
+//#if defined WIN32 || defined _WIN32 || defined __WIN32__
+//	return (double*)_aligned_malloc(size*sizeof(double), MEM_ALIGNMENT);
+//#else
+//	void *res;
+//	posix_memalign(&res, MEM_ALIGNMENT, size*sizeof(double));
+//	return (double*)res;
+//#endif
+//}
+
+template< class T>
+inline T *aligned_alloc(size_t size) {
+	size_t MEM_ALIGNMENT = (instruction_set >= 7) ? 32 : 16;
+    void *mem;
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+	mem = _aligned_malloc(size*sizeof(T), MEM_ALIGNMENT);
+#else
+	int res = posix_memalign(&mem, MEM_ALIGNMENT, size*sizeof(T));
+    if (res == ENOMEM) {
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
+        print_stacktrace(cerr);
+#endif
+        outError("Not enough memory, allocation of " + convertInt64ToString(size*sizeof(T)) + " bytes failed (bad_alloc)");
+    }
+#endif
+    if (mem == NULL) {
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
+        print_stacktrace(cerr);
+#endif
+        outError("Not enough memory, allocation of " + convertInt64ToString(size*sizeof(T)) + " bytes failed (bad_alloc)");
+    }
+    return (T*)mem;
+}
+
+inline void aligned_free(void *mem) {
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+	_aligned_free(mem);
+#else
+	free(mem);
+#endif
+}
+
+
+/**
+ *  Row Major Array For Eigen
+ */
+typedef Array<double, Dynamic, Dynamic, RowMajor> RowMajorArrayXXd;
+
+
+typedef std::map< string, double > StringDoubleMap;
+typedef std::map< int, PhyloNode* > IntPhyloNodeMap;
+
+#define MappedMat(NSTATES) Map<Matrix<double, NSTATES, NSTATES> >
+#define MappedArr2D(NSTATES) Map<Array<double, NSTATES, NSTATES> >
+#define MappedRowVec(NSTATES) Map<Matrix<double, 1, NSTATES> >
+#define MappedVec(NSTATES) Map<Matrix<double, NSTATES, 1> >
+#define Matrix(NSTATES) Matrix<double, NSTATES, NSTATES>
+#define RowVector(NSTATES) Matrix<double, 1, NSTATES>
+#define MappedRowArr2DDyn Map<Array<double, Dynamic, Dynamic, RowMajor> >
+#define MappedArrDyn Map<Array<double, Dynamic, 1> >
+#define MappedVecDyn(NSTATES) Map<Matrix<double, Dynamic, NSTATES> >
+
+const int MAX_SPR_MOVES = 20;
+
+/**
+        an SPR move.
+ */
+struct SPRMove {
+    PhyloNode *prune_dad;
+    PhyloNode *prune_node;
+    PhyloNode *regraft_dad;
+    PhyloNode *regraft_node;
+    double score;
+};
+
+struct SPR_compare {
+
+    bool operator()(SPRMove s1, SPRMove s2) const {
+        return s1.score > s2.score;
+    }
+};
+
+class SPRMoves : public set<SPRMove, SPR_compare> {
+public:
+    void add(PhyloNode *prune_node, PhyloNode *prune_dad,
+            PhyloNode *regraft_node, PhyloNode *regraft_dad, double score);
+};
+
+/*
+left_node-----------dad-----------right_node
+                     |
+                     |
+                     |inline
+                    node
+ */
+struct PruningInfo {
+    NeighborVec::iterator dad_it_left, dad_it_right, left_it, right_it;
+    Neighbor *dad_nei_left, *dad_nei_right, *left_nei, *right_nei;
+    Node *node, *dad, *left_node, *right_node;
+    double left_len, right_len;
+    double *dad_lh_left, *dad_lh_right;
+
+};
+
+/**
+ * This Structure is used in PhyloSuperTreePlen.
+ */
+struct SwapNNIParam {
+    double nni1_score;
+    double nni1_brlen;
+    double nni2_score;
+    double nni2_brlen;
+    Neighbor* node1_nei;
+    Neighbor* node2_nei;
+    double *nni1_ptnlh;
+    double *nni2_ptnlh;
+};
+
+struct NNIMove {
+    // Two nodes representing the central branch
+    PhyloNode *node1, *node2;
+    // Roots of the two subtree that are swapped
+    NeighborVec::iterator node1Nei_it, node2Nei_it;
+
+    // log-likelihood of the tree after applying the NNI
+    double newloglh;
+
+    int swap_id;
+
+    // old branch lengths of 5 branches before doing NNI
+    //double oldLen[5];
+
+    // new branch lengths of 5 branches corresponding to the NNI
+    double newLen[5];
+
+    // pattern likelihoods
+    double *ptnlh;
+
+    bool operator<(const NNIMove & rhs) const {
+        return newloglh > rhs.newloglh;
+        //return delta > rhs.delta;
+    }
+};
+
+
+
+struct LeafFreq {
+    int leaf_id;
+
+    int freq;
+
+    bool operator<(const LeafFreq & rhs) const {
+        return ( freq < rhs.freq);
+    }
+};
+
+/**
+Phylogenetic Tree class
+
+        @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui at univie.ac.at>
+ */
+class PhyloTree : public MTree, public Optimization {
+
+	friend class PhyloSuperTree;
+	friend class PhyloSuperTreePlen;
+	friend class RateGamma;
+	friend class RateGammaInvar;
+	friend class RateKategory;
+    friend class ModelMixture;
+    friend class RateFree;
+
+public:
+    /**
+       default constructor ( everything is initialized to NULL)
+     */
+    PhyloTree();
+
+    EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+    /**
+     * Constructor with given alignment
+     * @param alignment
+     */
+    PhyloTree(Alignment *aln);
+
+    void init();
+
+    /**
+            destructor
+     */
+    virtual ~PhyloTree();
+
+    /**
+            read the tree from the input file in newick format
+            @param infile the input file file.
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    virtual void readTree(const char *infile, bool &is_rooted);
+
+    /**
+            read the tree from the ifstream in newick format
+            @param in the input stream.
+            @param is_rooted (IN/OUT) true if tree is rooted
+     */
+    virtual void readTree(istream &in, bool &is_rooted);
+
+    /**
+            copy the phylogenetic tree structure into this tree, override to take sequence names
+            in the alignment into account
+            @param tree the tree to copy
+     */
+    virtual void copyTree(MTree *tree);
+    /**
+            copy the sub-tree structure into this tree
+            @param tree the tree to copy
+            @param taxa_set 0-1 string of length leafNum (1 to keep the leaf)
+     */
+    virtual void copyTree(MTree *tree, string &taxa_set);
+
+
+    /**
+            copy the phylogenetic tree structure into this tree, designed specifically for PhyloTree.
+            So there is some distinction with copyTree.
+            @param tree the tree to copy
+     */
+    void copyPhyloTree(PhyloTree *tree);
+
+
+    /**
+            Set the alignment, important to compute parsimony or likelihood score
+            Assing taxa ids according to their position in the alignment
+            @param alignment associated alignment
+     */
+    void setAlignment(Alignment *alignment);
+
+    /** set the root by name */
+    void setRootNode(const char *my_root);
+
+
+    /**
+            set the substitution model, important to compute the likelihood
+            @param amodel associated substitution model
+     */
+    void setModel(ModelSubst *amodel);
+
+    /**
+            set the model factory
+            @param model_fac model factory
+     */
+    void setModelFactory(ModelFactory *model_fac);
+
+    /**
+            set rate heterogeneity, important to compute the likelihood
+            @param rate associated rate heterogeneity class
+     */
+    void setRate(RateHeterogeneity *rate);
+
+    /**
+            get rate heterogeneity
+            @return associated rate heterogeneity class
+     */
+    RateHeterogeneity *getRate();
+
+    void discardSaturatedSite(bool val);
+
+    /**
+            get the name of the model
+     */
+    virtual string getModelName();
+
+	/**
+	 * @return model name with parameters in form of e.g. GTR{a,b,c,d,e,f}+I{pinvar}+G{alpha}
+	 */
+	virtual string getModelNameParams();
+
+    ModelSubst *getModel() {
+        return model;
+    }
+
+    ModelFactory *getModelFactory() {
+        return model_factory;
+    }
+
+    virtual bool isSuperTree() {
+        return false;
+    }
+
+    /**
+            allocate a new node. Override this if you have an inherited Node class.
+            @param node_id node ID
+            @param node_name node name
+            @return a new node
+     */
+    virtual Node* newNode(int node_id = -1, const char* node_name = NULL);
+
+    /**
+            allocate a new node. Override this if you have an inherited Node class.
+            @param node_id node ID
+            @param node_name node name issued by an interger
+            @return a new node
+     */
+    virtual Node* newNode(int node_id, int node_name);
+
+    /**
+     *		@return number of alignment patterns
+     */
+    virtual int getAlnNPattern() {
+        return aln->getNPattern();
+    }
+
+    /**
+     *		@return number of alignment sites
+     */
+    virtual int getAlnNSite() {
+        return aln->getNSite();
+    }
+
+    /**
+     * save branch lengths into a vector
+     */
+    virtual void saveBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+    /**
+     * restore branch lengths from a vector previously called with saveBranchLengths
+     */
+    virtual void restoreBranchLengths(DoubleVector &lenvec, int startid = 0, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /****************************************************************************
+            Dot product
+     ****************************************************************************/
+    template <class Numeric, class VectorClass, const int VCSIZE>
+    Numeric dotProductSIMD(Numeric *x, Numeric *y, int size);
+
+    typedef BootValType (PhyloTree::*DotProductType)(BootValType *x, BootValType *y, int size);
+    DotProductType dotProduct;
+
+#if defined(BINARY32) || defined(__NOAVX__)
+    void setDotProductAVX() {}
+#else
+    void setDotProductAVX();
+#endif
+    /**
+            this function return the parsimony or likelihood score of the tree. Default is
+            to compute the parsimony score. Override this function if you define a new
+            score function.
+            @return the tree score
+     */
+    //virtual double computeScore() { return -computeLikelihood(); }
+    //virtual double computeScore() { return (double)computeParsimonyScore(); }
+
+    /****************************************************************************
+            Parsimony function
+     ****************************************************************************/
+
+    /**
+     * 		Return the approximated branch length estimation using corrected parsimony branch length
+     * 		This is usually used as the starting point before using Newton-Raphson
+     */
+    double computeCorrectedParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+            initialize partial_pars vector of all PhyloNeighbors, allocating central_partial_pars
+     */
+    virtual void initializeAllPartialPars();
+
+    /**
+            initialize partial_pars vector of all PhyloNeighbors, allocating central_partial_pars
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param index the index
+     */
+    virtual void initializeAllPartialPars(int &index, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            compute the tree parsimony score
+            @return parsimony score of the tree
+     */
+    int computeParsimony();
+
+    typedef void (PhyloTree::*ComputePartialParsimonyType)(PhyloNeighbor *, PhyloNode *);
+    ComputePartialParsimonyType computePartialParsimonyPointer;
+
+    /**
+            Compute partial parsimony score of the subtree rooted at dad
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+     */
+    virtual void computePartialParsimony(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    void computePartialParsimonyNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    void computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
+    template<class VectorClass>
+    void computePartialParsimonyFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    void computeReversePartialParsimony(PhyloNode *node, PhyloNode *dad);
+
+    typedef int (PhyloTree::*ComputeParsimonyBranchType)(PhyloNeighbor *, PhyloNode *, int *);
+    ComputeParsimonyBranchType computeParsimonyBranchPointer;
+
+    /**
+            compute tree parsimony score on a branch
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+            @param branch_subst (OUT) if not NULL, the number of substitutions on this branch
+            @return parsimony score of the tree
+     */
+    virtual int computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+    int computeParsimonyBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+    int computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+    template<class VectorClass>
+    int computeParsimonyBranchFastSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst = NULL);
+
+
+    void printParsimonyStates(PhyloNeighbor *dad_branch = NULL, PhyloNode *dad = NULL);
+
+    virtual void setParsimonyKernel(LikelihoodKernel lk);
+#if defined(BINARY32) || defined(__NOAVX__)
+    virtual void setParsimonyKernelAVX() {}
+#else
+    virtual void setParsimonyKernelAVX();
+#endif
+    /**
+            SLOW VERSION: compute the parsimony score of the tree, given the alignment
+            @return the parsimony score
+     */
+    int computeParsimonyScore();
+
+
+    /**
+            SLOW VERSION: compute the parsimony score of the tree, given the alignment
+            @return the parsimony score
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param ptn pattern ID
+            @param states set of admissible states at the current node (in binary code)
+     */
+    int computeParsimonyScore(int ptn, int &states, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+
+    /****************************************************************************
+            likelihood function
+     ****************************************************************************/
+
+    /**
+            initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+     */
+    virtual void initializeAllPartialLh();
+
+    /**
+            de-allocate central_partial_lh
+     */
+    virtual void deleteAllPartialLh();
+
+    /**
+            initialize partial_lh vector of all PhyloNeighbors, allocating central_partial_lh
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param index the index
+     */
+    virtual void initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+
+    /**
+            clear all partial likelihood for a clean computation again
+            @param make_null true to make all partial_lh become NULL
+     */
+    virtual void clearAllPartialLH(bool make_null = false);
+
+    /**
+     * compute all partial likelihoods if not computed before
+     */
+    void computeAllPartialLh(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     * compute all partial parsimony vector if not computed before
+     */
+    void computeAllPartialPars(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            allocate memory for a partial likelihood vector
+     */
+    double *newPartialLh();
+
+    /** get the number of bytes occupied by partial_lh */
+    int getPartialLhBytes();
+
+    /**
+            allocate memory for a scale num vector
+     */
+    UBYTE *newScaleNum();
+
+    /** get the number of bytes occupied by scale_num */
+    int getScaleNumBytes();
+
+    /**
+     * this stores partial_lh for each state at the leaves of the tree because they are the same between leaves
+     * e.g. (1,0,0,0) for A,  (0,0,0,1) for T
+     */
+    double *tip_partial_lh;
+    bool tip_partial_lh_computed;
+
+    bool ptn_freq_computed;
+
+    /****************************************************************************
+            computing partial (conditional) likelihood of subtrees
+     ****************************************************************************/
+
+    void computeTipPartialLikelihood();
+    void computePtnInvar();
+    void computePtnFreq();
+
+    /**
+            compute the partial likelihood at a subtree
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+     */
+    virtual void computePartialLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+    typedef void (PhyloTree::*ComputePartialLikelihoodType)(PhyloNeighbor *, PhyloNode *);
+    ComputePartialLikelihoodType computePartialLikelihoodPointer;
+
+    /**
+     * original naive version in IQ-TREE
+     */
+    void computePartialLikelihoodNaive(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    /**
+     * this implements the SSE version using Eigen library
+     */
+    template<int NSTATES>
+    void computePartialLikelihoodSSE(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    //template <const int nstates>
+    void computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    //template <const int nstates>
+    void computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    //template <const int nstates>
+    void computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computeMixratePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computeMixturePartialLikelihoodEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad = NULL);
+
+    /****************************************************************************
+            computing likelihood on a branch
+     ****************************************************************************/
+
+    /**
+            compute tree likelihood on a branch. used to optimize branch length
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+            @return tree likelihood
+     */
+    virtual double computeLikelihoodBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    typedef double (PhyloTree::*ComputeLikelihoodBranchType)(PhyloNeighbor*, PhyloNode*);
+    ComputeLikelihoodBranchType computeLikelihoodBranchPointer;
+
+    /**
+     * this implements the SSE version using Eigen library
+     */
+    template<int NSTATES>
+    double computeLikelihoodBranchSSE(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+     * MINH: this implements the fast alternative strategy for reversible model (March 2013)
+     * where partial likelihoods at nodes store real partial likelihoods times eigenvectors
+     */
+//    template<int NSTATES>
+//    inline double computeLikelihoodBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    //template <const int nstates>
+    double computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    //template <const int nstates>
+    double computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    //template <const int nstates>
+    double computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeMixrateLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeMixtureLikelihoodBranchEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    double computeLikelihoodBranchNaive(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /****************************************************************************
+            computing likelihood on a branch using buffer
+     ****************************************************************************/
+
+    /**
+            quickly compute tree likelihood on branch current_it <-> current_it_back given buffer (theta_all).
+           	Used after optimizing branch length
+            @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
+                            assuming pattern_lh has the size of the number of patterns
+            @return tree likelihood
+     */
+    virtual double computeLikelihoodFromBuffer();
+    typedef double (PhyloTree::*ComputeLikelihoodFromBufferType)();
+    ComputeLikelihoodFromBufferType computeLikelihoodFromBufferPointer;
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeLikelihoodFromBufferEigenSIMD();
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeMixrateLikelihoodFromBufferEigenSIMD();
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    double computeMixtureLikelihoodFromBufferEigenSIMD();
+
+    /**
+            compute tree likelihood when a branch length collapses to zero
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+            @return tree likelihood
+     */
+    virtual double computeLikelihoodZeroBranch(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+        compute likelihood of rooted tree with virtual root (FOR TINA)
+        @param dad_branch the branch leading to the subtree
+        @param dad its dad, used to direct the tranversal
+        @return tree likelihood
+     */
+    virtual double computeLikelihoodRooted(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+            compute the tree likelihood
+            @param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
+                            assuming pattern_lh has the size of the number of patterns
+            @return tree likelihood
+     */
+    virtual double computeLikelihood(double *pattern_lh = NULL);
+
+    /**
+            compute pattern likelihoods only if the accumulated scaling factor is non-zero.
+            Otherwise, copy the pattern_lh attribute
+            @param pattern_lh (OUT) pattern log-likelihoods,
+                            assuming pattern_lh has the size of the number of patterns
+            @param cur_logl current log-likelihood (for sanity check)
+            @param pattern_lh_cat (OUT) if not NULL, store all pattern-likelihood per category
+     */
+    virtual void computePatternLikelihood(double *pattern_lh, double *cur_logl = NULL,
+    		double *pattern_lh_cat = NULL);
+
+    vector<uint64_t> ptn_cat_mask;
+
+    /**
+        compute categories for each pattern, update ptn_cat_mask
+        @return max number of categories necessary
+    */
+    virtual int computePatternCategories(IntVector *pattern_ncat = NULL);
+
+    /**
+            Compute the variance in tree log-likelihood
+            (Kishino & Hasegawa 1989, JME 29:170-179)
+            @param pattern_lh pattern log-likelihoods, will be computed if NULL
+            @param tree_lh tree log-likelihood, will be computed if ZERO
+     */
+    double computeLogLVariance(double *pattern_lh = NULL, double tree_lh = 0.0);
+
+    /**
+            Compute the variance in log-likelihood difference
+            between the current tree and another tree.
+            (Kishino & Hasegawa 1989, JME 29:170-179)
+            @param pattern_lh_other pattern log-likelihoods of the other tree
+            @param pattern_lh pattern log-likelihoods of current tree, will be computed if NULL
+     */
+    double computeLogLDiffVariance(double *pattern_lh_other, double *pattern_lh = NULL);
+
+    /**
+     *  \brief Estimate the observed branch length between \a dad_branch and \a dad analytically.
+     *	The ancestral states of the 2 nodes are first computed (Yang, 2006).
+     *	Branch length are then computed using analytical formula.
+     *	@param[in] dad_branch must be an internal node
+     *	@param[in] dad must be an internal node
+     *	@return estimated branch length or -1.0 if one of the 2 nodes is leaf
+     */
+    double computeBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+     * \brief Approximate the branch legnth between \a dad_branch and \a dad using Least Square instead
+     * of Newton Raphson
+     * @param[in] dad_branch
+     * @param[in] dad
+     * @return approximated branch length
+     */
+    double computeLeastSquareBranLen(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+     * Update all subtree distances that are affect by doing an NNI on branch (node1-node2)
+     * @param nni NNI move that is carried out
+     */
+    void updateSubtreeDists(NNIMove &nni);
+
+    /**
+     * Compute all pairwise distance of subtree rooted at \a source and other subtrees
+     */
+    void computeSubtreeDists();
+
+    void getUnmarkedNodes(PhyloNodeVector& unmarkedNodes, PhyloNode* node = NULL, PhyloNode* dad = NULL);
+
+    void computeAllSubtreeDistForOneNode(PhyloNode* source, PhyloNode* nei1, PhyloNode* nei2, PhyloNode* node, PhyloNode* dad);
+
+    double correctBranchLengthF81(double observedBran, double alpha = -1.0);
+
+    double computeCorrectedBayesianBranchLength(PhyloNeighbor *dad_branch, PhyloNode *dad);
+
+    /**
+            Compute the variance in log-likelihood difference
+            between the current tree and another tree.
+            (Kishino & Hasegawa 1989, JME 29:170-179)
+            @param other_tree the other tree to compare
+            @param pattern_lh pattern log-likelihoods of current tree, will be computed if NULL
+     */
+    double computeLogLDiffVariance(PhyloTree *other_tree, double *pattern_lh = NULL);
+
+    /**
+            Roll back the tree saved with only Taxon IDs and branch lengths.
+            For this function to work, one must printTree before with WT_TAXON_ID + WT_BR_LEN
+            @param best_tree_string input stream to read from
+     */
+    void rollBack(istream &best_tree_string);
+
+    /**
+            Read the tree saved with Taxon Names and branch lengths.
+            @param tree_string tree string to read from
+            @param updatePLL if true, tree is read into PLL
+     */
+    virtual void readTreeString(const string &tree_string);
+
+    /**
+            Read the tree saved with Taxon Names and branch lengths.
+            @param tree_string tree string to read from
+     */
+    void readTreeFile(const string &file_name);
+
+    /**
+     * Return the tree string contining taxon names and branch lengths
+     * @return
+     */
+    virtual string getTreeString();
+
+    /**
+     * Assign branch lengths for branch that has no or negative length
+     * With single model branch lengths are assigned using parsimony. With partition model
+     * branch lengths are assigned randomly
+     * @param force_change if true then force fixing also positive branch lengths
+     * @return number of branches fixed
+     */
+    int wrapperFixNegativeBranch(bool force_change);
+
+    /**
+     * Read the newick string into PLL kernel
+     * @param newickTree
+     */
+    void pllReadNewick(string newickTree);
+
+    /**
+     *  Return the sorted topology without branch length, used to compare tree topology
+     */
+    string getTopology();
+
+
+    bool checkEqualScalingFactor(double &sum_scaling, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /****************************************************************************
+            computing derivatives of likelihood function
+     ****************************************************************************/
+
+    void computeLikelihoodDervNaive(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    /**
+     * this implements the SSE version using Eigen library
+     */
+    template<int NSTATES>
+    void computeLikelihoodDervSSE(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    //template <const int nstates>
+    void computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    //template <const int nstates>
+    void computeMixtureLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    //template <const int nstates>
+    void computeMixrateLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computeLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computeMixrateLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    template <class VectorClass, const int VCSIZE, const int nstates>
+    void computeMixtureLikelihoodDervEigenSIMD(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    /**
+            compute tree likelihood and derivatives on a branch. used to optimize branch length
+            @param dad_branch the branch leading to the subtree
+            @param dad its dad, used to direct the tranversal
+            @param df (OUT) first derivative
+            @param ddf (OUT) second derivative
+            @return tree likelihood
+     */
+    void computeLikelihoodDerv(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf);
+
+    typedef void (PhyloTree::*ComputeLikelihoodDervType)(PhyloNeighbor *, PhyloNode *, double &, double &);
+    ComputeLikelihoodDervType computeLikelihoodDervPointer;
+
+    /****************************************************************************
+            Stepwise addition (greedy) by maximum parsimony
+     ****************************************************************************/
+
+    /**
+            FAST VERSION: used internally by computeParsimonyTree() to find the best target branch to add into the tree
+            @param added_node node to add
+            @param target_node (OUT) one end of the best branch found
+            @param target_dad (OUT) the other end of the best branch found
+            @param target_partial_pars (OUT) copy of the partial_pars corresponding to best branch
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the parsimony score of the tree
+     */
+    int addTaxonMPFast(Node *added_taxon, Node *added_node, Node *node, Node *dad);
+
+
+    /**
+     * FAST VERSION: compute parsimony tree by step-wise addition
+     * @param out_prefix prefix for .parstree file
+     * @param alignment input alignment
+     * @return parsimony score
+     */
+    int computeParsimonyTree(const char *out_prefix, Alignment *alignment);
+
+    /**
+            SLOW VERSION: grow the tree by step-wise addition
+            @param alignment input alignment
+     */
+    void growTreeMP(Alignment *alignment);
+
+    /**
+            used internally by growTreeMP() to find the best target branch to add into the tree
+            @param added_node node to add
+            @param target_node (OUT) one end of the best branch found
+            @param target_dad (OUT) the other end of the best branch found
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the parsimony score of the tree
+     */
+    int addTaxonMP(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad);
+
+
+    /****************************************************************************
+            Nearest Neighbor Interchange with parsimony
+     ****************************************************************************/
+    /**
+            search by a nearest neigbor interchange with parsimony
+     */
+    void searchNNI();
+
+    /**
+            search by a nearest neigbor interchange with parsimony
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param cur_score current score
+            @return best score
+     */
+    double searchNNI(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            try to swap the tree with nearest neigbor interchange at the branch connecting node1-node2.
+            If a swap shows better score, return the swapped tree and the score.
+            @param cur_score current score
+            @param node1 1st end node of the branch
+            @param node2 2nd end node of the branch
+            @return best score
+     */
+    double swapNNI(double cur_score, PhyloNode *node1, PhyloNode *node2);
+
+    /****************************************************************************
+            Branch length optimization by maximum likelihood
+     ****************************************************************************/
+
+    /**
+     * IMPORTANT: semantic change: this function does not return score anymore, for efficiency purpose
+            optimize one branch length by ML
+            @param node1 1st end node of the branch
+            @param node2 2nd end node of the branch
+            @param clearLH true to clear the partial likelihood, otherwise false
+            @param maxNRStep maximum number of Newton-Raphson steps
+            @return likelihood score
+     */
+    virtual void optimizeOneBranch(PhyloNode *node1, PhyloNode *node2, bool clearLH = true, int maxNRStep = 100);
+
+    /**
+            optimize all branch lengths of the children of node
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the likelihood of the tree
+     */
+    double optimizeChildBranches(PhyloNode *node, PhyloNode *dad = NULL);
+
+    /**
+            optimize all branch lengths at the subtree rooted at node step-by-step.
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the likelihood of the tree
+     */
+    virtual void optimizeAllBranches(PhyloNode *node, PhyloNode *dad = NULL, int maxNRStep = 100);
+
+    /**
+     * optimize all branch lengths at the subtree rooted at node step-by-step.
+     * Using Least Squares instead of Newton Raphson.
+     * @param node the current node
+     * @param dad dad of the node, used to direct the search
+     */
+    void optimizeAllBranchesLS(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+            optimize all branch lengths of the tree
+            @param iterations number of iterations to loop through all branches
+            @return the likelihood of the tree
+     */
+    virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD, int maxNRStep = 100);
+
+    /**
+            inherited from Optimization class, to return to likelihood of the tree
+            when the current branch length is set to value
+            @param value current branch length
+            @return negative of likelihood (for minimization)
+     */
+    virtual double computeFunction(double value);
+
+    /**
+            Inherited from Optimization class.
+            This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+            used by Newton raphson method to minimize the function.
+            @param value current branch length
+            @param df (OUT) first derivative
+            @param ddf (OUT) second derivative
+            @return negative of likelihood (for minimization)
+     */
+    virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+    /**
+        optimize the scaling factor for tree length, given all branch lengths fixed
+        @param scaling (IN/OUT) start value of scaling factor, and as output the optimal value
+        @param gradient_epsilon gradient epsilon
+        @return optimal tree log-likelihood
+    */
+    double optimizeTreeLengthScaling(double &scaling, double gradient_epsilon);
+
+
+     /****************************************************************************
+            Branch length optimization by Least Squares
+     ****************************************************************************/
+
+    /**
+     * Estimate the current branch using least squares
+     * @param node1 first node of the branch
+     * @param node2 second node of the branch
+     * @return
+     */
+    double optimizeOneBranchLS(PhyloNode *node1, PhyloNode *node2);
+
+    /****************************************************************************
+            Auxilary functions and varialbes for speeding up branch length optimization (RAxML Trick)
+     ****************************************************************************/
+
+    bool theta_computed;
+
+    /**
+     *	NSTATES x NUMCAT x (number of patterns) array
+     *	Used to store precomputed values when optimizing branch length
+     *	See Tung's report on 07.05.2012 for more information
+     */
+    double* theta_all;
+
+
+    /**
+     * frequencies of alignment patterns, used as buffer for likelihood computation
+     */
+    double *ptn_freq;
+
+    /**
+     * used as buffer for faster likelihood computation
+     * for const pattern: it stores product of p_invar and state frequency
+     * for other pattern: zero
+     */
+    double *ptn_invar;
+
+    /****************************************************************************
+            Nearest Neighbor Interchange by maximum likelihood
+     ****************************************************************************/
+
+    /**
+            search by a nearest neigbor interchange, then optimize branch lengths. Do it
+            until tree does not improve
+            @return the likelihood of the tree
+     */
+    double optimizeNNIBranches();
+
+    /**
+            search by a nearest neigbor interchange
+            @return the likelihood of the tree
+     */
+    double optimizeNNI();
+
+    /**
+            search by a nearest neigbor interchange
+            @param cur_score current likelihood score
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the likelihood of the tree
+     */
+    double optimizeNNI(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL
+            /*,ostream *out = NULL, int brtype = 0, ostream *out_lh = NULL, ostream *site_lh = NULL,
+    StringIntMap *treels = NULL, vector<double*> *treels_ptnlh = NULL, DoubleVector *treels_logl = NULL,
+    int *max_trees = NULL, double *logl_cutoff = NULL*/
+            );
+
+
+    /**
+       search for the best NNI move corresponding to this branch
+       @return NNIMove the best NNI, this NNI could be worse than the current tree
+       according to the evaluation scheme in use
+       @param node1 1 of the 2 nodes on the branch
+       @param node2 1 of the 2 nodes on the branch
+       @param nniMoves (IN/OUT) detailed information of the 2 NNIs, set .ptnlh to compute pattern likelihoods
+     */
+    virtual NNIMove getBestNNIForBran(PhyloNode *node1, PhyloNode *node2, NNIMove *nniMoves = NULL);
+
+    /**
+            Do an NNI
+            @param move reference to an NNI move object containing information about the move
+            @param clearLH decides whether or not the partial likelihood should be cleared
+     */
+    virtual void doNNI(NNIMove &move, bool clearLH = true);
+
+    /**
+     * Randomly choose perform an NNI, out of the two defined by branch node1-node2.
+     * This function also clear the corresponding partial likelihood vectors
+     * @param node1 one node of the branch
+     * @param node2 one node of the branch
+     */
+    void doOneRandomNNI(Node *node1, Node *node2);
+
+
+    /**
+     *   Apply 5 new branch lengths stored in the NNI move
+     *   @param nnimove the NNI move currently in consideration
+     */
+    virtual void changeNNIBrans(NNIMove nnimove);
+
+    /****************************************************************************
+            Stepwise addition (greedy) by maximum likelihood
+     ****************************************************************************/
+
+    /**
+            grow the tree by step-wise addition
+            @param alignment input alignment
+     */
+    void growTreeML(Alignment *alignment);
+
+    /**
+            used internally by growTreeML() to find the best target branch to add into the tree
+            @param added_node node to add
+            @param target_node (OUT) one end of the best branch found
+            @param target_dad (OUT) the other end of the best branch found
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the likelihood of the tree
+     */
+    double addTaxonML(Node *added_node, Node* &target_node, Node* &target_dad, Node *node, Node *dad);
+
+    /****************************************************************************
+            Distance function
+     ****************************************************************************/
+
+    /**
+            compute the distance between 2 sequences.
+            @param seq1 index of sequence 1
+            @param seq2 index of sequence 2
+            @param initial_dist initial distance
+            @param (OUT) variance of distance between seq1 and seq2
+            @return distance between seq1 and seq2
+     */
+
+    virtual double computeDist(int seq1, int seq2, double initial_dist, double &var);
+
+    virtual double computeDist(int seq1, int seq2, double initial_dist);
+
+    /**
+            compute distance and variance matrix, assume dist_mat and var_mat are allocated by memory of size num_seqs * num_seqs.
+            @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
+            @param var_mat (OUT) variance matrix for distance matrix
+            @return the longest distance
+     */
+    double computeDist(double *dist_mat, double *var_mat);
+
+    /**
+            compute observed distance matrix, assume dist_mat is allocated by memory of size num_seqs * num_seqs.
+            @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
+            @return the longest distance
+     */
+    double computeObsDist(double *dist_mat);
+
+    /**
+            compute distance matrix, allocating memory if necessary
+            @param params program parameters
+            @param alignment input alignment
+            @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
+            @param dist_file (OUT) name of the distance file
+            @return the longest distance
+     */
+    double computeDist(Params &params, Alignment *alignment, double* &dist_mat, double* &var_mat, string &dist_file);
+
+    /**
+            compute observed distance matrix, allocating memory if necessary
+            @param params program parameters
+            @param alignment input alignment
+            @param dist_mat (OUT) distance matrix between all pairs of sequences in the alignment
+            @param dist_file (OUT) name of the distance file
+            @return the longest distance
+     */
+    double computeObsDist(Params &params, Alignment *alignment, double* &dist_mat, string &dist_file);
+
+    /**
+            correct the distances to follow metric property of triangle inequalities.
+            Using the Floyd alogrithm.
+            @param dist_mat (IN/OUT) the shortest path between all pairs of taxa
+    @return the longest distance
+     */
+    double correctDist(double *dist_mat);
+
+    /****************************************************************************
+            compute BioNJ tree, a more accurate extension of Neighbor-Joining
+     ****************************************************************************/
+
+    /**
+            compute BioNJ tree
+            @param params program parameters
+            @param alignment input alignment
+            @param dist_file distance matrix file
+     */
+    void computeBioNJ(Params &params, Alignment *alignment, string &dist_file);
+    /**
+            Neighbor-joining/parsimony tree might contain negative branch length. This
+            function will fix this.
+            @param fixed_length fixed branch length to set to negative branch lengths
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return The number of branches that have no/negative length
+     */
+    virtual int fixNegativeBranch(bool force = false, Node *node = NULL, Node *dad = NULL);
+
+    // OBSOLETE: assignRandomBranchLengths no longer needed, use fixNegativeBranch instead!
+//    int assignRandomBranchLengths(bool force = false, Node *node = NULL, Node *dad = NULL);
+
+    /* compute Bayesian branch lengths based on ancestral sequence reconstruction */
+    void computeAllBayesianBranchLengths(Node *node = NULL, Node *dad = NULL);
+
+    /**
+        generate random tree
+    */
+    void generateRandomTree(TreeGenType tree_type);
+
+    /****************************************************************************
+            Subtree Pruning and Regrafting by maximum likelihood
+            NOTE: NOT DONE YET
+     ****************************************************************************/
+
+    /**
+            search by Subtree pruning and regrafting
+            @return the likelihood of the tree
+     */
+    double optimizeSPR();
+
+    /**
+            search by Subtree pruning and regrafting, then optimize branch lengths. Iterative until
+            no tree improvement found.
+            @return the likelihood of the tree
+     */
+    double optimizeSPRBranches();
+
+    /**
+            search by Subtree pruning and regrafting at a current subtree
+            @param cur_score current likelihood score
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @return the likelihood of the tree
+     */
+    double optimizeSPR(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     *  original implementation by Minh
+     */
+    double optimizeSPR_old(double cur_score, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /**
+     *  original implementation by Minh
+     */
+    double swapSPR_old(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1,
+            PhyloNode *orig_node1, PhyloNode *orig_node2,
+            PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path);
+
+    /**
+            move the subtree (dad1-node1) to the branch (dad2-node2)
+     */
+    double swapSPR(double cur_score, int cur_depth, PhyloNode *node1, PhyloNode *dad1,
+            PhyloNode *orig_node1, PhyloNode *orig_node2,
+            PhyloNode *node2, PhyloNode *dad2, vector<PhyloNeighbor*> &spr_path);
+
+    double assessSPRMove(double cur_score, const SPRMove &spr);
+
+    void pruneSubtree(PhyloNode *node, PhyloNode *dad, PruningInfo &info);
+
+    void regraftSubtree(PruningInfo &info,
+            PhyloNode *in_node, PhyloNode *in_dad);
+
+    /****************************************************************************
+            Approximate Likelihood Ratio Test with SH-like interpretation
+     ****************************************************************************/
+
+    void computeNNIPatternLh(double cur_lh,
+            double &lh2, double *pattern_lh2,
+            double &lh3, double *pattern_lh3,
+            PhyloNode *node1, PhyloNode *node2);
+
+    /**
+            Resampling estimated log-likelihood (RELL)
+     */
+    void resampleLh(double **pat_lh, double *lh_new);
+
+    /**
+            Test one branch of the tree with aLRT SH-like interpretation
+     */
+    double testOneBranch(
+            double best_score, double *pattern_lh, int reps, int lbp_reps,
+            PhyloNode *node1, PhyloNode *node2, double &lbp_support);
+
+    /**
+            Test all branches of the tree with aLRT SH-like interpretation
+     */
+    int testAllBranches(int threshold,
+            double best_score, double *pattern_lh, int reps, int lbp_reps,
+            PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /****************************************************************************
+            Collapse stable (highly supported) clades by one representative
+     ****************************************************************************/
+
+    /**
+            delete a leaf from the tree, assume tree is birfucating
+            @param leaf the leaf node to remove
+     */
+    void deleteLeaf(Node *leaf);
+
+    /**
+            reinsert one leaf back into the tree
+            @param leaf the leaf to reinsert
+            @param adjacent_node the node adjacent to the leaf, returned by deleteLeaves() function
+            @param node one end node of the reinsertion branch in the existing tree
+            @param dad the other node of the reinsertion branch in the existing tree
+     */
+    void reinsertLeaf(Node *leaf, Node *node, Node *dad);
+
+    bool isSupportedNode(PhyloNode* node, int min_support);
+
+    /**
+            Collapse stable (highly supported) clades by one representative
+            @return the number of taxa prunned
+     */
+    int collapseStableClade(int min_support, NodeVector &pruned_taxa, StrVector &linked_name, double* &dist_mat);
+
+    int restoreStableClade(Alignment *original_aln, NodeVector &pruned_taxa, StrVector &linked_name);
+
+    /**
+            randomize the neighbor orders of all nodes
+     */
+    void randomizeNeighbors(Node *node = NULL, Node *dad = NULL);
+
+    virtual void changeLikelihoodKernel(LikelihoodKernel lk);
+
+    virtual void setLikelihoodKernel(LikelihoodKernel lk);
+
+#if defined(BINARY32) || defined(__NOAVX__)
+    virtual void setLikelihoodKernelAVX() {}
+#else
+    virtual void setLikelihoodKernelAVX();
+#endif
+    /****************************************************************************
+            Public variables
+     ****************************************************************************/
+
+    /**
+            associated alignment
+     */
+    Alignment *aln;
+
+    /**
+     * Distance matrix
+     */
+    double *dist_matrix;
+
+    /**
+     * Variance matrix
+     */
+    double *var_matrix;
+
+    /**
+            TRUE if you want to optimize branch lengths by Newton-Raphson method
+     */
+    bool optimize_by_newton;
+
+    /**
+     *      TRUE if the loglikelihood is computed using SSE
+     */
+    LikelihoodKernel sse;
+
+    /**
+     * for UpperBounds: Initial tree log-likelihood
+     */
+    double mlInitial;
+
+    /**
+     * for UpperBounds: Log-likelihood after optimization of model parameters in the beginning of tree search
+     */
+    double mlFirstOpt;
+
+    /**
+    * for Upper Bounds: how many NNIs have UB < L curScore, that is NNIs for which we don't need to compute likelihood
+    */
+	int skippedNNIub;
+
+	/**
+	* for Upper Bounds: how many NNIs were considered in total
+	*/
+	int totalNNIub;
+
+    /**
+     * for Upper Bounds: min, mean and max UB encountered during the tree search, such that UB < L curScore
+     */
+
+    //double minUB, meanUB, maxUB;
+
+    /*
+     * for UpperBounds: mlCheck = 1, if previous two values were already saved.
+     * Needed, because parameter optimization is done twice before and after tree search
+     */
+
+    int mlCheck;
+
+    /*
+     * for Upper Bounds: min base frequency
+     */
+
+	double minStateFreq;
+
+    /*
+     * 		Store the all the parameters for the program
+     */
+    Params* params;
+
+    /** sequence names that were removed */
+	StrVector removed_seqs;
+
+	/** sequence that are identical to one of the removed sequences */
+	StrVector twin_seqs;
+
+	/** remove identical sequences from the tree */
+    virtual void removeIdenticalSeqs(Params &params);
+
+    /** reinsert identical sequences into the tree and reset original alignment */
+    virtual void reinsertIdenticalSeqs(Alignment *orig_aln);
+
+
+    /**
+            assign the leaf names with the alignment sequence names, using the leaf ID for assignment.
+            @param node the starting node, NULL to start from the root
+            @param dad dad of the node, used to direct the search
+     */
+    void assignLeafNames(Node *node = NULL, Node *dad = NULL);
+
+    /**
+     * initialize partition information for super tree
+     */
+    virtual void initPartitionInfo() {
+    }
+
+    /**
+     * print transition matrix for all branches
+     *
+     */
+    void printTransMatrices(Node *node = NULL, Node *dad = NULL);
+
+    /**
+     * compute the memory size required for storing partial likelihood vectors
+     * @return memory size required in bytes
+     */
+    virtual uint64_t getMemoryRequired(size_t ncategory = 1);
+
+    void getMemoryRequired(uint64_t &partial_lh_entries, uint64_t &scale_num_entries, uint64_t &partial_pars_entries);
+
+    /****** following variables are for ultra-fast bootstrap *******/
+    /** 2 to save all trees, 1 to save intermediate trees */
+    int save_all_trees;
+
+    set<int> computeNodeBranchDists(Node *node = NULL, Node *dad = NULL);
+
+    /*
+     * Manuel's approach for analytic approximation of branch length given initial guess
+        b0: initial guess for the maximum
+        @return approximted branch length
+    */
+    double approxOneBranch(PhyloNode *node, PhyloNode *dad, double b0);
+
+    void approxAllBranches(PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+    /** set pointer of params variable */
+	virtual void setParams(Params* params);
+
+	double getCurScore() {
+		return curScore;
+	}
+
+	void setCurScore(double curScore) {
+		this->curScore = curScore;
+	}
+
+	/**
+	 * This will invalidate curScore variable, used whenever reading a tree!
+	 */
+	void resetCurScore(double score = 0.0) {
+        if (score != 0.0)
+            curScore = score;
+        else
+		    curScore = -DBL_MAX;
+        if (model)
+            initializeAllPartialLh();
+//		clearAllPartialLH();
+	}
+
+    void computeSeqIdentityAlongTree(Split &resp, Node *node = NULL, Node *dad = NULL);
+    void computeSeqIdentityAlongTree();
+
+protected:
+
+    /**
+     *  Instance of the phylogenetic likelihood library. This is basically the tree data strucutre in RAxML
+     */
+    pllInstance *pllInst;
+
+    /**
+     *  Whether the partial likelihood vectors have been computed for PLL
+     */
+//    bool lhComputed;
+
+    /**
+     *	PLL data structure for alignment
+     */
+    pllAlignmentData *pllAlignment;
+
+    /**
+     *  PLL data structure for storing phylognetic analysis options
+     */
+    pllInstanceAttr pllAttr;
+
+    /**
+     *  PLL partition list
+     */
+    partitionList * pllPartitions;
+
+    /**
+     *  is the subtree distance matrix need to be computed or updated
+     */
+    bool subTreeDistComputed;
+
+    /**
+     * Map data structure to store distance between subtree.
+     * The key is a string which is constructed by concatenating IDs of
+     * the 2 nodes, e.g. 15-16
+     */
+    StringDoubleMap subTreeDists;
+
+    StringDoubleMap subTreeWeights;
+
+    /** distance (# of branches) between 2 nodes */
+    int *nodeBranchDists;
+
+    /**
+     * A list containing all the marked list. This is used in the dynamic programming
+     * algorithm for compute inter subtree distances
+     */
+    IntPhyloNodeMap markedNodeList;
+
+    /** converted root state, for Tina's zoombie domain */
+    char root_state;
+
+    /**
+            internal pattern log-likelihoods, always stored after calling computeLikelihood()
+            or related functions. Note that scaling factors are not incorporated here.
+            If you want to get real pattern log-likelihoods, please use computePatternLikelihood()
+     */
+    double *_pattern_lh;
+
+    /**
+            internal pattern likelihoods per category, always stored after calling computeLikelihood()
+            or related functions. Note that scaling factors are not incorporated here.
+            If you want to get real pattern likelihoods, please use computePatternLikelihood()
+     */
+    double *_pattern_lh_cat;
+
+    /**
+            associated substitution model
+     */
+    ModelSubst *model;
+
+    /**
+            Model factory includes SubstModel and RateHeterogeneity
+            stores transition matrices computed before for efficiency purpose, eps. AA or CODON model.
+     */
+    ModelFactory *model_factory;
+
+    /**
+            among-site rates
+     */
+    RateHeterogeneity *site_rate;
+
+    /**
+            current branch iterator, used by computeFunction() to optimize branch lengths
+            and by computePatternLikelihood() to compute all pattern likelihoods
+     */
+    PhyloNeighbor *current_it;
+    /**
+            current branch iterator of the other end, used by computeFunction() to optimize branch lengths
+            and by computePatternLikelihood() to compute all pattern likelihoods
+     */
+    PhyloNeighbor *current_it_back;
+
+    bool is_opt_scaling;
+
+    /** current scaling factor for optimizeTreeLengthScaling() */
+    double current_scaling;
+
+    /**
+            spr moves
+     */
+    SPRMoves spr_moves;
+
+    /**
+            SPR radius
+     */
+    int spr_radius;
+
+
+    /**
+            the main memory storing all partial likelihoods for all neighbors of the tree.
+            The variable partial_lh in PhyloNeighbor will be assigned to a region inside this variable.
+     */
+    double *central_partial_lh;
+    double *nni_partial_lh; // used for NNI functions
+
+    /**
+            the main memory storing all scaling event numbers for all neighbors of the tree.
+            The variable scale_num in PhyloNeighbor will be assigned to a region inside this variable.
+     */
+    UBYTE *central_scale_num;
+    UBYTE *nni_scale_num; // used for NNI functions
+
+    /**
+            the main memory storing all partial parsimony states for all neighbors of the tree.
+            The variable partial_pars in PhyloNeighbor will be assigned to a region inside this variable.
+     */
+    UINT *central_partial_pars;
+
+    /**
+            TRUE to discard saturated for Meyer & von Haeseler (2003) model
+     */
+    bool discard_saturated_site;
+
+    /**
+     * Temporary partial likelihood array: used when swapping branch and recalculate the
+     * likelihood --> avoid calling malloc everytime
+     */
+//    double *tmp_partial_lh1;
+//    double *tmp_partial_lh2;
+
+    /**
+     *  Temporary array containing anscentral states.
+     *  Used to avoid calling malloc
+     */
+
+//    double *tmp_anscentral_state_prob1;
+//    double *tmp_anscentral_state_prob2;
+    /** pattern-specific rates */
+    //double *tmp_ptn_rates;
+
+    /**
+     * Temporary scale num array: used when swapping branch and recalculate the
+     * likelihood --> avoid calling malloc
+     */
+//    UBYTE *tmp_scale_num1;
+//    UBYTE *tmp_scale_num2;
+
+    /****************************************************************************
+            Vector of bit blocks, used for parsimony function
+     ****************************************************************************/
+
+    /**
+            @return size of the bits block vector for one node
+     */
+    size_t getBitsBlockSize();
+
+    /**
+            allocate new memory for a bit block vector
+            @return the allocated memory
+     */
+    UINT *newBitsBlock();
+
+    /**
+            @return size of the bits entry (for storing num_states bits)
+     */
+    int getBitsEntrySize();
+
+    /**
+            @param bits_entry
+            @return TRUE if bits_entry contains all 0s, FALSE otherwise
+     */
+    bool isEmptyBitsEntry(UINT *bits_entry);
+
+    /**
+            @param bits_entry1
+            @param bits_entry1
+            @param bits_union (OUT) union of bits_entry1 and bits_entry2
+     */
+    void unionBitsEntry(UINT *bits_entry1, UINT *bits_entry2, UINT* &bits_union);
+
+    /**
+            set a single bit to 1
+            @param bits_entry
+            @param id index of the bit in the entry to set to 1
+     */
+    void setBitsEntry(UINT* &bits_entry, int id);
+
+    /**
+            get a single bit content
+            @param bits_entry
+            @param id index of the bit in the entry
+            @return TRUE if bit ID is 1, FALSE otherwise
+     */
+    bool getBitsEntry(UINT* &bits_entry, int id);
+
+    /**
+            get bit blocks, each block span num_state bits
+            @param bit_vec bit block vector
+            @param index block index
+            @param bits_entry (OUT) content of the block at index
+     */
+    void getBitsBlock(UINT *bit_vec, int index, UINT* &bits_entry);
+
+    /**
+            set bit blocks, each block span num_state bits
+            @param bit_vec (OUT) bit block vector
+            @param index block index
+            @param bits_entry the content of the block at index
+     */
+    void setBitsBlock(UINT* &bit_vec, int index, UINT *bits_entry);
+
+    virtual void saveCurrentTree(double logl) {
+    } // save current tree
+
+
+    /**
+     * Current score of the tree;
+     */
+    double curScore;
+    
+    /** current best parsimony score */
+    UINT best_pars_score;
+
+};
+
+#endif
diff --git a/phylotreeavx.cpp b/phylotreeavx.cpp
new file mode 100644
index 0000000..697e370
--- /dev/null
+++ b/phylotreeavx.cpp
@@ -0,0 +1,109 @@
+/*
+ * phylotreeavx.cpp
+ *
+ *  Created on: Dec 14, 2014
+ *      Author: minh
+ */
+
+
+#include "phylokernel.h"
+#include "phylokernelmixture.h"
+#include "phylokernelmixrate.h"
+#include "vectorclass/vectorclass.h"
+
+#ifndef __AVX__
+#error "You must compile this file with AVX enabled!"
+#endif
+
+void PhyloTree::setParsimonyKernelAVX() {
+	computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFastSIMD<Vec8ui>;
+    computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFastSIMD<Vec8ui>;
+}
+
+void PhyloTree::setDotProductAVX() {
+#ifdef BOOT_VAL_FLOAT
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec8f, 8>;
+#else
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec4d, 4>;
+#endif
+
+}
+
+void PhyloTree::setLikelihoodKernelAVX() {
+    setParsimonyKernelAVX();
+	switch(aln->num_states) {
+	case 4:
+		if (model_factory && model_factory->model->isMixture()) {
+			if (model_factory->fused_mix_rate) {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
+//		        cout << "Fast-AVX-semi-mixture" << endl;
+			} else {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
+//		        cout << "Fast-AVX-mixture" << endl;
+			}
+		} else {
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 4>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 4>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 4>;
+			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 4>;
+//	        cout << "Fast-AVX" << endl;
+		}
+		break;
+	case 20:
+		if (model_factory && model_factory->model->isMixture()) {
+			if (model_factory->fused_mix_rate) {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
+//		        cout << "Fast-AVX-semi-mixture" << endl;
+			} else {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
+//		        cout << "Fast-AVX-mixture" << endl;
+			}
+		} else {
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 20>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 20>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 20>;
+			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 20>;
+//	        cout << "Fast-AVX" << endl;
+		}
+		break;
+	case 64:
+		if (model_factory && model_factory->model->isMixture()) {
+			if (model_factory->fused_mix_rate) {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
+//		        cout << "Fast-AVX-semi-mixture" << endl;
+			} else {
+				computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
+				computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
+				computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
+				computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
+//		        cout << "Fast-AVX-mixture" << endl;
+			}
+		} else {
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec4d, 4, 64>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec4d, 4, 64>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec4d, 4, 64>;
+			computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec4d, 4, 64>;
+//	        cout << "Fast-AVX" << endl;
+		}
+		break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
diff --git a/phylotreeeigen.cpp b/phylotreeeigen.cpp
new file mode 100644
index 0000000..8e323a2
--- /dev/null
+++ b/phylotreeeigen.cpp
@@ -0,0 +1,11 @@
+/*
+ * phylotreeeigen.cpp
+ *
+ *  Created on: Sep 15, 2014
+ *      Author: minh
+ */
+
+
+
+#include "phylotree.h"
+#include "modelgtr.h"
diff --git a/phylotreepars.cpp b/phylotreepars.cpp
new file mode 100644
index 0000000..2be1e90
--- /dev/null
+++ b/phylotreepars.cpp
@@ -0,0 +1,463 @@
+/*
+ * phylotreepars.cpp
+ *
+ * Fast implementation of parsimony kernel
+ *
+ *  Created on: May 18, 2015
+ *      Author: minh
+ */
+
+#include "phylotree.h"
+#include "vectorclass/vectorclass.h"
+#include "phylosupertree.h"
+
+/***********************************************************/
+/****** optimized version of parsimony kernel **************/
+/***********************************************************/
+
+void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    if (dad_branch->partial_lh_computed & 2)
+        return;
+    Node *node = dad_branch->node;
+    int nstates = aln->num_states;
+    int site;
+
+    dad_branch->partial_lh_computed |= 2;
+
+    if (node->isLeaf() && dad) {
+        // external node
+        if (aln->ordered_pattern.empty())
+            aln->orderPatternByNumChars();
+        int leafid = node->id;
+        int pars_size = getBitsBlockSize();
+        memset(dad_branch->partial_pars, 0, pars_size*sizeof(UINT));
+//        int ptn;
+//        int nptn = aln->size();
+    	int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
+        int max_sites = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
+        Alignment::iterator pat;
+    	switch (aln->seq_type) {
+    	case SEQ_DNA:
+//            nptn = aln->ordered_pattern.size();
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+//                Pattern *pat = &aln->ordered_pattern[ptn];
+//                if (!pat->is_informative)
+//                    continue;
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < 4) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        dad_branch->partial_pars[(site/UINT_BITS)*4+state] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*4);
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        p[0] |= bit1;
+                        p[1] |= bit1;
+                        p[2] |= bit1;
+                        p[3] |= bit1;
+                    }
+                } else {
+                	state -= 3;
+                    for (int j = 0; j < freq; j++, site++) {
+                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*4);
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        for (int i = 0; i < 4; i++)
+                            if (state & (1<<i))
+                                p[i] |= bit1;
+                    }
+                }
+            }
+            assert(site == aln->num_informative_sites);
+            // add dummy states
+            if (site < max_sites)
+            	dad_branch->partial_pars[(site/UINT_BITS)*4] |= ~((1<<(site%UINT_BITS)) - 1);
+//            for (; site < max_sites; site++) {
+//                dad_branch->partial_pars[(site/UINT_BITS)*4] |= (1 << (site%UINT_BITS));
+//            }
+    		break;
+    	case SEQ_PROTEIN:
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+//                if (!aln->at(ptn).is_informative)
+//                    continue;
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < 20) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        dad_branch->partial_pars[(site/UINT_BITS)*20+state] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*20);
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        for (int i = 0; i < 20; i++)
+                                p[i] |= bit1;
+                    }
+                } else {
+                	assert(state < 23);
+            		state = (state-20)*2;
+                    for (int j = 0; j < freq; j++, site++) {
+                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*20);
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        p[ambi_aa[state]] |= bit1;
+                        p[ambi_aa[state+1]] |= bit1;
+                    }
+                }
+            }
+            assert(site == aln->num_informative_sites);
+            // add dummy states
+            if (site < max_sites)
+            	dad_branch->partial_pars[(site/UINT_BITS)*20] |= ~((1<<(site%UINT_BITS)) - 1);
+//            for (; site < max_sites; site++) {
+//                dad_branch->partial_pars[(site/UINT_BITS)*20] |= (1 << (site%UINT_BITS));
+//            }
+    		break;
+    	default:
+//            for (ptn = 0, site = 0; ptn < nptn; ptn++) {
+            for (pat = aln->ordered_pattern.begin(), site = 0; pat != aln->ordered_pattern.end(); pat++) {
+//                if (!aln->at(ptn).is_informative)
+//                    continue;
+            	int state = pat->at(leafid);
+                int freq = pat->frequency;
+                if (state < nstates) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
+                    }
+                } else if (state == aln->STATE_UNKNOWN) {
+                    for (int j = 0; j < freq; j++, site++) {
+                        UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
+                        UINT bit1 = (1 << (site%UINT_BITS));
+                        for (int i = 0; i < nstates; i++)
+                                p[i] |= bit1;
+                    }
+                } else {
+                	assert(0);
+                }
+            }
+            assert(site == aln->num_informative_sites);
+            // add dummy states
+            if (site < max_sites)
+            	dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= ~((1<<(site%UINT_BITS)) - 1);
+//            for (; site < max_sites; site++) {
+//                dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= (1 << (site%UINT_BITS));
+//            }
+    		break;
+    	}
+
+    } else {
+        // internal node
+        assert(node->degree() == 3); // it works only for strictly bifurcating tree
+        PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            PhyloNeighbor* pit = (PhyloNeighbor*) (*it);
+            if ((*it)->node->name != ROOT_NAME && (pit->partial_lh_computed & 2) == 0) {
+                computePartialParsimonyFast(pit, (PhyloNode*) node);
+            }
+            if (!left) left = pit; else right = pit;
+        }
+//        UINT score = left->partial_pars[0] + right->partial_pars[0];
+        UINT score = 0;
+        int nsites = aln->num_informative_sites;
+        nsites = (nsites+UINT_BITS-1)/UINT_BITS;
+
+        switch (nstates) {
+        case 4:
+            #ifdef _OPENMP
+            #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
+            #endif
+			for (site = 0; site<nsites; site++) {
+				UINT w;
+                size_t offset = 4*site;
+                UINT *x = left->partial_pars + offset;
+                UINT *y = right->partial_pars + offset;
+                UINT *z = dad_branch->partial_pars + offset;
+				z[0] = x[0] & y[0];
+				z[1] = x[1] & y[1];
+				z[2] = x[2] & y[2];
+				z[3] = x[3] & y[3];
+				w = z[0] | z[1] | z[2] | z[3];
+				w = ~w;
+				score += vml_popcnt(w);
+				z[0] |= w & (x[0] | y[0]);
+				z[1] |= w & (x[1] | y[1]);
+				z[2] |= w & (x[2] | y[2]);
+				z[3] |= w & (x[3] | y[3]);
+			}
+			break;
+        default:
+            #ifdef _OPENMP
+            #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
+            #endif
+			for (site = 0; site<nsites; site++) {
+				int i;
+				UINT w = 0;
+                size_t offset = nstates*site;
+                UINT *x = left->partial_pars + offset;
+                UINT *y = right->partial_pars + offset;
+                UINT *z = dad_branch->partial_pars + offset;
+                
+				for (i = 0; i < nstates; i++) {
+					z[i] = x[i] & y[i];
+					w |= z[i];
+				}
+				w = ~w;
+				score += vml_popcnt(w);
+				for (i = 0; i < nstates; i++) {
+					z[i] |= w & (x[i] | y[i]);
+				}
+			}
+			break;
+        }
+        dad_branch->partial_pars[nstates*nsites] = score + left->partial_pars[nstates*nsites] + right->partial_pars[nstates*nsites];
+//        dad_branch->partial_pars[0] = score;
+    }
+}
+
+
+int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    assert(node_branch);
+    if (!central_partial_pars)
+        initializeAllPartialPars();
+    if ((dad_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyFast(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 2) == 0)
+        computePartialParsimonyFast(node_branch, node);
+    int site;
+    int nsites = (aln->num_informative_sites + UINT_BITS-1) / UINT_BITS;
+    int nstates = aln->num_states;
+
+    int scoreid = ((aln->num_informative_sites+UINT_BITS-1)/UINT_BITS)*nstates;
+    UINT sum_end_node = (dad_branch->partial_pars[scoreid] + node_branch->partial_pars[scoreid]);
+    UINT score = sum_end_node;
+
+    UINT lower_bound = best_pars_score;
+    if (branch_subst) lower_bound = INT_MAX;
+    switch (nstates) {
+    case 4:
+        #ifdef _OPENMP
+        #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
+        #endif
+		for (site = 0; site < nsites; site++) {
+            size_t offset = 4*site;
+            UINT *x = dad_branch->partial_pars + offset;
+            UINT *y = node_branch->partial_pars + offset;
+			UINT w = (x[0] & y[0]) | (x[1] & y[1]) | (x[2] & y[2]) | (x[3] & y[3]);
+			w = ~w;
+			score += vml_popcnt(w);
+            #ifndef _OPENMP
+            if (score >= lower_bound)
+                break;
+            #endif
+		}
+		break;
+    default:
+        #ifdef _OPENMP
+        #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
+        #endif
+		for (site = 0; site < nsites; site++) {
+            size_t offset = nstates*site;
+            UINT *x = dad_branch->partial_pars + offset;
+            UINT *y = node_branch->partial_pars + offset;
+			int i;
+			UINT w = x[0] & y[0];
+			for (i = 1; i < nstates; i++) {
+				w |= x[i] & y[i];
+			}
+			w = ~w;
+			score += vml_popcnt(w);
+            #ifndef _OPENMP
+            if (score >= lower_bound)
+                break;
+            #endif
+		}
+		break;
+    }
+    if (branch_subst)
+        *branch_subst = score - sum_end_node;
+//    score += sum_end_node;
+    return score;
+}
+
+void PhyloTree::computeAllPartialPars(PhyloNode *node, PhyloNode *dad) {
+	if (!node) node = (PhyloNode*)root;
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
+			computePartialParsimony((PhyloNeighbor*)*it, node);
+		PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
+		if ((rev->partial_lh_computed & 1) == 0)
+			computePartialParsimony(rev, (PhyloNode*)(*it)->node);
+		computeAllPartialPars((PhyloNode*)(*it)->node, node);
+	}
+}
+
+
+/****************************************************************************
+ Stepwise addition (greedy) by maximum parsimony
+ ****************************************************************************/
+
+// random generator function:
+//ptrdiff_t myrandom(ptrdiff_t i) {
+//    return random_int(i);
+//}
+
+// pointer object to it:
+//ptrdiff_t (*p_myrandom)(ptrdiff_t) = myrandom;
+
+int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment) {
+    aln = alignment;
+    int size = aln->getNSeq();
+    if (size < 3)
+        outError(ERR_FEW_TAXA);
+
+    freeNode();
+
+    root = newNode(size);
+
+    IntVector taxon_order;
+    taxon_order.resize(size);
+    for (int i = 0; i < size; i++)
+        taxon_order[i] = i;
+    // randomize the addition order
+    my_random_shuffle(taxon_order.begin(), taxon_order.end());
+
+    // create initial tree with 3 taxa
+    for (leafNum = 0; leafNum < 3; leafNum++) {
+        if (verbose_mode >= VB_MAX)
+            cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree" << endl;
+        Node *new_taxon = newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
+        root->addNeighbor(new_taxon, -1.0);
+        new_taxon->addNeighbor(root, -1.0);
+    }
+    root = findNodeID(taxon_order[0]);
+    initializeAllPartialPars();
+    size_t index = 6;
+    size_t pars_block_size = getBitsBlockSize();
+
+    if (isSuperTree())
+        ((PhyloSuperTree*)this)->mapTrees();
+    
+    UINT *tmp_partial_pars;
+    tmp_partial_pars = newBitsBlock();
+
+    // stepwise adding the next taxon
+    for (leafNum = 3; leafNum < size; leafNum++) {
+        if (verbose_mode >= VB_MAX)
+            cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree";
+        NodeVector nodes1, nodes2;
+        getBranches(nodes1, nodes2);
+        PhyloNode *target_node = NULL;
+        PhyloNode *target_dad = NULL;
+        best_pars_score = INT_MAX;
+        // allocate a new taxon and a new adjacent internal node
+        
+        UINT *new_taxon_partial_pars = central_partial_pars + ((index++) * pars_block_size);
+        
+        PhyloNode *new_taxon = (PhyloNode*)newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
+        PhyloNode *added_node = (PhyloNode*)newNode(size+leafNum-2);
+        added_node->addNeighbor(new_taxon, -1.0);
+        new_taxon->addNeighbor(added_node, -1.0);
+        ((PhyloNeighbor*) added_node->findNeighbor(new_taxon))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
+        ((PhyloNeighbor*) new_taxon->findNeighbor(added_node))->partial_pars = tmp_partial_pars;
+
+        // preserve two neighbors
+        added_node->addNeighbor((Node*) 1, -1.0);
+        added_node->addNeighbor((Node*) 2, -1.0);
+
+        for (int nodeid = 0; nodeid < nodes1.size(); nodeid++) {
+            int score = addTaxonMPFast(new_taxon, added_node, nodes1[nodeid], nodes2[nodeid]);
+            if (score < best_pars_score) {
+                best_pars_score = score;
+                target_node = (PhyloNode*)nodes1[nodeid];
+                target_dad = (PhyloNode*)nodes2[nodeid];
+                memcpy(new_taxon_partial_pars, tmp_partial_pars, pars_block_size*sizeof(UINT));
+            }
+        }
+        
+        if (verbose_mode >= VB_MAX)
+            cout << ", score = " << best_pars_score << endl;
+        // now insert the new node in the middle of the branch node-dad
+        target_node->updateNeighbor(target_dad, added_node, -1.0);
+        target_dad->updateNeighbor(target_node, added_node, -1.0);
+        added_node->updateNeighbor((Node*) 1, target_node, -1.0);
+        added_node->updateNeighbor((Node*) 2, target_dad, -1.0);
+        ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_pars =
+            ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_pars;
+        ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_pars =
+            ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_pars;
+            
+        ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_lh_computed = 
+            ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_lh_computed;
+        ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_lh_computed = 
+            ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_lh_computed;
+        
+        ((PhyloNeighbor*) new_taxon->findNeighbor(added_node))->partial_lh_computed |= 2;
+        ((PhyloNeighbor*) new_taxon->findNeighbor(added_node))->partial_pars = new_taxon_partial_pars;
+
+        ((PhyloNeighbor*)target_dad->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
+        ((PhyloNeighbor*)target_node->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
+
+        target_dad->clearReversePartialLh(added_node);
+        target_node->clearReversePartialLh(added_node);
+
+    }
+
+    aligned_free(tmp_partial_pars);
+    
+    assert(index == 4*leafNum-6);
+
+    nodeNum = 2 * leafNum - 2;
+    initializeTree();
+
+    setAlignment(alignment);
+//    initializeAllPartialPars();
+//    clearAllPartialLH();
+    fixNegativeBranch(true);
+    if (out_prefix) {
+		string file_name = out_prefix;
+		file_name += ".parstree";
+		printTree(file_name.c_str(), WT_NEWLINE);
+    }
+    return best_pars_score;
+}
+
+int PhyloTree::addTaxonMPFast(Node *added_taxon, Node* added_node, Node* node, Node* dad) {
+    Neighbor *dad_nei = dad->findNeighbor(node);
+
+    // now insert the new node in the middle of the branch node-dad
+    double len = dad_nei->length;
+    node->updateNeighbor(dad, added_node, len / 2.0);
+    dad->updateNeighbor(node, added_node, len / 2.0);
+    added_node->updateNeighbor((Node*) 1, node, len / 2.0);
+    added_node->updateNeighbor((Node*) 2, dad, len / 2.0);
+    ((PhyloNeighbor*) added_node->findNeighbor(node))->partial_pars =
+        ((PhyloNeighbor*) dad->findNeighbor(added_node))->partial_pars;
+    ((PhyloNeighbor*) added_node->findNeighbor(dad))->partial_pars =
+        ((PhyloNeighbor*) node->findNeighbor(added_node))->partial_pars;
+    ((PhyloNeighbor*) added_node->findNeighbor(node))->partial_lh_computed = 
+        ((PhyloNeighbor*) dad->findNeighbor(added_node))->partial_lh_computed;
+    ((PhyloNeighbor*) added_node->findNeighbor(dad))->partial_lh_computed = 
+        ((PhyloNeighbor*) node->findNeighbor(added_node))->partial_lh_computed;
+    // compute the likelihood
+    ((PhyloNeighbor*) added_taxon->findNeighbor(added_node))->clearPartialLh();
+    int score = computeParsimonyBranch((PhyloNeighbor*) added_node->neighbors[0], (PhyloNode*) added_node);
+    // remove the added node
+    node->updateNeighbor(added_node, dad, len);
+    dad->updateNeighbor(added_node, node, len);
+    added_node->updateNeighbor(node, (Node*) 1, len);
+    added_node->updateNeighbor(dad, (Node*) 2, len);
+
+    // set partial_pars to COMPUTED
+    ((PhyloNeighbor*)node->findNeighbor(dad))->partial_lh_computed |= 2;
+    ((PhyloNeighbor*)dad->findNeighbor(node))->partial_lh_computed |= 2;
+
+    // now tranverse the tree downwards
+
+//    FOR_NEIGHBOR_IT(node, dad, it){
+//        addTaxonMPFast(added_node, target_node, target_dad, target_partial_pars, (*it)->node, node);
+//    }
+    return score;
+
+}
diff --git a/phylotreesse.cpp b/phylotreesse.cpp
new file mode 100644
index 0000000..6a1bbb3
--- /dev/null
+++ b/phylotreesse.cpp
@@ -0,0 +1,2743 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "phylotree.h"
+#include "phylokernel.h"
+#include "phylokernelmixture.h"
+#include "phylokernelmixrate.h"
+#include "model/modelgtr.h"
+
+
+/* BQM: to ignore all-gapp subtree at an alignment site */
+//#define IGNORE_GAP_LH
+
+//#define USING_SSE
+
+void PhyloTree::setParsimonyKernel(LikelihoodKernel lk) {
+    // set parsimony kernel
+    switch (lk) {
+    case LK_SSE:
+        computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
+        computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
+    	break;
+    case LK_EIGEN:
+        computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFast;
+        computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFast;
+    	break;
+    case LK_EIGEN_SSE:
+		if (instruction_set >= 7)
+			setParsimonyKernelAVX();
+		else {
+			computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchFastSIMD<Vec4ui>;
+            computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyFastSIMD<Vec4ui>;
+        }
+    	break;
+    default:
+        computeParsimonyBranchPointer = &PhyloTree::computeParsimonyBranchNaive;
+        computePartialParsimonyPointer = &PhyloTree::computePartialParsimonyNaive;
+    	break;
+    }
+}
+
+void PhyloTree::setLikelihoodKernel(LikelihoodKernel lk) {
+    setParsimonyKernel(lk);
+
+	if (instruction_set >= 7) {
+		setDotProductAVX();
+	} else {
+#ifdef BOOT_VAL_FLOAT
+		dotProduct = &PhyloTree::dotProductSIMD<float, Vec4f, 4>;
+#else
+		dotProduct = &PhyloTree::dotProductSIMD<double, Vec2d, 2>;
+#endif
+	}
+	sse = lk;
+    if (!aln || lk == LK_NORMAL) {
+        computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
+        computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
+        computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
+        computeLikelihoodFromBufferPointer = NULL;
+        sse = LK_NORMAL;
+        return;
+    }
+    
+    if (sse == LK_EIGEN) {
+        if (model_factory && model_factory->model->isMixture()) {
+            if (model_factory->fused_mix_rate) {
+                computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigen;
+                computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigen;
+                computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigen;
+                computeLikelihoodFromBufferPointer = NULL;
+            } else {
+                computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigen;
+                computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigen;
+                computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigen;
+                computeLikelihoodFromBufferPointer = NULL;
+            }
+        } else {
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
+            computeLikelihoodFromBufferPointer = NULL;
+        }
+        return;
+    }
+
+//    cout << "Likelihood kernel: ";
+        
+    // set likelihood kernel
+	switch(aln->num_states) {
+	case 4:
+		switch(sse) {
+		case LK_SSE:
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<4>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<4>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<4>;
+	        computeLikelihoodFromBufferPointer = NULL;
+			break;
+		case LK_EIGEN_SSE:
+			if (instruction_set >= 7) {
+				// CPU supports AVX
+				setLikelihoodKernelAVX();
+			} else {
+				// CPU does not support AVX
+				if (model_factory && model_factory->model->isMixture()) {
+					if (model_factory->fused_mix_rate) {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
+					} else {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
+					}
+				} else {
+					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 4>;
+					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 4>;
+					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 4>;
+					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 4>;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+	case 20:
+		switch(sse) {
+		case LK_SSE:
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<20>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<20>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<20>;
+	        computeLikelihoodFromBufferPointer = NULL;
+			break;
+		case LK_EIGEN_SSE:
+			if (instruction_set >= 7) {
+				setLikelihoodKernelAVX();
+			} else {
+				if (model_factory && model_factory->model->isMixture()) {
+					if (model_factory->fused_mix_rate) {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
+					} else {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
+					}
+				} else {
+					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 20>;
+					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 20>;
+					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 20>;
+					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 20>;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case 64: // CODON
+		switch(sse) {
+		case LK_SSE:
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<64>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<64>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<64>;
+			computeLikelihoodFromBufferPointer = NULL;
+			break;
+		case LK_EIGEN_SSE:
+			if (instruction_set >= 7) {
+				setLikelihoodKernelAVX();
+			} else {
+				if (model_factory && model_factory->model->isMixture()) {
+					if (model_factory->fused_mix_rate) {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixrateLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
+//						cout << "Fast-SSE-semi-mixture" << endl;
+					} else {
+						computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
+						computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
+						computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
+						computeLikelihoodFromBufferPointer = &PhyloTree::computeMixtureLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
+//						cout << "Fast-SSE-mixture" << endl;
+					}
+				} else {
+					computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 64>;
+					computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 64>;
+					computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 64>;
+					computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 64>;
+//					cout << "Fast-SSE" << endl;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+
+
+	case 2:
+		switch(sse) {
+		case LK_SSE:
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchSSE<2>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervSSE<2>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodSSE<2>;
+	        computeLikelihoodFromBufferPointer = NULL;
+			break;
+		case LK_EIGEN_SSE:
+			computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigenSIMD<Vec2d, 2, 2>;
+			computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigenSIMD<Vec2d, 2, 2>;
+			computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigenSIMD<Vec2d, 2, 2>;
+	        computeLikelihoodFromBufferPointer = &PhyloTree::computeLikelihoodFromBufferEigenSIMD<Vec2d, 2, 2>;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	default:
+        if (sse == LK_EIGEN_SSE) {
+            if (model_factory && model_factory->model->isMixture()) {
+                if (model_factory->fused_mix_rate) {
+                    computeLikelihoodBranchPointer = &PhyloTree::computeMixrateLikelihoodBranchEigen;
+                    computeLikelihoodDervPointer = &PhyloTree::computeMixrateLikelihoodDervEigen;
+                    computePartialLikelihoodPointer = &PhyloTree::computeMixratePartialLikelihoodEigen;
+                    computeLikelihoodFromBufferPointer = NULL;
+                } else {
+                    computeLikelihoodBranchPointer = &PhyloTree::computeMixtureLikelihoodBranchEigen;
+                    computeLikelihoodDervPointer = &PhyloTree::computeMixtureLikelihoodDervEigen;
+                    computePartialLikelihoodPointer = &PhyloTree::computeMixturePartialLikelihoodEigen;
+                    computeLikelihoodFromBufferPointer = NULL;
+                }
+            } else {
+                computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchEigen;
+                computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervEigen;
+                computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodEigen;
+                computeLikelihoodFromBufferPointer = NULL;
+            }
+            sse = LK_EIGEN;
+        } else {
+            computeLikelihoodBranchPointer = &PhyloTree::computeLikelihoodBranchNaive;
+            computeLikelihoodDervPointer = &PhyloTree::computeLikelihoodDervNaive;
+            computePartialLikelihoodPointer = &PhyloTree::computePartialLikelihoodNaive;
+            computeLikelihoodFromBufferPointer = NULL;
+            sse = LK_NORMAL;
+        }
+		break;
+	}
+}
+
+void PhyloTree::changeLikelihoodKernel(LikelihoodKernel lk) {
+	if (sse == lk) return;
+	if ((sse == LK_EIGEN || sse == LK_EIGEN_SSE) && (lk == LK_NORMAL || lk == LK_SSE)) {
+		// need to increase the memory usage when changing from new kernel to old kernel
+        if (params->lh_mem_save == LM_PER_NODE)
+            params->lh_mem_save = LM_ALL_BRANCH;
+		setLikelihoodKernel(lk);
+		deleteAllPartialLh();
+		initializeAllPartialLh();
+		clearAllPartialLH();
+	} else {
+		// otherwise simply assign variable sse
+		setLikelihoodKernel(lk);
+	}
+}
+
+/*******************************************************
+ *
+ * master function: wrapper for other optimized functions
+ *
+ ******************************************************/
+
+void PhyloTree::computePartialLikelihood(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+	(this->*computePartialLikelihoodPointer)(dad_branch, dad);
+}
+
+double PhyloTree::computeLikelihoodBranch(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+	return (this->*computeLikelihoodBranchPointer)(dad_branch, dad);
+
+}
+
+void PhyloTree::computeLikelihoodDerv(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+	(this->*computeLikelihoodDervPointer)(dad_branch, dad, df, ddf);
+}
+
+
+double PhyloTree::computeLikelihoodFromBuffer() {
+	assert(current_it && current_it_back);
+
+	if (computeLikelihoodFromBufferPointer)
+		return (this->*computeLikelihoodFromBufferPointer)();
+	else
+		return (this->*computeLikelihoodBranchPointer)(current_it, (PhyloNode*)current_it_back->node);
+
+}
+
+void PhyloTree::computeTipPartialLikelihood() {
+	if (tip_partial_lh_computed)
+		return;
+	tip_partial_lh_computed = true;
+	int m, i, x, state, nstates = aln->num_states, nmixtures = model->getNMixtures();
+	double *all_inv_evec = model->getInverseEigenvectors();
+	assert(all_inv_evec);
+	assert(tip_partial_lh);
+
+	for (state = 0; state < nstates; state++) {
+		double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixtures];
+		for (m = 0; m < nmixtures; m++) {
+			double *inv_evec = &all_inv_evec[m*nstates*nstates];
+			for (i = 0; i < nstates; i++)
+				this_tip_partial_lh[m*nstates + i] = inv_evec[i*nstates+state];
+		}
+	}
+	// special treatment for unknown char
+	for (i = 0; i < nstates; i++) {
+		double *this_tip_partial_lh = &tip_partial_lh[aln->STATE_UNKNOWN*nstates*nmixtures];
+		for (m = 0; m < nmixtures; m++) {
+			double *inv_evec = &all_inv_evec[m*nstates*nstates];
+			double lh_unknown = 0.0;
+			for (x = 0; x < nstates; x++)
+				lh_unknown += inv_evec[i*nstates+x];
+			this_tip_partial_lh[m*nstates + i] = lh_unknown;
+		}
+	}
+
+	double lh_ambiguous;
+	// ambiguous characters
+	int ambi_aa[] = {
+        4+8, // B = N or D
+        32+64, // Z = Q or E
+        512+1024 // U = I or L
+        };
+	switch (aln->seq_type) {
+	case SEQ_DNA:
+		for (state = 4; state < 18; state++) {
+			int cstate = state-nstates+1;
+			double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixtures];
+			for (m = 0; m < nmixtures; m++) {
+				double *inv_evec = &all_inv_evec[m*nstates*nstates];
+				for (i = 0; i < nstates; i++) {
+					lh_ambiguous = 0.0;
+					for (x = 0; x < nstates; x++)
+						if ((cstate) & (1 << x))
+							lh_ambiguous += inv_evec[i*nstates+x];
+					this_tip_partial_lh[m*nstates+i] = lh_ambiguous;
+				}
+			}
+		}
+		break;
+	case SEQ_PROTEIN:
+		//map[(unsigned char)'B'] = 4+8+19; // N or D
+		//map[(unsigned char)'Z'] = 32+64+19; // Q or E
+		for (state = 0; state < sizeof(ambi_aa)/sizeof(int); state++) {
+			double *this_tip_partial_lh = &tip_partial_lh[(state+20)*nstates*nmixtures];
+			for (m = 0; m < nmixtures; m++) {
+				double *inv_evec = &all_inv_evec[m*nstates*nstates];
+				for (i = 0; i < nstates; i++) {
+					lh_ambiguous = 0.0;
+					for (x = 0; x < 11; x++)
+						if (ambi_aa[state] & (1 << x))
+							lh_ambiguous += inv_evec[i*nstates+x];
+					this_tip_partial_lh[m*nstates+i] = lh_ambiguous;
+				}
+			}
+		}
+		break;
+	default:
+		break;
+	}
+
+
+	//-------------------------------------------------------
+	// initialize ptn_freq and ptn_invar
+	//-------------------------------------------------------
+
+	computePtnFreq();
+	// for +I model
+	computePtnInvar();
+}
+
+void PhyloTree::computePtnFreq() {
+	if (ptn_freq_computed) return;
+	ptn_freq_computed = true;
+	size_t nptn = aln->getNPattern();
+	size_t maxptn = get_safe_upper_limit(nptn+model_factory->unobserved_ptns.size());
+	int ptn;
+	for (ptn = 0; ptn < nptn; ptn++)
+		ptn_freq[ptn] = (*aln)[ptn].frequency;
+	for (ptn = nptn; ptn < maxptn; ptn++)
+		ptn_freq[ptn] = 0.0;
+}
+
+void PhyloTree::computePtnInvar() {
+	size_t nptn = aln->getNPattern(), ptn;
+	size_t maxptn = get_safe_upper_limit(nptn+model_factory->unobserved_ptns.size());
+	int nstates = aln->num_states;
+
+    double *state_freq = aligned_alloc<double>(nstates);
+    model->getStateFrequency(state_freq);
+	memset(ptn_invar, 0, maxptn*sizeof(double));
+	double p_invar = site_rate->getPInvar();
+	if (p_invar != 0.0) {
+		for (ptn = 0; ptn < nptn; ptn++) {
+			if ((*aln)[ptn].const_char == nstates)
+				ptn_invar[ptn] = p_invar;
+			else if ((*aln)[ptn].const_char < nstates) {
+				ptn_invar[ptn] = p_invar * state_freq[(int) (*aln)[ptn].const_char];
+			}
+		}
+		// ascertmain bias correction
+		for (ptn = 0; ptn < model_factory->unobserved_ptns.size(); ptn++)
+			ptn_invar[nptn+ptn] = p_invar * state_freq[(int)model_factory->unobserved_ptns[ptn]];
+
+		// dummy values
+		for (ptn = nptn+model_factory->unobserved_ptns.size(); ptn < maxptn; ptn++)
+			ptn_invar[ptn] = ptn_invar[ptn-1];
+	}
+	aligned_free(state_freq);
+}
+
+/*******************************************************
+ *
+ * non-vectorized likelihood functions.
+ * this version uses Alexis' technique that stores the
+ * dot product of partial likelihoods and eigenvectors at node
+ * for faster branch length optimization
+ *
+ ******************************************************/
+
+//template <const int nstates>
+void PhyloTree::computePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+    size_t nstates = aln->num_states;
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+    
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+    size_t ncat = site_rate->getNRate();
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x;
+    size_t block = nstates * ncat;
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+	assert(inv_evec && evec);
+	double *eval = model->getEigenvalues();
+
+    dad_branch->lh_scale_factor = 0.0;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computePartialLikelihoodEigen(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computePartialLikelihoodEigen(right, node);
+        
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+
+        
+        
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+	double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		double *expleft = new double[nstates];
+		double *expright = new double[nstates];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (i = 0; i < nstates; i++) {
+			expleft[i] = exp(eval[i]*len_left);
+			expright[i] = exp(eval[i]*len_right);
+		}
+		for (x = 0; x < nstates; x++)
+			for (i = 0; i < nstates; i++) {
+				eleft[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expleft[i];
+				eright[c*nstatesqr+x*nstates+i] = evec[x*nstates+i] * expright[i];
+			}
+		delete [] expright;
+		delete [] expleft;
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+		double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (x = 0; x < block; x++) {
+				double vleft = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vleft += eleft[x*nstates+i] * tip_partial_lh[state*nstates+i];
+				}
+				partial_lh_left[state*block+x] = vleft;
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			for (x = 0; x < block; x++) {
+				double vright = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vright += eright[x*nstates+i] * tip_partial_lh[state*nstates+i];
+				}
+				partial_lh_right[state*block+x] = vright;
+			}
+		}
+
+		for (x = 0; x < block; x++) {
+			size_t addr = aln->STATE_UNKNOWN * block;
+			partial_lh_left[addr+x] = 1.0;
+			partial_lh_right[addr+x] = 1.0;
+		}
+
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+#ifdef _OPENMP
+//#pragma omp parallel for private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for private(ptn, c, x, i) schedule(static)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				double *left = partial_lh_left + (state_left*block+c*nstates);
+				double *right = partial_lh_right + (state_right*block+c*nstates);
+				for (x = 0; x < nstates; x++) {
+					partial_lh_tmp[x] = left[x] * right[x];
+				}
+
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+				}
+			}
+		}
+		delete [] partial_lh_right;
+		delete [] partial_lh_left;
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (x = 0; x < block; x++) {
+				double vleft = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vleft += eleft[x*nstates+i] * tip_partial_lh[state*nstates+i];
+				}
+				partial_lh_left[state*block+x] = vleft;
+			}
+		}
+		for (x = 0; x < block; x++) {
+			size_t addr = aln->STATE_UNKNOWN * block;
+			partial_lh_left[addr+x] = 1.0;
+		}
+
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+            double lh_max = 0.0;
+            
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (x = 0; x < nstates; x++) {
+					double vleft = 0.0, vright = 0.0;
+					size_t addr = c*nstatesqr+x*nstates;
+					vleft = partial_lh_left[state_left*block+c*nstates+x];
+					for (i = 0; i < nstates; i++) {
+						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+					}
+					partial_lh_tmp[x] = vleft * (vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+                    lh_max = max(fabs(res), lh_max);
+				}
+			}
+            // check if one should scale partial likelihoods
+            if (lh_max < SCALING_THRESHOLD) {
+            	if (lh_max == 0.0) {
+            		// for very shitty data
+            		for (c = 0; c < ncat; c++)
+            			memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
+					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+					//sum_scale += log(lh_max) * ptn_freq[ptn];
+					dad_branch->scale_num[ptn] += 4;
+					int nsite = aln->getNSite();
+					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+						if (aln->getPatternID(i) == ptn) {
+							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+							x++;
+						}
+            	} else {
+					// now do the likelihood scaling
+					for (i = 0; i < block; i++) {
+						partial_lh[i] *= SCALING_THRESHOLD_INVER;
+	                    //partial_lh[i] /= lh_max;
+					}
+					// unobserved const pattern will never have underflow
+					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+					//sum_scale += log(lh_max) * ptn_freq[ptn];
+					dad_branch->scale_num[ptn] += 1;
+            	}
+            }
+
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+		delete [] partial_lh_left;
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i) schedule(static)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+            double lh_max = 0.0;
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (x = 0; x < nstates; x++) {
+					double vleft = 0.0, vright = 0.0;
+					size_t addr = c*nstatesqr+x*nstates;
+					for (i = 0; i < nstates; i++) {
+						vleft += eleft[addr+i] * partial_lh_left[c*nstates+i];
+						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+					}
+					partial_lh_tmp[x] = vleft*vright;
+//                    assert(partial_lh_tmp[x] != 0.0);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+                    lh_max = max(lh_max, fabs(res));
+				}
+			}
+
+            // check if one should scale partial likelihoods
+            if (lh_max < SCALING_THRESHOLD) {
+            	if (lh_max == 0.0) {
+            		// for very shitty data
+            		for (c = 0; c < ncat; c++)
+            			memcpy(&partial_lh[c*nstates], &tip_partial_lh[aln->STATE_UNKNOWN*nstates], nstates*sizeof(double));
+					sum_scale += LOG_SCALING_THRESHOLD* 4 * ptn_freq[ptn];
+					//sum_scale += log(lh_max) * ptn_freq[ptn];
+					dad_branch->scale_num[ptn] += 4;
+					int nsite = aln->getNSite();
+					for (i = 0, x = 0; i < nsite && x < ptn_freq[ptn]; i++)
+						if (aln->getPatternID(i) == ptn) {
+							outWarning((string)"Numerical underflow for site " + convertIntToString(i+1));
+							x++;
+						}
+            	} else {
+					// now do the likelihood scaling
+					for (i = 0; i < block; i++) {
+						partial_lh[i] *= SCALING_THRESHOLD_INVER;
+	                    //partial_lh[i] /= lh_max;
+					}
+					// unobserved const pattern will never have underflow
+					sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+					//sum_scale += log(lh_max) * ptn_freq[ptn];
+					dad_branch->scale_num[ptn] += 1;
+            	}
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	delete [] eright;
+	delete [] eleft;
+}
+
+//template <const int nstates>
+void PhyloTree::computeLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigen(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodEigen(node_branch, node);
+        
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	assert(theta_all);
+	if (!theta_computed) {
+		// precompute theta for fast branch length optimization
+
+	    if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i) schedule(static)
+#endif
+	    	for (ptn = 0; ptn < nptn; ptn++) {
+				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_tip = tip_partial_lh + ((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*nstates;
+				for (i = 0; i < block; i++) {
+					theta[i] = lh_tip[i%nstates] * partial_lh_dad[i];
+				}
+
+			}
+			// ascertainment bias correction
+	    } else {
+	    	// both dad and node are internal nodes
+
+//	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i) schedule(static)
+#endif
+	    	for (ptn = 0; ptn < nptn; ptn++) {
+				double *theta = theta_all + ptn*block;
+			    double *partial_lh_node = node_branch->partial_lh + ptn*block;
+			    double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+//			    double theta_max = 0.0;
+	    		for (i = 0; i < block; i++) {
+	    			theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+//	    			theta_max = max(theta_max, fabs(theta[i]));
+	    		}
+//	    		if (theta_max <= 0) {
+//	    			// numerical underflow, recompute theta
+//	    			for (i = 0; i < block; i++) {
+//	    				partial_lh_node[i] *= SCALING_THRESHOLD_INVER;
+//		    			theta[i] = partial_lh_node[i] * partial_lh_dad[i];
+//	    			}
+//	    			node_branch->lh_scale_factor += LOG_SCALING_THRESHOLD*ptn_freq[ptn];
+//	    			node_branch->scale_num[ptn] += 1;
+//	    		}
+			}
+	    }
+		theta_computed = true;
+	}
+
+    double *val0 = new double[block];
+    double *val1 = new double[block];
+    double *val2 = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double prop = site_rate->getProp(c);
+		for (i = 0; i < nstates; i++) {
+			double cof = eval[i]*site_rate->getRate(c);
+			double val = exp(cof*dad_branch->length) * prop;
+			double val1_ = cof*val;
+			val0[c*nstates+i] = val;
+			val1[c*nstates+i] = val1_;
+			val2[c*nstates+i] = cof*val1_;
+		}
+	}
+
+
+    double my_df = 0.0, my_ddf = 0.0, prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
+//    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, df_const, ddf_const) private(ptn, i) schedule(static)
+#endif
+    for (ptn = 0; ptn < nptn; ptn++) {
+		double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
+		double *theta = theta_all + ptn*block;
+		for (i = 0; i < block; i++) {
+			lh_ptn += val0[i] * theta[i];
+			df_ptn += val1[i] * theta[i];
+			ddf_ptn += val2[i] * theta[i];
+		}
+
+//        assert(lh_ptn > 0.0);
+        lh_ptn = fabs(lh_ptn);
+        
+        if (ptn < orig_nptn) {
+			double df_frac = df_ptn / lh_ptn;
+			double ddf_frac = ddf_ptn / lh_ptn;
+			double freq = ptn_freq[ptn];
+			double tmp1 = df_frac * freq;
+			double tmp2 = ddf_frac * freq;
+			my_df += tmp1;
+			my_ddf += tmp2 - tmp1 * df_frac;
+		} else {
+			// ascertainment bias correction
+			prob_const += lh_ptn;
+			df_const += df_ptn;
+			ddf_const += ddf_ptn;
+		}
+    }
+	df = my_df;
+	ddf = my_ddf;
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+    }
+
+
+    delete [] val2;
+    delete [] val1;
+    delete [] val0;
+}
+
+//template <const int nstates>
+double PhyloTree::computeLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihoodEigen(dad_branch, dad);
+        computePartialLikelihood(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computePartialLikelihoodEigen(node_branch, node);
+        computePartialLikelihood(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    double *val = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		double prop = site_rate->getProp(c);
+		for (i = 0; i < nstates; i++)
+			val[c*nstates+i] = exp(eval[i]*len) * prop;
+	}
+
+	double prob_const = 0.0;
+	memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
+
+    if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+    	double *partial_lh_node = new double[(aln->STATE_UNKNOWN+1)*block];
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	// precompute information from one tip
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node +(*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*nstates;
+    		double *val_tmp = val;
+			for (c = 0; c < ncat; c++) {
+				for (i = 0; i < nstates; i++) {
+					  lh_node[i] = val_tmp[i] * lh_tip[i];
+				}
+				lh_node += nstates;
+				val_tmp += nstates;
+			}
+    	}
+
+    	// now do the real computation
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c) schedule(static)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*ncat;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+			double *lh_node = partial_lh_node + state_dad*block;
+			for (c = 0; c < ncat; c++) {
+				for (i = 0; i < nstates; i++) {
+					*lh_cat += lh_node[i] * partial_lh_dad[i];
+				}
+				lh_node += nstates;
+				partial_lh_dad += nstates;
+				lh_ptn += *lh_cat;
+				lh_cat++;
+			}
+//			assert(lh_ptn > -1e-10);
+			if (ptn < orig_nptn) {
+				lh_ptn = log(fabs(lh_ptn));
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+		delete [] partial_lh_node;
+    } else {
+    	// both dad and node are internal nodes
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c) schedule(static)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*ncat;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+			double *val_tmp = val;
+			for (c = 0; c < ncat; c++) {
+				for (i = 0; i < nstates; i++) {
+					*lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+				}
+				lh_ptn += *lh_cat;
+				partial_lh_node += nstates;
+				partial_lh_dad += nstates;
+				val_tmp += nstates;
+				lh_cat++;
+			}
+
+//			assert(lh_ptn > 0.0);
+            if (ptn < orig_nptn) {
+				lh_ptn = log(fabs(lh_ptn));
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+    }
+
+    if (isnan(tree_lh) || isinf(tree_lh)) {
+        cout << "WARNING: Numerical underflow caused by alignment sites";
+        i = aln->getNSite();
+        int j;
+        for (j = 0, c = 0; j < i; j++) {
+            ptn = aln->getPatternID(j);
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                cout << " " << j+1;
+                c++;
+                if (c >= 10) {
+                    cout << " ...";
+                    break;
+                }
+            }
+        }
+        cout << endl;
+        tree_lh = current_it->lh_scale_factor + current_it_back->lh_scale_factor;
+        for (ptn = 0; ptn < orig_nptn; ptn++) {
+            if (isnan(_pattern_lh[ptn]) || isinf(_pattern_lh[ptn])) {
+                _pattern_lh[ptn] = LOG_SCALING_THRESHOLD*4; // log(2^(-1024))
+            }
+            tree_lh += _pattern_lh[ptn] * ptn_freq[ptn];
+        }
+    }
+
+    if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+        assert(prob_const < 1.0);
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+    }
+
+	assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+    delete [] val;
+    return tree_lh;
+}
+
+
+/************************************************************************************************
+ *
+ *   SSE vectorized functions of the Naive implementation
+ *
+ *************************************************************************************************/
+
+template<const int NSTATES>
+inline double PhyloTree::computeLikelihoodBranchSSE(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node; // Node A
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad); // Node B
+    assert(node_branch);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    // swap node and dad if dad is a leaf
+    if (node->isLeaf()) {
+        PhyloNode *tmp_node = dad;
+        dad = node;
+        node = tmp_node;
+        PhyloNeighbor *tmp_nei = dad_branch;
+        dad_branch = node_branch;
+        node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodSSE<NSTATES>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodSSE<NSTATES>(node_branch, node);
+
+    // now combine likelihood at the branch
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    int ptn, cat, state1, state2;
+    double *partial_lh_site;
+    double *partial_lh_child;
+    double *trans_state;
+    double p_invar = site_rate->getPInvar();
+    int numCat = site_rate->getNRate();
+    int numStates = model->num_states;
+    int tranSize = numStates * numStates;
+    int alnSize = aln->size() + model_factory->unobserved_ptns.size();
+    int orig_alnSize = aln->size();
+    int block = numStates * numCat;
+
+    double p_var_cat = (1.0 - p_invar) / (double) numCat;
+
+    EIGEN_ALIGN16 double *trans_mat_orig = new double[numCat * tranSize + 1];
+    double *trans_mat = trans_mat_orig;
+    if (((intptr_t) trans_mat) % 16 != 0)
+        trans_mat = trans_mat + 1;
+    EIGEN_ALIGN16 double state_freq[NSTATES];
+    model->getStateFrequency(state_freq);
+    for (cat = 0; cat < numCat; cat++) {
+        double *trans_cat = trans_mat + (cat * tranSize);
+        model_factory->computeTransMatrix(dad_branch->length * site_rate->getRate(cat), trans_cat);
+        for (state1 = 0; state1 < NSTATES; state1++) {
+            double *trans_mat_state = trans_cat + (state1 * NSTATES);
+            for (state2 = 0; state2 < NSTATES; state2++)
+                trans_mat_state[state2] *= state_freq[state1];
+        }
+    }
+
+    double prob_const = 0.0; // probability of unobserved const patterns
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, cat) schedule(static)
+#endif
+    for (ptn = 0; ptn < alnSize; ++ptn) {
+        double lh_ptn = 0.0; // likelihood of the pattern
+        for (cat = 0; cat < numCat; cat++) {
+            partial_lh_site = node_branch->partial_lh + (ptn * block + cat * NSTATES);
+            partial_lh_child = dad_branch->partial_lh + (ptn * block + cat * NSTATES);
+            trans_state = trans_mat + cat * tranSize;
+            Map<Matrix<double, 1, NSTATES>, Aligned> eigen_partial_lh_child(&partial_lh_child[0]);
+            Map<Matrix<double, 1, NSTATES>, Aligned> eigen_partial_lh_site(&partial_lh_site[0]);
+            Map<Matrix<double, NSTATES, NSTATES>, Aligned> eigen_trans_state(&trans_state[0]);
+            lh_ptn += (eigen_partial_lh_child * eigen_trans_state).dot(eigen_partial_lh_site);
+        }
+        if (ptn < orig_alnSize) {
+			lh_ptn *= p_var_cat;
+			if ((*aln)[ptn].const_char == NSTATES)
+				lh_ptn += p_invar;
+			else if ((*aln)[ptn].const_char < NSTATES) {
+				lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
+			}
+			lh_ptn = log(lh_ptn);
+			tree_lh += lh_ptn * (aln->at(ptn).frequency);
+			_pattern_lh[ptn] = lh_ptn;
+			// BQM: pattern_lh contains the LOG-likelihood, not likelihood
+        } else {
+			lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_alnSize]];
+			prob_const += lh_ptn;
+
+        }
+    }
+    if (orig_alnSize < alnSize) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_alnSize; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+    }
+
+    delete[] trans_mat_orig;
+    return tree_lh;
+}
+
+template<int NSTATES>
+void PhyloTree::computePartialLikelihoodSSE(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    Node *node = dad_branch->node;
+    int ptn, cat;
+    //double *trans_state;
+    double *partial_lh_site;
+    double *partial_lh_child;
+    dad_branch->lh_scale_factor = 0.0;
+
+    int numCat = site_rate->getNRate();
+    int numStates = model->num_states;
+    int tranSize = numStates * numStates;
+    int alnSize = aln->size() + model_factory->unobserved_ptns.size();
+    int orig_alnSize = aln->size();
+    int block = numStates * numCat;
+    size_t lh_size = alnSize * block;
+    memset(dad_branch->scale_num, 0, alnSize * sizeof(UBYTE));
+
+    if (node->isLeaf() && dad) {
+        // external node
+        memset(dad_branch->partial_lh, 0, lh_size * sizeof(double));
+        for (ptn = 0; ptn < alnSize; ++ptn) {
+            char state;
+            partial_lh_site = dad_branch->partial_lh + (ptn * block);
+
+            if (node->name == ROOT_NAME) {
+                state = aln->STATE_UNKNOWN;
+            } else if (ptn < orig_alnSize){
+                state = (aln->at(ptn))[node->id];
+            } else {
+            	state = model_factory->unobserved_ptns[ptn-orig_alnSize];
+            }
+
+            if (state == aln->STATE_UNKNOWN) {
+#ifndef KEEP_GAP_LH
+                dad_branch->scale_num[ptn] = -1;
+#endif
+                for (int state2 = 0; state2 < block; state2++) {
+                    partial_lh_site[state2] = 1.0;
+                }
+            } else if (state < NSTATES) {
+                double *_par_lh_site = partial_lh_site + state;
+                for (cat = 0; cat < numCat; cat++) {
+                    *_par_lh_site = 1.0;
+                    _par_lh_site += NSTATES;
+                }
+            } else if (aln->seq_type == SEQ_DNA) {
+                // ambiguous character, for DNA, RNA
+                state = state - (NSTATES - 1);
+                for (int state2 = 0; state2 < NSTATES; state2++)
+                    if (state & (1 << state2)) {
+                        for (cat = 0; cat < numCat; cat++)
+                            partial_lh_site[cat * NSTATES + state2] = 1.0;
+                    }
+            } else if (aln->seq_type == SEQ_PROTEIN) {
+                // ambiguous character, for DNA, RNA
+                state = state - (NSTATES);
+                assert(state < 3);
+                int state_map[] = {4+8,32+64,512+1024};
+                for (int state2 = 0; state2 < 11; state2++)
+                    if (state_map[(int)state] & (1 << state2)) {
+                        for (cat = 0; cat < numCat; cat++)
+                            partial_lh_site[cat * NSTATES + state2] = 1.0;
+                    }
+            } else {
+            	outError("Internal error ", __func__);
+            }
+        }
+    } else {
+        // internal node
+        EIGEN_ALIGN16 double *trans_mat_orig = new double[numCat * tranSize + 2];
+        double *trans_mat = trans_mat_orig;
+        if (((intptr_t) trans_mat) % 16 != 0)
+            trans_mat = trans_mat + 1;
+        for (ptn = 0; ptn < lh_size; ++ptn)
+            dad_branch->partial_lh[ptn] = 1.0;
+#ifndef KEEP_GAP_LH
+        for (ptn = 0; ptn < alnSize; ptn++)
+            dad_branch->scale_num[ptn] = -1;
+#endif
+        FOR_NEIGHBOR_IT(node, dad, it)if ((*it)->node->name != ROOT_NAME) {
+            computePartialLikelihoodSSE<NSTATES > ((PhyloNeighbor*) (*it), (PhyloNode*) node);
+            dad_branch->lh_scale_factor += ((PhyloNeighbor*) (*it))->lh_scale_factor;
+            for (cat = 0; cat < numCat; cat++) {
+                model_factory->computeTransMatrix((*it)->length * site_rate->getRate(cat), &trans_mat[cat * tranSize]);
+            }
+            partial_lh_site = dad_branch->partial_lh;
+            partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh;
+            double sum_scale = 0.0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, cat, partial_lh_site, partial_lh_child) schedule(static)
+#endif
+            for (ptn = 0; ptn < alnSize; ++ptn)
+#ifndef KEEP_GAP_LH
+            if (((PhyloNeighbor*) (*it))->scale_num[ptn] < 0) {
+#ifndef _OPENMP
+                partial_lh_site += NSTATES * numCat;
+                partial_lh_child += NSTATES * numCat;
+#endif
+            } else
+#endif
+            {
+#ifndef KEEP_GAP_LH
+                if (dad_branch->scale_num[ptn] < 0)
+                dad_branch->scale_num[ptn] = 0;
+#endif
+#ifdef _OPENMP
+                int lh_offset = ptn*block;
+                partial_lh_site = dad_branch->partial_lh + lh_offset;
+                partial_lh_child = ((PhyloNeighbor*) (*it))->partial_lh + lh_offset;
+#endif
+                dad_branch->scale_num[ptn] += ((PhyloNeighbor*) (*it))->scale_num[ptn];
+                double *partial_lh_block = partial_lh_site;
+                double *trans_state = trans_mat;
+                bool do_scale = true;
+                for (cat = 0; cat < numCat; cat++)
+                {
+                    MappedRowVec(NSTATES) ei_partial_lh_child(partial_lh_child);
+                    MappedRowVec(NSTATES) ei_partial_lh_site(partial_lh_site);
+                    MappedMat(NSTATES) ei_trans_state(trans_state);
+                    ei_partial_lh_site.array() *= (ei_partial_lh_child * ei_trans_state).array();
+                    partial_lh_site += NSTATES;
+                    partial_lh_child += NSTATES;
+                    trans_state += tranSize;
+                }
+                for (cat = 0; cat < block; cat++)
+                if (partial_lh_block[cat] > SCALING_THRESHOLD) {
+                    do_scale = false;
+                    break;
+                }
+                if (do_scale) {
+                    // unobserved const pattern will never have underflow
+                    Map<VectorXd, Aligned> ei_lh_block(partial_lh_block, block);
+                    ei_lh_block *= SCALING_THRESHOLD_INVER;
+                    sum_scale += LOG_SCALING_THRESHOLD *  (*aln)[ptn].frequency;
+                    dad_branch->scale_num[ptn] += 1;
+                }
+            }
+            dad_branch->lh_scale_factor += sum_scale;
+        }
+        delete[] trans_mat_orig;
+    }
+
+    dad_branch->partial_lh_computed |= 1;
+}
+
+/****************************************************************************
+ computing derivatives of likelihood function
+ ****************************************************************************/
+template<int NSTATES>
+inline void PhyloTree::computeLikelihoodDervSSE(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    //assert(node_branch);
+    // swap node and dad if node is a leaf
+    if (node->isLeaf()) {
+        PhyloNode *tmp_node = dad;
+        dad = node;
+        node = tmp_node;
+        PhyloNeighbor *tmp_nei = dad_branch;
+        dad_branch = node_branch;
+        node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodSSE<NSTATES>(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihoodSSE<NSTATES>(node_branch, node);
+    df = ddf = 0.0;
+    int cat = 0;
+    double *partial_lh_site = node_branch->partial_lh;
+    double *partial_lh_child = dad_branch->partial_lh;
+    double lh_ptn; // likelihood of the pattern
+    double lh_ptn_derv1;
+    double lh_ptn_derv2;
+    double derv1_frac;
+    double derv2_frac;
+    double *trans_state;
+    double *derv1_state;
+    double *derv2_state;
+    double p_invar = site_rate->getPInvar();
+
+    int numCat = site_rate->getNRate();
+    int numStates = model->num_states;
+    int tranSize = numStates * numStates;
+    int alnSize = aln->size() + model_factory->unobserved_ptns.size();
+    int orig_alnSize = aln->size();
+
+    double p_var_cat = (1.0 - p_invar) / (double) numCat;
+    double state_freq[NSTATES];
+    model->getStateFrequency(state_freq);
+    double *trans_mat_orig  = new double[numCat * tranSize + 1];
+    double *trans_derv1_orig  = new double[numCat * tranSize + 1];
+    double *trans_derv2_orig  = new double[numCat * tranSize + 1];
+    // make alignment 16
+    double *trans_mat = trans_mat_orig, *trans_derv1 = trans_derv1_orig, *trans_derv2 = trans_derv2_orig;
+    if (((intptr_t) trans_mat) % 16 != 0)
+        trans_mat = trans_mat + 1;
+    if (((intptr_t) trans_derv1) % 16 != 0)
+        trans_derv1 = trans_derv1 + 1;
+    if (((intptr_t) trans_derv2) % 16 != 0)
+        trans_derv2 = trans_derv2 + 1;
+
+    int discrete_cat = site_rate->getNDiscreteRate();
+    if (!site_rate->isSiteSpecificRate())
+        for (cat = 0; cat < discrete_cat; cat++) {
+            double *trans_cat = trans_mat + (cat * tranSize);
+            double *derv1_cat = trans_derv1 + (cat * tranSize);
+            double *derv2_cat = trans_derv2 + (cat * tranSize);
+            double rate_val = site_rate->getRate(cat);
+            model_factory->computeTransDervFreq(dad_branch->length, rate_val, state_freq, trans_cat, derv1_cat,
+                    derv2_cat);
+        }
+    int dad_state = aln->STATE_UNKNOWN;
+    double my_df = 0.0;
+    double my_ddf = 0.0;
+    double prob_const = 0.0, prob_const_derv1 = 0.0, prob_const_derv2 = 0.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf,prob_const, prob_const_derv1, prob_const_derv2) \
+	private(cat, partial_lh_child, partial_lh_site,\
+	lh_ptn, lh_ptn_derv1, lh_ptn_derv2, derv1_frac, derv2_frac, dad_state, trans_state, derv1_state, derv2_state) schedule(static)
+#endif
+    for (int ptn = 0; ptn < alnSize; ++ptn) {
+#ifdef _OPENMP
+        int lh_offset = ptn*numCat*numStates;
+        partial_lh_site = node_branch->partial_lh + lh_offset;
+        partial_lh_child = dad_branch->partial_lh + lh_offset;
+#endif
+        lh_ptn = 0.0;
+        lh_ptn_derv1 = 0.0;
+        lh_ptn_derv2 = 0.0;
+        int padding = 0;
+        dad_state = aln->STATE_UNKNOWN; // FOR TUNG: This is missing in your codes!
+        if (dad->isLeaf()) {
+        	if (ptn < orig_alnSize)
+        		dad_state = (*aln)[ptn][dad->id];
+        	else
+        		dad_state = model_factory->unobserved_ptns[ptn-orig_alnSize];
+        }
+        padding = dad_state * NSTATES;
+        if (dad_state < NSTATES) {
+            //external node
+            trans_state = trans_mat + padding;
+            derv1_state = trans_derv1 + padding;
+            derv2_state = trans_derv2 + padding;
+            for (cat = 0; cat < numCat; cat++) {
+                MappedVec(NSTATES)ei_partial_lh_child(partial_lh_child);
+                MappedVec(NSTATES) ei_trans_state(trans_state);
+                MappedVec(NSTATES) ei_derv1_state(derv1_state);
+                MappedVec(NSTATES) ei_derv2_state(derv2_state);
+                lh_ptn += ei_partial_lh_child.dot(ei_trans_state);
+                lh_ptn_derv1 += ei_partial_lh_child.dot(ei_derv1_state);
+                lh_ptn_derv2 += ei_partial_lh_child.dot(ei_derv2_state);
+                partial_lh_child += NSTATES;
+                partial_lh_site += NSTATES;
+                trans_state += tranSize;
+                derv1_state += tranSize;
+                derv2_state += tranSize;
+            }
+        } else {
+            // internal node, or external node but ambiguous character
+            trans_state = trans_mat;
+            derv1_state = trans_derv1;
+            derv2_state = trans_derv2;
+            for (cat = 0; cat < numCat; cat++) {
+                MappedRowVec(NSTATES) ei_partial_lh_site(partial_lh_site);
+                MappedRowVec(NSTATES) ei_partial_lh_child(partial_lh_child);
+                MappedMat(NSTATES) ei_trans_state(trans_state);
+                MappedMat(NSTATES) ei_derv1_state(derv1_state);
+                MappedMat(NSTATES) ei_derv2_state(derv2_state);
+                lh_ptn += (ei_partial_lh_child * ei_trans_state).dot(ei_partial_lh_site);
+                lh_ptn_derv1 += (ei_partial_lh_child * ei_derv1_state).dot(ei_partial_lh_site);
+                lh_ptn_derv2 += (ei_partial_lh_child * ei_derv2_state).dot(ei_partial_lh_site);
+                partial_lh_site += NSTATES;
+                partial_lh_child += NSTATES;
+                trans_state += tranSize;
+                derv1_state += tranSize;
+                derv2_state += tranSize;
+            }
+        }
+        if (ptn < orig_alnSize) {
+			lh_ptn = lh_ptn * p_var_cat;
+			if ((*aln)[ptn].const_char == NSTATES)
+				lh_ptn += p_invar;
+			else if ((*aln)[ptn].const_char < NSTATES) {
+				lh_ptn += p_invar * state_freq[(int) (*aln)[ptn].const_char];
+			}
+			double pad = p_var_cat / lh_ptn;
+			if (std::isinf(pad)) {
+				lh_ptn_derv1 *= p_var_cat;
+				lh_ptn_derv2 *= p_var_cat;
+				derv1_frac = lh_ptn_derv1 / lh_ptn;
+				derv2_frac = lh_ptn_derv2 / lh_ptn;
+			} else {
+				derv1_frac = lh_ptn_derv1 * pad;
+				derv2_frac = lh_ptn_derv2 * pad;
+			}
+	        double freq = aln->at(ptn).frequency;
+			double tmp1 = derv1_frac * freq;
+			double tmp2 = derv2_frac * freq;
+			my_df += tmp1;
+			my_ddf += tmp2 - tmp1 * derv1_frac;
+        } else {
+        	lh_ptn = lh_ptn*p_var_cat + p_invar*state_freq[(int)model_factory->unobserved_ptns[ptn-orig_alnSize]];
+        	prob_const += lh_ptn;
+        	prob_const_derv1 += lh_ptn_derv1 * p_var_cat;
+        	prob_const_derv2 += lh_ptn_derv2 * p_var_cat;
+        }
+    }
+    if (orig_alnSize < alnSize) {
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	derv1_frac = prob_const_derv1 / prob_const;
+    	derv2_frac = prob_const_derv2 / prob_const;
+    	int nsites = aln->getNSite();
+    	my_df += nsites * derv1_frac;
+    	my_ddf += nsites *(derv2_frac + derv1_frac*derv1_frac);
+    }
+
+    delete[] trans_derv2_orig;
+    delete[] trans_derv1_orig;
+    delete[] trans_mat_orig;
+    df = my_df;
+    ddf = my_ddf;
+}
+
+
+/************************************************************************************************
+ *
+ *   non-vectorized fused mixture and rate likelihood functions
+ *
+ *************************************************************************************************/
+
+//template <const int nstates>
+void PhyloTree::computeMixratePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+
+    size_t nstates = aln->num_states;
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+    size_t ncat = site_rate->getNRate();
+    assert(ncat == model->getNMixtures());
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x;
+    size_t block = nstates * ncat;
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+	assert(inv_evec && evec);
+	double *eval = model->getEigenvalues();
+
+    dad_branch->lh_scale_factor = 0.0;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computeMixratePartialLikelihoodEigen(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computeMixratePartialLikelihoodEigen(right, node);
+        
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }        
+        
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+	double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		double *expleft = new double[nstates];
+		double *expright = new double[nstates];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (i = 0; i < nstates; i++) {
+			expleft[i] = exp(eval[c*nstates+i]*len_left);
+			expright[i] = exp(eval[c*nstates+i]*len_right);
+		}
+		for (x = 0; x < nstates; x++)
+			for (i = 0; i < nstates; i++) {
+				eleft[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expleft[i];
+				eright[c*nstatesqr+x*nstates+i] = evec[c*nstatesqr+x*nstates+i] * expright[i];
+			}
+		delete [] expright;
+		delete [] expleft;
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+		double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (c = 0; c < ncat; c++)
+			for (x = 0; x < nstates; x++) {
+				double vleft = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+				}
+				partial_lh_left[state*block+c*nstates+x] = vleft;
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			for (c = 0; c < ncat; c++)
+			for (x = 0; x < nstates; x++) {
+				double vright = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vright += eright[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+				}
+				partial_lh_right[state*block+c*nstates+x] = vright;
+			}
+		}
+
+		for (x = 0; x < block; x++) {
+			size_t addr = aln->STATE_UNKNOWN * block;
+			partial_lh_left[addr+x] = 1.0;
+			partial_lh_right[addr+x] = 1.0;
+		}
+
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+#ifdef _OPENMP
+//#pragma omp parallel for private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for private(ptn, c, x, i)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				double *left = partial_lh_left + (state_left*block+c*nstates);
+				double *right = partial_lh_right + (state_right*block+c*nstates);
+				for (x = 0; x < nstates; x++) {
+					partial_lh_tmp[x] = left[x] * right[x];
+				}
+
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+				}
+			}
+		}
+		delete [] partial_lh_right;
+		delete [] partial_lh_left;
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (c = 0; c < ncat; c++)
+			for (x = 0; x < nstates; x++) {
+				double vleft = 0.0;
+				for (i = 0; i < nstates; i++) {
+					vleft += eleft[c*nstatesqr+x*nstates+i] * tip_partial_lh[state*block+c*nstates+i];
+				}
+				partial_lh_left[state*block+c*nstates+x] = vleft;
+			}
+		}
+		for (x = 0; x < block; x++) {
+			size_t addr = aln->STATE_UNKNOWN * block;
+			partial_lh_left[addr+x] = 1.0;
+		}
+
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+            double lh_max = 0.0;
+
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (x = 0; x < nstates; x++) {
+					double vleft = 0.0, vright = 0.0;
+					size_t addr = c*nstatesqr+x*nstates;
+					vleft = partial_lh_left[state_left*block+c*nstates+x];
+					for (i = 0; i < nstates; i++) {
+						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+					}
+					partial_lh_tmp[x] = vleft * (vright);
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+                    lh_max = max(fabs(res), lh_max);
+				}
+			}
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+				for (i = 0; i < block; i++) {
+					partial_lh[i] *= SCALING_THRESHOLD_INVER;
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+            }
+
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+		delete [] partial_lh_left;
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+            double lh_max = 0.0;
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+
+			for (c = 0; c < ncat; c++) {
+				// compute real partial likelihood vector
+				for (x = 0; x < nstates; x++) {
+					double vleft = 0.0, vright = 0.0;
+					size_t addr = c*nstatesqr+x*nstates;
+					for (i = 0; i < nstates; i++) {
+						vleft += eleft[addr+i] * partial_lh_left[c*nstates+i];
+						vright += eright[addr+i] * partial_lh_right[c*nstates+i];
+					}
+					partial_lh_tmp[x] = vleft*vright;
+				}
+				// compute dot-product with inv_eigenvector
+				for (i = 0; i < nstates; i++) {
+					double res = 0.0;
+					for (x = 0; x < nstates; x++) {
+						res += partial_lh_tmp[x]*inv_evec[c*nstatesqr+i*nstates+x];
+					}
+					partial_lh[c*nstates+i] = res;
+                    lh_max = max(lh_max, fabs(res));
+				}
+			}
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+				for (i = 0; i < block; i++) {
+                    partial_lh[i] *= SCALING_THRESHOLD_INVER;
+				}
+				// unobserved const pattern will never have underflow
+                sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	delete [] eright;
+	delete [] eleft;
+}
+
+//template <const int nstates>
+void PhyloTree::computeMixrateLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigen(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixratePartialLikelihoodEigen(node_branch, node);
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	assert(theta_all);
+	if (!theta_computed) {
+		// precompute theta for fast branch length optimization
+
+	    if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i)
+#endif
+	    	for (ptn = 0; ptn < nptn; ptn++) {
+				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_tip = tip_partial_lh + ((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*nstates*ncat;
+				for (i = 0; i < block; i++) {
+					theta[i] = lh_tip[i] * partial_lh_dad[i];
+				}
+
+			}
+			// ascertainment bias correction
+	    } else {
+	    	// both dad and node are internal nodes
+		    double *partial_lh_node = node_branch->partial_lh;
+		    double *partial_lh_dad = dad_branch->partial_lh;
+
+	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	    	for (i = 0; i < all_entries; i++) {
+				theta_all[i] = partial_lh_node[i] * partial_lh_dad[i];
+			}
+	    }
+		theta_computed = true;
+	}
+
+    double *val0 = new double[block];
+    double *val1 = new double[block];
+    double *val2 = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double prop = site_rate->getProp(c);
+		for (i = 0; i < nstates; i++) {
+			double cof = eval[c*nstates+i]*site_rate->getRate(c);
+			double val = exp(cof*dad_branch->length) * prop;
+			double val1_ = cof*val;
+			val0[c*nstates+i] = val;
+			val1[c*nstates+i] = val1_;
+			val2[c*nstates+i] = cof*val1_;
+		}
+	}
+
+
+    double my_df = 0.0, my_ddf = 0.0, prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, df_const, ddf_const) private(ptn, i)
+#endif
+    for (ptn = 0; ptn < nptn; ptn++) {
+		double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
+		double *theta = theta_all + ptn*block;
+		for (i = 0; i < block; i++) {
+			lh_ptn += val0[i] * theta[i];
+			df_ptn += val1[i] * theta[i];
+			ddf_ptn += val2[i] * theta[i];
+		}
+
+//        assert(lh_ptn > 0.0);
+        lh_ptn = fabs(lh_ptn);
+
+        if (ptn < orig_nptn) {
+			double df_frac = df_ptn / lh_ptn;
+			double ddf_frac = ddf_ptn / lh_ptn;
+			double freq = ptn_freq[ptn];
+			double tmp1 = df_frac * freq;
+			double tmp2 = ddf_frac * freq;
+			my_df += tmp1;
+			my_ddf += tmp2 - tmp1 * df_frac;
+		} else {
+			// ascertainment bias correction
+			prob_const += lh_ptn;
+			df_const += df_ptn;
+			ddf_const += ddf_ptn;
+		}
+    }
+	df = my_df;
+	ddf = my_ddf;
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+    }
+
+
+    delete [] val2;
+    delete [] val1;
+    delete [] val0;
+}
+
+//template <const int nstates>
+double PhyloTree::computeMixrateLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computeMixratePartialLikelihoodEigen(dad_branch, dad);
+        computePartialLikelihood(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+//        computeMixratePartialLikelihoodEigen(node_branch, node);
+        computePartialLikelihood(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+
+    size_t block = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    double *val = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		double prop = site_rate->getProp(c);
+		for (i = 0; i < nstates; i++)
+			val[c*nstates+i] = exp(eval[c*nstates+i]*len) * prop;
+	}
+
+	double prob_const = 0.0;
+	memset(_pattern_lh_cat, 0, nptn*ncat*sizeof(double));
+
+    if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+    	double *partial_lh_node = new double[(aln->STATE_UNKNOWN+1)*block];
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	// precompute information from one tip
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node +(*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*block;
+    		for (i = 0; i < block; i++)
+    			lh_node[i] = val[i]*lh_tip[i];
+//    		double *val_tmp = val;
+//			for (c = 0; c < ncat; c++) {
+//				for (i = 0; i < nstates; i++) {
+//					  lh_node[i] = val_tmp[i] * lh_tip[i];
+//				}
+//				lh_node += nstates;
+//				val_tmp += nstates;
+//			}
+    	}
+
+    	// now do the real computation
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*ncat;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+			double *lh_node = partial_lh_node + state_dad*block;
+			for (c = 0; c < ncat; c++) {
+				for (i = 0; i < nstates; i++) {
+					*lh_cat += lh_node[i] * partial_lh_dad[i];
+				}
+				lh_node += nstates;
+				partial_lh_dad += nstates;
+				lh_ptn += *lh_cat;
+				lh_cat++;
+			}
+//			assert(lh_ptn > 0.0);
+			if (ptn < orig_nptn) {
+				lh_ptn = log(fabs(lh_ptn));
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+		delete [] partial_lh_node;
+    } else {
+    	// both dad and node are internal nodes
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*ncat;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+			double *val_tmp = val;
+			for (c = 0; c < ncat; c++) {
+				for (i = 0; i < nstates; i++) {
+					*lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+				}
+				lh_ptn += *lh_cat;
+				partial_lh_node += nstates;
+				partial_lh_dad += nstates;
+				val_tmp += nstates;
+				lh_cat++;
+			}
+
+			assert(lh_ptn > 0.0);
+            if (ptn < orig_nptn) {
+				lh_ptn = log(lh_ptn);
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+    }
+
+
+    if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+    }
+
+	assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+    delete [] val;
+    return tree_lh;
+}
+
+
+/*******************************************************
+ *
+ * non-vectorized likelihood functions for mixture models
+ *
+ ******************************************************/
+
+//template <const int nstates>
+void PhyloTree::computeMixturePartialLikelihoodEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    // don't recompute the likelihood
+	assert(dad);
+    if (dad_branch->partial_lh_computed & 1)
+        return;
+    dad_branch->partial_lh_computed |= 1;
+
+    size_t nstates = aln->num_states;
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    PhyloNode *node = (PhyloNode*)(dad_branch->node);
+
+	if (node->isLeaf()) {
+	    dad_branch->lh_scale_factor = 0.0;
+
+		if (!tip_partial_lh_computed)
+			computeTipPartialLikelihood();
+		return;
+	}
+
+    size_t ptn, c;
+    size_t orig_ntn = aln->size();
+    size_t ncat = site_rate->getNRate(), nmixture = model->getNMixtures();
+    const size_t nstatesqr=nstates*nstates;
+    size_t i, x, m;
+    size_t statecat = nstates * ncat;
+//    size_t statemix = nstates * nmixture;
+    size_t block = nstates * ncat * nmixture;
+
+	double *evec = model->getEigenvectors();
+	double *inv_evec = model->getInverseEigenvectors();
+	assert(inv_evec && evec);
+	double *eval = model->getEigenvalues();
+
+    dad_branch->lh_scale_factor = 0.0;
+
+	// internal node
+	assert(node->degree() == 3); // it works only for strictly bifurcating tree
+	PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
+	FOR_NEIGHBOR_IT(node, dad, it) {
+		if (!left) left = (PhyloNeighbor*)(*it); else right = (PhyloNeighbor*)(*it);
+	}
+
+	if (!left->node->isLeaf() && right->node->isLeaf()) {
+		PhyloNeighbor *tmp = left;
+		left = right;
+		right = tmp;
+	}
+	if ((left->partial_lh_computed & 1) == 0)
+		computeMixturePartialLikelihoodEigen(left, node);
+	if ((right->partial_lh_computed & 1) == 0)
+		computeMixturePartialLikelihoodEigen(right, node);
+        
+    if (params->lh_mem_save == LM_PER_NODE && !dad_branch->partial_lh) {
+        // re-orient partial_lh
+        bool done = false;
+        FOR_NEIGHBOR_IT(node, dad, it2) {
+            PhyloNeighbor *backnei = ((PhyloNeighbor*)(*it2)->node->findNeighbor(node));
+            if (backnei->partial_lh) {
+                dad_branch->partial_lh = backnei->partial_lh;
+                dad_branch->scale_num = backnei->scale_num;
+                backnei->partial_lh = NULL;
+                backnei->scale_num = NULL;
+                backnei->partial_lh_computed &= ~1; // clear bit
+                done = true;
+                break;
+            }
+        }
+        assert(done && "partial_lh is not re-oriented");
+    }
+
+        
+	dad_branch->lh_scale_factor = left->lh_scale_factor + right->lh_scale_factor;
+//	double partial_lh_tmp[nstates];
+	double *eleft = new double[block*nstates], *eright = new double[block*nstates];
+
+	// precompute information buffer
+	for (c = 0; c < ncat; c++) {
+		double *expleft = new double[nstates];
+		double *expright = new double[nstates];
+		double len_left = site_rate->getRate(c) * left->length;
+		double len_right = site_rate->getRate(c) * right->length;
+		for (m = 0; m < nmixture; m++) {
+			for (i = 0; i < nstates; i++) {
+				expleft[i] = exp(eval[m*nstates+i]*len_left);
+				expright[i] = exp(eval[m*nstates+i]*len_right);
+			}
+			for (x = 0; x < nstates; x++)
+				for (i = 0; i < nstates; i++) {
+					eleft[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expleft[i];
+					eright[(m*ncat+c)*nstatesqr+x*nstates+i] = evec[m*nstatesqr+x*nstates+i] * expright[i];
+				}
+		}
+		delete [] expright;
+		delete [] expleft;
+	}
+
+	if (left->node->isLeaf() && right->node->isLeaf()) {
+		// special treatment for TIP-TIP (cherry) case
+
+		// pre compute information for both tips
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+		double *partial_lh_right = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (m = 0; m < nmixture; m++) {
+				double *this_eleft = &eleft[m*nstatesqr*ncat];
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
+
+//				for (c = 0; c < ncat; c++)
+//					for (x = 0; x < nstates; x++) {
+//						double vleft = 0.0;
+//						for (i = 0; i < nstates; i++) {
+//							vleft += this_eleft[(c*nstates+x)*nstates+i] * this_tip_partial_lh[i];
+//						}
+//						this_partial_lh_left[c*nstates+x] = vleft;
+//					}
+
+				for (x = 0; x < statecat; x++) {
+					double vleft = 0.0;
+					for (i = 0; i < nstates; i++) {
+						vleft += this_eleft[x*nstates+i] * this_tip_partial_lh[i];
+					}
+					this_partial_lh_left[x] = vleft;
+				}
+			}
+		}
+
+		for (it = aln->seq_states[right->node->id].begin(); it != aln->seq_states[right->node->id].end(); it++) {
+			int state = (*it);
+			for (m = 0; m < nmixture; m++) {
+				double *this_eright = &eright[m*nstatesqr*ncat];
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				double *this_partial_lh_right = &partial_lh_right[state*block+m*statecat];
+				for (x = 0; x < statecat; x++) {
+					double vright = 0.0;
+					for (i = 0; i < nstates; i++) {
+						vright += this_eright[x*nstates+i] * this_tip_partial_lh[i];
+					}
+					this_partial_lh_right[x] = vright;
+				}
+			}
+		}
+
+		size_t addr = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr+x] = 1.0;
+			partial_lh_right[addr+x] = 1.0;
+		}
+
+
+		// scale number must be ZERO
+	    memset(dad_branch->scale_num, 0, nptn * sizeof(UBYTE));
+#ifdef _OPENMP
+//#pragma omp parallel for private(ptn, c, x, i, m)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			int state_right = (ptn < orig_ntn) ? (aln->at(ptn))[right->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+			for (m = 0; m < nmixture; m++) {
+				for (c = 0; c < ncat; c++) {
+					// compute real partial likelihood vector
+					double *left = partial_lh_left + (state_left*block+m*statecat+c*nstates);
+					double *right = partial_lh_right + (state_right*block+m*statecat+c*nstates);
+					for (x = 0; x < nstates; x++) {
+						partial_lh_tmp[x] = left[x] * right[x];
+					}
+
+					// compute dot-product with inv_eigenvector
+					for (i = 0; i < nstates; i++) {
+						double res = 0.0;
+						for (x = 0; x < nstates; x++) {
+							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
+						}
+						partial_lh[m*statecat+c*nstates+i] = res;
+					}
+				}
+			}
+		}
+		delete [] partial_lh_right;
+		delete [] partial_lh_left;
+	} else if (left->node->isLeaf() && !right->node->isLeaf()) {
+		// special treatment to TIP-INTERNAL NODE case
+		// only take scale_num from the right subtree
+		memcpy(dad_branch->scale_num, right->scale_num, nptn * sizeof(UBYTE));
+
+		// pre compute information for left tip
+		double *partial_lh_left = new double[(aln->STATE_UNKNOWN+1)*block];
+
+		vector<int>::iterator it;
+		for (it = aln->seq_states[left->node->id].begin(); it != aln->seq_states[left->node->id].end(); it++) {
+			int state = (*it);
+			for (m = 0; m < nmixture; m++) {
+				double *this_eleft = &eleft[m*nstatesqr*ncat];
+				double *this_tip_partial_lh = &tip_partial_lh[state*nstates*nmixture + m*nstates];
+				double *this_partial_lh_left = &partial_lh_left[state*block+m*statecat];
+				for (x = 0; x < statecat; x++) {
+					double vleft = 0.0;
+					for (i = 0; i < nstates; i++) {
+						vleft += this_eleft[x*nstates+i] * this_tip_partial_lh[i];
+					}
+					this_partial_lh_left[x] = vleft;
+				}
+			}
+		}
+		size_t addr = aln->STATE_UNKNOWN * block;
+		for (x = 0; x < block; x++) {
+			partial_lh_left[addr+x] = 1.0;
+		}
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+			int state_left = (ptn < orig_ntn) ? (aln->at(ptn))[left->node->id] : model_factory->unobserved_ptns[ptn-orig_ntn];
+            double lh_max = 0.0;
+
+            for (m = 0; m < nmixture; m++) {
+				for (c = 0; c < ncat; c++) {
+					// compute real partial likelihood vector
+					for (x = 0; x < nstates; x++) {
+						double vleft = 0.0, vright = 0.0;
+						size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
+						vleft = partial_lh_left[state_left*block+m*statecat+c*nstates+x];
+						for (i = 0; i < nstates; i++) {
+							vright += eright[addr+i] * partial_lh_right[m*statecat+c*nstates+i];
+						}
+						partial_lh_tmp[x] = vleft * (vright);
+					}
+					// compute dot-product with inv_eigenvector
+					for (i = 0; i < nstates; i++) {
+						double res = 0.0;
+						for (x = 0; x < nstates; x++) {
+							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
+						}
+						partial_lh[m*statecat+c*nstates+i] = res;
+						lh_max = max(fabs(res), lh_max);
+					}
+				}
+            }
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+				for (i = 0; i < block; i++) {
+					partial_lh[i] *= SCALING_THRESHOLD_INVER;
+				}
+				// unobserved const pattern will never have underflow
+				sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+            }
+
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+		delete [] partial_lh_left;
+
+	} else {
+		// both left and right are internal node
+
+		double sum_scale = 0.0;
+#ifdef _OPENMP
+//#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m, partial_lh_tmp)
+#pragma omp parallel for reduction(+: sum_scale) private(ptn, c, x, i, m)
+#endif
+		for (ptn = 0; ptn < nptn; ptn++) {
+			double partial_lh_tmp[nstates];
+			double *partial_lh = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_left = left->partial_lh + ptn*block;
+			double *partial_lh_right = right->partial_lh + ptn*block;
+            double lh_max = 0.0;
+			dad_branch->scale_num[ptn] = left->scale_num[ptn] + right->scale_num[ptn];
+
+			for (m = 0; m < nmixture; m++) {
+				for (c = 0; c < ncat; c++) {
+					// compute real partial likelihood vector
+					for (x = 0; x < nstates; x++) {
+						double vleft = 0.0, vright = 0.0;
+						size_t addr = (m*ncat+c)*nstatesqr+x*nstates;
+						for (i = 0; i < nstates; i++) {
+							vleft += eleft[addr+i] * partial_lh_left[m*statecat+c*nstates+i];
+							vright += eright[addr+i] * partial_lh_right[m*statecat+c*nstates+i];
+						}
+						partial_lh_tmp[x] = vleft*vright;
+					}
+					// compute dot-product with inv_eigenvector
+					for (i = 0; i < nstates; i++) {
+						double res = 0.0;
+						for (x = 0; x < nstates; x++) {
+							res += partial_lh_tmp[x]*inv_evec[m*nstatesqr+i*nstates+x];
+						}
+						partial_lh[m*statecat+c*nstates+i] = res;
+						lh_max = max(lh_max, fabs(res));
+					}
+				}
+			}
+            if (lh_max < SCALING_THRESHOLD) {
+				// now do the likelihood scaling
+				for (i = 0; i < block; i++) {
+                    partial_lh[i] *= SCALING_THRESHOLD_INVER;
+				}
+				// unobserved const pattern will never have underflow
+                sum_scale += LOG_SCALING_THRESHOLD * ptn_freq[ptn];
+				dad_branch->scale_num[ptn] += 1;
+            }
+
+		}
+		dad_branch->lh_scale_factor += sum_scale;
+
+	}
+
+	delete [] eright;
+	delete [] eleft;
+}
+
+//template <const int nstates>
+void PhyloTree::computeMixtureLikelihoodDervEigen(PhyloNeighbor *dad_branch, PhyloNode *dad, double &df, double &ddf) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigen(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computeMixturePartialLikelihoodEigen(node_branch, node);
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+
+    size_t block = ncat * nstates * nmixture;
+    size_t statemix = nstates * nmixture;
+    size_t statecat = nstates * ncat;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, m;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+	assert(theta_all);
+	if (!theta_computed) {
+		// precompute theta for fast branch length optimization
+
+	    if (dad->isLeaf()) {
+	    	// special treatment for TIP-INTERNAL NODE case
+#ifdef _OPENMP
+#pragma omp parallel for private(ptn, i, m)
+#endif
+	    	for (ptn = 0; ptn < nptn; ptn++) {
+				double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+				double *theta = theta_all + ptn*block;
+				double *lh_tip = tip_partial_lh +
+						((int)((ptn < orig_nptn) ? (aln->at(ptn))[dad->id] :  model_factory->unobserved_ptns[ptn-orig_nptn]))*statemix;
+				for (m = 0; m < nmixture; m++) {
+					for (i = 0; i < statecat; i++) {
+						theta[m*statecat+i] = lh_tip[m*nstates + i%nstates] * partial_lh_dad[m*statecat+i];
+					}
+				}
+
+			}
+			// ascertainment bias correction
+	    } else {
+	    	// both dad and node are internal nodes
+		    double *partial_lh_node = node_branch->partial_lh;
+		    double *partial_lh_dad = dad_branch->partial_lh;
+
+	    	size_t all_entries = nptn*block;
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+	    	for (i = 0; i < all_entries; i++) {
+				theta_all[i] = partial_lh_node[i] * partial_lh_dad[i];
+			}
+	    }
+		theta_computed = true;
+	}
+
+    double *val0 = new double[block];
+    double *val1 = new double[block];
+    double *val2 = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double prop = site_rate->getProp(c);
+		for (m = 0; m < nmixture; m++) {
+			for (i = 0; i < nstates; i++) {
+				double cof = eval[m*nstates+i]*site_rate->getRate(c);
+				double val = exp(cof*dad_branch->length) * prop * ((ModelMixture*)model)->prop[m];
+				double val1_ = cof*val;
+				val0[(m*ncat+c)*nstates+i] = val;
+				val1[(m*ncat+c)*nstates+i] = val1_;
+				val2[(m*ncat+c)*nstates+i] = cof*val1_;
+			}
+		}
+	}
+
+
+    double my_df = 0.0, my_ddf = 0.0, prob_const = 0.0, df_const = 0.0, ddf_const = 0.0;
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: my_df, my_ddf, prob_const, df_const, ddf_const) private(ptn, i)
+#endif
+    for (ptn = 0; ptn < nptn; ptn++) {
+		double lh_ptn = ptn_invar[ptn], df_ptn = 0.0, ddf_ptn = 0.0;
+		double *theta = theta_all + ptn*block;
+		for (i = 0; i < block; i++) {
+			lh_ptn += val0[i] * theta[i];
+			df_ptn += val1[i] * theta[i];
+			ddf_ptn += val2[i] * theta[i];
+		}
+
+//        assert(lh_ptn > 0.0);
+        lh_ptn = fabs(lh_ptn);
+
+        if (ptn < orig_nptn) {
+			double df_frac = df_ptn / lh_ptn;
+			double ddf_frac = ddf_ptn / lh_ptn;
+			double freq = ptn_freq[ptn];
+			double tmp1 = df_frac * freq;
+			double tmp2 = ddf_frac * freq;
+			my_df += tmp1;
+			my_ddf += tmp2 - tmp1 * df_frac;
+		} else {
+			// ascertainment bias correction
+			prob_const += lh_ptn;
+			df_const += df_ptn;
+			ddf_const += ddf_ptn;
+		}
+    }
+	df = my_df;
+	ddf = my_ddf;
+    if (isnan(df) || isinf(df)) {
+        df = 0.0;
+        ddf = 0.0;
+//        outWarning("Numerical instability (some site-likelihood = 0)");
+    }
+
+	if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = 1.0 - prob_const;
+    	double df_frac = df_const / prob_const;
+    	double ddf_frac = ddf_const / prob_const;
+    	int nsites = aln->getNSite();
+    	df += nsites * df_frac;
+    	ddf += nsites *(ddf_frac + df_frac*df_frac);
+    }
+
+
+    delete [] val2;
+    delete [] val1;
+    delete [] val0;
+}
+
+//template <const int nstates>
+double PhyloTree::computeMixtureLikelihoodBranchEigen(PhyloNeighbor *dad_branch, PhyloNode *dad) {
+    PhyloNode *node = (PhyloNode*) dad_branch->node;
+    PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
+    if (!central_partial_lh)
+        initializeAllPartialLh();
+    if (node->isLeaf()) {
+    	PhyloNode *tmp_node = dad;
+    	dad = node;
+    	node = tmp_node;
+    	PhyloNeighbor *tmp_nei = dad_branch;
+    	dad_branch = node_branch;
+    	node_branch = tmp_nei;
+    }
+    if ((dad_branch->partial_lh_computed & 1) == 0)
+//        computeMixturePartialLikelihoodEigen(dad_branch, dad);
+        computePartialLikelihood(dad_branch, dad);
+    if ((node_branch->partial_lh_computed & 1) == 0)
+        computePartialLikelihood(node_branch, node);
+    double tree_lh = node_branch->lh_scale_factor + dad_branch->lh_scale_factor;
+    size_t nstates = aln->num_states;
+    size_t ncat = site_rate->getNRate();
+    size_t nmixture = model->getNMixtures();
+
+    size_t block = ncat * nstates * nmixture;
+    size_t statemix = nstates * nmixture;
+    size_t cat_states = ncat * nstates;
+    size_t ptn; // for big data size > 4GB memory required
+    size_t c, i, m;
+    size_t orig_nptn = aln->size();
+    size_t nptn = aln->size()+model_factory->unobserved_ptns.size();
+    double *eval = model->getEigenvalues();
+    assert(eval);
+
+    double *val = new double[block];
+	for (c = 0; c < ncat; c++) {
+		double len = site_rate->getRate(c)*dad_branch->length;
+		double prop = site_rate->getProp(c);
+		for (m = 0; m < nmixture; m++)
+			for (i = 0; i < nstates; i++)
+				val[(m*ncat+c)*nstates+i] = exp(eval[m*nstates+i]*len) * prop * ((ModelMixture*)model)->prop[m];
+	}
+
+	double prob_const = 0.0;
+    // 2018-08-14: _pattern_lh_cat now only stores mixture likelihoods
+	memset(_pattern_lh_cat, 0, nptn*nmixture*sizeof(double));
+
+    if (dad->isLeaf()) {
+    	// special treatment for TIP-INTERNAL NODE case
+    	double *partial_lh_node = new double[(aln->STATE_UNKNOWN+1)*block];
+    	IntVector states_dad = aln->seq_states[dad->id];
+    	states_dad.push_back(aln->STATE_UNKNOWN);
+    	// precompute information from one tip
+    	for (IntVector::iterator it = states_dad.begin(); it != states_dad.end(); it++) {
+    		double *lh_node = partial_lh_node +(*it)*block;
+    		double *lh_tip = tip_partial_lh + (*it)*statemix;
+    		double *val_tmp = val;
+			for (m = 0; m < nmixture; m++) {
+				for (c = 0; c < ncat; c++) {
+					for (i = 0; i < nstates; i++) {
+						  lh_node[i] = val_tmp[i] * lh_tip[m*nstates+i];
+					}
+					lh_node += nstates;
+					val_tmp += nstates;
+				}
+			}
+    	}
+
+    	// now do the real computation
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c, m)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*nmixture;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			int state_dad = (ptn < orig_nptn) ? (aln->at(ptn))[dad->id] : model_factory->unobserved_ptns[ptn-orig_nptn];
+			double *lh_node = partial_lh_node + state_dad*block;
+			for (m = 0; m < nmixture; m++) {
+                for (i = 0; i < cat_states; i++)
+                    *lh_cat += lh_node[i] * partial_lh_dad[i];
+                lh_ptn += *lh_cat;
+                lh_node += cat_states;
+                partial_lh_dad += cat_states;
+                lh_cat++;
+//				for (c = 0; c < ncat; c++) {
+//					for (i = 0; i < nstates; i++) {
+//						*lh_cat += lh_node[i] * partial_lh_dad[i];
+//					}
+//					lh_node += nstates;
+//					partial_lh_dad += nstates;
+//                    lh_ptn += *lh_cat;
+//					lh_cat++;
+//				}
+                
+			}
+//			assert(lh_ptn > 0.0);
+			if (ptn < orig_nptn) {
+				lh_ptn = log(fabs(lh_ptn));
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+		delete [] partial_lh_node;
+    } else {
+    	// both dad and node are internal nodes
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: tree_lh, prob_const) private(ptn, i, c, m)
+#endif
+    	for (ptn = 0; ptn < nptn; ptn++) {
+			double lh_ptn = ptn_invar[ptn];
+			double *lh_cat = _pattern_lh_cat + ptn*nmixture;
+			double *partial_lh_dad = dad_branch->partial_lh + ptn*block;
+			double *partial_lh_node = node_branch->partial_lh + ptn*block;
+			double *val_tmp = val;
+			for (m = 0; m < nmixture; m++) {
+                for (i = 0; i < cat_states; i++)
+                    *lh_cat += val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+                lh_ptn += *lh_cat;
+                partial_lh_dad += cat_states;
+                partial_lh_node += cat_states;
+                val_tmp += cat_states;
+                lh_cat++;
+//				for (c = 0; c < ncat; c++) {
+//					for (i = 0; i < nstates; i++) {
+//						*lh_cat +=  val_tmp[i] * partial_lh_node[i] * partial_lh_dad[i];
+//					}
+//					lh_ptn += *lh_cat;
+//					partial_lh_node += nstates;
+//					partial_lh_dad += nstates;
+//					val_tmp += nstates;
+//					lh_cat++;
+//				}
+			}
+
+			assert(lh_ptn > 0.0);
+            if (ptn < orig_nptn) {
+				lh_ptn = log(lh_ptn);
+				_pattern_lh[ptn] = lh_ptn;
+				tree_lh += lh_ptn * ptn_freq[ptn];
+			} else {
+				prob_const += lh_ptn;
+			}
+		}
+    }
+
+
+    if (orig_nptn < nptn) {
+    	// ascertainment bias correction
+    	prob_const = log(1.0 - prob_const);
+    	for (ptn = 0; ptn < orig_nptn; ptn++)
+    		_pattern_lh[ptn] -= prob_const;
+    	tree_lh -= aln->getNSite()*prob_const;
+		assert(!isnan(tree_lh) && !isinf(tree_lh));
+    }
+
+	assert(!isnan(tree_lh) && !isinf(tree_lh));
+
+    delete [] val;
+    return tree_lh;
+}
diff --git a/pllnni.cpp b/pllnni.cpp
new file mode 100755
index 0000000..16ac6bc
--- /dev/null
+++ b/pllnni.cpp
@@ -0,0 +1,1094 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#define GLOBAL_VARIABLES_DEFINITION
+
+#if !defined WIN32 && !defined _WIN32 && !defined __WIN32__
+#include <sys/resource.h>
+#endif
+
+#include "phylotree.h"
+#include "pllnni.h"
+#include "alignment.h"
+
+/* program options */
+int nni0;
+int nni5;
+extern VerboseMode verbose_mode;
+int NNI_MAX_NR_STEP = 10;
+
+/* program options */
+extern Params *globalParam;
+extern Alignment *globalAlignment;
+
+/**
+ * map from newick tree string to frequencies that a tree is revisited during tree search
+ */
+StringIntMap pllTreeCounter;
+
+
+/*
+ * ****************************************************************************
+ * pllUFBoot area
+ * ****************************************************************************
+ */
+
+pllUFBootData * pllUFBootDataPtr = NULL;
+
+
+int compareDouble(const void * a, const void * b) {
+	if (*(double*) a > *(double*) b)
+		return 1;
+	else if (*(double*) a < *(double*) b)
+		return -1;
+	else
+		return 0;
+}
+
+pllNNIMove *getNNIList(pllInstance* tr) {
+	static pllNNIMove* nniList;
+	if (nniList == NULL) {
+		nniList = (pllNNIMove*) malloc(2 * (tr->mxtips - 3) * sizeof(pllNNIMove));
+		assert(nniList != NULL);
+	}
+	return nniList;
+}
+
+pllNNIMove *getNonConflictNNIList(pllInstance* tr) {
+	static pllNNIMove* nonConfNNIList;
+	if (nonConfNNIList == NULL) {
+		nonConfNNIList = (pllNNIMove*) malloc((tr->mxtips - 3) * sizeof(pllNNIMove));
+		assert(nonConfNNIList != NULL);
+	}
+	return nonConfNNIList;
+}
+
+double pllDoRandomNNIs(pllInstance *tr, partitionList *pr, int numNNI) {
+	int numInBrans = tr->mxtips - 3;
+	int numNNIinStep = (int) numInBrans / 5;
+
+	// devided in multiple round, each round collect 1/5 of numNNI
+	int cnt1 = 0;
+	unordered_set<int> selectedNodes;
+	vector<nodeptr> selectedBrans;
+	vector<nodeptr> brans;
+	do {
+		int cnt2 = 0;
+		selectedNodes.clear();
+		selectedBrans.clear();
+		brans.clear();
+		pllGetAllInBran(tr, brans);
+		assert(brans.size() == numInBrans);
+		while(cnt2 < numNNIinStep && cnt2 < numNNI) {
+			int branIndex = random_int(numInBrans);
+			if (selectedNodes.find(brans[branIndex]->number) == selectedNodes.end() &&
+					selectedNodes.find(brans[branIndex]->back->number) == selectedNodes.end()) {
+				selectedNodes.insert(brans[branIndex]->number);
+				selectedNodes.insert(brans[branIndex]->back->number);
+				selectedBrans.push_back(brans[branIndex]);
+				cnt2++;
+			}
+		}
+		for (vector<nodeptr>::iterator it = selectedBrans.begin(); it != selectedBrans.end(); ++it) {
+			int nniType = random_int(2);
+			doOneNNI(tr, pr, (*it), nniType, TOPO_ONLY);
+		}
+		cnt1 += selectedBrans.size();
+		if (numNNI - cnt1 < numNNIinStep) {
+			numNNIinStep = numNNI - cnt1;
+		}
+	} while (cnt1 < numNNI);
+	pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+	pllOptimizeBranchLengths(tr, pr, 1);
+	return tr->likelihood;
+}
+
+void pllGetAllInBran(pllInstance *tr, vector<nodeptr> &branlist) {
+	nodeptr p = tr->start->back;
+	nodeptr q = p->next;
+	while (q != p) {
+		pllGetAllInBranForSubtree(tr, q->back, branlist);
+		q = q->next;
+	}
+}
+
+void pllGetAllInBranForSubtree(pllInstance *tr, nodeptr p, vector<nodeptr> &branlist) {
+	if (!isTip(p->number, tr->mxtips) && !isTip(p->back->number, tr->mxtips)) {
+		branlist.push_back(p);
+		nodeptr q = p->next;
+		while (q != p) {
+			pllGetAllInBranForSubtree(tr, q->back, branlist);
+			q = q->next;
+		}
+	}
+}
+
+double pllPerturbTree(pllInstance *tr, partitionList *pr, vector<pllNNIMove> &nnis) {
+	//printf("Perturbing %d NNIs \n", numNNI);
+	for (vector<pllNNIMove>::iterator it = nnis.begin(); it != nnis.end(); it++) {
+		printf("Do pertubing NNI (%d - %d) with logl = %10.4f \n", (*it).p->number, (*it).p->back->number, (*it).likelihood);
+		doOneNNI(tr, pr, (*it).p, (*it).nniType, TOPO_ONLY);
+		updateBranchLengthForNNI(tr, pr, (*it));
+
+	}
+	pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+	pllOptimizeBranchLengths(tr, pr, 1);
+	return tr->likelihood;
+}
+
+void quicksort_nni(pllNNIMove* arr,int left, int right) {
+    int i = left, j = right;
+    pllNNIMove tmp, pivot = arr[(left + right) / 2];
+
+    /* partition */
+    while (i <= j) {
+        while (arr[i].likelihood < pivot.likelihood)
+            i++;
+        while (pivot.likelihood < arr[j].likelihood)
+            j--;
+        if (i <= j) {
+            tmp = arr[i];
+            arr[i] = arr[j];
+            arr[j] = tmp;
+
+            i++;
+            j--;
+        }
+    };
+
+    /* recursion */
+    if (left < j)
+        quicksort_nni(arr, left, j);
+    if (i < right)
+        quicksort_nni(arr, i, right);
+}
+
+//TODO: Workaround for memory leak problem when calling setupTopol within doNNISearch
+topol *_setupTopol(pllInstance* tr) {
+	static topol* tree;
+	if (tree == NULL)
+		tree = setupTopol(tr->mxtips);
+	return tree;
+}
+
+vector<string> getAffectedBranches(pllInstance* tr, nodeptr p) {
+	vector<string> res;
+	res.push_back(getBranString(p));
+	nodeptr q = p->back;
+	nodeptr p_nei = p->next;
+	nodeptr q_nei = q->next;
+	while (p_nei != p) {
+		res.push_back(getBranString(p_nei));
+		if (!isTip(p_nei->back->number, tr->mxtips)) {
+			res.push_back(getBranString(p_nei->back->next));
+			res.push_back(getBranString(p_nei->back->next->next));
+		}
+		p_nei = p_nei->next;
+	}
+	while (q_nei != q) {
+		res.push_back(getBranString(q_nei));
+		if (!isTip(q_nei->back->number, tr->mxtips)) {
+			res.push_back(getBranString(q_nei->back->next));
+			res.push_back(getBranString(q_nei->back->next->next));
+		}
+		q_nei = q_nei->next;
+	}
+	return res;
+}
+
+string getBranString(nodeptr p) {
+	stringstream branString;
+	nodeptr q = p->back;
+	if (p->number < q->number) {
+		branString << p->number << "-" << q->number;
+	} else {
+		branString << q->number << "-" << p->number;
+	}
+	return branString.str();
+}
+
+set<int> getAffectedNodes(pllInstance* tr, nodeptr p) {
+	set<int> nodeSet;
+	nodeptr q = p->back;
+	nodeptr p_nei = p->next;
+	nodeptr q_nei = q->next;
+	nodeSet.insert(p->number);
+	nodeSet.insert(q->number);
+	while (p_nei != p) {
+		nodeptr nei = p_nei->back;
+		if (isTip(nei->number, tr->mxtips)) {
+			nodeSet.insert(nei->number);
+		} else {
+			nodeSet.insert(nei->number);
+			nodeSet.insert(nei->next->back->number);
+			nodeSet.insert(nei->next->next->back->number);
+		}
+		p_nei = p_nei->next;
+	}
+	while (q_nei != q) {
+		nodeptr nei = q_nei->back;
+		if (isTip(nei->number, tr->mxtips)) {
+			nodeSet.insert(nei->number);
+		} else {
+			nodeSet.insert(nei->number);
+			nodeSet.insert(nei->next->back->number);
+			nodeSet.insert(nei->next->next->back->number);
+		}
+		q_nei = q_nei->next;
+	}
+	return nodeSet;
+}
+
+void pllEvalAllNNIs(pllInstance *tr, partitionList *pr, SearchInfo &searchinfo) {
+    /* DTH: mimic IQTREE::optimizeNNI 's first call to IQTREE::saveCurrentTree */
+    if((globalParam->online_bootstrap == PLL_TRUE) &&
+            (globalParam->gbo_replicates > 0)){
+        tr->fastScaling = PLL_FALSE;
+        pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_TRUE);
+        pllSaveCurrentTree(tr, pr, tr->start);
+    }
+
+	nodeptr p = tr->start->back;
+	nodeptr q = p->next;
+	while (q != p) {
+		evalNNIForSubtree(tr, pr, q->back, searchinfo);
+		q = q->next;
+	}
+}
+
+/*
+void pllSaveQuartetForSubTree(pllInstance *tr, nodeptr p, SearchInfo &searchinfo) {
+	if (!isTip(p->number, tr->mxtips) && !isTip(p->back->number, tr->mxtips)) {
+			evalNNIForBran(tr, pr, p, searchinfo);
+		nodeptr q = p->next;
+		while (q != p) {
+			pllSaveQuartetForSubTree(tr, q->back, searchinfo);
+			q = q->next;
+		}
+	}
+}
+
+void pllSaveAllQuartet(pllInstance *tr, SearchInfo &searchinfo) {
+	nodeptr p = tr->start->back;
+	nodeptr q = p->next;
+	while (q != p) {
+		pllSaveQuartetForSubTree(tr, q->back, searchinfo);
+	}
+}
+*/
+
+double pllDoNNISearch(pllInstance* tr, partitionList *pr, SearchInfo &searchinfo) {
+	double initLH = tr->likelihood;
+	double finalLH = initLH;
+	vector<pllNNIMove> selectedNNIs;
+	unordered_set<int> selectedNodes;
+    int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+	/* data structure to store the initial tree topology + branch length */
+	topol* curTree = _setupTopol(tr);
+	saveTree(tr, curTree, numBranches);
+
+	/* evaluate NNIs */
+	pllEvalAllNNIs(tr, pr, searchinfo);
+
+	if (searchinfo.speednni) {
+		searchinfo.aBranches.clear();
+	}
+
+	/* apply non-conflicting positive NNIs */
+	if (searchinfo.posNNIList.size() != 0) {
+		sort(searchinfo.posNNIList.begin(), searchinfo.posNNIList.end(), comparePLLNNIMove);
+        if (verbose_mode >= VB_DEBUG) {
+        	cout << "curScore: "  << searchinfo.curLogl << endl;
+            for (int i = 0; i < searchinfo.posNNIList.size(); i++) {
+                cout << "Logl of positive NNI " << i << " : " << searchinfo.posNNIList[i].likelihood << endl;
+            }
+        }
+		for (vector<pllNNIMove>::reverse_iterator rit = searchinfo.posNNIList.rbegin(); rit != searchinfo.posNNIList.rend(); ++rit) {
+			if (selectedNodes.find((*rit).p->number) == selectedNodes.end() && selectedNodes.find((*rit).p->back->number) == selectedNodes.end()) {
+				selectedNNIs.push_back((*rit));
+				selectedNodes.insert((*rit).p->number);
+				selectedNodes.insert((*rit).p->back->number);
+			} else {
+				continue;
+			}
+		}
+
+		/* Applying all independent NNI moves */
+		searchinfo.curNumAppliedNNIs = selectedNNIs.size();
+		for (vector<pllNNIMove>::iterator it = selectedNNIs.begin(); it != selectedNNIs.end(); it++) {
+			/* do the topological change */
+			doOneNNI(tr, pr, (*it).p, (*it).nniType, TOPO_ONLY);
+			if (searchinfo.speednni) {
+				vector<string> aBranches = getAffectedBranches(tr, (*it).p);
+				searchinfo.aBranches.insert(aBranches.begin(), aBranches.end());
+			}
+			updateBranchLengthForNNI(tr, pr, (*it));
+            if (numBranches > 1 && !tr->useRecom) {
+                pllUpdatePartials(tr, pr, (*it).p, PLL_TRUE);
+                pllUpdatePartials(tr, pr, (*it).p->back, PLL_TRUE);
+            } else {
+                pllUpdatePartials(tr, pr, (*it).p, PLL_FALSE);
+                pllUpdatePartials(tr, pr, (*it).p->back, PLL_FALSE);
+            }
+		}
+		if (selectedNNIs.size() != 0) {
+			//pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+			pllOptimizeBranchLengths(tr, pr, 1);
+			if (globalParam->count_trees) {
+	            countDistinctTrees(tr, pr);
+			}
+			int numNNI = selectedNNIs.size();
+			/* new tree likelihood should not be smaller the likelihood of the computed best NNI */
+			while (tr->likelihood < selectedNNIs.back().likelihood) {
+				if (numNNI == 1) {
+					printf("ERROR: new logl=%10.4f after applying only the best NNI < best NNI logl=%10.4f\n",
+							tr->likelihood, selectedNNIs[0].likelihood);
+					assert(0);
+				} else {
+					cout << "Best logl: " << selectedNNIs.back().likelihood << " / " << "NNI step " << searchinfo.curNumNNISteps<< " / Applying " << numNNI << " NNIs give logl: " << tr->likelihood << " (worse than best)";
+					cout << " / Roll back tree ... " << endl;
+			        //restoreTL(rl, tr, 0, pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+				    if (!restoreTree(curTree, tr, pr)) {
+				        printf("ERROR: failed to roll back tree \n");
+				        assert(0);
+				    }
+				    // If tree log-likelihood decreases only apply the best NNI
+					numNNI = 1;
+					int count = numNNI;
+					for (vector<pllNNIMove>::reverse_iterator rit = selectedNNIs.rbegin(); rit != selectedNNIs.rend(); ++rit) {
+						doOneNNI(tr, pr, (*rit).p, (*rit).nniType, TOPO_ONLY);
+						updateBranchLengthForNNI(tr, pr, (*rit));
+			            if (numBranches > 1 && !tr->useRecom) {
+			                pllUpdatePartials(tr, pr, (*rit).p, PLL_TRUE);
+			                pllUpdatePartials(tr, pr, (*rit).p->back, PLL_TRUE);
+			            } else {
+			                pllUpdatePartials(tr, pr, (*rit).p, PLL_FALSE);
+			                pllUpdatePartials(tr, pr, (*rit).p->back, PLL_FALSE);
+			            }
+						count--;
+						if (count == 0) {
+							break;
+						}
+					}
+		            //pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+					pllOptimizeBranchLengths(tr, pr, 1);
+					//cout << "Number of NNIs reduced to " << numNNI << ": " << tr->likelihood << endl;
+
+					/* Only apply the best NNI after the tree has been rolled back */
+					searchinfo.curNumAppliedNNIs = numNNI;
+				}
+			}
+			if (tr->likelihood - initLH < 0.1) {
+				searchinfo.curNumAppliedNNIs = 0;
+			}
+			finalLH = tr->likelihood;
+		}
+	} else {
+		searchinfo.curNumAppliedNNIs = 0;
+	}
+	//freeTopol(curTree);
+	return finalLH;
+}
+
+
+void updateBranchLengthForNNI(pllInstance* tr, partitionList *pr, pllNNIMove &nni) {
+	int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+	/*  apply branch lengths */
+	for (int i = 0; i < numBranches; i++) {
+		nni.p->z[i] = nni.z0[i];
+		nni.p->back->z[i] = nni.z0[i];
+		nni.p->next->z[i] = nni.z1[i];
+		nni.p->next->back->z[i] = nni.z1[i];
+		nni.p->next->next->z[i] = nni.z2[i];
+		nni.p->next->next->back->z[i] = nni.z2[i];
+		nni.p->back->next->z[i] = nni.z3[i];
+		nni.p->back->next->back->z[i] = nni.z3[i];
+		nni.p->back->next->next->z[i] = nni.z4[i];
+		nni.p->back->next->next->back->z[i] = nni.z4[i];
+	}
+	/* update partial likelihood */
+//	if (numBranches > 1 && !tr->useRecom) {
+//		pllNewviewGeneric(tr, pr, nni.p, PLL_TRUE);
+//		pllNewviewGeneric(tr, pr, nni.p->back, PLL_TRUE);
+//	} else {
+//		pllNewviewGeneric(tr, pr, nni.p, PLL_FALSE);
+//		pllNewviewGeneric(tr, pr, nni.p->back, PLL_FALSE);
+//	}
+}
+
+void pllOptimizeOneBranch(pllInstance *tr, partitionList *pr, nodeptr p) {
+    nodeptr  q;
+    int i;
+    double   z[PLL_NUM_BRANCHES], z0[PLL_NUM_BRANCHES];
+    int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+    #ifdef _DEBUG_UPDATE
+      double
+        startLH;
+
+      pllEvaluateLikelihood (tr, p);
+
+      startLH = tr->likelihood;
+    #endif
+
+    q = p->back;
+
+    for(i = 0; i < numBranches; i++)
+      z0[i] = q->z[i];
+
+    if(numBranches > 1)
+      makenewzGeneric(tr, pr, p, q, z0, PLL_ITERATIONS, z, PLL_TRUE);
+    else
+      makenewzGeneric(tr, pr, p, q, z0, PLL_ITERATIONS, z, PLL_FALSE);
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(!tr->partitionConverged[i])
+      {
+        if(PLL_ABS(z[i] - z0[i]) > PLL_DELTAZ)
+        {
+          tr->partitionSmoothed[i] = PLL_FALSE;
+        }
+
+        p->z[i] = q->z[i] = z[i];
+      }
+    }
+
+    #ifdef _DEBUG_UPDATE
+      pllEvaluateLikelihood (tr, p);
+
+      if(tr->likelihood <= startLH)
+        {
+          if(fabs(tr->likelihood - startLH) > 0.01)
+      {
+        printf("%f %f\n", startLH, tr->likelihood);
+        assert(0);
+      }
+        }
+    #endif
+}
+
+double doOneNNI(pllInstance *tr, partitionList *pr, nodeptr p, int swap, NNI_Type nni_type, SearchInfo *searchinfo) {
+	assert(swap == 0 || swap == 1);
+	nodeptr q;
+	nodeptr tmp;
+	q = p->back;
+	assert(!isTip(q->number, tr->mxtips));
+	assert(!isTip(p->number, tr->mxtips));
+	int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+	if (swap == 1) {
+		tmp = p->next->back;
+		hookup(p->next, q->next->back, q->next->z, numBranches);
+		hookup(q->next, tmp, tmp->z, numBranches);
+	} else {
+		tmp = p->next->next->back;
+		hookup(p->next->next, q->next->back, q->next->z, numBranches);
+		hookup(q->next, tmp, tmp->z, numBranches);
+	}
+
+	if (nni_type == TOPO_ONLY) {
+		return 0.0;
+	}
+
+    if (numBranches > 1 && !tr->useRecom) {
+        pllUpdatePartials(tr, pr, p, PLL_TRUE);
+        pllUpdatePartials(tr, pr, q, PLL_TRUE);
+    } else {
+        pllUpdatePartials(tr, pr, p, PLL_FALSE);
+        pllUpdatePartials(tr, pr, q, PLL_FALSE);
+    }
+    // Optimize the central branch
+    pllOptimizeOneBranch(tr, pr, p);
+    if((globalParam->online_bootstrap == PLL_TRUE) && (globalParam->gbo_replicates > 0)){
+        tr->fastScaling = PLL_FALSE;
+        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_TRUE); // DTH: modified the last arg
+        pllSaveCurrentTree(tr, pr, p);
+    }else{
+        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_FALSE);
+    }
+    // if after optimizing the central branch we already obtain better logl
+    // then there is no need for optimizing other branches
+    if (tr->likelihood > searchinfo->curLogl) {
+        return tr->likelihood;
+    }
+    // Optimize 4 other branches
+    if (nni_type == NNI5) {
+        if (numBranches > 1 && !tr->useRecom) {
+            pllUpdatePartials(tr, pr, q, PLL_TRUE);
+        } else {
+            pllUpdatePartials(tr, pr, q, PLL_FALSE);
+        }
+        nodeptr r;
+        r = p->next;
+        if (numBranches > 1 && !tr->useRecom) {
+            pllUpdatePartials(tr, pr, r, PLL_TRUE);
+        } else {
+            pllUpdatePartials(tr, pr, r, PLL_FALSE);
+        }
+        pllOptimizeOneBranch(tr, pr, r);
+//        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_FALSE);
+//        if (tr->likelihood > searchinfo->curLogl) {
+//            return tr->likelihood;
+//        }
+        r = p->next->next;
+        if (numBranches > 1 && !tr->useRecom)
+            pllUpdatePartials(tr, pr, r, PLL_TRUE);
+        else
+            pllUpdatePartials(tr, pr, r, PLL_FALSE);
+        pllOptimizeOneBranch(tr, pr, r);
+//        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_FALSE);
+//        if (tr->likelihood > searchinfo->curLogl) {
+//            return tr->likelihood;
+//        }
+        if (numBranches > 1 && !tr->useRecom)
+            pllUpdatePartials(tr, pr, p, PLL_TRUE);
+        else
+            pllUpdatePartials(tr, pr, p, PLL_FALSE);
+        pllOptimizeOneBranch(tr, pr, p);
+//        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_FALSE);
+//        if (tr->likelihood > searchinfo->curLogl) {
+//            return tr->likelihood;
+//        }
+        // optimize 2 branches at node q
+        r = q->next;
+        if (numBranches > 1 && !tr->useRecom)
+            pllUpdatePartials(tr, pr, r, PLL_TRUE);
+        else
+            pllUpdatePartials(tr, pr, r, PLL_FALSE);
+        pllOptimizeOneBranch(tr, pr, r);
+//        pllEvaluateLikelihood(tr, pr, p, PLL_FALSE, PLL_FALSE);
+//        if (tr->likelihood > searchinfo->curLogl) {
+//            return tr->likelihood;
+//        }
+        r = q->next->next;
+        if (numBranches > 1 && !tr->useRecom)
+            pllUpdatePartials(tr, pr, r, PLL_TRUE);
+        else
+            pllUpdatePartials(tr, pr, r, PLL_FALSE);
+        pllOptimizeOneBranch(tr, pr, r);
+        if((globalParam->online_bootstrap == PLL_TRUE) &&
+                        (globalParam->gbo_replicates > 0)){
+            tr->fastScaling = PLL_FALSE;
+            pllEvaluateLikelihood(tr, pr, r, PLL_FALSE, PLL_TRUE); // DTH: modified the last arg
+            pllSaveCurrentTree(tr, pr, r);
+        }else{
+            pllEvaluateLikelihood(tr, pr, r, PLL_FALSE, PLL_FALSE);
+        }
+    }
+	return tr->likelihood;
+}
+
+double estBestLoglImp(SearchInfo* searchinfo) {
+    double res = 0.0;
+    int index = floor(searchinfo->deltaLogl.size() * 5 / 100);
+    set<double>::reverse_iterator ri;
+    for (ri = searchinfo->deltaLogl.rbegin(); ri != searchinfo->deltaLogl.rend(); ++ri) {
+        //cout << (*ri) << " ";
+        --index;
+        if (index == 0) {
+            res = (*ri);
+            break;
+        }
+    }
+    //cout << res << endl;
+    //cout << searchinfo->deltaLogl.size() << endl;
+    return res;
+}
+
+string convertQuartet2String(nodeptr p) {
+	nodeptr q = p->back;
+	int pNr = p->number;
+	int qNr = q->number;
+	int pNei1Nr = p->next->back->number;
+	int pNei2Nr = p->next->next->back->number;
+	int qNei1Nr = q->next->back->number;
+	int qNei2Nr = q->next->next->back->number;
+	stringstream middle;
+	stringstream left;
+	stringstream right;
+	stringstream res;
+	if (pNr < qNr) {
+		middle << "-" << pNr << "-" << qNr << "-";
+	} else {
+		middle << "-" << qNr << "-" << pNr << "-";
+	}
+	if (pNei1Nr < pNei2Nr) {
+		left << pNei1Nr << "-" << pNei2Nr;
+	} else {
+		left << pNei2Nr << "-" << pNei1Nr;
+	}
+	if (qNei1Nr < qNei2Nr) {
+		right << qNei1Nr << "-" << qNei2Nr;
+	} else {
+		right << qNei2Nr << "-" << qNei1Nr;
+	}
+	res << left.str() << middle.str() << right.str();
+	return res.str();
+}
+
+void countDistinctTrees(pllInstance* pllInst, partitionList *pllPartitions) {
+    pllTreeToNewick(pllInst->tree_string, pllInst, pllPartitions, pllInst->start->back, PLL_FALSE,
+            PLL_TRUE, PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+	PhyloTree mtree;
+	mtree.rooted = false;
+	mtree.aln = globalAlignment;
+	mtree.readTreeString(string(pllInst->tree_string));
+    mtree.root = mtree.findNodeName(globalAlignment->getSeqName(0));
+	ostringstream ostr;
+	mtree.printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
+	string tree_str = ostr.str();
+	if (pllTreeCounter.find(tree_str) == pllTreeCounter.end()) {
+		// not found in hash_map
+	    pllTreeCounter[tree_str] = 1;
+	} else {
+		// found in hash_map
+	    pllTreeCounter[tree_str]++;
+	}
+}
+
+int evalNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &searchinfo) {
+	nodeptr q = p->back;
+	assert(!isTip(p->number, tr->mxtips));
+	assert(!isTip(q->number, tr->mxtips));
+	int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+	int numPosNNI = 0;
+	int i;
+	pllNNIMove nni0; // dummy NNI to store backup information
+	nni0.p = p;
+	nni0.nniType = 0;
+	nni0.likelihood = searchinfo.curLogl;
+	for (i = 0; i < PLL_NUM_BRANCHES; i++) {
+		nni0.z0[i] = p->z[i];
+		nni0.z1[i] = p->next->z[i];
+		nni0.z2[i] = p->next->next->z[i];
+		nni0.z3[i] = q->next->z[i];
+		nni0.z4[i] = q->next->next->z[i];
+	}
+
+	pllNNIMove bestNNI;
+
+	/* do an NNI move of type 1 */
+	double lh1 = doOneNNI(tr, pr, p, 0, searchinfo.nni_type, &searchinfo);
+	if (globalParam->count_trees)
+		countDistinctTrees(tr, pr);
+	pllNNIMove nni1;
+	nni1.p = p;
+	nni1.nniType = 0;
+	// Store the optimized branch lengths
+	for (i = 0; i < PLL_NUM_BRANCHES; i++) {
+		nni1.z0[i] = p->z[i];
+		nni1.z1[i] = p->next->z[i];
+		nni1.z2[i] = p->next->next->z[i];
+		nni1.z3[i] = q->next->z[i];
+		nni1.z4[i] = q->next->next->z[i];
+	}
+	nni1.likelihood = lh1;
+	nni1.loglDelta = lh1 - nni0.likelihood;
+	nni1.negLoglDelta = -nni1.loglDelta;
+
+	/* Restore previous NNI move */
+	doOneNNI(tr, pr, p, 0, TOPO_ONLY);
+	/* Restore the old branch length */
+	for (i = 0; i < PLL_NUM_BRANCHES; i++) {
+		p->z[i] = nni0.z0[i];
+		q->z[i] = nni0.z0[i];
+		p->next->z[i] = nni0.z1[i];
+		p->next->back->z[i] = nni0.z1[i];
+		p->next->next->z[i] = nni0.z2[i];
+		p->next->next->back->z[i] = nni0.z2[i];
+		q->next->z[i] = nni0.z3[i];
+		q->next->back->z[i] = nni0.z3[i];
+		q->next->next->z[i] = nni0.z4[i];
+		q->next->next->back->z[i] = nni0.z4[i];
+	}
+
+	/* do an NNI move of type 2 */
+	double lh2 = doOneNNI(tr, pr, p, 1, searchinfo.nni_type, &searchinfo);
+	if (globalParam->count_trees)
+		countDistinctTrees(tr, pr);
+
+	// Create the nniMove struct to store this move
+	pllNNIMove nni2;
+	nni2.p = p;
+	nni2.nniType = 1;
+	// Store the optimized central branch length
+	for (i = 0; i < PLL_NUM_BRANCHES; i++) {
+		nni2.z0[i] = p->z[i];
+		nni2.z1[i] = p->next->z[i];
+		nni2.z2[i] = p->next->next->z[i];
+		nni2.z3[i] = q->next->z[i];
+		nni2.z4[i] = q->next->next->z[i];
+	}
+	nni2.likelihood = lh2;
+	nni2.loglDelta = lh2 - nni0.likelihood;
+	nni2.negLoglDelta = -nni2.loglDelta;
+
+	if (nni2.likelihood > nni1.likelihood) {
+		bestNNI = nni2;
+	} else {
+		bestNNI = nni1;
+	}
+
+	if (bestNNI.likelihood > searchinfo.curLogl + 1e-6) {
+		numPosNNI++;
+		searchinfo.posNNIList.push_back(bestNNI);
+	}
+
+	/* Restore previous NNI move */
+	doOneNNI(tr, pr, p, 1, TOPO_ONLY);
+	/* Restore the old branch length */
+	for (i = 0; i < PLL_NUM_BRANCHES; i++) {
+		p->z[i] = nni0.z0[i];
+		q->z[i] = nni0.z0[i];
+		p->next->z[i] = nni0.z1[i];
+		p->next->back->z[i] = nni0.z1[i];
+		p->next->next->z[i] = nni0.z2[i];
+		p->next->next->back->z[i] = nni0.z2[i];
+		q->next->z[i] = nni0.z3[i];
+		q->next->back->z[i] = nni0.z3[i];
+		q->next->next->z[i] = nni0.z4[i];
+		q->next->next->back->z[i] = nni0.z4[i];
+	}
+
+	// Re-compute the partial likelihood vectors
+	// TODO: One could save these instead of recomputation
+    if (numBranches > 1 && !tr->useRecom) {
+        pllUpdatePartials(tr, pr, p, PLL_TRUE);
+        pllUpdatePartials(tr, pr, p->back, PLL_TRUE);
+    } else {
+        pllUpdatePartials(tr, pr, p, PLL_FALSE);
+        pllUpdatePartials(tr, pr, p->back, PLL_FALSE);
+    }
+
+	return numPosNNI;
+}
+
+//void recomputePartial(pllInstance tr, partitionList pr, nodeptr p) {
+//    if (numBranches > 1 && !tr->useRecom) {
+//        pllUpdatePartials(tr, pr, p, PLL_TRUE);
+//        pllUpdatePartials(tr, pr, p->back, PLL_TRUE);
+//    } else {
+//        pllUpdatePartials(tr, pr, p, PLL_FALSE);
+//        pllUpdatePartials(tr, pr, p->back, PLL_FALSE);
+//    }
+//}
+
+bool isAffectedBranch(nodeptr p, SearchInfo &searchinfo) {
+	string branString = getBranString(p);
+	if (searchinfo.aBranches.find(branString) != searchinfo.aBranches.end()) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &searchinfo) {
+	if (!isTip(p->number, tr->mxtips) && !isTip(p->back->number, tr->mxtips)) {
+		if (searchinfo.speednni && searchinfo.curNumNNISteps != 1) {
+			if (isAffectedBranch(p, searchinfo)) {
+				evalNNIForBran(tr, pr, p, searchinfo);
+			}
+		} else {
+			evalNNIForBran(tr, pr, p, searchinfo);
+		}
+		nodeptr q = p->next;
+		while (q != p) {
+			evalNNIForSubtree(tr, pr, q->back, searchinfo);
+			q = q->next;
+		}
+	}
+}
+
+/**
+* DTH:
+* The PLL version of saveCurrentTree function
+* @param tr: the tree (a pointer to a pllInstance)
+* @param pr: pointer to a partitionList (this one keeps tons of tree info)
+* @param p: root?
+*/
+void pllSaveCurrentTree(pllInstance* tr, partitionList *pr, nodeptr p){
+    double cur_logl = tr->likelihood;
+
+    struct pllHashItem * item_ptr = (struct pllHashItem *) malloc(sizeof(struct pllHashItem));
+    item_ptr->data = (int *) malloc(sizeof(int));
+    item_ptr->next = NULL;
+    item_ptr->str = NULL;
+
+    unsigned int tree_index = -1;
+    char * tree_str = NULL;
+    pllTree2StringREC(tr->tree_string, tr, pr, tr->start->back, PLL_FALSE,
+            PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_TRUE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+    tree_str = (char *) malloc (strlen(tr->tree_string) + 1);
+    strcpy(tree_str, tr->tree_string);
+
+    pllBoolean is_stored = PLL_FALSE;
+
+    if(globalParam->store_candidate_trees){
+        is_stored = pllHashSearch(pllUFBootDataPtr->treels, tree_str, &(item_ptr->data));
+    }
+
+    if(is_stored){ // if found the tree_str
+        pllUFBootDataPtr->duplication_counter++;
+        tree_index = *((int *)item_ptr->data);
+        if (cur_logl <= pllUFBootDataPtr->treels_logl[tree_index] + 1e-4) {
+            if (cur_logl < pllUFBootDataPtr->treels_logl[tree_index] - 5.0)
+                if (verbose_mode >= VB_MED)
+                    printf("Current lh %f is much worse than expected %f\n",
+                            cur_logl, pllUFBootDataPtr->treels_logl[tree_index]);
+/*            free(tree_str);
+            free(item_ptr->data);
+            free(item_ptr);*/
+            return;
+        }
+        if (verbose_mode >= VB_MAX)
+            printf("Updated logl %f to %f\n", pllUFBootDataPtr->treels_logl[tree_index], cur_logl);
+        pllUFBootDataPtr->treels_logl[tree_index] = cur_logl;
+
+        if (pllUFBootDataPtr->save_all_br_lens) {
+            pllTree2StringREC(tr->tree_string, tr, pr, tr->start->back, PLL_TRUE,
+                    PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_TRUE, PLL_SUMMARIZE_LENGTH, PLL_FALSE, PLL_FALSE);
+            char * tree_str_br_lens = (char *) malloc (strlen(tr->tree_string) + 1);
+            strcpy(tree_str_br_lens, tr->tree_string);
+            pllUFBootDataPtr->treels_newick[tree_index] = tree_str_br_lens;
+        }
+        if (pllUFBootDataPtr->boot_samples == NULL) {
+            (pllUFBootDataPtr->treels_ptnlh)[tree_index] =
+                    (double *) malloc(pllUFBootDataPtr->n_patterns * sizeof(double));
+            pllComputePatternLikelihood(tr, (pllUFBootDataPtr->treels_ptnlh)[tree_index], &cur_logl);
+/*            free(tree_str);
+            free(item_ptr->data);
+            free(item_ptr);*/
+            return;
+        }
+        if (verbose_mode >= VB_MAX)
+            printf("Update treels_logl[%d] := %f\n", tree_index, cur_logl);
+
+    } else {
+		if (pllUFBootDataPtr->logl_cutoff != 0.0 && cur_logl <= pllUFBootDataPtr->logl_cutoff + 1e-4){
+		/*            free(tree_str);
+			free(item_ptr->data);
+			free(item_ptr);*/
+			return;
+		}
+
+		if(pllUFBootDataPtr->treels_size == pllUFBootDataPtr->candidate_trees_count)
+			pllResizeUFBootData();
+
+		tree_index = pllUFBootDataPtr->candidate_trees_count;
+		pllUFBootDataPtr->candidate_trees_count++;
+		if (globalParam->store_candidate_trees){
+			*((int *)item_ptr->data) = tree_index;
+			item_ptr->str = tree_str;
+			pllHashAdd(pllUFBootDataPtr->treels, pllHashString(tree_str, pllUFBootDataPtr->treels->size), tree_str, item_ptr->data);
+		}
+		pllUFBootDataPtr->treels_logl[tree_index] = cur_logl;
+
+		if (verbose_mode >= VB_MAX)
+			printf("Add    treels_logl[%d] := %f\n", tree_index, cur_logl);
+   }
+
+    //if (write_intermediate_trees)
+    //        printTree(out_treels, WT_NEWLINE | WT_BR_LEN);
+
+    double *pattern_lh = (double *) malloc(pllUFBootDataPtr->n_patterns * sizeof(double));
+    if(!pattern_lh) outError("Not enough dynamic memory!");
+    pllComputePatternLikelihood(tr, pattern_lh, &cur_logl);
+
+    if (pllUFBootDataPtr->boot_samples == NULL) {
+        // for runGuidedBootstrap
+        pllUFBootDataPtr->treels_ptnlh[tree_index] = pattern_lh;
+    } else {
+        // online bootstrap
+        int nptn = pllUFBootDataPtr->n_patterns;
+        int updated = 0;
+        int nsamples = globalParam->gbo_replicates;
+        for (int sample = 0; sample < nsamples; sample++) {
+            double rell = 0.0;
+            for (int ptn = 0; ptn < nptn; ptn++)
+                rell += pattern_lh[ptn] * pllUFBootDataPtr->boot_samples[sample][ptn];
+
+            if (rell > pllUFBootDataPtr->boot_logl[sample] + globalParam->ufboot_epsilon ||
+                (rell > pllUFBootDataPtr->boot_logl[sample] - globalParam->ufboot_epsilon &&
+                    random_double() <= 1.0/(pllUFBootDataPtr->boot_counts[sample]+1))) {
+                if (!globalParam->store_candidate_trees){
+                    is_stored = pllHashSearch(pllUFBootDataPtr->treels, tree_str, &(item_ptr->data));
+                    if(is_stored)
+                        tree_index = *((int *)item_ptr->data);
+                    else{
+                        *((int *)item_ptr->data) = tree_index = pllUFBootDataPtr->candidate_trees_count - 1;
+                        item_ptr->str = tree_str;
+                        pllHashAdd(pllUFBootDataPtr->treels, pllHashString(tree_str, pllUFBootDataPtr->treels->size), tree_str, item_ptr->data);
+//                        pllHashAdd(pllUFBootDataPtr->treels, tree_str, item_ptr->data);
+                    }
+                }
+                if (rell <= pllUFBootDataPtr->boot_logl[sample] +
+                        globalParam->ufboot_epsilon) {
+                    pllUFBootDataPtr->boot_counts[sample]++;
+                } else {
+                    pllUFBootDataPtr->boot_counts[sample] = 1;
+                }
+                if(rell > pllUFBootDataPtr->boot_logl[sample])
+                    pllUFBootDataPtr->boot_logl[sample] = rell;
+                pllUFBootDataPtr->boot_trees[sample] = tree_index;
+                updated++;
+            }
+        }
+/*        if (updated && verbose_mode >= VB_MAX)
+         printf("%d boot trees updated\n", updated);*/
+    }
+    if (pllUFBootDataPtr->save_all_br_lens) {
+        pllTree2StringREC(tr->tree_string, tr, pr, tr->start->back, PLL_TRUE,
+                PLL_FALSE, PLL_FALSE, PLL_FALSE, PLL_TRUE, PLL_SUMMARIZE_LH, PLL_FALSE, PLL_FALSE);
+        char * s = (char *) malloc (strlen(tr->tree_string) + 1);
+        strcpy(s, tr->tree_string);
+        pllUFBootDataPtr->treels_newick[tree_index] = s;
+    }
+
+//    if(!globalParam->store_candidate_trees){
+//        free(tree_str);
+//        free(item_ptr->data);
+//        free(item_ptr);
+//    }
+    if (pllUFBootDataPtr->boot_samples){
+        free(pattern_lh);
+        pllUFBootDataPtr->treels_ptnlh[tree_index] = NULL;
+    }
+
+//    printf("Done freeing: max = %d, count = %d, size = %d\n",
+//            pllUFBootDataPtr->max_candidate_trees,
+//            pllUFBootDataPtr->candidate_trees_count,
+//            pllUFBootDataPtr->treels_size);
+}
+
+/**
+* DTH:
+* Extract the array of site log likelihood to be kept in ptnlh
+* And update *cur_log
+* @param tr: the tree (pointer to an pllInstance)
+* @param ptnlh: to-be-kept array of site log likelihood
+* @param cur_logl: pointer to current tree log likelihood
+*/
+void pllComputePatternLikelihood(pllInstance* tr, double * ptnlh, double * cur_logl){
+    int i;
+    double tree_logl = 0;
+    for(i = 0; i < pllUFBootDataPtr->n_patterns; i++){
+        ptnlh[i] = tr->lhs[i];
+        tree_logl += tr->lhs[i] * tr->aliaswgt[i];
+    }
+    *cur_logl = tree_logl;
+}
+
+/**
+* DTH:
+* Resize some of the arrays in UFBootData if they're full
+* Along with update treels_size (to track the size of these arrays)
+*/
+void pllResizeUFBootData(){
+    int count = pllUFBootDataPtr->candidate_trees_count;
+    pllUFBootDataPtr->treels_size = 2 * count;
+
+    double * rtreels_logl =
+            (double *) malloc(2 * count * (sizeof(double)));
+    if(!rtreels_logl) outError("Not enough dynamic memory!");
+//    memset(rtreels_logl, 0, 2 * count * sizeof(double));
+    memcpy(rtreels_logl, pllUFBootDataPtr->treels_logl, count * sizeof(double));
+    free(pllUFBootDataPtr->treels_logl);
+    pllUFBootDataPtr->treels_logl = rtreels_logl;
+
+    char ** rtreels_newick =
+            (char **) malloc(2 * count * (sizeof(char *)));
+    if(!rtreels_newick) outError("Not enough dynamic memory!");
+    memset(rtreels_newick, 0, 2 * count * sizeof(char *));
+    memcpy(rtreels_newick, pllUFBootDataPtr->treels_newick, count * sizeof(char *));
+    free(pllUFBootDataPtr->treels_newick);
+    pllUFBootDataPtr->treels_newick = rtreels_newick;
+
+    double ** rtreels_ptnlh =
+        (double **) malloc(2 * count * (sizeof(double *)));
+    if(!rtreels_ptnlh) outError("Not enough dynamic memory!");
+    memset(rtreels_ptnlh, 0, 2 * count * sizeof(double *));
+    memcpy(rtreels_ptnlh, pllUFBootDataPtr->treels_ptnlh, count * sizeof(double *));
+    free(pllUFBootDataPtr->treels_ptnlh);
+    pllUFBootDataPtr->treels_ptnlh = rtreels_ptnlh;
+}
+
+
+/**
+* DTH:
+* (Based on function Tree2StringREC of PLL)
+* Print out the tree topology with IQTree taxa ID (starts at 0) instead of PLL taxa ID (starts at 1)
+* @param All are the same as in PLL's
+* 2014.4.23: REPLACE getBranchLength(tr, pr, perGene, p) BY pllGetBranchLength(tr, p, pr->numberOfPartitions)
+* BECAUSE OF LIB NEW DECLARATION: pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id);
+*/
+static char *pllTree2StringREC(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean  printBranchLengths, pllBoolean  printNames,
+		pllBoolean  printLikelihood, pllBoolean  rellTree, pllBoolean  finalPrint, int perGene, pllBoolean  branchLabelSupport, pllBoolean  printSHSupport)
+{
+	char * result = treestr; // DTH: added this var to be able to remove the '\n' at the end
+	char  *nameptr;
+
+	if(isTip(p->number, tr->mxtips)){
+		if(printNames){
+			nameptr = tr->nameList[p->number];
+			sprintf(treestr, "%s", nameptr);
+		}else
+			sprintf(treestr, "%d", p->number - 1);
+
+		while (*treestr) treestr++;
+	}else{
+		*treestr++ = '(';
+		treestr = pllTree2StringREC(treestr, tr, pr, p->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+				finalPrint, perGene, branchLabelSupport, printSHSupport);
+		*treestr++ = ',';
+		treestr = pllTree2StringREC(treestr, tr, pr, p->next->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+				finalPrint, perGene, branchLabelSupport, printSHSupport);
+		if(p == tr->start->back){
+			*treestr++ = ',';
+			treestr = pllTree2StringREC(treestr, tr, pr, p->back, printBranchLengths, printNames, printLikelihood, rellTree,
+				finalPrint, perGene, branchLabelSupport, printSHSupport);
+		}
+		*treestr++ = ')';
+	}
+
+	if(p == tr->start->back){
+		if(printBranchLengths && !rellTree)
+			sprintf(treestr, ":0.0;\n");
+		else
+			sprintf(treestr, ";\n");
+	}else{
+		if(rellTree || branchLabelSupport || printSHSupport){
+			if(( !isTip(p->number, tr->mxtips)) &&
+					( !isTip(p->back->number, tr->mxtips)))
+			{
+				assert(p->bInf != (branchInfo *)NULL);
+				if(rellTree)
+					sprintf(treestr, "%d:%8.20f", p->bInf->support, p->z[0]);
+				if(branchLabelSupport)
+					sprintf(treestr, ":%8.20f[%d]", p->z[0], p->bInf->support);
+				if(printSHSupport)
+					sprintf(treestr, ":%8.20f[%d]", pllGetBranchLength(tr, p, pr->numberOfPartitions), p->bInf->support);
+			}else{
+				if(rellTree || branchLabelSupport)
+					sprintf(treestr, ":%8.20f", p->z[0]);
+				if(printSHSupport)
+					sprintf(treestr, ":%8.20f", pllGetBranchLength(tr, p, pr->numberOfPartitions));
+			}
+		}else{
+			if(printBranchLengths)
+				sprintf(treestr, ":%8.20f", pllGetBranchLength(tr, p, pr->numberOfPartitions));
+			else
+				sprintf(treestr, "%s", "\0");
+		}
+	}
+
+	if(result[strlen(result) - 1] == '\n') result[strlen(result) - 1] = '\0';
+	while (*treestr) treestr++;
+	return  treestr;
+}
+
+
diff --git a/pllnni.h b/pllnni.h
new file mode 100644
index 0000000..a0597fe
--- /dev/null
+++ b/pllnni.h
@@ -0,0 +1,254 @@
+#ifndef NNISEARCH_H
+#define NNISEARCH_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "tools.h"
+#include <string>
+#include <sstream>
+#include <set>
+#include <algorithm>
+#include <map>
+#include <vector>
+//#include <unordered_set>
+extern "C" {
+#include "pllrepo/src/pllInternal.h"
+}
+
+typedef struct {
+	nodeptr p;
+	int nniType;
+	char* idString;
+    double z0[PLL_NUM_BRANCHES]; // p
+    double z1[PLL_NUM_BRANCHES]; // p->next
+    double z2[PLL_NUM_BRANCHES]; // p->next->next
+    double z3[PLL_NUM_BRANCHES]; // q->next
+    double z4[PLL_NUM_BRANCHES]; // q->next->next
+	double likelihood;
+	double loglDelta;
+	double negLoglDelta;
+} pllNNIMove;
+
+typedef struct {
+    // FOR GENERAL TREE SEARCH
+	bool speednni;
+	vector<pllNNIMove> posNNIList; // positive NNIs
+	unordered_set<string> aBranches; // Set of branches that are affected by the previous NNIs
+	double curLogl; // Current tree log-likelihood
+	int curIter; // Current iteration number
+	double curPerStrength; // Current perturbation strength
+
+	// FOR NNI SEARCH
+	NNI_Type nni_type;
+	int numAppliedNNIs; // total number of applied NNIs sofar
+	int curNumAppliedNNIs; // number of applied NNIs at the current step
+	int curNumNNISteps;
+	int maxNNISteps;
+	set<double> deltaLogl; // logl differences between nni1 and nni5
+} SearchInfo;
+
+
+/* This is the info you need to copy the vector*/
+typedef struct
+{
+  int node_number;
+  int num_partitions;
+  size_t *partition_sizes;
+  double **lh_values;
+  int **expVector;
+} LH_VECTOR;
+
+inline bool comparePLLNNIMove(const pllNNIMove &a, const pllNNIMove &b)
+{
+    return a.likelihood < b.likelihood;
+}
+
+void countDistinctTrees(pllInstance* pllInst, partitionList *pllPartitions);
+
+//static int cmp_nni(const void* nni1, const void* nni2);
+
+int compareDouble(const void * a, const void * b);
+
+pllNNIMove *getNonConflictNNIList(pllInstance* tr);
+
+void _update(pllInstance *tr, partitionList *pr, nodeptr p);
+
+#define MAX_NUM_DELTA 10000
+
+typedef struct {
+	double delta[MAX_NUM_DELTA];
+	int num_delta;
+	double delta_min;
+	int doNNICut;
+} NNICUT;
+
+/**
+ * get all the nodes affected by the NNI
+ * @param p
+ * @return
+ */
+set<int> getAffectedNodes(pllInstance* tr, nodeptr p);
+
+string getBranString(nodeptr p);
+
+bool containsAffectedNodes(nodeptr p, SearchInfo &searchinfo);
+
+void updateBranchLengthForNNI(pllInstance* tr, partitionList *pr, pllNNIMove &nni);
+
+void pllEvalAllNNIs(pllInstance *tr, partitionList *pr, SearchInfo &searchinfo);
+
+double pllDoRandomNNIs(pllInstance *tr, partitionList *pr, int numNNI);
+
+/**
+ *  Compute the possible best logl improvement of NNI5 over NNI1
+ *  @param serachinfo contains the logl improvement collected in previous iterations
+ *  @return a logl delta that is larger than 95% of the collected values
+ */
+double estBestLoglImp(SearchInfo* searchinfo);
+
+/**
+ *  Evaluate NNI moves for the current internal branch
+ *  @param tr the current tree data structure
+ *  @param pr partition data structure
+ *  @param p the node representing the current branch
+ *  @return number of positive NNIs found
+ */
+int evalNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &searchinfo);
+
+/**
+ * Perturb the best tree
+ *
+ * Given the best tree, apply some NNIs to escape local optimum
+ * @param tr
+ * @param pr
+ * @param nnis list of all NNI to apply
+ * @param numNNI size of the array nnis
+ * @return
+ */
+double pllPerturbTree(pllInstance *tr, partitionList *pr, vector<pllNNIMove> &nnis);
+
+/**
+ * 	do 1 round of fastNNI
+ *  return new tree log-likelihood if found improving NNI otherwise -1.0
+ *
+ *  @param[in] tr the tree data structure
+ *  @param[in] pr partition data structure
+ *  @param[out] nniList list containing information about the 2(n-3) evaluated NNIs
+ *  @param[in/out] tabuNNIs tabu list
+ *  @param[out] nni_count number of NNI that have been applied
+ *  @param[out] deltaNNI average improvement made by one NNI
+ */
+double pllDoNNISearch(pllInstance* tr, partitionList *pr, SearchInfo &searchinfo);
+
+void pllUpdateTabuList(pllInstance *tr, SearchInfo &searchinfo);
+
+void pllSaveQuartetForSubTree(pllInstance* tr, nodeptr p, SearchInfo &searchinfo);
+
+
+/**
+ *  @brief Do 1 NNI move.
+ *  @param[in] tr: the tree data structure
+ *  @param[in] pr partition data structure
+ *  @param[in] swap: represents one of the 2 NNI moves. Could be either 0 or 1
+ *  @param[in] NNI_Type
+ */
+double doOneNNI(pllInstance * tr, partitionList *pr, nodeptr p, int swap, NNI_Type nni_type, SearchInfo *searchinfo = NULL);
+
+void pllGetAllInBran(pllInstance *tr, vector<nodeptr> &branlist);
+
+void pllGetAllInBranForSubtree(pllInstance *tr, nodeptr p, vector<nodeptr> &branlist);
+
+
+string convertQuartet2String(nodeptr p);
+/**
+ *  Go through all 2(n-3) internal branches of the tree and
+ *  evaluate all possible NNI moves
+ */
+void evalAllNNI(pllInstance* tr);
+
+/**
+ * 	@brief evaluate all NNIs within the subtree specified by node p
+ * 	populates the list containing all possible NNI moves
+ *
+ * 	@param[in] tr: the tree data structure
+ * 	@param[in] pr partition data structure
+ * 	@param[in] p node pointer that specify the subtree
+ */
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p, SearchInfo &searchinfo);
+
+/*
+ *  @brief return the array which can be used to store evaluated NNI moves
+ *
+ *  @param[in] tr: the tree data structure
+ */
+pllNNIMove *getNNIList(pllInstance* tr);
+
+
+
+/*
+ * ****************************************************************************
+ * pllUFBoot area
+ * ****************************************************************************
+ */
+
+/**
+ * DTH:
+ * pllUFBootData struct
+ * This one keeps all info necessary to run UFBoot in PLL mode
+ */
+typedef struct{
+    int max_candidate_trees;
+    int treels_size;
+    int save_all_trees;
+    pllBoolean save_all_br_lens;
+    double logl_cutoff;
+    int duplication_counter;
+    int n_patterns;
+    struct pllHashTable * treels;
+    unsigned int candidate_trees_count; /* counter of trees in pllHashTable */
+    double * treels_logl; // maintain size == treels_size
+    char ** treels_newick; // maintain size == treels_size
+    double ** treels_ptnlh; // maintain size == treels_size
+    int ** boot_samples;
+    double * boot_logl;
+    int * boot_counts;
+    int * boot_trees;
+} pllUFBootData;
+
+/**
+ * DTH:
+ * The PLL version of saveCurrentTree function
+ * @param tr: the tree (a pointer to a pllInstance)
+ * @param pr: pointer to a partitionList (this one keeps tons of tree info)
+ * @param p: root?
+ */
+void pllSaveCurrentTree(pllInstance* tr, partitionList *pr, nodeptr p);
+
+/**
+ * DTH:
+ * Extract the array of site log likelihood to be kept in ptnlh
+ * And update *cur_log
+ * @param tr: the tree (pointer to an pllInstance)
+ * @param ptnlh: to-be-kept array of site log likelihood
+ * @param cur_logl: pointer to current tree log likelihood
+ */
+void pllComputePatternLikelihood(pllInstance* tr, double * ptnlh, double * cur_logl);
+
+/**
+ * DTH:
+ * Resize some of the arrays in UFBootData if they're full
+ * Along with update treels_size (to track the size of these arrays)
+ */
+void pllResizeUFBootData();
+
+/**
+ * DTH:
+ * (Based on function Tree2StringREC of PLL)
+ * Print out the tree topology with IQTree taxa ID (starts at 0) instead of PLL taxa ID (starts at 1)
+ * @param All are the same as in PLL's
+ */
+static char *pllTree2StringREC(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames,
+		pllBoolean printLikelihood, pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport);
+
+#endif
+
diff --git a/pllrepo/AUTHORS b/pllrepo/AUTHORS
new file mode 100644
index 0000000..e69de29
diff --git a/pllrepo/COPYING b/pllrepo/COPYING
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/pllrepo/COPYING
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/pllrepo/ChangeLog b/pllrepo/ChangeLog
new file mode 100644
index 0000000..e69de29
diff --git a/pllrepo/Doxyfile b/pllrepo/Doxyfile
new file mode 100644
index 0000000..af84260
--- /dev/null
+++ b/pllrepo/Doxyfile
@@ -0,0 +1,2299 @@
+# Doxyfile 1.8.5
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Phylogenetic Likelihood Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = 1.0.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "A software library for phylogenetic inference"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = /var/www/test
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-
+# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi,
+# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en,
+# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
+# Turkish, Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = src
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = dox
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more acurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = dox/header2.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefor more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = dox/pll.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = NO
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all refrences to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/pllrepo/INSTALL b/pllrepo/INSTALL
new file mode 100644
index 0000000..a1e89e1
--- /dev/null
+++ b/pllrepo/INSTALL
@@ -0,0 +1,370 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994-1996, 1999-2002, 2004-2011 Free Software Foundation,
+Inc.
+
+   Copying and distribution of this file, with or without modification,
+are permitted in any medium without royalty provided the copyright
+notice and this notice are preserved.  This file is offered as-is,
+without warranty of any kind.
+
+Basic Installation
+==================
+
+   Briefly, the shell commands `./configure; make; make install' should
+configure, build, and install this package.  The following
+more-detailed instructions are generic; see the `README' file for
+instructions specific to this package.  Some packages provide this
+`INSTALL' file but do not implement all of the features documented
+below.  The lack of an optional feature in a given package is not
+necessarily a bug.  More recommendations for GNU packages can be found
+in *note Makefile Conventions: (standards)Makefile Conventions.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You need `configure.ac' if
+you want to change it or regenerate `configure' using a newer version
+of `autoconf'.
+
+   The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.
+
+     Running `configure' might take a while.  While running, it prints
+     some messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package, generally using the just-built uninstalled binaries.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.  When installing into a prefix owned by root, it is
+     recommended that the package be configured and built as a regular
+     user, and only the `make install' phase executed with root
+     privileges.
+
+  5. Optionally, type `make installcheck' to repeat any self-tests, but
+     this time using the binaries in their final installed location.
+     This target does not install anything.  Running this target as a
+     regular user, particularly if the prior `make install' required
+     root privileges, verifies that the installation completed
+     correctly.
+
+  6. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+  7. Often, you can also type `make uninstall' to remove the installed
+     files again.  In practice, not all packages have tested that
+     uninstallation works correctly, even though it is required by the
+     GNU Coding Standards.
+
+  8. Some packages, particularly those that use Automake, provide `make
+     distcheck', which can by used by developers to test that all other
+     targets like `make install' and `make uninstall' work correctly.
+     This target is generally not run by end users.
+
+Compilers and Options
+=====================
+
+   Some systems require unusual options for compilation or linking that
+the `configure' script does not know about.  Run `./configure --help'
+for details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+   You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you can use GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.  This
+is known as a "VPATH" build.
+
+   With a non-GNU `make', it is safer to compile the package for one
+architecture at a time in the source code directory.  After you have
+installed the package for one architecture, use `make distclean' before
+reconfiguring for another architecture.
+
+   On MacOS X 10.5 and later systems, you can create libraries and
+executables that work on multiple system types--known as "fat" or
+"universal" binaries--by specifying multiple `-arch' options to the
+compiler but only a single `-arch' option to the preprocessor.  Like
+this:
+
+     ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
+                 CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
+                 CPP="gcc -E" CXXCPP="g++ -E"
+
+   This is not guaranteed to produce working output in all cases, you
+may have to build one architecture at a time and combine the results
+using the `lipo' tool if you have problems.
+
+Installation Names
+==================
+
+   By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX', where PREFIX must be an
+absolute file name.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.  In general, the
+default for these options is expressed in terms of `${prefix}', so that
+specifying just `--prefix' will affect all of the other directory
+specifications that were not explicitly provided.
+
+   The most portable way to affect installation locations is to pass the
+correct locations to `configure'; however, many packages provide one or
+both of the following shortcuts of passing variable assignments to the
+`make install' command line to change installation locations without
+having to reconfigure or recompile.
+
+   The first method involves providing an override variable for each
+affected directory.  For example, `make install
+prefix=/alternate/directory' will choose an alternate location for all
+directory configuration variables that were expressed in terms of
+`${prefix}'.  Any directories that were specified during `configure',
+but not in terms of `${prefix}', must each be overridden at install
+time for the entire installation to be relocated.  The approach of
+makefile variable overrides for each directory variable is required by
+the GNU Coding Standards, and ideally causes no recompilation.
+However, some platforms have known limitations with the semantics of
+shared libraries that end up requiring recompilation when using this
+method, particularly noticeable in packages that use GNU Libtool.
+
+   The second method involves providing the `DESTDIR' variable.  For
+example, `make install DESTDIR=/alternate/directory' will prepend
+`/alternate/directory' before all installation names.  The approach of
+`DESTDIR' overrides is not required by the GNU Coding Standards, and
+does not work on platforms that have drive letters.  On the other hand,
+it does better at avoiding recompilation issues, and works well even
+when some directory options were not specified in terms of `${prefix}'
+at `configure' time.
+
+Optional Features
+=================
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+   Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+   Some packages offer the ability to configure how verbose the
+execution of `make' will be.  For these packages, running `./configure
+--enable-silent-rules' sets the default to minimal output, which can be
+overridden with `make V=1'; while running `./configure
+--disable-silent-rules' sets the default to verbose, which can be
+overridden with `make V=0'.
+
+Particular systems
+==================
+
+   On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
+CC is not installed, it is recommended to use the following options in
+order to use an ANSI C compiler:
+
+     ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
+
+and if that doesn't work, install pre-built binaries of GCC for HP-UX.
+
+   HP-UX `make' updates targets which have the same time stamps as
+their prerequisites, which makes it generally unusable when shipped
+generated files such as `configure' are involved.  Use GNU `make'
+instead.
+
+   On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
+parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
+a workaround.  If GNU CC is not installed, it is therefore recommended
+to try
+
+     ./configure CC="cc"
+
+and if that doesn't work, try
+
+     ./configure CC="cc -nodtk"
+
+   On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
+directory contains several dysfunctional programs; working variants of
+these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
+in your `PATH', put it _after_ `/usr/bin'.
+
+   On Haiku, software installed for all users goes in `/boot/common',
+not `/usr/local'.  It is recommended to use the following options:
+
+     ./configure --prefix=/boot/common
+
+Specifying the System Type
+==========================
+
+   There may be some features `configure' cannot figure out
+automatically, but needs to determine by the type of machine the package
+will run on.  Usually, assuming the package is built to be run on the
+_same_ architectures, `configure' can figure that out, but if it prints
+a message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS
+     KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+   If you want to set default values for `configure' scripts to share,
+you can create a site shell script called `config.site' that gives
+default values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+   Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).
+
+Unfortunately, this technique does not work for `CONFIG_SHELL' due to
+an Autoconf bug.  Until the bug is fixed you can use this workaround:
+
+     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+`configure' Invocation
+======================
+
+   `configure' recognizes the following options to control how it
+operates.
+
+`--help'
+`-h'
+     Print a summary of all of the options to `configure', and exit.
+
+`--help=short'
+`--help=recursive'
+     Print a summary of the options unique to this package's
+     `configure', and exit.  The `short' variant lists options used
+     only in the top level, while the `recursive' variant lists options
+     also present in any nested packages.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`--prefix=DIR'
+     Use DIR as the installation prefix.  *note Installation Names::
+     for more details, including other options available for fine-tuning
+     the installation locations.
+
+`--no-create'
+`-n'
+     Run the configure checks, but stop before creating any output
+     files.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff --git a/pllrepo/Makefile.am b/pllrepo/Makefile.am
new file mode 100644
index 0000000..381c18f
--- /dev/null
+++ b/pllrepo/Makefile.am
@@ -0,0 +1,7 @@
+ACLOCAL_AMFLAGS = -I m4 --install
+if BUILD_MPI
+SUBDIRS = MPI src man examples
+else
+SUBDIRS = src man examples
+endif
+
diff --git a/pllrepo/NEWS b/pllrepo/NEWS
new file mode 100644
index 0000000..e69de29
diff --git a/pllrepo/README b/pllrepo/README
new file mode 100644
index 0000000..e69de29
diff --git a/pllrepo/configure.ac b/pllrepo/configure.ac
new file mode 100644
index 0000000..5e6dd2f
--- /dev/null
+++ b/pllrepo/configure.ac
@@ -0,0 +1,123 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.68])
+AC_INIT([libpll], [1.0.0], [Tomas.Flouri at h-its.org])
+AC_CONFIG_MACRO_DIR([m4])
+AM_INIT_AUTOMAKE
+
+# AM_MAINTAINER_MODE
+
+LIBPLL_VERSION=1.0.0
+LIBPLL_MAJOR=1
+LIBPLL_MINOR=0
+LIBPLL_REV=0
+
+AC_CONFIG_SRCDIR([src/pll.h])
+AC_CONFIG_HEADERS([config.h])
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_MAKE_SET
+AC_PROG_RANLIB
+AC_PROG_CPP
+LT_INIT
+
+AM_PROG_CC_C_O
+# Checks for libraries.
+# FIXME: Replace `main' with a function in `-lm':
+AC_CHECK_LIB([m], [log])
+
+# Checks for header files.
+AC_CHECK_HEADERS([fcntl.h float.h limits.h malloc.h stddef.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+AC_C_INLINE
+AC_TYPE_INT64_T
+AC_TYPE_SIZE_T
+AC_TYPE_SSIZE_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+AC_CHECK_TYPES([ptrdiff_t])
+
+# Checks for library functions.
+AC_FUNC_ERROR_AT_LINE
+AC_FUNC_MALLOC
+AC_FUNC_MMAP
+AC_FUNC_REALLOC
+AC_CHECK_FUNCS([atexit clock_gettime getcwd getpagesize gettimeofday memmove memset munmap pow sqrt strcasecmp strchr strdup strndup strstr])
+
+have_pthreads=no
+AC_SEARCH_LIBS([pthread_create], [pthread], [have_pthreads=yes])
+
+if test "x${have_pthreads}" = xyes; then
+  AC_CHECK_HEADERS([pthread.h], [], [have_pthreads=no])
+fi
+
+
+have_generic=yes
+have_avx=no
+have_avx_pthreads=no
+have_avx_mpi=no
+have_sse3=no
+have_sse3_pthreads=no
+have_sse3_mpi=no
+
+AX_EXT
+
+if test "x${ax_cv_have_avx_ext}" = "xyes"; then
+  have_avx=yes
+  if test "x${have_pthreads}"="xyes"; then
+    have_avx_pthreads=yes
+  fi
+fi
+
+if test "x${ax_cv_have_sse3_ext}" = "xyes"; then
+  have_sse3=yes
+  if test "x${have_pthreads}"="xyes"; then
+    have_sse3_pthreads=yes
+  fi
+fi
+
+AC_ARG_ENABLE(generic,AC_HELP_STRING([--enable-generic],[build generic version of the library]),[have_generic=yes])
+AC_ARG_ENABLE(generic,AC_HELP_STRING([--disable-generic],[build generic version of the library]),[have_generic=no])
+AC_ARG_ENABLE(avx,AC_HELP_STRING([--enable-avx],[build AVX version of the library]),[have_avx=yes])
+AC_ARG_ENABLE(avx,AC_HELP_STRING([--disable-avx],[build AVX version of the library]),[have_avx=no])
+AC_ARG_ENABLE(avx-pthreads,AC_HELP_STRING([--enable-avx-pthreads],[build pthreads AVX version of the library]),[have_avx_pthreads=yes])
+AC_ARG_ENABLE(avx-pthreads,AC_HELP_STRING([--disable-avx-pthreads],[build pthreads AVX version of the library]),[have_avx_pthreads=no])
+AC_ARG_ENABLE(avx-mpi,AC_HELP_STRING([--enable-avx-mpi],[build MPI AVX version of the library]),[have_avx_mpi=yes])
+AC_ARG_ENABLE(avx-mpi,AC_HELP_STRING([--disable-avx-mpi],[build MPI AVX version of the library]),[have_avx_mpi=no])
+AC_ARG_ENABLE(sse3,AC_HELP_STRING([--enable-sse3],[build SSE3 version of the library]),[have_sse3=yes])
+AC_ARG_ENABLE(sse3,AC_HELP_STRING([--disable-sse3],[build SSE3 version of the library]),[have_sse3=no])
+AC_ARG_ENABLE(sse3-pthreads,AC_HELP_STRING([--enable-sse3-pthreads],[build pthreads SSE3 version of the library]),[have_sse3_pthreads=yes])
+AC_ARG_ENABLE(sse3-pthreads,AC_HELP_STRING([--disable-sse3-pthreads],[build pthreads SSE3 version of the library]),[have_sse3_pthreads=no])
+AC_ARG_ENABLE(sse3-mpi,AC_HELP_STRING([--enable-sse3-mpi],[build MPI SSE3 version of the library]),[have_sse3_mpi=yes])
+AC_ARG_ENABLE(sse3-mpi,AC_HELP_STRING([--disable-sse3-mpi],[build MPI SSE3 version of the library]),[have_sse3_mpi=no])
+
+AX_MPI([have_mpi=yes],[have_mpi=no])
+
+AM_INIT_AUTOMAKE([subdir-objects])
+AM_CONDITIONAL(BUILD_GENERIC, test "x${have_generic}" = "xyes")
+AM_CONDITIONAL(BUILD_AVX, test "x${have_avx}" = "xyes")
+AM_CONDITIONAL(BUILD_AVX_PTHREADS, test "x${have_avx_pthreads}" = "xyes")
+AM_CONDITIONAL(BUILD_SSE3, test "x${have_sse3}" = "xyes")
+AM_CONDITIONAL(BUILD_SSE3_PTHREADS, test "x${have_sse3_pthreads}" = "xyes")
+AM_CONDITIONAL(BUILD_MPI, test "x${have_mpi}" = "xyes")
+
+AC_SUBST(LIBPLL_VERSION)
+AC_SUBST(LIBPLL_MAJOR)
+AC_SUBST(LIBPLL_MINOR)
+AC_SUBST(LIBPLL_REV)
+AC_SUBST(MPICC)
+
+
+
+AC_CONFIG_FILES([Makefile
+                 MPI/Makefile
+                 src/Makefile
+                 man/Makefile
+                 examples/Makefile])
+AC_OUTPUT
diff --git a/pllrepo/sources.am b/pllrepo/sources.am
new file mode 100644
index 0000000..22a89c4
--- /dev/null
+++ b/pllrepo/sources.am
@@ -0,0 +1,2 @@
+ALL_SOURCES = ../src/genericParallelization.c ../src/hash.c ../src/stack.c ../src/ssort.c ../src/queue.c ../src/utils.c ../src/randomTree.c ../src/optimizeModel.c ../src/trash.c ../src/searchAlgo.c ../src/topologies.c ../src/fastDNAparsimony.c ../src/treeIO.c ../src/models.c ../src/evaluatePartialGenericSpecial.c ../src/evaluateGenericSpecial.c ../src/newviewGenericSpecial.c ../src/makenewzGenericSpecial.c ../src/bipartitionList.c ../src/restartHashTable.c ../src/recom.c ../src/lexer.c . [...]
+AVX_SOURCES = ../src/avxLikelihood.c $(ALL_SOURCES) 
diff --git a/pllrepo/src/CMakeLists.txt b/pllrepo/src/CMakeLists.txt
new file mode 100644
index 0000000..fd99063
--- /dev/null
+++ b/pllrepo/src/CMakeLists.txt
@@ -0,0 +1,67 @@
+#set( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wall -msse3 -DRAXML_USE_LLALLOC -D_USE_PTHREADS -D_OPTIMIZED_FUNCTIONS -D__SIM_SSE3 -fno-builtin" )
+
+#add_executable( raxml_light axml.c  optimizeModel.c trash.c searchAlgo.c topologies.c treeIO.c models.c evaluatePartialGenericSpecial.c evaluateGenericSpecial.c newviewGenericSpecial.c makenewzGenericSpecial.c bipartitionList.c restartHashTable.c fastDNAparsimony.c randomTree.c lockless_allocator/ll_alloc.c mem_alloc.c recom.c)
+
+#target_link_libraries( raxml_light m pthread )
+
+if (NOT BINARY32 AND NOT IQTREE_FLAGS MATCHES "novx")
+add_library(pllavx 
+	avxLikelihood.c)
+endif()
+
+if (IQTREE_FLAGS MATCHES "omp")
+add_library(pll 
+  alignment.c
+  bipartitionList.c
+  evaluateGenericSpecial.c
+  evaluatePartialGenericSpecial.c
+  fastDNAparsimony.c
+  hardware.c
+  hash.c
+  lexer.c
+  makenewzGenericSpecial.c
+  models.c
+  newick.c
+  newviewGenericSpecial.c
+  genericParallelization.c
+  optimizeModel.c
+  parsePartition.c
+  queue.c
+  randomTree.c
+  recom.c
+  restartHashTable.c
+  searchAlgo.c
+  ssort.c
+  stack.c
+  topologies.c
+  trash.c
+  treeIO.c
+  utils.c)
+else()
+add_library(pll 
+  alignment.c
+  bipartitionList.c
+  evaluateGenericSpecial.c
+  evaluatePartialGenericSpecial.c
+  fastDNAparsimony.c
+  hardware.c
+  hash.c
+  lexer.c
+  makenewzGenericSpecial.c
+  models.c
+  newick.c
+  newviewGenericSpecial.c
+  optimizeModel.c
+  parsePartition.c
+  queue.c
+  randomTree.c
+  recom.c
+  restartHashTable.c
+  searchAlgo.c
+  ssort.c
+  stack.c
+  topologies.c
+  trash.c
+  treeIO.c
+  utils.c)
+endif()
\ No newline at end of file
diff --git a/pllrepo/src/Makefile.ALL b/pllrepo/src/Makefile.ALL
new file mode 100644
index 0000000..573521e
--- /dev/null
+++ b/pllrepo/src/Makefile.ALL
@@ -0,0 +1,54 @@
+MAKE = make
+RM = rm -f
+TARGET = libpll
+VERSION = 1.0.0
+ARCH1 = AVX
+ARCH2 = SSE3
+ARCH3 = AVX-PTHREADS
+ARCH4 = SSE3-PTHREADS
+ARCH5 = AVX-MPI
+ARCH6 = SSE3-MPI
+ARCH7 = ARM
+STATICLIB1 = $(TARGET)-$(ARCH1).a.$(VERSION)
+STATICLIB2 = $(TARGET)-$(ARCH2).a.$(VERSION)
+STATICLIB3 = $(TARGET)-$(ARCH3).a.$(VERSION)
+STATICLIB4 = $(TARGET)-$(ARCH4).a.$(VERSION)
+STATICLIB5 = $(TARGET)-$(ARCH5).a.$(VERSION)
+STATICLIB6 = $(TARGET)-$(ARCH6).a.$(VERSION)
+STATICLIB7 = $(TARGET)-$(ARCH7).a.$(VERSION)
+
+
+all: $(STATICLIB1) $(STATICLIB2) $(STATICLIB3) $(STATICLIB4) $(STATICLIB5) $(STATICLIB6) $(STATICLIB7)
+
+$(STATICLIB1): Makefile.$(ARCH1)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB2): Makefile.$(ARCH2)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB3): Makefile.$(ARCH3)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB4): Makefile.$(ARCH4)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB5): Makefile.$(ARCH5)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB6): Makefile.$(ARCH6)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+$(STATICLIB7): Makefile.$(ARCH7)
+	-$(RM) *.o
+	$(MAKE) -f $+
+
+clean:
+	-$(RM) *.a $(STATICLIB1) $(STATICLIB2) $(STATICLIB3) $(STATICLIB4) $(STATICLIB5) $(STATICLIB6) $(STATICLIB7)
+
+.PHONY: all clean
diff --git a/pllrepo/src/Makefile.ARM b/pllrepo/src/Makefile.ARM
new file mode 100644
index 0000000..340fb5c
--- /dev/null
+++ b/pllrepo/src/Makefile.ARM
@@ -0,0 +1,51 @@
+CC = gcc 
+AR = ar
+CFLAGS =  -c -D_GNU_SOURCE -O2 -g -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = ARM
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o randomTree.o optimizeModel.o trash.o searchAlgo.o topologies.o fastDNAparsimony.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o recom.o lexer.o alignment.o newick.o parsePartition.o
+
+all: $(STATICLIB)
+
+$(STATICLIB): $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+fastDNAparsimony.o : fastDNAparsimony.c  $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.AVX b/pllrepo/src/Makefile.AVX
new file mode 100644
index 0000000..cb48dca
--- /dev/null
+++ b/pllrepo/src/Makefile.AVX
@@ -0,0 +1,60 @@
+CC = gcc
+AR = ar
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D__SSE3 -D__AVX -msse3 -fomit-frame-pointer -funroll-loops -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes  -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-parameter - [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = avx
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = avxLikelihood.o evaluateGenericSpecial.o hash.o models.o queue.o restartHashTable.o stack.o treeIO.o evaluatePartialGenericSpecial.o makenewzGenericSpecial.o newviewGenericSpecial.o randomTree.o searchAlgo.o topologies.o utils.o bipartitionList.o fastDNAparsimony.o optimizeModel.o recom.o trash.o lexer.o alignment.o ssort.o newick.o parsePartition.o parsimony.o
+
+all: $(STATICLIB)
+
+$(STATICLIB): $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+parsimony.o : parsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+utils.o : utils.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.AVX-MPI b/pllrepo/src/Makefile.AVX-MPI
new file mode 100644
index 0000000..c928256
--- /dev/null
+++ b/pllrepo/src/Makefile.AVX-MPI
@@ -0,0 +1,59 @@
+CC = mpicc
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D__AVX -D_FINE_GRAIN_MPI -D__SSE3 -msse3 -O2 -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-paramet [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = AVX-MPI
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o randomTree.o optimizeModel.o trash.o searchAlgo.o topologies.o fastDNAparsimony.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o recom.o  genericParallelization.o avxLikelihood.o lexer.o alignment.o ssort.o newick.o parsePartition.o parsimony.o
+
+all : $(STATICLIB)
+
+$(STATICLIB) : $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $+
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+parsimony.o : parsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+parsePartitions.o : parsePartitions.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+mesh.o : mesh.c $(GLOBAL_DEPS)
+fineGrainMpi.o :  fineGrainMpi.c  $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.AVX-PTHREADS b/pllrepo/src/Makefile.AVX-PTHREADS
new file mode 100644
index 0000000..c72e486
--- /dev/null
+++ b/pllrepo/src/Makefile.AVX-PTHREADS
@@ -0,0 +1,61 @@
+CC = gcc 
+AR = ar
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D_USE_PTHREADS -D__SSE3 -msse3 -D__AVX -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wu [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = AVX-PTHREADS
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o fastDNAparsimony.o randomTree.o lexer.o recom.o genericParallelization.o alignment.o newick.o parsePartition.o parsimony.o
+
+
+all: $(STATICLIB)
+
+$(STATICLIB) : $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+parsimony.o : parsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+genericParallelization.o : genericParallelization.c $(GLOBAL_DEPS)
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+utils.o : utils.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c  $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.AVX.clang b/pllrepo/src/Makefile.AVX.clang
new file mode 100644
index 0000000..71ab72b
--- /dev/null
+++ b/pllrepo/src/Makefile.AVX.clang
@@ -0,0 +1,57 @@
+CC = clang
+AR = ar
+CFLAGS = -g -c -O2 -D__SSE3 -D__AVX -msse3 -fomit-frame-pointer -funroll-loops -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes  -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-parameter -Wunused-value  [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = avx
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = avxLikelihood.o evaluateGenericSpecial.o hash.o models.o queue.o restartHashTable.o stack.o treeIO.o evaluatePartialGenericSpecial.o makenewzGenericSpecial.o newviewGenericSpecial.o randomTree.o searchAlgo.o topologies.o utils.o bipartitionList.o fastDNAparsimony.o optimizeModel.o recom.o trash.o lexer.o alignment.o ssort.o newick.o parsePartition.o
+
+all: $(STATICLIB)
+
+$(STATICLIB): $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+utils.o : utils.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.AVX.shared b/pllrepo/src/Makefile.AVX.shared
new file mode 100644
index 0000000..3752291
--- /dev/null
+++ b/pllrepo/src/Makefile.AVX.shared
@@ -0,0 +1,68 @@
+CC = gcc
+AR = ar
+CFLAGS = -fPIC -g -c -O2 -D__SSE3 -D__AVX -msse3 -fomit-frame-pointer -funroll-loops -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes  -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-parameter -Wunused- [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = AVX
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = avxLikelihood.o evaluateGenericSpecial.o hash.o models.o queue.o restartHashTable.o stack.o treeIO.o evaluatePartialGenericSpecial.o makenewzGenericSpecial.o newviewGenericSpecial.o randomTree.o searchAlgo.o topologies.o utils.o bipartitionList.o fastDNAparsimony.o optimizeModel.o recom.o trash.o lexer.o common.o alignment.o fasta.o phylip.o ssort.o newick.o part.o
+
+all: $(SHAREDOBJ)
+
+$(SHAREDOBJ): $(OBJ)
+	@echo "==> Building PLL Library ($@)"
+	$(CC) -shared -Wl,-soname,$@ -o $@ $(OBJ)
+	ln -sf $(SHAREDOBJ) $(TARGET)-$(ARCH).so
+
+avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -mavx -c -o $@ $<
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+pll.o : pll.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+utils.o : utils.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+common.o: parser/common.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+alignment.o: parser/alignment/alignment.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+phylip.o: parser/alignment/phylip.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+fasta.o: parser/alignment/fasta.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+newick.o: parser/newick/newick.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+part.o: parser/partition/part.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+ssort.o : parser/ssort.c $(GLOBAL_DEPS)
+	$(CC) $(CFLAGS) -o $@ $<
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.MIC-PTHREADS b/pllrepo/src/Makefile.MIC-PTHREADS
new file mode 100644
index 0000000..8f2d701
--- /dev/null
+++ b/pllrepo/src/Makefile.MIC-PTHREADS
@@ -0,0 +1,62 @@
+CC = icc 
+AR = ar
+MICFLAGS = -mmic -std=c99 -D__MIC_NATIVE -opt-streaming-cache-evict=0 # -D_DEBUG_MSG
+COMMON_FLAGS = $(MICFLAGS) -c -D_GNU_SOURCE -D_USE_PTHREADS -fomit-frame-pointer -funroll-loops -Wall #-Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  [...]
+OPT1_FLAGS = -O1
+OPT2_FLAGS = -O2
+CFLAGS = $(COMMON_FLAGS) $(OPT2_FLAGS)
+
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = MIC-PTHREADS
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o mic_native_dna.o mic_native_aa.o fastDNAparsimony.o randomTree.o lexer.o recom.o genericParallelization.o alignment.o newick.o parsePartition.o
+
+
+all: $(STATICLIB)
+
+$(STATICLIB) : $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+models.o : models.c $(GLOBAL_DEPS)
+	$(CC) $(COMMON_FLAGS) $(OPT1_FLAGS) -c -o models.o models.c
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+genericParallelization.o : genericParallelization.c $(GLOBAL_DEPS)
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+utils.o : utils.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c  $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+mic_native_dna.o : mic_native_dna.c $(GLOBAL_DEPS)
+mic_native_aa.o : mic_native_aa.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.SSE3 b/pllrepo/src/Makefile.SSE3
new file mode 100644
index 0000000..2afbe71
--- /dev/null
+++ b/pllrepo/src/Makefile.SSE3
@@ -0,0 +1,52 @@
+CC = gcc 
+AR = ar
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D__SSE3 -msse3 -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-va [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = SSE3
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o randomTree.o optimizeModel.o trash.o searchAlgo.o topologies.o fastDNAparsimony.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o recom.o lexer.o alignment.o newick.o parsePartition.o parsimony.o
+
+all: $(STATICLIB)
+
+$(STATICLIB): $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+fastDNAparsimony.o : fastDNAparsimony.c  $(GLOBAL_DEPS)
+parsimony.o : parsimony.c $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.SSE3-MPI b/pllrepo/src/Makefile.SSE3-MPI
new file mode 100644
index 0000000..ecf8023
--- /dev/null
+++ b/pllrepo/src/Makefile.SSE3-MPI
@@ -0,0 +1,50 @@
+CC = mpicc
+AR = ar
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D_FINE_GRAIN_MPI -D__SSE3 -msse3 -O2 -fomit-frame-pointer -funroll-loops -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wred [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = SSE3-MPI
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o randomTree.o optimizeModel.o trash.o searchAlgo.o topologies.o fastDNAparsimony.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o recom.o  genericParallelization.o lexer.o alignment.o ssort.o newick.o parsePartition.o parsimony.o
+
+all : $(STATICLIB)
+
+$(STATICLIB) : $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $+
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+fastDNAparsimony.o : fastDNAparsimony.c $(GLOBAL_DEPS)
+parsimony.o : parsimony.c $(GLOBAL_DEPS)
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+parsePartitions.o : parsePartitions.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+lexer.o : lexer.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+parsePartition.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean : 
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.SSE3-PTHREADS b/pllrepo/src/Makefile.SSE3-PTHREADS
new file mode 100644
index 0000000..fac2010
--- /dev/null
+++ b/pllrepo/src/Makefile.SSE3-PTHREADS
@@ -0,0 +1,52 @@
+CC = gcc 
+AR = ar
+CFLAGS = -g -c -O2 -D_GNU_SOURCE -D_USE_PTHREADS -D__SSE3 -msse3 -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-va [...]
+ARFLAGS = rvs
+TARGET = libpll
+ARCH = SSE3-PTHREADS
+VERSION = 1.0.0
+STATICLIB = $(TARGET)-$(ARCH).a.$(VERSION)	# static library
+SHAREDOBJ = $(TARGET)-$(ARCH).so.$(VERSION)	# shared object
+GLOBAL_DEPS = pll.h globalVariables.h
+RM = rm -f
+
+OBJ = hash.o stack.o ssort.o queue.o utils.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o fastDNAparsimony.o randomTree.o lexer.o recom.o genericParallelization.o alignment.o newick.o parsePartition.o parsimony.o
+
+all: $(STATICLIB)
+
+$(STATICLIB) : $(OBJ)
+	@echo "==> Building PLL Library ($(STATICLIB))"
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	ln -sf $(STATICLIB) $(TARGET)-$(ARCH).a
+
+bipartitionList.o : bipartitionList.c $(GLOBAL_DEPS)
+genericParallelization.o : genericParallelization.c $(GLOBAL_DEPS)
+evaluatePartialSpecialGeneric.o : evaluatePartialSpecialGeneric.c $(GLOBAL_DEPS)
+optimizeModel.o : optimizeModel.c $(GLOBAL_DEPS)
+trash.o : trash.c $(GLOBAL_DEPS)
+searchAlgo.o : searchAlgo.c $(GLOBAL_DEPS)
+topologies.o : topologies.c $(GLOBAL_DEPS)
+treeIO.o : treeIO.c $(GLOBAL_DEPS)
+models.o : models.c $(GLOBAL_DEPS)
+evaluatePartialGenericSpecial.o : evaluatePartialGenericSpecial.c $(GLOBAL_DEPS)
+evaluateGenericSpecial.o : evaluateGenericSpecial.c $(GLOBAL_DEPS)
+newviewGenericSpecial.o : newviewGenericSpecial.c $(GLOBAL_DEPS)
+makenewzGenericSpecial.o : makenewzGenericSpecial.c $(GLOBAL_DEPS)
+restartHashTable.o : restartHashTable.c $(GLOBAL_DEPS)
+randomTree.o : randomTree.c $(GLOBAL_DEPS)
+fastDNAparsimony.o : fastDNAparsimony.c  $(GLOBAL_DEPS)
+parsimony.o : parsimony.c  $(GLOBAL_DEPS)
+recom.o : recom.c  $(GLOBAL_DEPS)
+queue.o : queue.c $(GLOBAL_DEPS)
+stack.o : stack.c $(GLOBAL_DEPS)
+hash.o : hash.c $(GLOBAL_DEPS)
+alignment.o: alignment.c $(GLOBAL_DEPS)
+newick.o: newick.c $(GLOBAL_DEPS)
+part.o: parsePartition.c $(GLOBAL_DEPS)
+ssort.o : ssort.c $(GLOBAL_DEPS)
+
+clean:
+	-$(RM) *.o $(STATICLIB) $(SHAREDOBJ)
+
+.PHONY: all clean
+.INTERMEDIATE: $(OBJ)
diff --git a/pllrepo/src/Makefile.am b/pllrepo/src/Makefile.am
new file mode 100644
index 0000000..0748b09
--- /dev/null
+++ b/pllrepo/src/Makefile.am
@@ -0,0 +1,53 @@
+#lib_LTLIBRARIES = libpll-generic.la
+lib_LTLIBRARIES = 
+#lib_LIBRARIES = libpll-generic.a
+lib_LIBRARIES = 
+libpll_generic_la_SOURCES = hash.c stack.c ssort.c queue.c utils.c randomTree.c optimizeModel.c trash.c searchAlgo.c topologies.c fastDNAparsimony.c treeIO.c models.c evaluatePartialGenericSpecial.c evaluateGenericSpecial.c newviewGenericSpecial.c makenewzGenericSpecial.c bipartitionList.c restartHashTable.c recom.c lexer.c alignment.c newick.c parsePartition.c parsimony.c
+libpll_generic_la_CFLAGS = -c -D_GNU_SOURCE -O2 -fomit-frame-pointer -funroll-loops -Wall -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-var [...]
+libpll_generic_la_LDFLAGS = -version-info @LIBPLL_MAJOR@:@LIBPLL_MINOR@:@LIBPLL_REV@
+libpll_generic_a_SOURCES = $(libpll_generic_la_SOURCES)
+libpll_generic_a_CFLAGS = $(libpll_generic_la_CFLAGS)
+
+if BUILD_SSE3
+lib_LTLIBRARIES += libpll-sse3.la
+libpll_sse3_la_SOURCES = $(libpll_generic_la_SOURCES)
+libpll_sse3_la_CFLAGS = -D__SSE3 -msse3 $(libpll_generic_la_CFLAGS)
+libpll_sse3_la_LDFLAGS = -version-info @LIBPLL_MAJOR@:@LIBPLL_MINOR@:@LIBPLL_REV@
+lib_LIBRARIES += libpll-sse3.a
+libpll_sse3_a_SOURCES = $(libpll_sse3_la_SOURCES)
+libpll_sse3_a_CFLAGS = $(libpll_sse3_la_CFLAGS)
+endif
+
+
+if BUILD_AVX
+lib_LTLIBRARIES += libpll-avx.la
+libpll_avx_la_SOURCES = avxLikelihood.c $(libpll_generic_la_SOURCES)
+libpll_avx_la_CFLAGS = -D__SSE3 -D__AVX -mavx -msse3 $(libpll_generic_la_CFLAGS)
+libpll_avx_la_LDFLAGS = -version-info @LIBPLL_MAJOR@:@LIBPLL_MINOR@:@LIBPLL_REV@
+lib_LIBRARIES += libpll-avx.a
+libpll_avx_a_SOURCES = $(libpll_avx_la_SOURCES)
+libpll_avx_a_CFLAGS = $(libpll_avx_la_CFLAGS)
+endif
+
+if BUILD_SSE3_PTHREADS
+lib_LTLIBRARIES += libpll-sse3-pthreads.la
+libpll_sse3_pthreads_la_SOURCES = genericParallelization.c $(libpll_generic_la_SOURCES)
+libpll_sse3_pthreads_la_CFLAGS = -D_USE_PTHREADS -D__SSE3 -msse3 $(libpll_generic_la_CFLAGS)
+libpll_sse3_pthreads_la_LDFLAGS = -version-info @LIBPLL_MAJOR@:@LIBPLL_MINOR@:@LIBPLL_REV@
+lib_LIBRARIES += libpll-sse3-pthreads.a
+libpll_sse3_pthreads_a_SOURCES = $(libpll_sse3_pthreads_la_SOURCES)
+libpll_sse3_pthreads_a_CFLAGS = $(libpll_sse3_pthreads_la_CFLAGS)
+endif
+
+if BUILD_AVX_PTHREADS
+lib_LTLIBRARIES += libpll-avx-pthreads.la
+libpll_avx_pthreads_la_SOURCES = avxLikelihood.c genericParallelization.c $(libpll_generic_la_SOURCES)
+libpll_avx_pthreads_la_CFLAGS = -D_USE_PTHREADS -D__AVX -mavx -D__SSE3 -msse3 $(libpll_generic_la_CFLAGS)
+libpll_avx_pthreads_la_LDFLAGS = -version-info @LIBPLL_MAJOR@:@LIBPLL_MINOR@:@LIBPLL_REV@
+lib_LIBRARIES += libpll-avx-pthreads.a
+libpll_avx_pthreads_a_SOURCES = $(libpll_avx_pthreads_la_SOURCES)
+libpll_avx_pthreads_a_CFLAGS = $(libpll_avx_pthreads_la_CFLAGS)
+endif
+
+pkgincludedir=$(includedir)/pll
+pkginclude_HEADERS = pll.h newick.h stack.h hash.h errcodes.h globalVariables.h lexer.h genericParallelization.h treeIO.h queue.h parsePartition.h mem_alloc.h cycle.h
diff --git a/pllrepo/src/alignment.c b/pllrepo/src/alignment.c
new file mode 100644
index 0000000..d50f6db
--- /dev/null
+++ b/pllrepo/src/alignment.c
@@ -0,0 +1,754 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file alignment.c
+ *
+ * @brief Collection of routines for reading alignments
+ *
+ * Auxiliary functions for storing alignments read from predefined file formats
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+/** @defgroup alignmentGroup Reading and parsing multiple sequence alignments
+    
+    This set of functions handles the reading and parsing of several file formats that describe multiple sequence alignments. They are also responsible for storing the alignment in an internal structure
+*/
+static pllAlignmentData * pllParsePHYLIP (const char * filename);
+static pllAlignmentData * pllParseFASTA (const char * filename);
+static int read_phylip_header (int * inp, int * sequenceCount, int * sequenceLength);
+static __inline int parsedOk (int * actLen, int sequenceCount, int sequenceLength);
+static int parse_phylip (pllAlignmentData * alignmentData, int input);
+static int getFastaAlignmentInfo (int * inp, int * seqCount, int * seqLen);
+static int parseFastaAlignment (pllAlignmentData * alignmentData, int input);
+
+#ifdef __PLL_DEBUG_PARSER
+static int
+printTokens (int input)
+{
+  pllLexToken token;
+
+  do
+   {
+     NEXT_TOKEN
+
+     /* begin of parser */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_NUMBER:
+          printf ("PLL_TOKEN_NUMBER (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        case PLL_TOKEN_STRING:
+          printf ("PLL_TOKEN_STRING (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        case PLL_TOKEN_EOF:
+          printf ("PLL_TOKEN_EOF\n");
+          break;
+        case PLL_TOKEN_WHITESPACE:
+          printf ("PLL_TOKEN_WHITESPACE\n");
+          break;
+        case PLL_TOKEN_NEWLINE:
+          printf ("PLL_TOKEN_NEWLINE\n");
+          break;
+        case PLL_TOKEN_UNKNOWN:
+          printf ("PLL_TOKEN_UNKNOWN (%.*s, %d)\n", token.len, token.lexeme, token.len);
+          break;
+        default:
+          break;
+      }
+     /* end of parser */
+
+
+   }
+  while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN);
+
+  if (token.tokenType == PLL_TOKEN_UNKNOWN) return (0);
+
+  return (1);
+}
+#endif
+
+/** @ingroup alignmentGroup
+    @brief Initialize alignment structure fields
+
+    Allocates memory for the data structure that will hold the alignment and
+    initializes it. It requires the number of sequences \a sequenceCount and
+    the length of sequences \a sequenceLength. It returns a pointer to the
+    initialized data structure.
+
+    @param sequenceCount
+      Number of sequences in the alignment
+    
+    @param sequenceLength
+      Length of the sequences
+
+    @param 
+      Initialized alignment data structured
+*/
+pllAlignmentData *
+pllInitAlignmentData (int sequenceCount, int sequenceLength)
+ {
+   int i;
+   pllAlignmentData * alignmentData;
+   //void * mem;
+   //TUNG
+   unsigned char *mem;
+
+   
+   /** TODO */
+   alignmentData               =  (pllAlignmentData *) rax_malloc (sizeof (pllAlignmentData));
+   alignmentData->sequenceData = (unsigned char **) rax_malloc ((sequenceCount + 1) * sizeof (unsigned char *));
+   //mem = (void *) rax_malloc (sizeof (unsigned char) * (sequenceLength + 1) * sequenceCount);
+   //TUNG
+   mem = (unsigned char *)rax_malloc(sizeof(unsigned char) * (sequenceLength + 1) * sequenceCount);
+   for (i = 1; i <= sequenceCount; ++i)
+    {
+      alignmentData->sequenceData[i]                 = (unsigned char *) (&mem[sizeof (unsigned char) * (i - 1) * (sequenceLength + 1)]);
+      alignmentData->sequenceData[i][sequenceLength] = 0;
+    }
+   alignmentData->sequenceData[0] = NULL;
+    
+   alignmentData->sequenceLabels = (char **) rax_calloc ((sequenceCount + 1), sizeof (char *));
+
+   alignmentData->sequenceCount  = sequenceCount;
+   alignmentData->sequenceLength = sequenceLength;
+   alignmentData->originalSeqLength = sequenceLength;
+
+   /** TODO: remove siteWeights from alignment */
+   alignmentData->siteWeights    = NULL;
+
+   return (alignmentData);
+ }
+
+/** @ingroup alignmentGroup
+    @brief Deallocates the memory associated with the alignment data structure
+    
+    Deallocates the memory associated with the alignment data structure \a alignmentData.
+
+    @param alignmentData
+      The alignment data structure
+*/
+void
+pllAlignmentDataDestroy (pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i)
+   {
+     rax_free (alignmentData->sequenceLabels[i]);
+   }
+  rax_free (alignmentData->sequenceLabels);
+  rax_free (alignmentData->sequenceData[1]);
+  rax_free (alignmentData->sequenceData);
+  rax_free (alignmentData->siteWeights);
+  rax_free (alignmentData);
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Prints the alignment to the console
+
+    @param alignmentData
+      The alignment data structure
+*/
+void 
+pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData)
+ {
+   int i;
+
+   printf ("%d %d\n", alignmentData->sequenceCount, alignmentData->sequenceLength);
+   for (i = 1; i <= alignmentData->sequenceCount; ++ i)
+    {
+      printf ("%s %s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+    }
+ }
+
+
+
+static void dump_fasta_content(FILE * fp, pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++i)
+     fprintf (fp, ">%s\n%s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+}
+
+static void dump_phylip_content(FILE * fp, pllAlignmentData * alignmentData)
+{
+  int i;
+
+  for (i = 1; i <= alignmentData->sequenceCount; ++i)
+     fprintf (fp, "%s %s\n", alignmentData->sequenceLabels[i], alignmentData->sequenceData[i]);
+}
+
+/** @ingroup alignmentGroup
+    @brief Dump the alignment to a file of format \a fileFormat
+
+    Dumps the alignment contained in \a alignmentData to file \a filename of type \a fileFormat.
+
+    @note If \a filename exists, all contents will be erased
+
+    @param alignmentData
+      Alignment data structure
+
+    @param fileFormat
+      Format of output file. Can take the value \b PLL_FORMAT_PHYLIP or \b PLL_FORMAT_FASTA
+
+    @param filename
+      Output filename
+
+    @return
+      Returns \b PLL_TRUE on success, otherwise \b PLL_FALSE.
+*/
+int
+pllAlignmentDataDumpFile (pllAlignmentData * alignmentData, int fileFormat, const char * filename)
+{
+  FILE * fp;
+  void (*outfun)(FILE *, pllAlignmentData *);
+  
+  if (fileFormat != PLL_FORMAT_PHYLIP && fileFormat != PLL_FORMAT_FASTA) return (PLL_FALSE);
+
+  outfun = (fileFormat == PLL_FORMAT_PHYLIP) ? dump_phylip_content : dump_fasta_content;
+
+  fp = fopen (filename,"wb");
+  if (!fp) return (PLL_FALSE);
+  
+  /* if PHYLIP print the silly header at the beginning */
+  if (fileFormat == PLL_FORMAT_PHYLIP)
+   {
+     fprintf (fp, "%d %d\n", alignmentData->sequenceCount, alignmentData->sequenceLength);
+   }
+  
+  outfun(fp, alignmentData);
+
+  fclose (fp);
+  return (PLL_TRUE);
+}
+
+
+
+/* ROUTINES FOR PHYLIP PARSING */
+/** @ingroup alignmentGroup
+    @brief Parse the PHYLIP file header
+*/
+static int
+read_phylip_header (int * inp, int * sequenceCount, int * sequenceLength)
+{
+  pllLexToken token;
+  int input;
+
+  input = *inp;
+
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER) return (0);
+
+  *sequenceCount = atoi (token.lexeme);
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+  if (token.tokenType != PLL_TOKEN_NUMBER) return (0);
+
+  *sequenceLength = atoi (token.lexeme);
+
+  *inp = input;
+
+  return (*sequenceCount && *sequenceLength);
+}
+
+static __inline int
+parsedOk (int * actLen, int sequenceCount, int sequenceLength)
+{
+  int i;
+
+  for (i = 1; i <= sequenceCount; ++ i)
+   {
+     if (actLen[i] != sequenceLength) return (0);
+   }
+  
+  return (1);
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Parse the PHYLIP file body
+*/
+static int
+parse_phylip (pllAlignmentData * alignmentData, int input)
+{
+  int i,j;
+  pllLexToken token;
+  int * sequenceLength;
+  int rc;
+
+  sequenceLength = (int *) rax_calloc (alignmentData->sequenceCount + 1, sizeof (int));
+
+  NEXT_TOKEN
+  for (i = 0; ; ++i)
+  {
+    j = i % alignmentData->sequenceCount;
+    if (i < alignmentData->sequenceCount) 
+     {
+       if (token.tokenType == PLL_TOKEN_EOF)
+        {
+          rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength);
+          rax_free (sequenceLength);
+          return (rc);
+        }
+
+       if (token.tokenType == PLL_TOKEN_UNKNOWN)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+
+       CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+
+       if (token.tokenType != PLL_TOKEN_STRING && token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_FLOAT)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+       alignmentData->sequenceLabels[i + 1] = my_strndup (token.lexeme, token.len);
+       NEXT_TOKEN
+       CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     }
+    
+    while (1)
+     {
+       if (token.tokenType == PLL_TOKEN_EOF)
+        {
+          rc = parsedOk (sequenceLength, alignmentData->sequenceCount, alignmentData->sequenceLength);
+          rax_free (sequenceLength);
+          return (rc);
+        }
+
+       if (token.tokenType == PLL_TOKEN_UNKNOWN)
+        {
+         rax_free (sequenceLength);
+         return (0);
+        }
+       
+       if (token.tokenType == PLL_TOKEN_NEWLINE) break;
+
+       if (token.tokenType != PLL_TOKEN_STRING)
+        {
+          rax_free (sequenceLength);
+          return (0);
+        }
+
+       if (sequenceLength[j + 1] + token.len > alignmentData->sequenceLength) 
+        {
+          fprintf (stderr, "Sequence %d is larger than specified\n", j + 1);
+          rax_free (sequenceLength);
+          return (0);
+        }
+       memmove (alignmentData->sequenceData[j + 1] + sequenceLength[j + 1], token.lexeme, token.len);
+       sequenceLength[j + 1] += token.len;
+
+       NEXT_TOKEN
+       CONSUME (PLL_TOKEN_WHITESPACE)
+     }
+    CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
+  }
+}
+
+/* Phylip parsers. Use the following attributed grammar 
+ * 
+ *        S -> HEADER ENDL DATA
+ *   HEADER -> PLL_TOKEN_NUMBER PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER ENDL |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER PLL_TOKEN_WHITESPACE PLL_TOKEN_NUMBER ENDL
+ *     ENDL -> PLL_TOKEN_WHITESPACE PLL_TOKEN_NEWLINE | PLL_TOKEN_NEWLINE
+ *     DATA -> PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING ENDL DATA |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING ENDL DATA | 
+ *             PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_EOF |
+ *             PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_WHITESPACE PLL_TOKEN_STRING PLL_TOKEN_EOF
+ */
+
+/** @ingroup alignmentGroup
+    @brief Parse a PHYLIP file
+
+    Parses the PHYLIP file \a filename and returns a ::pllAlignmentData structure
+    with the alignment.
+
+    @param filename
+      Name of file to be parsed
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL
+      in case of failure.
+*/
+static pllAlignmentData *
+pllParsePHYLIP (const char * filename)
+{
+  int 
+    i, input, sequenceCount, sequenceLength;
+  char * rawdata;
+  long filesize;
+  pllAlignmentData * alignmentData;
+
+  rawdata = pllReadFile (filename, &filesize);
+  if (!rawdata)
+   {
+     errno = PLL_ERROR_FILE_OPEN;
+     return (NULL);
+   }
+  
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol();
+
+  /* parse the header to obtain the number of taxa and sequence length */
+  if (!read_phylip_header (&input, &sequenceCount, &sequenceLength))
+   {
+     rax_free (rawdata);
+     fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n");
+     errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX;
+     return (NULL);
+   }
+
+  lex_table_amend_phylip();
+
+  /* allocate alignment structure */
+  alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength);
+
+  if (! parse_phylip (alignmentData, input))
+   {
+     errno = PLL_ERROR_PHYLIP_BODY_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+     rax_free (rawdata);
+     return (NULL);
+   }
+  
+  lex_table_restore();
+  rax_free (rawdata);
+
+  alignmentData->siteWeights  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i) 
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+pllAlignmentData *
+pllParsePHYLIPString (const char *rawdata, long filesize)
+{
+  int
+    i, input, sequenceCount, sequenceLength;
+//  char * rawdata;
+//  long filesize;
+  pllAlignmentData * alignmentData;
+
+//  rawdata = pllReadFile (filename, &filesize);
+//  if (!rawdata)
+//   {
+//     errno = PLL_ERROR_FILE_OPEN;
+//     return (NULL);
+//   }
+
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol();
+
+  /* parse the header to obtain the number of taxa and sequence length */
+  if (!read_phylip_header (&input, &sequenceCount, &sequenceLength))
+   {
+//     rax_free (rawdata);
+     fprintf (stderr, "Error while parsing PHYLIP header (number of taxa and sequence length)\n");
+     errno = PLL_ERROR_PHYLIP_HEADER_SYNTAX;
+     return (NULL);
+   }
+
+  lex_table_amend_phylip();
+
+  /* allocate alignment structure */
+  alignmentData = pllInitAlignmentData (sequenceCount, sequenceLength);
+
+  if (! parse_phylip (alignmentData, input))
+   {
+     errno = PLL_ERROR_PHYLIP_BODY_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+//     rax_free (rawdata);
+     return (NULL);
+   }
+
+  lex_table_restore();
+//  rax_free (rawdata);
+
+  alignmentData->siteWeights  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+/* FASTA routines */
+/* only check whether it is a valid alignment in fasta format */
+/** @ingroup alignmentGroup
+    @brief Get information about the FASTA alignment
+
+    Get the information such as number of sequences and length of sequences of a FASTA alignment
+
+    @return
+      Returns \b PLL_TRUE if the alignment is valid, otherwise \b PLL_FALSE
+*/
+static int
+getFastaAlignmentInfo (int * inp, int * seqCount, int * seqLen)
+{
+  pllLexToken token;
+  int input;
+
+  input = *inp;
+
+  *seqCount = *seqLen = 0;
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_STRING) return (PLL_FALSE);
+
+  while (1)
+   {
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (PLL_TRUE);
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          if (token.len < 2 || token.lexeme[0] != '>') return (0);
+          break;
+        default:
+          return (PLL_FALSE);
+      }
+     
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+     /* read second token (sequence) */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (PLL_FALSE);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          if (!*seqLen)
+            *seqLen = token.len;
+          else
+           {
+             if (*seqLen != token.len) return (0);
+           }
+          break;
+        default:
+          return (PLL_FALSE);
+      }
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     ++ (*seqCount);
+   }
+
+  return (PLL_TRUE);
+}
+
+/** @ingroup alignmentGroup
+    @brief Check whether the FASTA content is valid
+*/
+static int
+parseFastaAlignment (pllAlignmentData * alignmentData, int input)
+{
+  pllLexToken token;
+  int i;
+
+  NEXT_TOKEN
+  CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+  if (token.tokenType != PLL_TOKEN_NUMBER && token.tokenType != PLL_TOKEN_STRING) return (0);
+
+  i = 1;
+  while (1)
+   {
+     /* first parse the sequence label */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (1);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          alignmentData->sequenceLabels[i] = my_strndup (token.lexeme + 1, token.len - 1);
+          break;
+        default:
+          return (0);
+      }
+     
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+     /* now parse the sequence itself */
+     switch (token.tokenType)
+      {
+        case PLL_TOKEN_EOF:
+          return (0);
+          break;
+
+        case PLL_TOKEN_NUMBER:
+        case PLL_TOKEN_STRING:
+          memmove (alignmentData->sequenceData[i], token.lexeme, token.len);
+          break;
+        default:
+          return (0);
+      }
+     NEXT_TOKEN
+     CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+     ++ i;
+   }
+}
+
+
+/** @ingroup alignmentGroup
+    @brief Parse a FASTA file
+    
+    Parses the FASTA file \a filename and returns a ::pllAlignmentData structure
+    with the alignment.
+
+    @param filename
+      Name of file to be parsed
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the alignment, or \b NULL
+      in case of failure.
+*/
+static pllAlignmentData *
+pllParseFASTA (const char * filename)
+{
+  int
+    i,
+    seqLen,
+    seqCount,
+    input;
+  long filesize;
+
+  char * rawdata;
+  pllAlignmentData * alignmentData;
+
+  rawdata = pllReadFile (filename, &filesize);
+  if (!rawdata)
+   {
+     errno = PLL_ERROR_FILE_OPEN;
+     return (NULL);
+   }
+
+  lex_table_amend_fasta ();
+  
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol ();
+
+
+  if (!getFastaAlignmentInfo (&input, &seqCount, &seqLen))
+   {
+     errno = PLL_ERROR_FASTA_SYNTAX;
+     lex_table_restore ();
+     rax_free (rawdata);
+     return (NULL);
+   }
+  
+  alignmentData = pllInitAlignmentData (seqCount, seqLen);
+  
+  printf ("\n---------------\n\n");
+
+  init_lexan (rawdata, filesize);
+  input = get_next_symbol ();
+
+  if (!parseFastaAlignment (alignmentData, input))
+   {
+     errno = PLL_ERROR_FASTA_SYNTAX;
+     pllAlignmentDataDestroy (alignmentData);
+     lex_table_restore();
+     rax_free(rawdata);
+     return (NULL);
+   }
+
+  /* allocate alignment structure */
+
+
+  lex_table_restore ();
+  rax_free (rawdata);
+
+  alignmentData->siteWeights = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    alignmentData->siteWeights[i] = 1;
+
+  return (alignmentData);
+}
+
+
+
+/** @ingroup alignmentGroup
+    @brief Parse a file that contains a multiple sequence alignment
+
+    Parses the file \a filename of type \a fileType which contains a multiple sequence alignment.
+    The supported file types are the sequential and interleaved versions of PHYLIP format, and
+    the FASTA format. The parsed alignment is returned as a pointer to a structure of type
+    ::pllAlignmentData
+
+    @param fileType
+      Type of file to parse. Can be either \b PLL_FORMAT_PHYLIP or \b PLL_FORMAT_FASTA
+
+    @param filename
+      Name of file to parse
+
+    @return
+      Returns a structure of type ::pllAlignmentData that contains the multiple sequence alignment,
+      otherwise returns \b NULL in case of failure.
+*/
+pllAlignmentData *
+pllParseAlignmentFile (int fileType, const char * filename)
+{
+
+  switch (fileType)
+   {
+     case PLL_FORMAT_PHYLIP:
+       return (pllParsePHYLIP (filename));
+     case PLL_FORMAT_FASTA:
+       return (pllParseFASTA (filename));
+     default:
+       /* RTFM */
+       errno = PLL_ERROR_INVALID_FILETYPE;
+       return (NULL);
+   }
+}
diff --git a/pllrepo/src/avxLikelihood.c b/pllrepo/src/avxLikelihood.c
new file mode 100644
index 0000000..5202883
--- /dev/null
+++ b/pllrepo/src/avxLikelihood.c
@@ -0,0 +1,4111 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file avxLikelihood.c
+ *
+ * @brief AVX versions of the likelihood functions
+ *
+ * AVX versions of the likelihood functions
+ */
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include <immintrin.h>
+#include <assert.h>
+
+#ifdef _FMA
+#include <x86intrin.h>
+#define FMAMACC(a,b,c) _mm256_fmadd_pd(b,c,a)
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const unsigned int mask32[32];
+
+PLL_ALIGN_BEGIN const union PLL_ALIGN_END
+{
+  uint64_t i[4];
+  __m256d m;
+  
+} absMask_AVX = {{0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL, 0x7fffffffffffffffULL}};
+
+
+
+static __inline __m256d hadd4(__m256d v, __m256d u)
+{ 
+  __m256d
+    a, b;
+  
+  v = _mm256_hadd_pd(v, v);
+  a = _mm256_permute2f128_pd(v, v, 1);
+  v = _mm256_add_pd(a, v);
+
+  u = _mm256_hadd_pd(u, u);
+  b = _mm256_permute2f128_pd(u, u, 1);
+  u = _mm256_add_pd(b, u);
+
+  v = _mm256_mul_pd(v, u);	
+  
+  return v;
+}
+
+static __inline __m256d hadd3(__m256d v)
+{ 
+  __m256d
+    a;
+  
+  v = _mm256_hadd_pd(v, v);
+  a = _mm256_permute2f128_pd(v, v, 1);
+  v = _mm256_add_pd(a, v);
+  
+  return v;
+}
+
+
+void  newviewGTRGAMMA_AVX(int tipCase,
+			 double *x1, double *x2, double *x3,
+			 double *extEV, double *tipVector,
+			 int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			 const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+			 )
+{
+ 
+  int  
+    i, 
+    k, 
+    scale, 
+    addScale = 0;
+ 
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+ 
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	double 
+	  *uX1, *uX2;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END,
+	  umpX2[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}
+	  
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		}	    
+	  }   	
+	  
+
+	for(i = 0; i < n; i++)
+	  {	    		 	    
+	    uX1 = &umpX1[64 * tipX1[i]];
+	    uX2 = &umpX2[64 * tipX2[i]];		  
+	    
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   
+		  xv = _mm256_setzero_pd();
+	       
+		int 
+		  l;
+		
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      																	   
+		    __m256d
+		      x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+		    xv = FMAMACC(xv,x1v,evv);
+#else						  
+		    xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		  }
+		
+		_mm256_store_pd(&x3[16 * i + 4 * k], xv);
+	      }	         	   	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	double 
+	  *uX1;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i*4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}	 	   
+	  }   	
+	
+	for(i = 0; i < n; i++)
+	  { 
+	    __m256d
+	      xv[4];	    	   
+	    
+	    scale = 1;
+	    uX1 = &umpX1[64 * tipX1[i]];
+
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   		 
+		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+
+		int 
+		  l;
+
+		xv[k]  = _mm256_setzero_pd();
+		  
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      															
+		    __m256d  
+		      x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		    x2v = hadd3(x2v);
+		    x1v = _mm256_mul_pd(x1v, x2v);			
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+		    xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		  }
+		    
+		if(scale)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }
+	      }	    
+
+	    if(scale)
+	      {
+		xv[0] = _mm256_mul_pd(xv[0], twoto);
+		xv[1] = _mm256_mul_pd(xv[1], twoto);
+		xv[2] = _mm256_mul_pd(xv[2], twoto);
+		xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i] += 1;
+	      }
+
+	    _mm256_store_pd(&x3[16 * i],      xv[0]);
+	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {
+	for(i = 0; i < n; i++)
+	  {	
+	    __m256d
+	      xv[4];
+	    
+	    scale = 1;
+
+	    for(k = 0; k < 4; k++)
+	      {
+		__m256d	   
+		 
+		  xvl = _mm256_load_pd(&(x1[i * 16 + k * 4])),
+		  xvr = _mm256_load_pd(&(x2[i * 16 + k * 4]));
+
+		int 
+		  l;
+
+		xv[k] = _mm256_setzero_pd();
+
+		for(l = 0; l < 4; l++)
+		  {	       	     				      	      															
+		    __m256d 
+		      x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+		      x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		    x1v = hadd4(x1v, x2v);			
+		
+		    __m256d 
+		      evv = _mm256_load_pd(&extEV[l * 4]);
+						  
+		    xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		  }
+		
+		if(scale)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }
+	      }
+
+	     if(scale)
+	      {
+		xv[0] = _mm256_mul_pd(xv[0], twoto);
+		xv[1] = _mm256_mul_pd(xv[1], twoto);
+		xv[2] = _mm256_mul_pd(xv[2], twoto);
+		xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i] += 1;		
+	      }
+		
+	    _mm256_store_pd(&x3[16 * i],      xv[0]);
+	    _mm256_store_pd(&x3[16 * i + 4],  xv[1]);
+	    _mm256_store_pd(&x3[16 * i + 8],  xv[2]);
+	    _mm256_store_pd(&x3[16 * i + 12], xv[3]);
+	  }
+      }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+  
+}
+
+void  newviewGTRGAMMA_AVX_GAPPED_SAVE(int tipCase,
+				      double *x1_start, double *x2_start, double *x3_start,
+				      double *extEV, double *tipVector,
+				      int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				      const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				      unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+				      double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+				      )
+{
+ 
+  int  
+    i, 
+    k, 
+    scale,
+    scaleGap,
+    addScale = 0;
+ 
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+ 
+  double
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	double 
+	  *uX1, *uX2;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END,
+	  umpX2[1024] PLL_ALIGN_END;
+
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i * 4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}
+	  
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&right[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX2[i * 64 + j * 16 + k * 4], left1);
+		}	    
+	  }   	
+	  
+	x3 = x3_gapColumn;
+
+	{
+	  uX1 = &umpX1[960];
+	  uX2 = &umpX2[960];		  
+	  
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   
+		xv = _mm256_setzero_pd();
+	      
+	      int 
+		l;
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      																	   
+		  __m256d
+		    x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+		  xv = FMAMACC(xv,x1v,evv);
+#else						  
+		  xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		}
+		    
+	      _mm256_store_pd(&x3[4 * k], xv);
+	    }
+	}
+	
+	x3 = x3_start;
+
+	for(i = 0; i < n; i++)
+	  {		    	    	
+	    if(!(x3_gap[i / 32] & mask32[i % 32]))	     
+	      {
+		uX1 = &umpX1[64 * tipX1[i]];
+		uX2 = &umpX2[64 * tipX2[i]];		  
+	    
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   
+		      xv = _mm256_setzero_pd();
+	       
+		    int 
+		      l;
+		
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      																	   
+			__m256d
+			  x1v =  _mm256_mul_pd(_mm256_load_pd(&uX1[k * 16 + l * 4]), _mm256_load_pd(&uX2[k * 16 + l * 4]));
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+#ifdef _FMA
+			xv = FMAMACC(xv,x1v,evv);
+#else						  
+			xv = _mm256_add_pd(xv, _mm256_mul_pd(x1v, evv));
+#endif
+		      }
+		    
+		    _mm256_store_pd(&x3[4 * k], xv);
+		  }
+
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	double 
+	  *uX1;
+	PLL_ALIGN_BEGIN double
+	  umpX1[1024] PLL_ALIGN_END;
+       
+	for (i = 1; i < 16; i++)
+	  {
+	    __m256d 
+	      tv = _mm256_load_pd(&(tipVector[i*4]));
+
+	    int 
+	      j;
+	    
+	    for (j = 0; j < 4; j++)
+	      for (k = 0; k < 4; k++)
+		{		 
+		  __m256d 
+		    left1 = _mm256_load_pd(&left[j * 16 + k * 4]);		  		  		  
+
+		  left1 = _mm256_mul_pd(left1, tv);		  
+		  left1 = hadd3(left1);
+		  		  		  
+		  _mm256_store_pd(&umpX1[i * 64 + j * 16 + k * 4], left1);
+		}	 	   
+	  }	
+
+	{ 
+	  __m256d
+	    xv[4];
+	  
+	  scaleGap = 1;
+	  uX1 = &umpX1[960];
+
+	  x2 = x2_gapColumn;			 
+	  x3 = x3_gapColumn;
+
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   		 
+		xvr = _mm256_load_pd(&(x2[k * 4]));
+
+	      int 
+		l;
+
+	      xv[k]  = _mm256_setzero_pd();
+		  
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d  
+		    x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+		  x2v = hadd3(x2v);
+		  x1v = _mm256_mul_pd(x1v, x2v);			
+		
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+		  xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		}
+		    
+	      if(scaleGap)
+		{
+		  __m256d 	     
+		    v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+		  
+		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		  if(_mm256_movemask_pd( v1 ) != 15)
+		    scaleGap = 0;
+		}
+	    }
+	
+	  if(scaleGap)
+	    {
+	      xv[0] = _mm256_mul_pd(xv[0], twoto);
+	      xv[1] = _mm256_mul_pd(xv[1], twoto);
+	      xv[2] = _mm256_mul_pd(xv[2], twoto);
+	      xv[3] = _mm256_mul_pd(xv[3], twoto);	    
+	    }
+
+	  _mm256_store_pd(&x3[0],      xv[0]);
+	  _mm256_store_pd(&x3[4],  xv[1]);
+	  _mm256_store_pd(&x3[8],  xv[2]);
+	  _mm256_store_pd(&x3[12], xv[3]);
+	}
+	
+	x3 = x3_start;
+	
+	for(i = 0; i < n; i++)
+	  {
+	    if((x3_gap[i / 32] & mask32[i % 32]))
+	      {
+		if(scaleGap)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;
+		  }
+	      }
+	    else
+	      {
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 16;
+		  }
+		
+		__m256d
+		  xv[4];	    	   
+		
+		scale = 1;
+		uX1 = &umpX1[64 * tipX1[i]];
+		
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   		 
+		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    
+		    int 
+		      l;
+		    
+		    xv[k]  = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      															
+			__m256d  
+			  x1v = _mm256_load_pd(&uX1[k * 16 + l * 4]),		     
+			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+			x2v = hadd3(x2v);
+			x1v = _mm256_mul_pd(x1v, x2v);			
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+			
+#ifdef _FMA
+			xv[k] = FMAMACC(xv[k],x1v,evv);
+#else			  
+			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+#endif
+		      }
+		    
+		    if(scale)
+		      {
+			__m256d 	     
+			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			
+			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			
+			if(_mm256_movemask_pd( v1 ) != 15)
+			  scale = 0;
+		      }
+		  }	    
+	      
+		if(scale)
+		  {
+		    xv[0] = _mm256_mul_pd(xv[0], twoto);
+		    xv[1] = _mm256_mul_pd(xv[1], twoto);
+		    xv[2] = _mm256_mul_pd(xv[2], twoto);
+		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i] += 1;		   
+		  }
+	      
+		_mm256_store_pd(&x3[0],      xv[0]);
+		_mm256_store_pd(&x3[4],  xv[1]);
+		_mm256_store_pd(&x3[8],  xv[2]);
+		_mm256_store_pd(&x3[12], xv[3]);
+	      
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {          
+	{		
+	  x1 = x1_gapColumn;	     	    
+	  x2 = x2_gapColumn;	    
+	  x3 = x3_gapColumn;
+
+	  __m256d
+	    xv[4];
+	    
+	  scaleGap = 1;
+
+	  for(k = 0; k < 4; k++)
+	    {
+	      __m256d	   
+		
+		xvl = _mm256_load_pd(&(x1[k * 4])),
+		xvr = _mm256_load_pd(&(x2[k * 4]));
+
+	      int 
+		l;
+
+	      xv[k] = _mm256_setzero_pd();
+
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+		    x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&extEV[l * 4]);
+		  
+		  xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		}
+		
+	      if(scaleGap)
+		  {
+		    __m256d 	     
+		      v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scaleGap = 0;
+		  }
+	    }
+
+	  if(scaleGap)
+	    {
+	      xv[0] = _mm256_mul_pd(xv[0], twoto);
+	      xv[1] = _mm256_mul_pd(xv[1], twoto);
+	      xv[2] = _mm256_mul_pd(xv[2], twoto);
+	      xv[3] = _mm256_mul_pd(xv[3], twoto);	       
+	    }
+		
+	  _mm256_store_pd(&x3[0],  xv[0]);
+	  _mm256_store_pd(&x3[4],  xv[1]);
+	  _mm256_store_pd(&x3[8],  xv[2]);
+	  _mm256_store_pd(&x3[12], xv[3]);
+	}	  
+      
+	x3 = x3_start;
+
+	for(i = 0; i < n; i++)
+	  {
+	    if(x3_gap[i / 32] & mask32[i % 32])
+	      {	     
+		if(scaleGap)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1; 	       
+		  }
+	      }
+	    else
+	      {	
+		if(x1_gap[i / 32] & mask32[i % 32])
+		  x1 = x1_gapColumn;
+		else
+		  {
+		    x1 = x1_ptr;
+		    x1_ptr += 16;
+		  }
+	     
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 16;
+		  }
+
+		__m256d
+		  xv[4];
+	    
+		scale = 1;
+
+		for(k = 0; k < 4; k++)
+		  {
+		    __m256d	   
+		      
+		      xvl = _mm256_load_pd(&(x1[k * 4])),
+		      xvr = _mm256_load_pd(&(x2[k * 4]));
+		    
+		    int 
+		      l;
+		    
+		    xv[k] = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 4; l++)
+		      {	       	     				      	      															
+			__m256d 
+			  x1v = _mm256_mul_pd(xvl, _mm256_load_pd(&left[k * 16 + l * 4])),
+			  x2v = _mm256_mul_pd(xvr, _mm256_load_pd(&right[k * 16 + l * 4]));			    
+			
+			x1v = hadd4(x1v, x2v);			
+			
+			__m256d 
+			  evv = _mm256_load_pd(&extEV[l * 4]);
+			
+			xv[k] = _mm256_add_pd(xv[k], _mm256_mul_pd(x1v, evv));
+		      }
+		    
+		    if(scale)
+		      {
+			__m256d 	     
+			  v1 = _mm256_and_pd(xv[k], absMask_AVX.m);
+			
+			v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+			
+			if(_mm256_movemask_pd( v1 ) != 15)
+			  scale = 0;
+		      }
+		  }
+
+		if(scale)
+		  {
+		    xv[0] = _mm256_mul_pd(xv[0], twoto);
+		    xv[1] = _mm256_mul_pd(xv[1], twoto);
+		    xv[2] = _mm256_mul_pd(xv[2], twoto);
+		    xv[3] = _mm256_mul_pd(xv[3], twoto);
+		    
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i] += 1;
+		  }
+		
+		_mm256_store_pd(&x3[0],      xv[0]);
+		_mm256_store_pd(&x3[4],  xv[1]);
+		_mm256_store_pd(&x3[8],  xv[2]);
+		_mm256_store_pd(&x3[12], xv[3]);
+	      
+		x3 += 16;
+	      }
+	  }
+      }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+  
+}
+
+
+
+
+void newviewGTRCAT_AVX(int tipCase,  double *EV,  int *cptr,
+			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+			   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2;
+    
+  int 
+    i, 
+    addScale = 0;
+   
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+	{	 
+	  int 
+	    l;
+	  
+	  le = &left[cptr[i] * 16];
+	  ri = &right[cptr[i] * 16];
+
+	  x1 = &(tipVector[4 * tipX1[i]]);
+	  x2 = &(tipVector[4 * tipX2[i]]);
+	  
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	   	   	    
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else				
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+	    }	  		  
+
+	  _mm256_store_pd(&x3_start[4 * i], vv);	    	   	    
+	}
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+	{
+	  int 
+	    l;
+
+	  x1 = &(tipVector[4 * tipX1[i]]);
+	  x2 = &x2_start[4 * i];	 
+	  
+	  le =  &left[cptr[i] * 16];
+	  ri =  &right[cptr[i] * 16];
+
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+				
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else	      
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+#endif
+	    }	  		  
+	  
+	  
+	  __m256d 	     
+	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+
+	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	    
+	  if(_mm256_movemask_pd( v1 ) == 15)
+	    {	     	      
+	      vv = _mm256_mul_pd(vv, twoto);	      
+	      
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i] += 1;	      	     
+	    }       
+	  
+	  _mm256_store_pd(&x3_start[4 * i], vv);	 	  	  
+	}
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  int 
+	    l;
+
+	  x1 = &x1_start[4 * i];
+	  x2 = &x2_start[4 * i];
+	  
+	  
+	  le =  &left[cptr[i] * 16];
+	  ri =  &right[cptr[i] * 16];
+
+	  __m256d	   
+	    vv = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 4; l++)
+	    {	       	     				      	      															
+	      __m256d 
+		x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+			
+	      x1v = hadd4(x1v, x2v);			
+		
+	      __m256d 
+		evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	      vv = FMAMACC(vv,x1v,evv);
+#else						
+	      vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+	    }	  		  
+
+	 
+	  __m256d 	     
+	    v1 = _mm256_and_pd(vv, absMask_AVX.m);
+
+	  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	    
+	  if(_mm256_movemask_pd( v1 ) == 15)
+	    {	
+	      vv = _mm256_mul_pd(vv, twoto);
+	      
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i] += 1;	   
+	    }	
+
+	  _mm256_store_pd(&x3_start[4 * i], vv);
+	  	  
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+void newviewGTRCAT_AVX_GAPPED_SAVE(int tipCase,  double *EV,  int *cptr,
+				   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+				   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2, 
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start, 
+    *x3_ptr = x3_start;
+  
+  int 
+    i, 
+    scaleGap = 0,
+    addScale = 0;
+   
+  __m256d 
+    minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD ),
+    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+  
+
+  {
+    int 
+      l;
+
+    x1 = x1_gapColumn;	      
+    x2 = x2_gapColumn;
+    x3 = x3_gapColumn;    	 
+	  	  
+    le =  &left[maxCats * 16];
+    ri =  &right[maxCats * 16];
+
+    __m256d	   
+      vv = _mm256_setzero_pd();
+	  
+    for(l = 0; l < 4; l++)
+      {	       	     				      	      															
+	__m256d 
+	  x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+	  x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+	
+	x1v = hadd4(x1v, x2v);			
+	
+	__m256d 
+	  evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+	vv = FMAMACC(vv,x1v,evv);
+#else						
+	vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+      }	  		  
+
+    if(tipCase != PLL_TIP_TIP)
+      {
+	__m256d 	     
+	  v1 = _mm256_and_pd(vv, absMask_AVX.m);
+    
+	v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+    
+	if(_mm256_movemask_pd( v1 ) == 15)
+	  {
+	    vv = _mm256_mul_pd(vv, twoto);	      	 
+	    scaleGap = 1;
+	  }
+      }
+    
+    _mm256_store_pd(x3, vv);    
+  }
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+	{ 
+	  if(noGap(x3_gap, i))
+	    {	 
+	      int 
+		l;
+	      
+	      x1 = &(tipVector[4 * tipX1[i]]);
+	      x2 = &(tipVector[4 * tipX2[i]]);
+
+	      x3 = x3_ptr;
+
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 16];
+	      else	  	  
+		le =  &left[cptr[i] * 16];	  
+	      
+	      if(isGap(x2_gap, i))
+		ri =  &right[maxCats * 16];
+	      else	 	  
+		ri =  &right[cptr[i] * 16];
+	  	  
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else				
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+		}	  		  
+
+	      _mm256_store_pd(x3, vv);	 
+	      
+	      x3_ptr += 4;
+	    }
+	}
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+	{ 
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		   		    
+		}	       
+	    }
+	  else
+	    {
+	      int 
+		l;
+
+	      x1 = &(tipVector[4 * tipX1[i]]);    
+	      x3 = x3_ptr;
+
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 16];
+	      else
+		le =  &left[cptr[i] * 16];
+	  
+	      if(isGap(x2_gap, i))
+		{		 
+		  ri =  &right[maxCats * 16];
+		  x2 = x2_gapColumn;
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 16];
+		  x2 = x2_ptr;
+		  x2_ptr += 4;
+		}	  	 
+
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+		  
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else	      
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));
+#endif
+		}	  		  
+	  
+	  
+	      __m256d 	     
+		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) == 15)
+		{	     	      
+		  vv = _mm256_mul_pd(vv, twoto);	      
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		 
+		}       
+	  
+	      _mm256_store_pd(x3, vv);	 	  	  
+
+	      x3_ptr += 4;
+	    }
+	}
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)		   		    
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;
+		}	      
+	    }
+	  else
+	    {
+	      int 
+		l;
+	      
+	      x3 = x3_ptr;
+	      
+	      if(isGap(x1_gap, i))
+		{
+		  x1 = x1_gapColumn;
+		  le =  &left[maxCats * 16];
+		}
+	      else
+		{
+		  le =  &left[cptr[i] * 16];
+		  x1 = x1_ptr;
+		  x1_ptr += 4;
+		}
+
+	      if(isGap(x2_gap, i))	
+		{
+		  x2 = x2_gapColumn;
+		  ri =  &right[maxCats * 16];	    
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 16];
+		  x2 = x2_ptr;
+		  x2_ptr += 4;
+		}	 	  	  	  
+	  
+	      __m256d	   
+		vv = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 4; l++)
+		{	       	     				      	      															
+		  __m256d 
+		    x1v = _mm256_mul_pd(_mm256_load_pd(x1), _mm256_load_pd(&le[l * 4])),
+		    x2v = _mm256_mul_pd(_mm256_load_pd(x2), _mm256_load_pd(&ri[l * 4]));			    
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv = _mm256_load_pd(&EV[l * 4]);
+#ifdef _FMA
+		  vv = FMAMACC(vv,x1v,evv);
+#else						
+		  vv = _mm256_add_pd(vv, _mm256_mul_pd(x1v, evv));						      	
+#endif
+		}	  		  
+	      
+	      
+	      __m256d 	     
+		v1 = _mm256_and_pd(vv, absMask_AVX.m);
+	      
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) == 15)
+		{	
+		  vv = _mm256_mul_pd(vv, twoto);	      
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		
+		}	
+	      
+	      _mm256_store_pd(x3, vv);
+	      
+	      x3_ptr += 4;
+	    }	  	  
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+void newviewGTRCATPROT_AVX(int tipCase, double *extEV,
+			       int *cptr,
+			       double *x1, double *x2, double *x3, double *tipVector,
+			       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+			       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le, *ri, *v, *vl, *vr;
+
+  int i, l, scale, addScale = 0;
+
+#ifdef _FMA
+  int k;
+#endif
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {	   
+	    le = &left[cptr[i] * 400];
+	    ri = &right[cptr[i] * 400];
+
+	    vl = &(tipVector[20 * tipX1[i]]);
+	    vr = &(tipVector[20 * tipX2[i]]);
+	    v  = &x3[20 * i];	    	    	   	    
+
+	    __m256d vv[5];
+	    
+	    vv[0] = _mm256_setzero_pd();
+	    vv[1] = _mm256_setzero_pd();
+	    vv[2] = _mm256_setzero_pd();
+	    vv[3] = _mm256_setzero_pd();
+	    vv[4] = _mm256_setzero_pd();	   	    
+
+	    for(l = 0; l < 20; l++)
+	      {	       
+		__m256d 
+		  x1v = _mm256_setzero_pd(),
+		  x2v = _mm256_setzero_pd();	
+				
+		double 
+		  *ev = &extEV[l * 20],
+		  *lv = &le[l * 20],
+		  *rv = &ri[l * 20];														
+
+#ifdef _FMA		
+		for(k = 0; k < 20; k += 4) 
+		  {
+		    __m256d vlv = _mm256_load_pd(&vl[k]);
+		    __m256d lvv = _mm256_load_pd(&lv[k]);
+		    x1v = FMAMACC(x1v,vlv,lvv);
+		    __m256d vrv = _mm256_load_pd(&vr[k]);
+		    __m256d rvv = _mm256_load_pd(&rv[k]);
+		    x2v = FMAMACC(x2v,vrv,rvv);
+		  }
+#else		
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+#endif
+
+		x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		for(k = 0; k < 5; k++) 
+		  {
+		    __m256d evv = _mm256_load_pd(&ev[k*4]);
+		    vv[k] = FMAMACC(vv[k],x1v,evv);
+		  }	  
+#else		
+		__m256d 
+		  evv[5];
+	    	
+		evv[0] = _mm256_load_pd(&ev[0]);
+		evv[1] = _mm256_load_pd(&ev[4]);
+		evv[2] = _mm256_load_pd(&ev[8]);
+		evv[3] = _mm256_load_pd(&ev[12]);
+		evv[4] = _mm256_load_pd(&ev[16]);		
+		
+		vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+#endif
+	      }
+	    _mm256_store_pd(&v[0], vv[0]);
+	    _mm256_store_pd(&v[4], vv[1]);
+	    _mm256_store_pd(&v[8], vv[2]);
+	    _mm256_store_pd(&v[12], vv[3]);
+	    _mm256_store_pd(&v[16], vv[4]);
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:      	
+      for (i = 0; i < n; i++)
+	{
+	  le = &left[cptr[i] * 400];
+	  ri = &right[cptr[i] * 400];
+	  
+	  vl = &(tipVector[20 * tipX1[i]]);
+	  vr = &x2[20 * i];
+	  v  = &x3[20 * i];	   
+	  
+	  __m256d vv[5];
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	 
+
+	  for(l = 0; l < 20; l++)
+	    {	       
+	      __m256d 
+		x1v = _mm256_setzero_pd(),
+		x2v = _mm256_setzero_pd();	
+	      
+	      double 
+		*ev = &extEV[l * 20],
+		*lv = &le[l * 20],
+		*rv = &ri[l * 20];														
+#ifdef _FMA
+	      for(k = 0; k < 20; k += 4) 
+		{
+		  __m256d vlv = _mm256_load_pd(&vl[k]);
+		  __m256d lvv = _mm256_load_pd(&lv[k]);
+		  x1v = FMAMACC(x1v,vlv,lvv);
+		  __m256d vrv = _mm256_load_pd(&vr[k]);
+		  __m256d rvv = _mm256_load_pd(&rv[k]);
+		  x2v = FMAMACC(x2v,vrv,rvv);
+		}
+#else	      
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	      
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+#endif
+
+	      x1v = hadd4(x1v, x2v);			
+	      
+	      __m256d 
+		evv[5];
+	      
+	      evv[0] = _mm256_load_pd(&ev[0]);
+	      evv[1] = _mm256_load_pd(&ev[4]);
+	      evv[2] = _mm256_load_pd(&ev[8]);
+	      evv[3] = _mm256_load_pd(&ev[12]);
+	      evv[4] = _mm256_load_pd(&ev[16]);		
+
+#ifdef _FMA
+	      for(k = 0; k < 5; k++)
+		vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
+#else	      
+	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+	    }	  
+
+	   	     
+	  __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	  scale = 1;
+	  
+	  for(l = 0; scale && (l < 20); l += 4)
+	    {	       
+	      __m256d 
+		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) != 15)
+		scale = 0;
+	    }	    	  	  
+	 
+
+	  if(scale)
+	    {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+	  
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	      
+	    }
+
+	  _mm256_store_pd(&v[0], vv[0]);
+	  _mm256_store_pd(&v[4], vv[1]);
+	  _mm256_store_pd(&v[8], vv[2]);
+	  _mm256_store_pd(&v[12], vv[3]);
+	  _mm256_store_pd(&v[16], vv[4]);	       
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  le = &left[cptr[i] * 400];
+	  ri = &right[cptr[i] * 400];
+
+	  vl = &x1[20 * i];
+	  vr = &x2[20 * i];
+	  v = &x3[20 * i];
+
+	  __m256d vv[5];
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 20; l++)
+	    {	       
+	      __m256d 
+		x1v = _mm256_setzero_pd(),
+		x2v = _mm256_setzero_pd();	
+	      
+	      double 
+		*ev = &extEV[l * 20],
+		*lv = &le[l * 20],
+		*rv = &ri[l * 20];														
+	      
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	      x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	      
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	      x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+
+	      x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+	       for(k = 0; k < 5; k++) 
+		 {
+		   __m256d evv = _mm256_load_pd(&ev[k*4]);
+		   vv[k] = FMAMACC(vv[k],x1v,evv);
+		 }
+#else	      
+	      __m256d 
+		evv[5];
+	      
+	      evv[0] = _mm256_load_pd(&ev[0]);
+	      evv[1] = _mm256_load_pd(&ev[4]);
+	      evv[2] = _mm256_load_pd(&ev[8]);
+	      evv[3] = _mm256_load_pd(&ev[12]);
+	      evv[4] = _mm256_load_pd(&ev[16]);		
+	      
+	      vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	      vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	      vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	      vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	      vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+	    }	  
+
+	   	     
+	  __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	  scale = 1;
+	  
+	  for(l = 0; scale && (l < 20); l += 4)
+	    {	       
+	      __m256d 
+		v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	      v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	      
+	      if(_mm256_movemask_pd( v1 ) != 15)
+		scale = 0;
+	    }	    	  	  
+
+	  if(scale)
+	    {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+	  
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	      
+	    }
+
+	  _mm256_store_pd(&v[0], vv[0]);
+	  _mm256_store_pd(&v[4], vv[1]);
+	  _mm256_store_pd(&v[8], vv[2]);
+	  _mm256_store_pd(&v[12], vv[3]);
+	  _mm256_store_pd(&v[16], vv[4]);
+	 
+	}
+      break;
+    default:
+      assert(0);
+    }
+  
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+void newviewGTRCATPROT_AVX_GAPPED_SAVE(int tipCase, double *extEV,
+				       int *cptr,
+				       double *x1, double *x2, double *x3, double *tipVector,
+				       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+				       unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+				       double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    *x1_ptr = x1,
+    *x2_ptr = x2, 
+    *x3_ptr = x3;
+  
+  int 
+    i, 
+    l, 
+    scale, 
+    addScale = 0,
+    scaleGap = 0;
+
+#ifdef _FMA
+  int k;
+#endif
+
+  {
+    le = &left[maxCats * 400];
+    ri = &right[maxCats * 400];
+    
+    vl = x1_gapColumn;
+    vr = x2_gapColumn;
+    v  = x3_gapColumn;
+
+    __m256d vv[5];
+    
+    vv[0] = _mm256_setzero_pd();
+    vv[1] = _mm256_setzero_pd();
+    vv[2] = _mm256_setzero_pd();
+    vv[3] = _mm256_setzero_pd();
+    vv[4] = _mm256_setzero_pd();
+    
+    for(l = 0; l < 20; l++)
+      {	       
+	__m256d 
+	  x1v = _mm256_setzero_pd(),
+	  x2v = _mm256_setzero_pd();	
+	
+	double 
+	  *ev = &extEV[l * 20],
+	  *lv = &le[l * 20],
+	  *rv = &ri[l * 20];														
+	
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+	x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+	
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+	x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+	
+	x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+	for(k = 0; k < 5; k++) 
+	  {
+	    __m256d evv = _mm256_load_pd(&ev[k*4]);
+	    vv[k] = FMAMACC(vv[k],x1v,evv);
+	  }
+#else	      
+	__m256d 
+	  evv[5];
+	
+	evv[0] = _mm256_load_pd(&ev[0]);
+	evv[1] = _mm256_load_pd(&ev[4]);
+	evv[2] = _mm256_load_pd(&ev[8]);
+	evv[3] = _mm256_load_pd(&ev[12]);
+	evv[4] = _mm256_load_pd(&ev[16]);		
+	
+	vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+	vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+	vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+	vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+	vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+      }	  
+
+
+     if(tipCase != PLL_TIP_TIP)
+       {
+	 __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	 scale = 1;
+	  
+	 for(l = 0; scale && (l < 20); l += 4)
+	   {	       
+	     __m256d 
+	       v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+	     v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+	     
+	     if(_mm256_movemask_pd( v1 ) != 15)
+	       scale = 0;
+	   }	    	  	  
+
+	 if(scale)
+	   {
+	      __m256d 
+		twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+	      
+	      for(l = 0; l < 20; l += 4)
+		vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 	      	     	      
+	   
+	      scaleGap = 1;
+	   }
+       }
+
+     _mm256_store_pd(&v[0], vv[0]);
+     _mm256_store_pd(&v[4], vv[1]);
+     _mm256_store_pd(&v[8], vv[2]);
+     _mm256_store_pd(&v[12], vv[3]);
+     _mm256_store_pd(&v[16], vv[4]);     
+  }
+
+
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    if(noGap(x3_gap, i))	   
+	      {	    
+		vl = &(tipVector[20 * tipX1[i]]);
+		vr = &(tipVector[20 * tipX2[i]]);
+		v  = x3_ptr;	    	    	   	    
+
+		if(isGap(x1_gap, i))
+		  le =  &left[maxCats * 400];
+		else	  	  
+		  le =  &left[cptr[i] * 400];	  
+		
+		if(isGap(x2_gap, i))
+		  ri =  &right[maxCats * 400];
+		else	 	  
+		  ri =  &right[cptr[i] * 400];
+
+		__m256d vv[5];
+		
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();	   	    
+		
+		for(l = 0; l < 20; l++)
+		  {	       
+		    __m256d 
+		      x1v = _mm256_setzero_pd(),
+		      x2v = _mm256_setzero_pd();	
+		    
+		    double 
+		      *ev = &extEV[l * 20],
+		      *lv = &le[l * 20],
+		      *rv = &ri[l * 20];														
+		    
+#ifdef _FMA		
+		    for(k = 0; k < 20; k += 4) 
+		      {
+			__m256d vlv = _mm256_load_pd(&vl[k]);
+			__m256d lvv = _mm256_load_pd(&lv[k]);
+			x1v = FMAMACC(x1v,vlv,lvv);
+			__m256d vrv = _mm256_load_pd(&vr[k]);
+			__m256d rvv = _mm256_load_pd(&rv[k]);
+			x2v = FMAMACC(x2v,vrv,rvv);
+		      }
+#else		
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));	
+#endif
+		    
+		    x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		    for(k = 0; k < 5; k++) 
+		      {
+			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			vv[k] = FMAMACC(vv[k],x1v,evv);
+		      }	  
+#else		
+		    __m256d 
+		      evv[5];
+		    
+		    evv[0] = _mm256_load_pd(&ev[0]);
+		    evv[1] = _mm256_load_pd(&ev[4]);
+		    evv[2] = _mm256_load_pd(&ev[8]);
+		    evv[3] = _mm256_load_pd(&ev[12]);
+		    evv[4] = _mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      		      	  
+#endif
+		  }
+		
+		_mm256_store_pd(&v[0], vv[0]);
+		_mm256_store_pd(&v[4], vv[1]);
+		_mm256_store_pd(&v[8], vv[2]);
+		_mm256_store_pd(&v[12], vv[3]);
+		_mm256_store_pd(&v[16], vv[4]);
+
+		x3_ptr += 20;
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:      	
+      for (i = 0; i < n; i++)
+	{
+	  if(isGap(x3_gap, i))
+	    {
+	      if(scaleGap)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i] += 1;		   		    
+		}	     
+	    }
+	  else
+	    {
+	      vl = &(tipVector[20 * tipX1[i]]);
+
+	      vr = x2_ptr;
+	      v = x3_ptr;
+	      
+	      if(isGap(x1_gap, i))
+		le =  &left[maxCats * 400];
+	      else
+		le =  &left[cptr[i] * 400];
+	      
+	      if(isGap(x2_gap, i))
+		{		 
+		  ri =  &right[maxCats * 400];
+		  vr = x2_gapColumn;
+		}
+	      else
+		{
+		  ri =  &right[cptr[i] * 400];
+		  vr = x2_ptr;
+		  x2_ptr += 20;
+		}	  	  
+	  
+	      __m256d vv[5];
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      	      	      
+	      for(l = 0; l < 20; l++)
+		{	       
+		  __m256d 
+		    x1v = _mm256_setzero_pd(),
+		    x2v = _mm256_setzero_pd();	
+		  
+		  double 
+		    *ev = &extEV[l * 20],
+		    *lv = &le[l * 20],
+		    *rv = &ri[l * 20];														
+#ifdef _FMA
+		  for(k = 0; k < 20; k += 4) 
+		    {
+		      __m256d vlv = _mm256_load_pd(&vl[k]);
+		      __m256d lvv = _mm256_load_pd(&lv[k]);
+		      x1v = FMAMACC(x1v,vlv,lvv);
+		      __m256d vrv = _mm256_load_pd(&vr[k]);
+		      __m256d rvv = _mm256_load_pd(&rv[k]);
+		      x2v = FMAMACC(x2v,vrv,rvv);
+		    }
+#else	      
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		  x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		  
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		  x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+#endif
+		  
+		  x1v = hadd4(x1v, x2v);			
+		  
+		  __m256d 
+		    evv[5];
+		  
+		  evv[0] = _mm256_load_pd(&ev[0]);
+		  evv[1] = _mm256_load_pd(&ev[4]);
+		  evv[2] = _mm256_load_pd(&ev[8]);
+		  evv[3] = _mm256_load_pd(&ev[12]);
+		  evv[4] = _mm256_load_pd(&ev[16]);		
+		  
+#ifdef _FMA
+		  for(k = 0; k < 5; k++)
+		    vv[k] = FMAMACC(vv[k],x1v,evv[k]);		 
+#else	      
+		  vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		  vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		  vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		  vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		  vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+		}	  
+
+	   	     
+	      __m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+	  
+	      scale = 1;
+	      
+	      for(l = 0; scale && (l < 20); l += 4)
+		{	       
+		  __m256d 
+		    v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		  v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		  
+		  if(_mm256_movemask_pd( v1 ) != 15)
+		    scale = 0;
+		}	    	  	  
+	 
+	      if(scale)
+		{
+		  __m256d 
+		    twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+		  
+		  for(l = 0; l < 20; l += 4)
+		    vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		  
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i]  += 1;	      
+		}
+
+	      _mm256_store_pd(&v[0], vv[0]);
+	      _mm256_store_pd(&v[4], vv[1]);
+	      _mm256_store_pd(&v[8], vv[2]);
+	      _mm256_store_pd(&v[12], vv[3]);
+	      _mm256_store_pd(&v[16], vv[4]);	       
+	      
+	      x3_ptr += 20;
+	    }
+	}    
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	   if(isGap(x3_gap, i))
+	     {
+	       if(scaleGap)		   		    
+		 {
+		   if(useFastScaling)
+		     addScale += wgt[i];
+		   else
+		     ex3[i] += 1;
+		 }		 	       
+	     }
+	   else
+	     {
+
+	        v = x3_ptr;
+
+		if(isGap(x1_gap, i))
+		  {
+		    vl = x1_gapColumn;
+		    le =  &left[maxCats * 400];
+		  }
+		else
+		  {
+		    le =  &left[cptr[i] * 400];
+		    vl = x1_ptr;
+		    x1_ptr += 20;
+		  }
+		
+		if(isGap(x2_gap, i))	
+		  {
+		    vr = x2_gapColumn;
+		    ri =  &right[maxCats * 400];	    
+		  }
+		else
+		  {
+		    ri =  &right[cptr[i] * 400];
+		    vr = x2_ptr;
+		    x2_ptr += 20;
+		  }	 	  	 
+		
+		__m256d vv[5];
+		
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++)
+		  {	       
+		    __m256d 
+		      x1v = _mm256_setzero_pd(),
+		      x2v = _mm256_setzero_pd();	
+		    
+		    double 
+		      *ev = &extEV[l * 20],
+		      *lv = &le[l * 20],
+		      *rv = &ri[l * 20];														
+		    
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[0]), _mm256_load_pd(&lv[0])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[4]), _mm256_load_pd(&lv[4])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[8]), _mm256_load_pd(&lv[8])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[12]), _mm256_load_pd(&lv[12])));
+		    x1v = _mm256_add_pd(x1v, _mm256_mul_pd(_mm256_load_pd(&vl[16]), _mm256_load_pd(&lv[16])));
+		    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[0]), _mm256_load_pd(&rv[0])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[4]), _mm256_load_pd(&rv[4])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[8]), _mm256_load_pd(&rv[8])));			    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[12]), _mm256_load_pd(&rv[12])));				    
+		    x2v = _mm256_add_pd(x2v,  _mm256_mul_pd(_mm256_load_pd(&vr[16]), _mm256_load_pd(&rv[16])));
+		    
+		    x1v = hadd4(x1v, x2v);			
+#ifdef _FMA
+		    for(k = 0; k < 5; k++) 
+		      {
+			__m256d evv = _mm256_load_pd(&ev[k*4]);
+			vv[k] = FMAMACC(vv[k],x1v,evv);
+		      }
+#else	      
+		    __m256d 
+		      evv[5];
+		    
+		    evv[0] = _mm256_load_pd(&ev[0]);
+		    evv[1] = _mm256_load_pd(&ev[4]);
+		    evv[2] = _mm256_load_pd(&ev[8]);
+		    evv[3] = _mm256_load_pd(&ev[12]);
+		    evv[4] = _mm256_load_pd(&ev[16]);		
+		    
+		    vv[0] = _mm256_add_pd(vv[0], _mm256_mul_pd(x1v, evv[0]));
+		    vv[1] = _mm256_add_pd(vv[1], _mm256_mul_pd(x1v, evv[1]));
+		    vv[2] = _mm256_add_pd(vv[2], _mm256_mul_pd(x1v, evv[2]));
+		    vv[3] = _mm256_add_pd(vv[3], _mm256_mul_pd(x1v, evv[3]));
+		    vv[4] = _mm256_add_pd(vv[4], _mm256_mul_pd(x1v, evv[4]));				      	
+#endif
+		  }	  
+
+	   	     
+		__m256d minlikelihood_avx = _mm256_set1_pd( PLL_MINLIKELIHOOD );
+		
+		scale = 1;
+		
+		for(l = 0; scale && (l < 20); l += 4)
+		  {	       
+		    __m256d 
+		      v1 = _mm256_and_pd(vv[l / 4], absMask_AVX.m);
+		    v1 = _mm256_cmp_pd(v1,  minlikelihood_avx, _CMP_LT_OS);
+		    
+		    if(_mm256_movemask_pd( v1 ) != 15)
+		      scale = 0;
+		  }	    	  	  
+		
+		if(scale)
+		  {
+		    __m256d 
+		      twoto = _mm256_set1_pd(PLL_TWOTOTHE256);
+		    
+		    for(l = 0; l < 20; l += 4)
+		      vv[l / 4] = _mm256_mul_pd(vv[l / 4] , twoto);		    		 
+		    
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;	      
+		  }
+
+		_mm256_store_pd(&v[0], vv[0]);
+		_mm256_store_pd(&v[4], vv[1]);
+		_mm256_store_pd(&v[8], vv[2]);
+		_mm256_store_pd(&v[12], vv[3]);
+		_mm256_store_pd(&v[16], vv[4]);
+
+		 x3_ptr += 20;
+	     }
+	}   
+      break;
+    default:
+      assert(0);
+    }
+  
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
+				 double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+				 int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+				 double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling) 
+{
+  double	
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined (__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {
+       
+    PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+	
+	for(i = 0; i < 23; i++) 
+	  {	    	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		v = &(tipVector[k / 20][20 * i]);
+
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	for(i = 0; i < n; i++) 
+	  {	    
+	    uX1 = &umpX1[80 * tipX1[i]];
+	    uX2 = &umpX2[80 * tipX2[i]];
+	   
+	    for(j = 0; j < 4; j++) 
+	      {     	
+		__m256d vv[5];  
+
+		v = &x3[i * 80 + j * 20];
+			
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+
+		for(k = 0; k < 20; k++) 
+		  {			 
+		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    
+		    __m256d extEvv = _mm256_load_pd(&extEV[j][20 * k]);
+#ifdef _FMA
+		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+		    
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 4]);
+#ifdef _FMA
+		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 8]);
+#ifdef _FMA
+		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 12]);
+#ifdef _FMA
+		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+		    extEvv = _mm256_load_pd(&extEV[j][20 * k + 16]);
+#ifdef _FMA
+		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+		  } 
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_TIP_INNER: 
+      {
+
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {	   
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		
+		 v = &(tipVector[k / 20][20 * i]);
+
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    uX1 = &umpX1[80 * tipX1[i]];
+	   	    
+	    for(k = 0; k < 4; k++) 
+	      {
+		v = &(x2[80 * i + k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &(x3[80 * i + 20 * k]);
+	
+
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[k][l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[k][l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[k][l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = &x3[80 * i];
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }
+		if(useFastScaling)
+		  addScale += wgt[i];				
+		else
+		  ex3[i] += 1;
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_INNER_INNER:      
+      for(i = 0; i < n; i++) 
+	{ 
+	  scale = 1;
+	  
+	  for(k = 0; k < 4; k++) 
+	    {
+	      vl = &(x1[80 * i + 20 * k]);
+	      vr = &(x2[80 * i + 20 * k]);
+	      v  = &(x3[80 * i + 20 * k]);	      	   
+
+	      __m256d vv[5]; 
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 20; l++) 
+		{		  
+		  __m256d al = _mm256_setzero_pd();
+		  __m256d ar = _mm256_setzero_pd();
+       		  
+		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  __m256d vlv = _mm256_load_pd(&vl[0]);
+		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = _mm256_load_pd(&vl[4]);
+		  vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = _mm256_load_pd(&vl[8]);
+		  vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = _mm256_load_pd(&vl[12]);
+		  vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = _mm256_load_pd(&vl[16]);
+		  vrv = _mm256_load_pd(&vr[16]);
+
+#ifdef _FMA		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  /**************************************************************************************************************/
+
+		  al = hadd3(al);
+		  ar = hadd3(ar);
+		  al = _mm256_mul_pd(ar,al);
+		  
+		  /************************************************************************************************************/
+#ifdef _FMA		    
+		  __m256d ev =  _mm256_load_pd(&extEV[k][20 * l + 0]);
+		  vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 0])));			  		 		  
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 4]);
+		  vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 4])));		  		 
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 8]);
+		  vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 8])));		  		 
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 12]);
+		  vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 12])));		  		 
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[k][20 * l + 16]);
+		  vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[k][20 * l + 16])));			 	  
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);		 
+		} 
+	    }
+	  v = &(x3[80 * i]);
+	  scale = 1;
+	  __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+
+	  for(l = 0; scale && (l < 80); l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	      if(_mm256_movemask_pd(vv_abs) != 15)
+		scale = 0;	     
+	    }
+
+	  if(scale) 
+	    {		     	      
+	      __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	      for(l = 0; l < 80; l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		}
+	      if(useFastScaling)
+		addScale += wgt[i];					
+	      else
+		ex3[i] += 1;
+	    } 
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+ 
+
+void newviewGTRGAMMAPROT_AVX(int tipCase,
+			     double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			     int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+			     double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling) 
+{
+  double	
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined(__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {
+       
+    PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	for(i = 0; i < n; i++) 
+	  {	    
+	    uX1 = &umpX1[80 * tipX1[i]];
+	    uX2 = &umpX2[80 * tipX2[i]];
+	   
+	    for(j = 0; j < 4; j++) 
+	      {     	
+		__m256d vv[5];  
+
+		v = &x3[i * 80 + j * 20];
+			
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+
+		for(k = 0; k < 20; k++) 
+		  {			 
+		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		    
+		    __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+		    vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+		    
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+		    vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+		    vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+		    vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+		    extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+		    vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+		  } 
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_TIP_INNER: 
+      {
+
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    uX1 = &umpX1[80 * tipX1[i]];
+	   	    
+	    for(k = 0; k < 4; k++) 
+	      {
+		v = &(x2[80 * i + k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &(x3[80 * i + 20 * k]);
+	
+
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = &x3[80 * i];
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }
+		if(useFastScaling)
+		  addScale += wgt[i];				
+		else
+		  ex3[i] += 1;
+	      } 
+	  } 
+      } 
+      break;
+    case PLL_INNER_INNER:      
+      for(i = 0; i < n; i++) 
+	{ 
+	  scale = 1;
+	  
+	  for(k = 0; k < 4; k++) 
+	    {
+	      vl = &(x1[80 * i + 20 * k]);
+	      vr = &(x2[80 * i + 20 * k]);
+	      v  = &(x3[80 * i + 20 * k]);	      	   
+
+	      __m256d vv[5]; 
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(l = 0; l < 20; l++) 
+		{		  
+		  __m256d al = _mm256_setzero_pd();
+		  __m256d ar = _mm256_setzero_pd();
+       		  
+		  __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		  __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		  __m256d vlv = _mm256_load_pd(&vl[0]);
+		  __m256d vrv = _mm256_load_pd(&vr[0]);
+		  
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		  vlv = _mm256_load_pd(&vl[4]);
+		  vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		  vlv = _mm256_load_pd(&vl[8]);
+		  vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		  vlv = _mm256_load_pd(&vl[12]);
+		  vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		  rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		  vlv = _mm256_load_pd(&vl[16]);
+		  vrv = _mm256_load_pd(&vr[16]);
+
+#ifdef _FMA		    
+		  al = FMAMACC(al, vlv, leftv);
+		  ar = FMAMACC(ar, vrv, rightv);
+#else
+		  al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		  ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+
+		  /**************************************************************************************************************/
+
+		  al = hadd3(al);
+		  ar = hadd3(ar);
+		  al = _mm256_mul_pd(ar,al);
+		  
+		  /************************************************************************************************************/
+#ifdef _FMA		    
+		  __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		  vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		  vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		  vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		  vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+
+#ifdef _FMA		    
+		  ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		  vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);		 
+		} 
+	    }
+	  v = &(x3[80 * i]);
+	  scale = 1;
+	  __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+
+	  for(l = 0; scale && (l < 80); l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	      vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	      if(_mm256_movemask_pd(vv_abs) != 15)
+		scale = 0;	     
+	    }
+
+	  if(scale) 
+	    {		     	      
+	      __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	      for(l = 0; l < 80; l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		}
+	      if(useFastScaling)
+		addScale += wgt[i];					
+	      else
+		ex3[i] += 1;
+	    } 
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+void newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(int tipCase,
+					 double *x1_start, double *x2_start, double *x3_start, double *extEV, double *tipVector,
+					 int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
+					 double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+					 unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+					 double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn) 
+{
+  double	
+    *x1 = x1_start,
+    *x2 = x2_start,
+    *x3_ptr = x3_start,
+    *x2_ptr = x2_start,
+    *x1_ptr = x1_start,
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr;
+  
+  int	
+    i, 
+    j, 
+    l, 
+    k, 
+    gapScaling = 0,
+    scale, 
+    addScale = 0;
+
+ 
+#ifndef GCC_VERSION
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+
+#if GCC_VERSION < 40500 && defined(__GNUC__)
+   __m256d
+    bitmask = _mm256_set_pd(0,0,0,-1);
+#else
+  __m256i
+    bitmask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+#endif 
+  
+  switch(tipCase) 
+    {
+    case PLL_TIP_TIP: 
+      {       
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  umpX2[1840] PLL_ALIGN_END;
+
+
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+	    
+	    for(k = 0; k < 80; k++) 
+	      {
+		double 
+		  *ll =  &left[k * 20],
+		  *rr =  &right[k * 20];
+		
+		__m256d 
+		  umpX1v = _mm256_setzero_pd(),
+		  umpX2v = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+#ifdef _FMA
+		    __m256d llv = _mm256_load_pd(&ll[l]);
+		    umpX1v = FMAMACC(umpX1v,vv,llv);
+		    __m256d rrv = _mm256_load_pd(&rr[l]);
+		    umpX2v = FMAMACC(umpX2v,vv,rrv);
+#else		    
+		    umpX1v = _mm256_add_pd(umpX1v,_mm256_mul_pd(vv,_mm256_load_pd(&ll[l])));
+		    umpX2v = _mm256_add_pd(umpX2v,_mm256_mul_pd(vv,_mm256_load_pd(&rr[l])));
+#endif
+		  }
+		
+		umpX1v = hadd3(umpX1v);
+		umpX2v = hadd3(umpX2v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+		_mm256_maskstore_pd(&umpX2[80 * i + k], bitmask, umpX2v);
+	      } 
+	  }
+
+	
+	{	    
+	  uX1 = &umpX1[1760];
+	  uX2 = &umpX2[1760];
+	  
+	  for(j = 0; j < 4; j++) 
+	    {     	
+	      __m256d vv[5];  
+	      
+	      v = &x3_gapColumn[j * 20];
+	      
+	      vv[0] = _mm256_setzero_pd();
+	      vv[1] = _mm256_setzero_pd();
+	      vv[2] = _mm256_setzero_pd();
+	      vv[3] = _mm256_setzero_pd();
+	      vv[4] = _mm256_setzero_pd();
+	      
+	      for(k = 0; k < 20; k++) 
+		{			 
+		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+		  
+		  __m256d x1px2v = _mm256_set1_pd(x1px2);		    
+		  
+		  __m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+		  vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+		  vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[0],vv[0]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+		  vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+		  vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[4],vv[1]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+		  vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+		  vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[8],vv[2]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+		  vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+		  vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[12],vv[3]);
+		  
+		  extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+		  vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+		  vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+		  _mm256_store_pd(&v[16],vv[4]);
+		} 
+	    } 
+	}
+
+	
+	for(i = 0; i < n; i++) 
+	  {
+	    if(!(x3_gap[i / 32] & mask32[i % 32]))
+	      {	    
+		uX1 = &umpX1[80 * tipX1[i]];
+		uX2 = &umpX2[80 * tipX2[i]];
+	   
+		for(j = 0; j < 4; j++) 
+		  {     	
+		    __m256d vv[5];  
+		    
+		    v = &x3_ptr[j * 20];
+			
+		    vv[0] = _mm256_setzero_pd();
+		    vv[1] = _mm256_setzero_pd();
+		    vv[2] = _mm256_setzero_pd();
+		    vv[3] = _mm256_setzero_pd();
+		    vv[4] = _mm256_setzero_pd();
+
+		    for(k = 0; k < 20; k++) 
+		      {			 
+			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+			
+			__m256d x1px2v = _mm256_set1_pd(x1px2);		    
+			
+			__m256d extEvv = _mm256_load_pd(&extEV[20 * k]);
+#ifdef _FMA
+			vv[0] = FMAMACC(vv[0],x1px2v,extEvv);
+#else
+			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[0],vv[0]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 4]);
+#ifdef _FMA
+			vv[1] = FMAMACC(vv[1],x1px2v,extEvv);
+#else
+			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[4],vv[1]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 8]);
+#ifdef _FMA
+			vv[2] = FMAMACC(vv[2],x1px2v,extEvv);
+#else
+			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[8],vv[2]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 12]);
+#ifdef _FMA
+			vv[3] = FMAMACC(vv[3],x1px2v,extEvv);
+#else
+			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[12],vv[3]);
+			
+			extEvv = _mm256_load_pd(&extEV[20 * k + 16]);
+#ifdef _FMA
+			vv[4] = FMAMACC(vv[4],x1px2v,extEvv);
+#else
+			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v,extEvv));
+#endif
+			_mm256_store_pd(&v[16],vv[4]);
+		      } 
+		  }
+		x3_ptr += 80;		  
+	      }
+	  }
+      }
+      break;
+    case PLL_TIP_INNER: 
+      {
+    	  PLL_ALIGN_BEGIN double
+	  umpX1[1840] PLL_ALIGN_END,
+	  ump_x2[20] PLL_ALIGN_END;
+
+
+
+	for(i = 0; i < 23; i++) 
+	  {
+	    v = &(tipVector[20 * i]);
+
+	    for(k = 0; k < 80; k++) 
+	      {
+		__m256d umpX1v = _mm256_setzero_pd();
+		for(l = 0; l < 20; l+=4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d leftv = _mm256_load_pd(&left[k * 20 + l]);
+#ifdef _FMA
+		   
+		    umpX1v = FMAMACC(umpX1v, vv, leftv);
+#else
+		    umpX1v = _mm256_add_pd(umpX1v, _mm256_mul_pd(vv, leftv));
+#endif
+		  }
+		umpX1v = hadd3(umpX1v);
+		_mm256_maskstore_pd(&umpX1[80 * i + k], bitmask, umpX1v);
+	      } 
+	  }
+
+	{	   
+	  uX1 = &umpX1[1760];
+	   	    
+	  for(k = 0; k < 4; k++) 
+	    {
+	      v = &(x2_gapColumn[k * 20]);
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    __m256d ump_x2v = _mm256_setzero_pd();
+		    		  
+		    __m256d vv = _mm256_load_pd(&v[0]);
+		    __m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    vv = _mm256_load_pd(&v[4]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[8]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[12]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+
+		    vv = _mm256_load_pd(&v[16]);
+		    rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+		    ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+		    ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+		    
+		    ump_x2v = hadd3(ump_x2v);
+		    _mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		  }
+		
+		v = &x3_gapColumn[20 * k];
+	
+		__m256d vv[5]; 
+
+		vv[0] = _mm256_setzero_pd();
+		vv[1] = _mm256_setzero_pd();
+		vv[2] = _mm256_setzero_pd();
+		vv[3] = _mm256_setzero_pd();
+		vv[4] = _mm256_setzero_pd();
+		
+		for(l = 0; l < 20; l++) 
+		  {
+		    x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+		    __m256d x1px2v = _mm256_set1_pd(x1px2);	
+	    		 
+#ifdef _FMA
+		    __m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+		    vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+		    vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+		    _mm256_store_pd(&v[0],vv[0]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+		    vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+		    vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+		    _mm256_store_pd(&v[4],vv[1]);
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+		    vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+		    vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+		    _mm256_store_pd(&v[8],vv[2]);
+		    
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+		    vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+		    vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+		    _mm256_store_pd(&v[12],vv[3]);
+
+
+#ifdef _FMA
+		    ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+		    vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+		    vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+		    _mm256_store_pd(&v[16],vv[4]);
+
+		  } 
+	      }
+	   
+	    v = x3_gapColumn;
+	    __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+	    scale = 1;
+	    for(l = 0; scale && (l < 80); l += 4) 
+	      {
+		__m256d vv = _mm256_load_pd(&v[l]);
+		__m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		if(_mm256_movemask_pd(vv_abs) != 15)
+		  scale = 0;
+	      }
+	    
+	    if(scale) 
+	      {		
+		__m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		gapScaling = 1;
+
+		for(l = 0; l < 80; l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		  }	
+	      } 
+	}       
+	
+	for (i = 0; i < n; i++) 
+	  {	   
+	    if((x3_gap[i / 32] & mask32[i % 32]))
+	      {	       
+		if(gapScaling)
+		  {
+		    if(useFastScaling)
+		      addScale += wgt[i];
+		    else
+		      ex3[i]  += 1;
+		  }
+	      }
+	    else
+	      {		
+		uX1 = &umpX1[80 * tipX1[i]];
+		
+		if(x2_gap[i / 32] & mask32[i % 32])
+		  x2 = x2_gapColumn;
+		else
+		  {
+		    x2 = x2_ptr;
+		    x2_ptr += 80;
+		  }	      
+	    
+		for(k = 0; k < 4; k++) 
+		  {
+		    v = &(x2[k * 20]);
+		    
+		    for(l = 0; l < 20; l++) 
+		      {
+			__m256d ump_x2v = _mm256_setzero_pd();
+		    	
+			__m256d vv = _mm256_load_pd(&v[0]);
+			__m256d rightv = _mm256_load_pd(&right[k*400+l*20+0]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[4]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+4]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[8]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+8]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[12]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+12]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			vv = _mm256_load_pd(&v[16]);
+			rightv = _mm256_load_pd(&right[k*400+l*20+16]);
+#ifdef _FMA
+			ump_x2v = FMAMACC(ump_x2v,vv,rightv);
+#else
+			ump_x2v = _mm256_add_pd(ump_x2v, _mm256_mul_pd(vv, rightv));
+#endif
+			
+			ump_x2v = hadd3(ump_x2v);
+			_mm256_maskstore_pd(&ump_x2[l], bitmask, ump_x2v);
+		      }
+		  
+		    
+		    v = &x3_ptr[k * 20];
+		    
+		    __m256d vv[5]; 
+		    
+		    vv[0] = _mm256_setzero_pd();
+		    vv[1] = _mm256_setzero_pd();
+		    vv[2] = _mm256_setzero_pd();
+		    vv[3] = _mm256_setzero_pd();
+		    vv[4] = _mm256_setzero_pd();
+		    
+		    for(l = 0; l < 20; l++) 
+		      {
+			x1px2 = uX1[k * 20 + l]	* ump_x2[l];
+			__m256d x1px2v = _mm256_set1_pd(x1px2);	
+			
+#ifdef _FMA
+			__m256d ev = _mm256_load_pd(&extEV[l * 20 + 0]);
+			vv[0] = FMAMACC(vv[0],x1px2v, ev);
+#else
+			vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 0])));
+#endif
+			_mm256_store_pd(&v[0],vv[0]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 4]);
+			vv[1] = FMAMACC(vv[1],x1px2v, ev);
+#else
+			vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 4])));
+#endif
+			_mm256_store_pd(&v[4],vv[1]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 8]);
+			vv[2] = FMAMACC(vv[2],x1px2v, ev);
+#else
+			vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 8])));
+#endif
+			_mm256_store_pd(&v[8],vv[2]);
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 12]);
+			vv[3] = FMAMACC(vv[3],x1px2v, ev);
+#else
+			vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 12])));
+#endif
+			_mm256_store_pd(&v[12],vv[3]);
+			
+			
+#ifdef _FMA
+			ev = _mm256_load_pd(&extEV[l * 20 + 16]);
+			vv[4] = FMAMACC(vv[4],x1px2v, ev);
+#else
+			vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(x1px2v, _mm256_load_pd(&extEV[l * 20 + 16])));
+#endif
+			_mm256_store_pd(&v[16],vv[4]);
+			
+		      } 
+		  }
+		
+		v = x3_ptr;
+		__m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);
+		scale = 1;
+		for(l = 0; scale && (l < 80); l += 4) 
+		  {
+		    __m256d vv = _mm256_load_pd(&v[l]);
+		    __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		    vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		    if(_mm256_movemask_pd(vv_abs) != 15)
+		      scale = 0;
+		  }
+	    
+		if(scale) 
+		  {		
+		    __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		    for(l = 0; l < 80; l += 4) 
+		      {
+			__m256d vv = _mm256_load_pd(&v[l]);
+			_mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		      }
+		    if(useFastScaling)
+		      addScale += wgt[i];				
+		    else
+		      ex3[i] += 1;
+		  }	      
+		x3_ptr += 80;
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:    	  
+      for(k = 0; k < 4; k++) 
+	{
+	  vl = &(x1_gapColumn[20 * k]);
+	  vr = &(x2_gapColumn[20 * k]);
+	  v  = &(x3_gapColumn[20 * k]);	      	   
+
+	  __m256d vv[5]; 
+	  
+	  vv[0] = _mm256_setzero_pd();
+	  vv[1] = _mm256_setzero_pd();
+	  vv[2] = _mm256_setzero_pd();
+	  vv[3] = _mm256_setzero_pd();
+	  vv[4] = _mm256_setzero_pd();
+	  
+	  for(l = 0; l < 20; l++) 
+	    {		  
+	      __m256d al = _mm256_setzero_pd();
+	      __m256d ar = _mm256_setzero_pd();
+	      
+	      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+	      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+	      __m256d vlv = _mm256_load_pd(&vl[0]);
+	      __m256d vrv = _mm256_load_pd(&vr[0]);
+	      
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+	      vlv = _mm256_load_pd(&vl[4]);
+	      vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+	      vlv = _mm256_load_pd(&vl[8]);
+	      vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+	      vlv = _mm256_load_pd(&vl[12]);
+	      vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+	      
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+	      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+	      vlv = _mm256_load_pd(&vl[16]);
+	      vrv = _mm256_load_pd(&vr[16]);
+	      
+#ifdef _FMA		    
+	      al = FMAMACC(al, vlv, leftv);
+	      ar = FMAMACC(ar, vrv, rightv);
+#else
+	      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+	      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+	      
+	      /**************************************************************************************************************/
+	      
+	      al = hadd3(al);
+	      ar = hadd3(ar);
+	      al = _mm256_mul_pd(ar,al);
+	      
+	      /************************************************************************************************************/
+#ifdef _FMA		    
+	      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+	      vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+	      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+	      _mm256_store_pd(&v[0],vv[0]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+	      vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+	      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+	      _mm256_store_pd(&v[4],vv[1]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+	      vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+	      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+	      _mm256_store_pd(&v[8],vv[2]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+	      vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+	      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+	      _mm256_store_pd(&v[12],vv[3]);
+	      
+#ifdef _FMA		    
+	      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+	      vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+	      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+	      _mm256_store_pd(&v[16],vv[4]);		 
+	    } 
+	}
+	
+      v = x3_gapColumn;
+      scale = 1;
+      __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+      
+      for(l = 0; scale && (l < 80); l += 4) 
+	{
+	  __m256d vv = _mm256_load_pd(&v[l]);
+	  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+	  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+	  if(_mm256_movemask_pd(vv_abs) != 15)
+	    scale = 0;	     
+	}
+
+      if(scale) 
+	{		     	      
+	  __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+	  gapScaling = 1;
+
+	  for(l = 0; l < 80; l += 4) 
+	    {
+	      __m256d vv = _mm256_load_pd(&v[l]);
+	      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+	    }
+	  
+	} 
+   
+     
+
+      for(i = 0; i < n; i++) 
+	{   
+	  
+	  if(x3_gap[i / 32] & mask32[i % 32])
+	    {	     
+	      if(gapScaling)
+		{
+		  if(useFastScaling)
+		    addScale += wgt[i];
+		  else
+		    ex3[i]  += 1; 	       
+		}
+	    }
+	  else
+	    {
+	      if(x1_gap[i / 32] & mask32[i % 32])
+		x1 = x1_gapColumn;
+	      else
+		{
+		  x1 = x1_ptr;
+		  x1_ptr += 80;
+		}
+
+	      if(x2_gap[i / 32] & mask32[i % 32])
+		x2 = x2_gapColumn;
+	      else
+		{
+		  x2 = x2_ptr;
+		  x2_ptr += 80;
+		}	   
+	  
+	      for(k = 0; k < 4; k++) 
+		{
+		  vl = &(x1[20 * k]);
+		  vr = &(x2[20 * k]);
+		  v  = &(x3_ptr[20 * k]);	      	   
+		  
+		  __m256d vv[5]; 
+		  
+		  vv[0] = _mm256_setzero_pd();
+		  vv[1] = _mm256_setzero_pd();
+		  vv[2] = _mm256_setzero_pd();
+		  vv[3] = _mm256_setzero_pd();
+		  vv[4] = _mm256_setzero_pd();
+		  
+		  for(l = 0; l < 20; l++) 
+		    {		  
+		      __m256d al = _mm256_setzero_pd();
+		      __m256d ar = _mm256_setzero_pd();
+		      
+		      __m256d leftv  = _mm256_load_pd(&left[k * 400 + l * 20 + 0]);
+		      __m256d rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 0]);
+		      __m256d vlv = _mm256_load_pd(&vl[0]);
+		      __m256d vrv = _mm256_load_pd(&vr[0]);
+		      
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));		  
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 4]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 4]);
+		      vlv = _mm256_load_pd(&vl[4]);
+		      vrv = _mm256_load_pd(&vr[4]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 8]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 8]);
+		      vlv = _mm256_load_pd(&vl[8]);
+		      vrv = _mm256_load_pd(&vr[8]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 12]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 12]);
+		      vlv = _mm256_load_pd(&vl[12]);
+		      vrv = _mm256_load_pd(&vr[12]);
+#ifdef _FMA
+		      
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      leftv = _mm256_load_pd(&left[k * 400 + l * 20 + 16]);
+		      rightv = _mm256_load_pd(&right[k * 400 + l * 20 + 16]);
+		      vlv = _mm256_load_pd(&vl[16]);
+		      vrv = _mm256_load_pd(&vr[16]);
+		      
+#ifdef _FMA		    
+		      al = FMAMACC(al, vlv, leftv);
+		      ar = FMAMACC(ar, vrv, rightv);
+#else
+		      al = _mm256_add_pd(al,_mm256_mul_pd(vlv,leftv));
+		      ar = _mm256_add_pd(ar,_mm256_mul_pd(vrv,rightv));
+#endif
+		      
+		      /**************************************************************************************************************/
+		      
+		      al = hadd3(al);
+		      ar = hadd3(ar);
+		      al = _mm256_mul_pd(ar,al);
+		      
+		      /************************************************************************************************************/
+#ifdef _FMA		    
+		      __m256d ev =  _mm256_load_pd(&extEV[20 * l + 0]);
+		      vv[0] = FMAMACC(vv[0], al, ev);		 
+#else
+		      vv[0] = _mm256_add_pd(vv[0],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 0])));			  		 		  
+#endif
+		      _mm256_store_pd(&v[0],vv[0]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 4]);
+		      vv[1] = FMAMACC(vv[1], al, ev);		 
+#else
+		      vv[1] = _mm256_add_pd(vv[1],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 4])));		  		 
+#endif
+		      _mm256_store_pd(&v[4],vv[1]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 8]);
+		      vv[2] = FMAMACC(vv[2], al, ev);		 
+#else
+		      vv[2] = _mm256_add_pd(vv[2],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 8])));		  		 
+#endif
+		      _mm256_store_pd(&v[8],vv[2]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 12]);
+		      vv[3] = FMAMACC(vv[3], al, ev);		 
+#else
+		      vv[3] = _mm256_add_pd(vv[3],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 12])));		  		 
+#endif
+		      _mm256_store_pd(&v[12],vv[3]);
+		      
+#ifdef _FMA		    
+		      ev =  _mm256_load_pd(&extEV[20 * l + 16]);
+		      vv[4] = FMAMACC(vv[4], al, ev);		 
+#else
+		      vv[4] = _mm256_add_pd(vv[4],_mm256_mul_pd(al, _mm256_load_pd(&extEV[20 * l + 16])));			 	  
+#endif
+		      _mm256_store_pd(&v[16],vv[4]);		 
+		    }
+		}
+	      
+	      v = x3_ptr;
+	      scale = 1;
+	      
+	      __m256d minlikelihood_avx = _mm256_set1_pd(PLL_MINLIKELIHOOD);	 
+	      
+	      for(l = 0; scale && (l < 80); l += 4) 
+		{
+		  __m256d vv = _mm256_load_pd(&v[l]);
+		  __m256d vv_abs = _mm256_and_pd(vv,absMask_AVX.m);
+		  vv_abs = _mm256_cmp_pd(vv_abs,minlikelihood_avx,_CMP_LT_OS);
+		  if(_mm256_movemask_pd(vv_abs) != 15)
+		    scale = 0;	     
+		}
+	      
+	      if(scale) 
+		{		     	      
+		  __m256d PLL_TWOTOTHE256v = _mm256_set_pd(PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256,PLL_TWOTOTHE256);
+		  for(l = 0; l < 80; l += 4) 
+		    {
+		      __m256d vv = _mm256_load_pd(&v[l]);
+		      _mm256_store_pd(&v[l],_mm256_mul_pd(vv,PLL_TWOTOTHE256v));
+		    }
+		  if(useFastScaling)
+		    addScale += wgt[i];					
+		  else
+		    ex3[i] += 1;
+		}  
+	      x3_ptr += 80;
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+ 
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+}
diff --git a/pllrepo/src/bipartitionList.c b/pllrepo/src/bipartitionList.c
new file mode 100644
index 0000000..44c6888
--- /dev/null
+++ b/pllrepo/src/bipartitionList.c
@@ -0,0 +1,434 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32  
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+static pllBipartitionEntry *initEntry(void);
+static void getxnodeBips (nodeptr p);
+static void newviewBipartitions(unsigned int **bitVectors, 
+                                nodeptr p, 
+                                int numsp, 
+                                unsigned int vectorLength, 
+                                int processID);
+
+static void insertHashRF(unsigned int *bitVector, 
+                         pllHashTable *h, 
+                         unsigned int vectorLength, 
+                         int treeNumber, 
+                         int treeVectorLength, 
+                         hashNumberType position, 
+                         int support, 
+                         pllBoolean computeWRF);
+
+extern const unsigned int mask32[32];
+
+
+static void getxnodeBips (nodeptr p)
+{
+  nodeptr  s;
+
+  if ((s = p->next)->xBips || (s = s->next)->xBips)
+    {
+      p->xBips = s->xBips;
+      s->xBips = 0;
+    }
+
+  assert(p->xBips);
+}
+
+
+static pllBipartitionEntry *initEntry(void)
+{
+  pllBipartitionEntry * e = (pllBipartitionEntry *)rax_malloc(sizeof(pllBipartitionEntry));
+
+  e->bitVector     = (unsigned int*)NULL;
+  e->treeVector    = (unsigned int*)NULL;
+  e->supportVector = (int*)NULL;
+  e->bipNumber  = 0;
+  e->bipNumber2 = 0;
+  e->supportFromTreeset[0] = 0;
+  e->supportFromTreeset[1] = 0;
+  e->next       = (pllBipartitionEntry *)NULL;
+
+  return e;
+} 
+
+void cleanupHashTable(pllHashTable *h, int state)
+{
+  unsigned int
+    k,
+    entryCount = 0,
+    removeCount = 0;
+ 
+  assert(state == 1 || state == 0);
+
+  for(k = 0, entryCount = 0; k < h->size; k++)       
+    { 
+      pllHashItem * start     = NULL;
+      pllHashItem * lastValid = NULL;
+      
+      pllHashItem * hitem = h->Items[k];
+      while (hitem)
+       {                           
+         pllBipartitionEntry *e = (pllBipartitionEntry *)(hitem->data);
+         if(state == 0)
+           {
+             e->treeVector[0] = e->treeVector[0] & 2;      
+             assert(!(e->treeVector[0] & 1));
+           }
+         else
+           {
+             e->treeVector[0] = e->treeVector[0] & 1;
+             assert(!(e->treeVector[0] & 2));
+           }
+         
+         if(e->treeVector[0] != 0)
+           {
+             if(!start)
+               start = hitem;
+             lastValid = hitem;
+             hitem = hitem->next;
+           }         
+         else
+           {
+             pllHashItem *tmp = hitem;
+             pllBipartitionEntry *remove = e;
+             hitem = hitem->next;
+             
+             removeCount++;
+
+             if(lastValid) lastValid->next = hitem;
+
+             if(remove->bitVector)     rax_free(remove->bitVector);
+             if(remove->treeVector)    rax_free(remove->treeVector);
+             if(remove->supportVector) rax_free(remove->supportVector);
+             rax_free(remove);              
+             rax_free(tmp);
+           }
+         entryCount++;
+       }
+
+      if(!start)
+        {
+          assert(!lastValid);
+          h->Items[k] = NULL;
+        }
+      else
+        {
+          h->Items[k] = start;
+        }            
+    }
+
+  assert(entryCount ==  h->entries);
+  h->entries-= removeCount;
+}
+
+
+
+
+
+
+
+
+
+
+
+unsigned int **initBitVector(int mxtips, unsigned int *vectorLength)
+{
+  unsigned int 
+    **bitVectors = (unsigned int **)rax_malloc(sizeof(unsigned int*) * 2 * (size_t)mxtips);
+  
+  int 
+    i;
+
+  if(mxtips % PLL_MASK_LENGTH == 0)
+    *vectorLength = mxtips / PLL_MASK_LENGTH;
+  else
+    *vectorLength = 1 + (mxtips / PLL_MASK_LENGTH); 
+  
+  for(i = 1; i <= mxtips; i++)
+    {
+      bitVectors[i] = (unsigned int *)rax_calloc((size_t)(*vectorLength), sizeof(unsigned int));
+      assert(bitVectors[i]);
+      bitVectors[i][(i - 1) / PLL_MASK_LENGTH] |= mask32[(i - 1) % PLL_MASK_LENGTH];
+    }
+  
+  for(i = mxtips + 1; i < 2 * mxtips; i++) 
+    {
+      bitVectors[i] = (unsigned int *)rax_malloc(sizeof(unsigned int) * (size_t)(*vectorLength));
+      assert(bitVectors[i]);
+    }
+
+  return bitVectors;
+}
+
+void freeBitVectors(unsigned int **v, int n)
+{
+  int i;
+
+  for(i = 1; i < n; i++)
+    rax_free(v[i]);
+}
+
+
+static void newviewBipartitions(unsigned int **bitVectors, 
+                                nodeptr p, 
+                                int numsp, 
+                                unsigned int vectorLength, 
+                                int processID)
+{
+  
+  if(isTip(p->number, numsp))
+    return;
+  {
+    nodeptr 
+      q = p->next->back, 
+      r = p->next->next->back;
+    
+    
+    
+    unsigned int       
+      *vector = bitVectors[p->number],
+      *left  = bitVectors[q->number],
+      *right = bitVectors[r->number];
+    unsigned 
+      int i;      
+    
+    assert(processID == 0);
+    
+
+    while(!p->xBips)
+      { 
+        if(!p->xBips)
+          getxnodeBips(p);
+      }
+
+    p->hash = q->hash ^ r->hash;
+
+    if(isTip(q->number, numsp) && isTip(r->number, numsp))
+      {         
+        for(i = 0; i < vectorLength; i++)
+          vector[i] = left[i] | right[i];               
+      }
+    else
+      { 
+        if(isTip(q->number, numsp) || isTip(r->number, numsp))
+          {
+            if(isTip(r->number, numsp))
+              { 
+                nodeptr tmp = r;
+                r = q;
+                q = tmp;
+              }    
+                    
+            while(!r->xBips)
+              {
+                if(!r->xBips)
+                  newviewBipartitions(bitVectors, r, numsp, vectorLength, processID);
+              }    
+
+            for(i = 0; i < vectorLength; i++)
+              vector[i] = left[i] | right[i];            
+          }
+        else
+          {         
+            while((!r->xBips) || (!q->xBips))
+              {
+                if(!q->xBips)
+                  newviewBipartitions(bitVectors, q, numsp, vectorLength, processID);
+                if(!r->xBips)
+                  newviewBipartitions(bitVectors, r, numsp, vectorLength, processID);
+              }                                    
+
+            for(i = 0; i < vectorLength; i++)
+              vector[i] = left[i] | right[i];    
+          }
+
+      }     
+  }     
+}
+
+
+
+
+static void insertHashRF(unsigned int *bitVector, 
+                         pllHashTable *h, 
+                         unsigned int vectorLength, 
+                         int treeNumber, 
+                         int treeVectorLength, 
+                         hashNumberType position, 
+                         int support, 
+                         pllBoolean computeWRF)
+{
+  pllBipartitionEntry * e;
+  pllHashItem * hitem;
+
+  if(h->Items[position] != NULL)
+    {
+      for (hitem = h->Items[position]; hitem; hitem = hitem->next)
+        { 
+          e = (pllBipartitionEntry *)(hitem->data);
+          
+          if (!memcmp(bitVector, e->bitVector, vectorLength * sizeof(unsigned int)))
+            {
+              e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH];
+              if(computeWRF)
+                {
+                  e->supportVector[treeNumber] = support;
+                  assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH);
+                }
+              return;
+            }
+        }
+    }
+  e = initEntry(); 
+       
+  rax_posix_memalign ((void **)&(e->bitVector), PLL_BYTE_ALIGNMENT, (size_t)vectorLength * sizeof(unsigned int));
+  memset(e->bitVector, 0, vectorLength * sizeof(unsigned int));
+
+  e->treeVector = (unsigned int*)rax_calloc((size_t)treeVectorLength, sizeof(unsigned int));
+  if(computeWRF)
+    e->supportVector = (int*)rax_calloc((size_t)treeVectorLength * PLL_MASK_LENGTH, sizeof(int));
+
+  e->treeVector[treeNumber / PLL_MASK_LENGTH] |= mask32[treeNumber % PLL_MASK_LENGTH];
+  if(computeWRF)
+    {
+      e->supportVector[treeNumber] = support;
+     
+      assert(0 <= treeNumber && treeNumber < treeVectorLength * PLL_MASK_LENGTH);
+    }
+
+  memcpy(e->bitVector, bitVector, sizeof(unsigned int) * vectorLength);
+  
+  pllHashAdd (h, position, NULL, (void *)e);
+}
+
+
+
+void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, pllHashTable *h, int treeNumber, int function, branchInfo *bInf, 
+                             int *countBranches, int treeVectorLength, pllBoolean traverseOnly, pllBoolean computeWRF, int processID)
+{
+  if(isTip(p->number, numsp))
+    return;
+  else
+    {
+      nodeptr 
+        q = p->next;          
+
+      do 
+        {
+          bitVectorInitravSpecial(bitVectors, q->back, numsp, vectorLength, h, treeNumber, function, bInf, countBranches, treeVectorLength, traverseOnly, computeWRF, processID);
+          q = q->next;
+        }
+      while(q != p);
+           
+      newviewBipartitions(bitVectors, p, numsp, vectorLength, processID);
+      
+      assert(p->xBips);
+
+      assert(!traverseOnly);     
+
+      if(!(isTip(p->back->number, numsp)))
+        {
+          unsigned int 
+            *toInsert  = bitVectors[p->number];
+          
+          hashNumberType 
+            position = p->hash % h->size;
+         
+          assert(!(toInsert[0] & 1));
+          assert(!computeWRF);
+          
+          switch(function)
+            {        
+            case PLL_BIPARTITIONS_RF:        
+              insertHashRF(toInsert, h, vectorLength, treeNumber, treeVectorLength, position, 0, computeWRF);
+              *countBranches =  *countBranches + 1;
+              break;
+            default:
+              assert(0);
+            }             
+        }
+      
+    }
+}
+
+double convergenceCriterion(pllHashTable *h, int mxtips)
+{
+  int      
+    rf = 0; 
+
+  unsigned int 
+    k = 0, 
+    entryCount = 0;
+  
+  double    
+    rrf;  
+
+  pllHashItem * hitem;
+
+  for(k = 0, entryCount = 0; k < h->size; k++)          
+    {      
+      for (hitem = h->Items[k]; hitem; hitem = hitem->next)
+       {
+         pllBipartitionEntry *e = hitem->data;
+         unsigned int *vector = e->treeVector;          
+
+         if(((vector[0] & 1) > 0) + ((vector[0] & 2) > 0) == 1)
+           rf++;        
+          
+         entryCount++;
+         e = e->next;
+       }
+    }
+
+  assert(entryCount == h->entries);  
+  rrf = (double)rf/((double)(2 * (mxtips - 3)));  
+  return rrf;
+}
diff --git a/pllrepo/src/cycle.h b/pllrepo/src/cycle.h
new file mode 100644
index 0000000..889932a
--- /dev/null
+++ b/pllrepo/src/cycle.h
@@ -0,0 +1,516 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#ifdef TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+
+
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if defined(_MSC_VER) && _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     __asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static __inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static __inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#ifdef HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static __inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
diff --git a/pllrepo/src/errcodes.h b/pllrepo/src/errcodes.h
new file mode 100644
index 0000000..ce81e68
--- /dev/null
+++ b/pllrepo/src/errcodes.h
@@ -0,0 +1,69 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file errcodes.h
+ */
+#ifndef ERRCODES_H
+#define ERRCODES_H
+
+#define PLL_ERROR_FILE_OPEN             1               /**< Error while opening file */
+#define PLL_ERROR_INVALID_FILETYPE      2               /**< Invalid fileType given at pllParseAlignmeFile */
+
+#define  PLL_NNI_P_TIP                  1 << 0          /**< Node p is a tip */
+#define  PLL_NNI_Q_TIP                  1 << 1          /**< Node p->back is a tip */
+
+#define  PLL_PARTITION_OUT_OF_BOUNDS    1 << 0      /**< Trying to access a partition index that is out of bounds */
+#define  PLL_BASE_FREQUENCIES_DO_NOT_SUM_TO_1 1 << 1      /**< base frequencies don't sum to 1.0 */
+
+#define PLL_LINKAGE_LIST_OUT_OF_BOUNDS 1 << 0      /**< trying to link a partition index that is out of bounds */
+
+#define PLL_SUBSTITUTION_RATE_OUT_OF_BOUNDS 1 << 0 /**< trying  to set a substitution rate to a value that is out of bounds */
+#define PLL_INVALID_Q_MATRIX_SYMMETRY       1 << 1 /**< specifyng an invalid parameter symmetry in the Q matrix */
+#define PLL_Q_MATRIX_SYMMETRY_OUT_OF_BOUNDS 1 << 2 /**<specifying a Q matrix symmetry that is out of bounds */
+
+#define PLL_UNKNOWN_MOLECULAR_DATA_TYPE 1 << 0 /**<PLL is trying to do something for an unknown data type */
+
+#define PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING 1 << 0 /**<PLL detected an inconsistent setting for the Q matrix rate optimization */
+#define PLL_INCONSISTENT_Q_MATRIX_SYMMETRIES_ACROSS_LINKED_PARTITIONS 1 << 1 /**<Q matrix symmetry vector is not identical for linked partitions */
+#define PLL_INCONSISTENT_Q_MATRIX_ENTRIES_ACROSS_LINKED_PARTITIONS 1 << 2 /**<Q matrix entries are not identical for linked partitions */
+#define PLL_INCONSISTENT_ALPHA_STATES_ACROSS_LINKED_PARTITIONS 1 << 3 /**<alpha states are not identical across linked partitions */
+#define PLL_INCONSISTENT_ALPHA_VALUES_ACROSS_LINKED_PARTITIONS 1 << 4 /**<alpha values are not identical across linked partitions */
+#define PLL_INCONSISTENT_FREQUENCY_STATES_ACROSS_LINKED_PARTITIONS 1 << 5 /**<frequency states are not identical across linked partitions */
+#define PLL_INCONSISTENT_FREQUENCY_VALUES_ACROSS_LINKED_PARTITIONS 1 << 6 /**<frequency values are not identical across linked partitions */
+
+#define PLL_NEWICK_ROOTED_TREE          1 << 0          /**< @brief Binary root detected */
+#define PLL_NEWICK_BAD_STRUCTURE        1 << 1          /**< @brief Errornous tree detected */
+
+
+
+#define PLL_ERROR_PHYLIP_HEADER_SYNTAX         5
+#define PLL_ERROR_PHYLIP_BODY_SYNTAX           6
+#define PLL_ERROR_FASTA_SYNTAX                 7
+
+
+
+
+#endif
diff --git a/pllrepo/src/evaluateGenericSpecial.c b/pllrepo/src/evaluateGenericSpecial.c
new file mode 100644
index 0000000..9a0dfc8
--- /dev/null
+++ b/pllrepo/src/evaluateGenericSpecial.c
@@ -0,0 +1,3321 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file evaluateGenericSpecial.c
+ *   
+ * @brief Functions for computing the log likelihood at a given branch of the tree (i.e. a virtual root that is placed at this branch)
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32 
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+/* the set of functions in here computes the log likelihood at a given branch (the virtual root of a tree) */
+
+/* includes for using SSE3 intrinsics */
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+/*#include <tmmintrin.h>*/
+#endif
+
+
+/** @defgroup evaluateLikelihoodGroup Likelihood evaluation
+    
+    This set of functions deals with the evaluation of likelihood for the current topology
+*/
+
+
+
+
+
+
+
+/* below are the function headers for unreadeble highly optimized versions of the above functions 
+   for DNA and protein data that also use SSE3 intrinsics and implement some memory saving tricks.
+   The actual functions can be found at the end of this source file. 
+   All other likelihood function implementation files:
+
+   newviewGenericSpacial.c
+   makenewzSpecial.c
+   evaluatePartialGenericSpecial.c
+
+   are also structured like this 
+
+   To decide which set of function implementations to use you will have to undefine or define _OPTIMIZED_FUNCTIONS 
+   in the Makefile 
+   */
+#if (defined(__SSE3) || defined(__AVX))
+
+static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
+                                       double *x1, double *x2,  
+                                       double *tipVector[4], 
+                                       unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
+                                       double * lg4_weights);
+
+/* GAMMA for proteins with memory saving */
+
+static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                                double *x1, double *x2,  
+                                                double *tipVector, 
+                                                unsigned char *tipX1, int n, double *diagptable, 
+                                                double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+
+/* GAMMA for proteins */
+
+static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                    double *x1, double *x2,  
+                                    double *tipVector, 
+                                    unsigned char *tipX1, int n, double *diagptable);
+
+/* CAT for proteins */
+
+static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                  double *x1, double *x2, double *tipVector,
+                                  unsigned char *tipX1, int n, double *diagptable_start);
+
+
+/* CAT for proteins with memory saving */
+
+static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                       double *x1, double *x2, double *tipVector,
+                                       unsigned char *tipX1, int n, double *diagptable_start, 
+                                       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+/* analogous DNA fuctions */
+
+static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                   double *x1_start, double *x2_start, double *tipVector,                     
+                                   unsigned char *tipX1, int n, double *diagptable_start,
+                                   double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                           double *x1_start, double *x2_start, 
+                                           double *tipVector, 
+                                           unsigned char *tipX1, const int n, double *diagptable,
+                                           double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                               double *x1_start, double *x2_start, 
+                               double *tipVector, 
+                               unsigned char *tipX1, const int n, double *diagptable);
+
+
+static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                              double *x1_start, double *x2_start, double *tipVector,                  
+                              unsigned char *tipX1, int n, double *diagptable_start);
+
+
+#endif
+
+#if (defined(__AVX) || defined(__SSE3))
+static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1_start, double *x2_start, double *tipVector,                   
+                                     unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling);
+#endif
+
+
+/* 
+   global variables of pthreads version, reductionBuffer is the global array 
+   that is used for implementing deterministic reduction operations, that is,
+   the total log likelihood over the partial log lieklihoods for the sites that each thread has computed 
+
+   NumberOfThreads is just the number of threads.
+
+   Note the volatile modifier here, that guarantees that the compiler will not do weird optimizations 
+   rearraengements of the code accessing those variables, because it does not know that several concurrent threads 
+   will access those variables simulatenously 
+
+   UPDATE: reductionBuffer is now merged with globalResult
+   */
+
+
+/* a pre-computed 32-bit integer mask */
+
+extern const unsigned int mask32[32];
+
+/* the function below computes the P matrix from the decomposition of the Q matrix and the respective rate categories for a single partition */
+
+/** @brief Compute the diagonal of P matrix for a specific edge
+
+    This function computes the diagonal of P matrix for a branch of length \a z
+    from the decomposition of the Q matrix specified in \a EIGN and the respective
+    rate categories \a rptr for a single partition. The diagonal is then stored in
+    \a diagptable. 
+
+    @param z                  Length of edge
+    @param states             Number of states
+    @param numberOfCategories Number of categories in the rate heterogeneity rate arrays
+    @param rptr               Rate heterogeneity rate arrays
+    @param EIGN               Eigenvalues
+    @param diagptable         Where to store the resulting P matrix
+*/
+static void calcDiagptable(const double z, const int states, const int numberOfCategories, const double *rptr, const double *EIGN, double *diagptable)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    lz,
+    *lza = (double *)rax_malloc(sizeof(double) * states);
+
+  /* transform the root branch length to the log and check if it is not too small */
+
+  if (z < PLL_ZMIN) 
+    lz = log(PLL_ZMIN);
+  else
+    lz = log(z);
+
+  /* do some pre-computations to avoid redundant computations further below */
+
+  for(i = 1; i < states; i++)      
+    lza[i] = EIGN[i] * lz; 
+
+  /* loop over the number of per-site or discrete gamma rate categories */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {                    
+    /* 
+       diagptable is a pre-allocated array of doubles that stores the P-Matrix 
+       the first entry is always 1.0 
+       */
+    diagptable[i * states] = 1.0;
+
+    /* compute the P matrix for all remaining states of the model */
+
+    for(l = 1; l < states; l++)
+      diagptable[i * states + l] = exp(rptr[i] * lza[l]);
+  }
+
+  rax_free(lza);
+}
+
+/** @brief Compute the diagonal of P matrix for a specific edge for the LG4 model
+
+    This function computes the diagonal of P matrix for a branch of length \a z
+    from the decomposition of the 4 LG4 Q matrices specified in \a EIGN and the respective
+    rate categories \a rptr for a single partition. The diagonal is then stored in
+    \a diagptable. 
+
+    @param z
+      Length of edge
+
+    @param states
+      Number of states
+
+    @param numberOfCategories
+      Number of categories in the rate heterogeneity rate arrays
+
+    @param rptr
+      Rate heterogeneity rate arrays
+
+    @param EIGN
+      Eigenvalues of the 4 Q matrices
+
+    @param diagptable
+      Where to store the resulting P matrix
+
+    @param numStates
+      Number of states
+*/
+static void calcDiagptableFlex_LG4(double z, int numberOfCategories, double *rptr, double *EIGN[4], double *diagptable, const int numStates)
+{
+  int 
+    i, 
+    l;
+  
+  double 
+    lz;
+  
+  assert(numStates <= 64);
+  
+  if (z < PLL_ZMIN) 
+    lz = log(PLL_ZMIN);
+  else
+    lz = log(z);
+
+  for(i = 0; i <  numberOfCategories; i++)
+    {                  
+      diagptable[i * numStates + 0] = 1.0;
+
+      for(l = 1; l < numStates; l++)
+        diagptable[i * numStates + l] = exp(rptr[i] * EIGN[i][l] * lz);                   
+    }        
+}
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static double evaluateCatAsc(int *ex1, int *ex2,
+			     double *x1, double *x2,  
+			     double *tipVector, 
+			     unsigned char *tipX1, int n, double *diagptable, const int numStates)
+{
+  double
+    exponent,
+    sum = 0.0, 
+    unobserved,
+    term,
+    *left, 
+    *right;
+  
+  int     
+    i,    
+    l;   
+         
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+   
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+	{
+	  left = &(tipVector[numStates * tip[i]]);	  	  
+	  right = &(x2[i * numStates]);
+
+	  term = 0.0;
+	         	      
+	  for(l = 0; l < numStates; l++)
+	    term += left[l] * right[l] * diagptable[l];	      	 	 	  	 
+
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	 */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
+
+	  unobserved = fabs(term) * exponent;
+
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0)
+	    {
+	      printf("s %d\n", ex2[i]);
+	      assert(0);
+	    }
+#endif	  
+	    
+	  sum += unobserved;
+	}              
+    }              
+  else
+    {           
+      for (i = 0; i < n; i++) 
+	{	  	 
+	  term = 0.0;
+	  	 
+	  left  = &(x1[i * numStates]);
+	  right = &(x2[i * numStates]);	    
+	      
+	  for(l = 0; l < numStates; l++)
+	    term += left[l] * right[l] * diagptable[l];		  
+	  
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	  */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
+
+	  unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0 || ex1[i] > 0)
+	    {
+	      printf("s %d %d\n", ex1[i], ex2[i]);
+	      assert(0);
+	    }
+#endif
+
+	  sum += unobserved;
+	}             
+    }        
+
+  return  sum;
+}
+
+
+static double evaluateGammaAsc(int *ex1, int *ex2,
+				double *x1, double *x2,  
+				double *tipVector, 
+				unsigned char *tipX1, int n, double *diagptable, const int numStates)
+{
+  double
+    exponent,
+    sum = 0.0, 
+    unobserved,
+    term,
+    *left, 
+    *right;
+  
+  int     
+    i, 
+    j, 
+    l;   
+  
+  const int 
+    gammaStates = numStates * 4;
+         
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+   
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+	{
+	  left = &(tipVector[numStates * tip[i]]);	  	  
+	  
+	  for(j = 0, term = 0.0; j < 4; j++)
+	    {
+	      right = &(x2[gammaStates * i + numStates * j]);
+	      
+	      for(l = 0; l < numStates; l++)
+		term += left[l] * right[l] * diagptable[j * numStates + l];	      
+	    }	 	  	 
+
+      /* assumes that pow behaves as expected/specified for underflows
+         from the man page:
+           If result underflows, and is not representable,
+           a range error occurs and 0.0 is returned.
+      */
+
+      exponent = pow(PLL_MINLIKELIHOOD, (double)ex2[i]);
+
+      unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0)
+	    {
+	      printf("s %d\n", ex2[i]);
+	      assert(0);
+	    }
+#endif	  
+	    
+	  sum += unobserved;
+	}              
+    }              
+  else
+    {           
+      for (i = 0; i < n; i++) 
+	{	  	 	             
+	  
+	  for(j = 0, term = 0.0; j < 4; j++)
+	    {
+	      left  = &(x1[gammaStates * i + numStates * j]);
+	      right = &(x2[gammaStates * i + numStates * j]);	    
+	      
+	      for(l = 0; l < numStates; l++)
+		term += left[l] * right[l] * diagptable[j * numStates + l];	
+	    }
+	  
+	  /* assumes that pow behaves as expected/specified for underflows
+	     from the man page:
+	       If result underflows, and is not representable,
+	       a range error occurs and 0.0 is returned.
+	  */
+
+	  exponent = pow(PLL_MINLIKELIHOOD, (double)(ex1[i] + ex2[i]));
+
+	  unobserved = fabs(term) * exponent;
+	  
+#ifdef _DEBUG_ASC
+	  if(ex2[i] > 0 || ex1[i] > 0)
+	    {
+	      printf("s %d %d\n", ex1[i], ex2[i]);
+	      assert(0);
+	    }
+#endif
+
+	  sum += unobserved;
+	}             
+    }        
+
+  return  sum;
+}
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
+    
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the GAMMA model of rate heterogeneity is used. The likelihood is computed at
+    a virtual root placed at an edge whose two end-points (nodes) have the conditional
+    likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1_start
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2_start
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data 
+      type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
+      probabilities computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the 
+      number of sites in the current partition, for multi-threads this is the number of sites assigned
+      to the running thread from the current partition.
+
+    @param diagptable
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
+      evaluating the likehood, and for each category of the GAMMA model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @return
+      The evaluated log likelihood of the tree topology
+*/
+static double evaluateGAMMA_FLEX(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                 double *x1_start, double *x2_start, 
+                                 double *tipVector, 
+                                 unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *x1,
+    *x2;
+
+  int     
+    i, 
+    j,
+    k;
+
+  /* span is the offset within the likelihood array at an inner node that gets us from the values 
+     of site i to the values of site i + 1 */
+
+  const int 
+    span = states * 4;
+
+
+  /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is 
+     a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-) 
+     Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
+     is done before when we compute the traversal descriptor.     
+     */
+
+  /* the left node is a tip */
+  if(tipX1)
+  {             
+    /* loop over the sites of this partition */
+    for (i = 0; i < n; i++)
+    {
+      /* access pre-computed tip vector values via a lookup table */
+      x1 = &(tipVector[states * tipX1[i]]);      
+      /* access the other(inner) node at the other end of the branch */
+      x2 = &(x2_start[span * i]);        
+
+      /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
+      for(j = 0, term = 0.0; j < 4; j++)
+        /* loop over states and multiply them with the P matrix */
+        for(k = 0; k < states; k++)
+          term += x1[k] * x2[j * states + k] * diagptable[j * states + k];                                                        
+
+      /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
+         Under the GAMMA model the 4 discrete GAMMA rates all have the same probability 
+         of 0.25 */
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+      /* if required get the per-site log likelihoods.
+         note that these are the plain per site log-likes, not 
+         multiplied with the pattern weight value */
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+      /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
+
+      x1 = &(x1_start[span * i]);
+      x2 = &(x2_start[span * i]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+        for(k = 0; k < states; k++)
+          term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief Memory saving version of the generic (and slow) implementation of log likelihood evaluation of a tree using the GAMMA model of rate heterogeneity
+
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the GAMMA model of rate heterogeneity is used and memory saving technique
+    is enabled. The likelihood is computed at a virtual root placed at an edge whose
+    two end-points (nodes) have the conditional likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1_start
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2_start
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data 
+      type, i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition
+      probabilities computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the 
+      number of sites in the current partition, for multi-threads this is the number of sites assigned
+      to the running thread from the current partition.
+
+    @param diagptable
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are
+      evaluating the likehood, and for each category of the GAMMA model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @param x1_gapColumn
+
+    @param x2_gapColumn
+
+    @param x1_gap
+      Gap bitvector for the left child node
+
+    @param x2_gap
+      Gap bitvector for the right child node
+
+    @return
+      The evaluated log likelihood of the tree topology
+
+    @todo
+      Document x1_gapColumn, x2_gapColumn, x1_gap, x2_gap and add a brief description of how this technique works
+*/
+static double evaluateGAMMA_FLEX_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
+                                      double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *x1,
+    *x2,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+    
+  int     
+    i, 
+    j,
+    k;
+
+  /* span is the offset within the likelihood array at an inner node that gets us from the values 
+     of site i to the values of site i + 1 */
+
+  const int 
+    span = states * 4;
+
+
+  /* we distingusih between two cases here: one node of the two nodes defining the branch at which we put the virtual root is 
+     a tip. Both nodes can not be tips because we do not allow for two-taxon trees ;-) 
+     Nota that, if a node is a tip, this will always be tipX1. This is done for code simplicity and the flipping of the nodes
+     is done before when we compute the traversal descriptor.     
+     */
+
+  /* the left node is a tip */
+  if(tipX1)
+  {             
+    /* loop over the sites of this partition */
+    for (i = 0; i < n; i++)
+    {
+      /* access pre-computed tip vector values via a lookup table */
+      x1 = &(tipVector[states * tipX1[i]]);      
+      /* access the other(inner) node at the other end of the branch */
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+        {
+          x2 = x2_ptr;
+          x2_ptr += span;
+        }
+
+      /* loop over GAMMA rate categories, hard-coded as 4 in RAxML */
+      for(j = 0, term = 0.0; j < 4; j++)
+        /* loop over states and multiply them with the P matrix */
+        for(k = 0; k < states; k++)
+          term += x1[k] * x2[j * states + k] * diagptable[j * states + k];                                                        
+
+      /* take the log of the likelihood and multiply the per-gamma rate likelihood by 1/4.
+         Under the GAMMA model the 4 discrete GAMMA rates all have the same probability 
+         of 0.25 */
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+      /* if required get the per-site log likelihoods.
+         note that these are the plain per site log-likes, not 
+         multiplied with the pattern weight value */
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+      /* same as before, only that now we access two inner likelihood vectors x1 and x2 */
+      
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1 = x1_gapColumn;
+      else
+        {
+          x1 = x1_ptr;
+          x1_ptr += span;
+        }    
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+        {
+          x2 = x2_ptr;
+          x2_ptr += span;
+        }                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+        for(k = 0; k < states; k++)
+          term += x1[j * states + k] * x2[j * states + k] * diagptable[j * states + k];
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+      
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+#endif
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity
+    
+    Computes the log likelihood of the topology for a specific partition, assuming
+    that the CAT model of rate heterogeneity is used. The likelihood is computed at
+    a virtual root placed at an edge whose two end-points (nodes) have the conditional
+    likelihood vectors \a x1 and \a x2. 
+    Furthermore, if \a getPerSiteLikelihoods is set to \b PLL_TRUE, then the log
+    likelihood for each site is also computed and stored at the corresponding position
+    in the array \a perSiteLikelihoods.
+
+    @param fastScaling
+      If set to \b PLL_FALSE, then the likelihood of each site is also multiplied by \a log(PLL_MINLIKELIHOOD) times the number
+      of times it has been scaled down
+
+    @param ex1
+      An array that holds how many times a site has been scaled and points at the entries for node \a p. This
+      parameter is used if \a fastScaling is set to \b PLL_FALSE.
+
+    @param ex2
+      An array that holds how many times a site has been scaled and points at the entries for node \a q. This
+      parameter is used if \a fastScaling is set to \b PLL_TRUE.
+
+    @param cptr
+      Array holding the rate for each site in the compressed partition alignment
+
+    @param wptr
+      Array holding the weight for each site in the compressed partition alignment
+
+    @param x1
+      Conditional likelihood vectors for one of the two end-points of the specific edge for which we are evaluating the likelihood
+
+    @param x2
+      Conditional likelihood vectors for the other end-point of the specific edge for which we are evaluating the likelihood
+
+    @param tipVector
+      Precomputed table where the number of rows is equal to the number of possible basepair characters for the current data type, 
+      i.e.16 for DNA and 23 for AA, and each rows contains \a states elements each of which contains transition probabilities 
+      computed from the eigenvectors of the decomposed Q matrix.
+
+    @param tipX1
+      If one of the two end-points (nodes) of the specific edge (for which we are evaluating the likelihood) is a tip, then
+      this holds a pointer to the sequence data (basepairs) already converted in the internal integer representation, and \a x2
+      holds the conditional likelihood vectors for the internal node.
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
+      current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
+
+    @param diagptable_start
+      Start of the array that contains the P-Matrix diagonal of the specific edge for which we are evaluating the likehood,
+      and for each category of the CAT model
+
+    @param states
+      Number of states (4 for DNA, 20 for AA)
+
+    @param perSiteLikelihoods
+      Array to store per-site log likelihoods if \a getPerSiteLikelihoods is set to \b PLL_TRUE
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE then per-site log likelihoods are also computed and stored in \a perSiteLikelihoods
+
+    @return
+      The evaluated log likelihood of the tree topology
+*/
+static double evaluateCAT_FLEX (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                double *x1, double *x2, double *tipVector,
+                                unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *diagptable,  
+    *left, 
+    *right;
+
+  int     
+    i, 
+    l;                           
+
+  /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless 
+     of whether we are using CAT, GAMMA, DNA or protein data etc */
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {
+      /* same as in the GAMMA implementation */
+      left = &(tipVector[states * tipX1[i]]);
+      right = &(x2[states * i]);
+
+      /* important difference here, we do not have, as for GAMMA 
+         4 P matrices assigned to each site, but just one. However those 
+         P-Matrices can be different for the sites.
+         Hence we index into the precalculated P-matrices for individual sites 
+         via the category pointer cptr[i]
+         */
+      diagptable = &diagptable_start[states * cptr[i]];                  
+
+      /* similar to gamma, with the only difference that we do not integrate (sum)
+         over the discrete gamma rates, but simply compute the likelihood of the 
+         site and the given P-matrix */
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];                        
+
+      /* take the log */
+       if(!fastScaling)
+         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+       else
+         term = log(fabs(term));
+
+       /* if required get the per-site log likelihoods.
+          note that these are the plain per site log-likes, not 
+          multiplied with the pattern weight value */
+
+       if(getPerSiteLikelihoods)
+         perSiteLikelihoods[i] = term;
+
+      /* 
+         multiply the log with the pattern weight of this site. 
+         The site pattern for which we just computed the likelihood may 
+         represent several alignment columns sites that have been compressed 
+         into one site pattern if they are exactly identical AND evolve under the same model,
+         i.e., form part of the same partition.
+         */                  
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {    
+    for (i = 0; i < n; i++) 
+    {   
+      /* as before we now access the likelihood arrayes of two inner nodes */
+      left  = &x1[states * i];
+      right = &x2[states * i];
+
+      diagptable = &diagptable_start[states * cptr[i]];         
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];
+      
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief A generic (and slow) implementation of log likelihood evaluation of a tree using the CAT model of rate heterogeneity with memory saving
+    
+    This is the same as ::evaluateCAT_FLEX but with the memory saving technique enabled.
+    Please check ::evaluateCAT_FLEX for more information and a description of the common
+    input parameters
+    
+    @param x1_gapColumn
+
+    @param x2_gapColumn
+
+    @param x1_gap
+      Gap bitvector for the left child node
+
+    @param x2_gap
+      Gap bitvector for the right child node
+    
+    @todo
+      Comment on x1_gapColumn and x2_gapColumn
+*/
+static double evaluateCAT_FLEX_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1, double *x2, double *tipVector,
+                                     unsigned char *tipX1, int n, double *diagptable_start, const int states, double *perSiteLikelihoods, pllBoolean getPerSiteLikelihoods,
+                                     double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+    term,
+    *diagptable,  
+    *left, 
+    *right,
+    *left_ptr = x1,
+    *right_ptr = x2;
+
+  int     
+    i, 
+    l;                           
+
+  /* chosing between tip vectors and non tip vectors is identical in all flavors of this function ,regardless 
+     of whether we are using CAT, GAMMA, DNA or protein data etc */
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {
+      /* same as in the GAMMA implementation */
+      left = &(tipVector[states * tipX1[i]]);
+   
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+        {
+          right = right_ptr;
+          right_ptr += states;
+        }         
+      /* important difference here, we do not have, as for GAMMA 
+         4 P matrices assigned to each site, but just one. However those 
+         P-Matrices can be different for the sites.
+         Hence we index into the precalculated P-matrices for individual sites 
+         via the category pointer cptr[i]
+         */
+      diagptable = &diagptable_start[states * cptr[i]];                  
+
+      /* similar to gamma, with the only difference that we do not integrate (sum)
+         over the discrete gamma rates, but simply compute the likelihood of the 
+         site and the given P-matrix */
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];                        
+
+      /* take the log */
+       if(!fastScaling)
+         term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+       else
+         term = log(fabs(term));
+
+       /* if required get the per-site log likelihoods.
+          note that these are the plain per site log-likes, not 
+          multiplied with the pattern weight value */
+
+       if(getPerSiteLikelihoods)
+         perSiteLikelihoods[i] = term;
+
+      /* 
+         multiply the log with the pattern weight of this site. 
+         The site pattern for which we just computed the likelihood may 
+         represent several alignment columns sites that have been compressed 
+         into one site pattern if they are exactly identical AND evolve under the same model,
+         i.e., form part of the same partition.
+         */                  
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {    
+    for (i = 0; i < n; i++) 
+    {   
+      /* as before we now access the likelihood arrayes of two inner nodes */     
+
+      if(isGap(x1_gap, i))
+        left = x1_gapColumn;
+      else
+        {
+          left = left_ptr;
+          left_ptr += states;
+        }       
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+        {
+          right = right_ptr;
+          right_ptr += states;
+        }       
+
+      diagptable = &diagptable_start[states * cptr[i]];         
+
+      for(l = 0, term = 0.0; l < states; l++)
+        term += left[l] * right[l] * diagptable[l];
+      
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      if(getPerSiteLikelihoods)
+        perSiteLikelihoods[i] = term;
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+#endif
+
+
+/* This is the core function for computing the log likelihood at a branch */
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluate the log likelihood of a specific branch of the topology
+    
+    Evaluates the likelihood of the tree topology assuming a virtual root is
+    placed at the edge whose end-points are node with number \a pNumber and \a
+    qNumber in the first slot of the traversal descriptor. The function first
+    computes the conditional likelihoods for all necessary nodes (the ones in
+    the traversal descriptor list) by calling the function \a pllNewviewIterative
+    and then evaluates the likelihood at the root. In addition, if \a
+    getPerSiteLikelihoods is set to \b PLL_TRUE, the per-site likelihoods are
+    stored in \a tr->lhs.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param getPerSiteLikelihoods
+      If set to \b PLL_TRUE, compute the log likelihood for each site. 
+
+    @note
+      This is an internal function and should not be called by the user. It assumes
+      that a valid traversal descriptor has already been computed. It also assumes
+      that the edge we are referring to is an edge that leads to a tip, i.e. either
+      p or q of the first entry of traversal descriptor are tips.
+*/
+void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods)
+{
+  /* the branch lengths and node indices of the virtual root branch are always the first one that 
+     are stored in the very important traversal array data structure that describes a partial or full tree traversal */
+
+  /* get the branch length at the root */
+  double 
+    *pz = tr->td[0].ti[0].qz;   
+
+  /* get the node number of the node to the left and right of the branch that defines the virtual rooting */
+
+  int    
+    pNumber = tr->td[0].ti[0].pNumber, 
+    qNumber = tr->td[0].ti[0].qNumber, 
+    p_slot,
+    q_slot,
+    model;
+  
+  pllBoolean
+    fastScaling = tr->fastScaling;
+
+  /* the slots are the entries in xVector where the LH vector is available */
+  if(tr->useRecom)
+    {
+      p_slot = tr->td[0].ti[0].slot_p;
+      q_slot = tr->td[0].ti[0].slot_q;
+    }
+  else
+    {
+      p_slot = pNumber - tr->mxtips - 1;
+      q_slot = qNumber - tr->mxtips - 1;
+    }
+  
+  /* before we can compute the likelihood at the virtual root, we need to do a partial or full tree traversal to compute 
+     the conditional likelihoods of the vectors as specified in the traversal descriptor. Maintaining this tarversal descriptor consistent 
+     will unfortunately be the responsibility of users. This is tricky, if as planned for here, we use a rooted view (described somewhere in Felsenstein's book)
+     for the conditional vectors with respect to the tree
+     */
+
+  /* iterate over all valid entries in the traversal descriptor */
+
+  pllNewviewIterative(tr, pr, 1);
+
+  /* after the above call we are sure that we have properly and consistently computed the 
+     conditionals to the right and left of the virtual root and we can now invoke the 
+     the log likelihood computation */
+
+  /* we need to loop over all partitions. Note that we may have a mix of DNA, protein binary data etc partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {    
+      /* whats' the number of sites of this partition (at the current thread) */
+      int           
+        width = pr->partitionData[model]->width;
+      
+      /* 
+         Important part of the tarversal descriptor: 
+         figure out if we need to recalculate the likelihood of this 
+         partition: 
+         
+         The reasons why this is important in terms of performance are given in this paper 
+         here which you should actually read:
+         
+         A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009, accepted for publication, Vienna, Austria, September 2009
+         
+         The width > 0 check is for checking if under the cyclic data distribution of per-partition sites to threads this thread does indeed have a site 
+         of the current partition.
+         
+      */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+        {       
+          int 
+#if (defined(__SSE3) || defined(__AVX))
+            rateHet = (int)discreteRateCategories(tr->rateHetModel),
+#endif
+            categories,
+            ascWidth = pr->partitionData[model]->states,
+            
+            /* get the number of states in the partition, e.g.: 4 = DNA, 20 = Protein */
+            
+            states = pr->partitionData[model]->states,
+            *ex1 = NULL,
+            *ex2 = NULL,
+            *ex1_asc = NULL,
+            *ex2_asc = NULL;
+          
+          double 
+            *rateCategories = (double*)NULL,
+            z, 
+            partitionLikelihood = 0.0,
+            *x1_start           = NULL,
+            *x2_start           = NULL,
+            *diagptable         = NULL,
+            *x1_start_asc       = NULL,
+            *x2_start_asc       = NULL;
+
+#if (defined(__SSE3) || defined(__AVX))
+          double
+            *x1_gapColumn = (double*)NULL,
+            *x2_gapColumn = (double*)NULL;
+#endif
+          
+#if (defined(__SSE3) || defined(__AVX))
+          unsigned int
+            *x1_gap = (unsigned int*)NULL,
+            *x2_gap = (unsigned int*)NULL;       
+#endif
+          
+          unsigned char 
+            *tip = (unsigned char*)NULL;          
+          
+          /* 
+             figure out if we are using the CAT or GAMMA model of rate heterogeneity 
+             and set pointers to the rate heterogeneity rate arrays and also set the 
+             number of distinct rate categories appropriately.
+             
+             Under GAMMA this is constant and hard-coded as 4, weheras under CAT 
+             the number of site-wise rate categories can vary in the course of computations 
+             up to a user defined maximum value of site categories (default: 25)
+          */
+
+          if(tr->rateHetModel == PLL_CAT)
+            {        
+              rateCategories = pr->partitionData[model]->perSiteRates;
+              categories = pr->partitionData[model]->numberOfCategories;
+            }
+          else  /* GAMMA */
+            {        
+              rateCategories = pr->partitionData[model]->gammaRates;
+              categories = 4;
+            }
+          
+          /* set this pointer to the memory area where space has been reserved a priori for storing the 
+             P matrix at the root */
+          
+          diagptable = pr->partitionData[model]->left;
+          
+          /* figure out if we need to address tip vectors (a char array that indexes into a precomputed tip likelihood 
+             value array) or if we need to address inner vectors */
+          
+          /* either node p or node q is a tip */
+          
+          if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
+            {                       
+              /* q is a tip */
+              
+              if(isTip(qNumber, tr->mxtips))
+                {       
+                  /* get the start address of the inner likelihood vector x2 for partition model,
+                     note that inner nodes are enumerated/indexed starting at 0 to save allocating some 
+                     space for additional pointers */
+
+                  x2_start = pr->partitionData[model]->xVector[p_slot];
+                  
+                  /* get the corresponding tip vector */
+                  
+                  tip      = pr->partitionData[model]->yVector[qNumber];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+                  if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+                  if (pr->partitionData[model]->ascBias)
+#endif
+                   {
+                     x2_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                     ex2_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
+                   }
+
+                  
+                  /* memory saving stuff, let's deal with this later or ask Fernando ;-) */
+                  
+#if (defined(__SSE3) || defined(__AVX))
+                  if(tr->saveMemory)
+                    {
+                      x2_gap         = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+                      x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet]);
+                    }
+#endif
+                  /* per site likelihood scaling */
+
+                  if(!fastScaling)                  
+                    ex2 = pr->partitionData[model]->expVector[p_slot];              
+                }           
+              else
+                {       
+                  /* p is a tip, same as above */
+                  
+                  x2_start = pr->partitionData[model]->xVector[q_slot];
+                  tip = pr->partitionData[model]->yVector[pNumber];
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+                  if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+                  if (pr->partitionData[model]->ascBias)
+#endif
+                   {
+                     x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                     ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
+                   }
+                  
+#if (defined(__SSE3) || defined(__AVX))
+                  if(tr->saveMemory)
+                    {
+                      x2_gap         = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+                      x2_gapColumn   = &(pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet]);
+                    }
+#endif
+
+                  /* per site likelihood scaling */
+
+                  if(!fastScaling)                  
+                    ex2 = pr->partitionData[model]->expVector[q_slot];             
+                }
+            }
+          else
+            {  
+              
+              assert(p_slot != q_slot);
+              /* neither p nor q are tips, hence we need to get the addresses of two inner vectors */
+              
+              x1_start = pr->partitionData[model]->xVector[p_slot];
+              x2_start = pr->partitionData[model]->xVector[q_slot];
+              
+              /* memory saving option */
+              
+#if (defined(__SSE3) || defined(__AVX))
+              if(tr->saveMemory)
+                {
+                  x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+                  x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+                  x1_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+                  x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+                }
+#endif
+                      
+              /* per site likelihood scaling */
+
+              if(!fastScaling)
+                {
+                  ex1      = pr->partitionData[model]->expVector[p_slot];
+                  ex2      = pr->partitionData[model]->expVector[q_slot];     
+                }
+              
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+              if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+              if (pr->partitionData[model]->ascBias)
+#endif
+               {
+                 x1_start_asc  = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x2_start_asc  = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+
+                 ex1_asc       = &pr->partitionData[model]->ascExpVector[(pNumber - tr->mxtips - 1) * ascWidth];
+                 ex2_asc       = &pr->partitionData[model]->ascExpVector[(qNumber - tr->mxtips - 1) * ascWidth];
+               }
+
+
+
+            }
+          
+          
+          /* if we are using a per-partition branch length estimate, the branch has an index, otherwise, for a joint branch length
+             estimate over all partitions we just use the branch length value with index 0 */
+          
+          if(pr->perGeneBranchLengths)
+            z = pz[model];
+          else
+            z = pz[0];
+          
+          /* calc P-Matrix at root for branch z connecting nodes p and q */
+          
+          if(pr->partitionData[model]->dataType == PLL_AA_DATA && (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+            calcDiagptableFlex_LG4(z, 4, pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4, diagptable, 20);
+          else
+            calcDiagptable(z, states, categories, rateCategories, pr->partitionData[model]->EIGN, diagptable);
+          
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+          
+          /* generic slow functions, memory saving option is not implemented for these */
+          
+          assert(!tr->saveMemory);
+          
+          /* decide wheter CAT or GAMMA is used and compute log like */
+          if(tr->rateHetModel == PLL_CAT)
+            partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt, 
+                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
+          else
+            partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, getPerSiteLikelihoods);
+#else
+   
+          /* if we want to compute the per-site likelihoods, we use the generic evaluate function implementations 
+             for this, because the slowdown is not that dramatic */
+
+          if(getPerSiteLikelihoods)
+            {         
+#ifdef __MIC_NATIVE
+                          // not supported on MIC!
+                          assert(0 && "Per-site LH calculations is not implemented on Intel MIC");
+#else
+               if(tr->rateHetModel == PLL_CAT)
+                {
+                   if(tr->saveMemory)
+                     partitionLikelihood = evaluateCAT_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                 tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE,
+                                                                 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                   else
+                     partitionLikelihood = evaluateCAT_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                            x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                            tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
+                }
+              else
+                {
+                  if(tr->saveMemory)
+                    partitionLikelihood = evaluateGAMMA_FLEX_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                  tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE, 
+                                                                  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);              
+                  else
+                    partitionLikelihood = evaluateGAMMA_FLEX(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                             x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                             tip, width, diagptable, states, pr->partitionData[model]->perSiteLikelihoods, PLL_TRUE);
+                }
+#endif
+            }
+          else
+            {
+              /* for the optimized functions we have a dedicated, optimized function implementation 
+                 for each rate heterogeneity and data type combination, we switch over the number of states 
+                 and the rate heterogeneity model */
+              
+              switch(states)
+                {         
+                case 2: /* binary */
+                  assert (!tr->saveMemory);
+                  if (tr->rateHetModel == PLL_CAT)
+                   {
+                     partitionLikelihood =  evaluateGTRCAT_BINARY(ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector, 
+                                                                  tip, width, diagptable, fastScaling);
+                   }
+                  else
+                   {
+                     partitionLikelihood = evaluateGTRGAMMA_BINARY(ex1, ex2, pr->partitionData[model]->wgt,
+                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                   tip, width, diagptable, fastScaling);                 
+                   }
+                  break;
+                case 4: /* DNA */
+                  {
+
+#ifdef __MIC_NATIVE
+
+                  /* CAT & memory saving are not supported on MIC */
+
+                  assert(!tr->saveMemory);
+                  assert(tr->rateHetModel == PLL_GAMMA);
+
+                  partitionLikelihood =  evaluateGTRGAMMA_MIC(ex1, ex2, pr->partitionData[model]->wgt,
+                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                              tip, width, diagptable, fastScaling);
+#else
+                    if(tr->rateHetModel == PLL_CAT)
+                      {                           
+                        if(tr->saveMemory)
+                          partitionLikelihood =  evaluateGTRCAT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                     x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                     tip, width, diagptable, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                          partitionLikelihood =  evaluateGTRCAT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                tip, width, diagptable);
+                      }
+                    else
+                      {         
+                        if(tr->saveMemory)                 
+                          partitionLikelihood =  evaluateGTRGAMMA_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                              tip, width, diagptable,
+                                                                              x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);                  
+                        else
+                          partitionLikelihood =  evaluateGTRGAMMA(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                  x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                  tip, width, diagptable);                                
+                      }
+#endif
+                  }
+                  break;                                   
+                case 20: /* proteins */
+                  {
+
+#ifdef __MIC_NATIVE
+
+                  /* CAT & memory saving are not supported on MIC */
+
+                  assert(!tr->saveMemory);
+                  assert(tr->rateHetModel == PLL_GAMMA);
+
+                  if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                    partitionLikelihood =  evaluateGTRGAMMAPROT_LG4_MIC(pr->partitionData[model]->wgt,
+                                                                    x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                                                    tip, width, diagptable, pr->partitionData[model]->lg4x_weights);
+                  else
+                        partitionLikelihood =  evaluateGTRGAMMAPROT_MIC(ex1, ex2, pr->partitionData[model]->wgt,
+                                              x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                              tip, width, diagptable, fastScaling);
+
+//                  printf("tip: %p, width: %d,  lh: %f\n", tip, width, partitionLikelihood);
+//                  int g;
+//                  if (x1_start)
+//                                        for (g = 0; g < 20; ++g)
+//                                                printf("%f \t", x1_start[g]);
+//                  printf("\n");
+//                  if (x2_start)
+//                                        for (g = 0; g < 20; ++g)
+//                                                printf("%f \t", x2_start[g]);
+#else
+
+                      if(tr->rateHetModel == PLL_CAT)
+                      {                           
+                        if(tr->saveMemory)
+                          partitionLikelihood = evaluateGTRCATPROT_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                        x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                        tip, width, diagptable,  x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                          partitionLikelihood = evaluateGTRCATPROT(fastScaling, ex1, ex2, pr->partitionData[model]->rateCategory, pr->partitionData[model]->wgt,
+                                                                   x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                   tip, width, diagptable);               
+                      }
+                    else
+                      {                                               
+                        if(tr->saveMemory)
+                          partitionLikelihood = evaluateGTRGAMMAPROT_GAPPED_SAVE(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                                 x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                                 tip, width, diagptable,
+                                                                                 x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+                        else
+                      {
+                        if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                          partitionLikelihood =  evaluateGTRGAMMAPROT_LG4((int *)NULL, (int *)NULL, pr->partitionData[model]->wgt,
+                                                                          x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                                                          tip, width, diagptable, PLL_TRUE, pr->partitionData[model]->lg4x_weights);
+                        else
+                          partitionLikelihood = evaluateGTRGAMMAPROT(fastScaling, ex1, ex2, pr->partitionData[model]->wgt,
+                                                                     x1_start, x2_start, pr->partitionData[model]->tipVector,
+                                                                     tip, width, diagptable);           
+                      }
+                      }
+#endif
+                  }
+                  break;                            
+                default:
+                  assert(0);        
+                }
+            }
+#endif
+              
+          /* check that there was no major numerical screw-up, the log likelihood should be < 0.0 always */
+          
+          assert(partitionLikelihood < 0.0);
+          
+          /* now here is a nasty part, for each partition and each node we maintain an integer counter to count how often 
+             how many entries per node were scaled by a constant factor. Here we use this information generated during Felsenstein's 
+             pruning algorithm by the newview() functions to undo the preceding scaling multiplications at the root, for mathematical details 
+             you should actually read:
+             
+             A. Stamatakis: "Orchestrating the Phylogenetic Likelihood Function on Emerging Parallel Architectures". 
+             In B. Schmidt, editor, Bioinformatics: High Performance Parallel Computer Architectures, 85-115, CRC Press, Taylor & Francis, 2010.
+             
+             There's a copy of this book in my office 
+          */
+          
+          if(fastScaling)
+            partitionLikelihood += (pr->partitionData[model]->globalScaler[pNumber] + pr->partitionData[model]->globalScaler[qNumber]) * log(PLL_MINLIKELIHOOD);
+          
+          /* now we have the correct log likelihood for the current partition after undoing scaling multiplications */           
+          
+          /* finally, we also store the per partition log likelihood which is important for optimizing the alpha parameter 
+             of this partition for example */
+
+          /* asc bias stuff */
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          if (tr->threadID == 0 && pr->partitionData[model]->ascBias)
+#else
+          if (pr->partitionData[model]->ascBias)
+#endif
+           {
+             size_t
+               i;
+             
+             int        
+               w = 0;
+             
+             double                                
+               correction;
+
+             switch(tr->rateHetModel)
+               {
+               case PLL_CAT:
+                 {
+                   double 
+                     rates = 1.0;
+                   
+                   //need to re-calculate P-matrix for the correction here assuming a rate of 1.0 
+                   calcDiagptable(z, states, 1, &rates, pr->partitionData[model]->EIGN, diagptable);
+                   
+                   
+                   correction = evaluateCatAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
+                                               tip, ascWidth, diagptable, ascWidth);
+                 }
+                 break;
+               case PLL_GAMMA:                       
+                 correction = evaluateGammaAsc(ex1_asc, ex2_asc, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector,
+                                               tip, ascWidth, diagptable, ascWidth);
+                 break;
+               default:
+                 assert(0);
+               }
+             
+             
+             
+             for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
+               w += tr->aliaswgt[i];
+
+             partitionLikelihood = partitionLikelihood - (double)w * log(1.0 - correction);                  
+              
+           }
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          if(!(pr->partitionData[model]->ascBias && tr->threadID == 0))
+           {
+#endif
+             if(partitionLikelihood >= 0.0)
+               {
+                 printf("positive log like: %f for partition %d\n", partitionLikelihood, model);
+                 assert(0);
+               }
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+           }
+#endif
+
+          
+          pr->partitionData[model]->partitionLH = partitionLikelihood;
+        }
+      else
+        {
+          /* if the current thread does not have a single site of this partition
+             it is important to set the per partition log like to 0.0 because 
+             of the reduction operation that will take place later-on.
+             That is, the values of tr->perPartitionLH across all threads 
+             need to be in a consistent state, always !
+          */
+          
+          if(width == 0)            
+            pr->partitionData[model]->partitionLH = 0.0;
+        }
+    }
+
+
+#ifdef DEBUG_PERSITE_LNL
+  /* per persite-stuff */
+  {
+    int model = 0; 
+    for(model = 0; model < pr->numberOfPartitions ; ++model)
+      {
+        int j= 0; 
+        pInfo *partition  =  pr->partitionData[model]; 
+        for(j = 0;  j < partition->width; ++j)
+          printf("[%d] lnl[%d]=%f\n", tr->threadID, j, partition->perSiteLikelihoods[j]); 
+
+      }
+  }
+
+#endif
+}
+
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluate the log likelihood of the tree topology
+
+    Evaluate the log likelihood of the tree topology of instance \a tr by
+    assuming a virtual root between nodes \a p and \a p->back. If
+    \a fullTraversal is set to \b PLL_TRUE then the log likelihood vectors for
+    each node are recomputed from scratch.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Specifies the virtual root, which is assumed to be a (virtual node) connecting \a p and \a p->back
+
+    @param fullTraversal
+      If set to \b PLL_TRUE, then the likelihood vectors at all nodes are recomputed, otherwise only the
+      necessary vectors (those that are not oriented in the right direction) are recomputed.
+
+    @param getPerSiteLikelihoods
+      Also compute and store (in \a tr->lhs) the log likelihood of each site of the (compressed) alignment
+
+    @note
+      If \a getPerSiteLikelihoods is set to \b PLL_TRUE, then make sure that \a tr->fastScaling is set to
+      \b PLL_FALSE, otherwise an assertion will fail.
+*/
+void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods)
+{
+  /* now this may be the entry point of the library to compute 
+     the log like at a branch defined by p and p->back == q */
+
+  volatile double 
+    result = 0.0;
+
+  nodeptr 
+    q = p->back; 
+  
+
+  pllBoolean
+        p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
+        q_recom = PLL_FALSE;
+
+  int
+    i,
+    model,
+    numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions : 1;
+
+  /* if evaluate shall return the per-site log likelihoods 
+     fastScaling needs to be disabled, otherwise this will 
+     not work */
+
+  if(getPerSiteLikelihoods)          
+    assert(!(tr->fastScaling)); 
+
+  /* set the first entry of the traversal descriptor to contain the indices
+     of nodes p and q */
+
+  tr->td[0].ti[0].pNumber = p->number;
+  tr->td[0].ti[0].qNumber = q->number;          
+
+  /* copy the branch lengths of the tree into the first entry of the traversal descriptor.
+     if -M is not used tr->numBranches must be 1 */
+
+  for(i = 0; i < numBranches; i++)
+    tr->td[0].ti[0].qz[i] =  q->z[i];
+
+  /* recom part */
+  if(tr->useRecom)
+  {
+    int slot = -1;
+    if(!isTip(q->number, tr->mxtips))
+    {
+      q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_q = slot;
+    }
+    if(!isTip(p->number, tr->mxtips))
+    {
+      p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_p = slot;
+    }
+    if(!isTip(p->number, tr->mxtips) &&  !isTip(q->number, tr->mxtips))
+      assert(tr->td[0].ti[0].slot_q != tr->td[0].ti[0].slot_p);
+  }
+
+
+  /* now compute how many conditionals must be re-computed/re-oriented by newview
+     to be able to calculate the likelihood at the root defined by p and q.
+     */
+
+  /* one entry in the traversal descriptor is already used, hence set the tarversal length counter to 1 */
+  tr->td[0].count = 1;
+
+  if(fullTraversal)
+  {
+    assert(isTip(q->back->number, tr->mxtips));
+    computeTraversal(tr, q, PLL_FALSE, numBranches);
+  }
+  else
+  {
+    if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
+      computeTraversal(tr, p, PLL_TRUE, numBranches);
+
+    if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
+      computeTraversal(tr, q, PLL_TRUE, numBranches);
+  }
+
+
+  /* now we copy this partition execute mask into the traversal descriptor which must come from the 
+     calling program, the logic of this should not form part of the library */
+
+  storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+  /* also store in the traversal descriptor that something has changed i.e., in the parallel case that the 
+     traversal descriptor list of nodes needs to be broadcast once again */
+
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+  /* now here we enter the fork-join region for Pthreads */
+
+
+  /* start the parallel region and tell all threads to compute the log likelihood for 
+     their fraction of the data. This call is implemented in the case switch of execFunction in axml.c
+     */
+  if(getPerSiteLikelihoods)
+    {
+      memset(tr->lhs, 0, sizeof(double) * tr->originalCrunchedLength); 
+      pllMasterBarrier(tr, pr, PLL_THREAD_EVALUATE_PER_SITE_LIKES);
+    }
+  else
+    pllMasterBarrier (tr, pr, PLL_THREAD_EVALUATE);
+
+  /* and now here we explicitly do the reduction operation , that is add over the 
+     per-thread and per-partition log likelihoods to obtain the overall log like 
+     over all sites and partitions */
+
+ 
+  /* 
+     for unpartitioned data that's easy, we just sum over the log likes computed 
+     by each thread, thread 0 stores his results in reductionBuffer[0] thread 1 in 
+     reductionBuffer[1] and so on 
+     */
+
+  /* This reduction for the partitioned case is more complicated because each thread 
+     needs to store the partial log like of each partition and we then need to collect 
+     and add everything */
+
+#else
+  /* and here is just the sequential case, we directly call pllEvaluateIterative() above 
+     without having to tell the threads/processes that they need to compute this function now */
+
+  pllEvaluateIterative(tr, pr, getPerSiteLikelihoods); //PLL_TRUE
+
+  /*
+    if we want to obtain per-site rates they have initially been stored 
+     in arrays that are associated to the partition, now we 
+     copy them into the vector tr->lhs[].
+     We may also chose that the user needs to rpovide an array, but this can be decided later-on.
+  */
+
+  if(getPerSiteLikelihoods) //PLL_TRUE
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        memcpy(&(tr->lhs[pr->partitionData[model]->lower]), pr->partitionData[model]->perSiteLikelihoods, pr->partitionData[model]->width  * sizeof(double));
+    }
+
+#endif
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    result += pr->partitionData[model]->partitionLH;
+
+  /* set the tree data structure likelihood value to the total likelihood */
+
+  tr->likelihood = result;    
+
+  /* the code below is mainly for testing if the per-site log 
+     likelihoods we have stored in tr->lhs yield the same 
+     likelihood as the likelihood we computed. 
+     For numerical reasons we need to make a dirt PLL_ABS(difference) < epsilon
+     comparison */
+     
+  if(getPerSiteLikelihoods) //PLL_TRUE
+    {
+      double 
+        likelihood = 0;
+      int i; 
+
+      /* note that in tr->lhs, we just store the likelihood of 
+         one representative of a potentially compressed pattern,
+         hence, we need to multiply the elemnts with the pattern 
+         weight vector */
+
+
+      for(i = 0; i < tr->originalCrunchedLength; i++)
+        {
+//          printf("lhs[%d]=%f * %d\n", i, tr->lhs[i], tr->aliaswgt[i]); 
+          likelihood += (tr->lhs[i]   * tr->aliaswgt[i] );
+        }
+         
+      if( PLL_ABS(tr->likelihood - likelihood) > 0.00001)
+        {
+  //        printf("likelihood was %f\t summed/weighted per-site-lnl was %f\n", tr->likelihood, likelihood); 
+        }
+
+        assert(PLL_ABS(tr->likelihood - likelihood) < 0.00001);
+    }
+
+
+  if(tr->useRecom)
+  {
+    unpinNode(tr->rvec, p->number, tr->mxtips);
+    unpinNode(tr->rvec, q->number, tr->mxtips);
+  }
+
+  /* do some bookkeeping to have traversalHasChanged in a consistent state */
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+}
+
+
+void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods)
+{
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+  double 
+    //likelihood,
+    accumulatedPerSiteLikelihood = 0.0;
+
+  size_t
+    localCount,
+    i,
+    //globalCounter,
+    lower,
+    upper;
+  int model;
+#endif
+  /* compute the likelihood of the tree with the standard function to:
+     1. obtain the current score for error checking
+     2. store a full tree traversal in the traversal descriptor that 
+     will then be used for calculating per-site log likelihoods 
+     for each site individually and independently */
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  //likelihood = tr->likelihood;
+
+  /* now compute per-site log likelihoods using the respective functions */
+
+#if (defined( _USE_PTHREADS ) || defined(_FINE_GRAIN_MPI))
+  /* here we need a barrier to invoke a parallel region that calls 
+     function 
+     perSiteLogLikelihoodsPthreads(tree *tr, partitionList *pr, double *lhs, int n, int tid)
+     defined above and subsequently collects the per-site log likelihoods 
+     computed by the threads and stored in local per-thread memory 
+     and stores them in buffer tr->lhs.
+     This corresponds to a gather operation in MPI.
+     */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_PER_SITE_LIKELIHOODS);
+
+  /* 
+     when the parallel region has terminated, the per-site log likelihoods 
+     are stored in array tr->lhs of the master thread which we copy to the result buffer
+  */
+  
+  memcpy(logLikelihoods, tr->lhs, sizeof(double) * tr->originalCrunchedLength);
+
+
+#else
+
+  /* sequential case: just loop over all partitions and compute per site log likelihoods */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    lower = pr->partitionData[model]->lower;
+    upper = pr->partitionData[model]->upper;
+
+    for(i = lower, localCount = 0; i < upper; i++, localCount++)
+    {
+      double 
+        l;
+
+      /* 
+         we need to switch of rate heterogeneity implementations here.
+         when we have PSR we actually need to provide the per-site rate 
+         to the function evaluatePartialGeneric() that computes the 
+         per-site log likelihood.
+         Under GAMMA, the rate will just be ignored, here we just set it to 1.0
+         */
+
+      switch(tr->rateHetModel)
+      {
+        case PLL_CAT:
+          l = evaluatePartialGeneric (tr, pr, i, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localCount]], model);
+          break;
+        case PLL_GAMMA:
+          l = evaluatePartialGeneric (tr, pr, i, 1.0, model);
+          break;
+        default:
+          assert(0);
+      }
+
+      /* store value in result array and add the likelihood of this site to the overall likelihood */
+
+      logLikelihoods[i] = l;
+      accumulatedPerSiteLikelihood += l;
+    } 
+  }
+
+
+  /* error checking. We need a dirt PLL_ABS() < epsilon here, because the implementations 
+     (standard versus per-site) are pretty different and hence slight numerical 
+     deviations are expected */
+
+  assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
+  
+#endif
+  
+
+
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+static double evaluateGTRCAT_BINARY (int *ex1, int *ex2, int *cptr, int *wptr,
+                                     double *x1_start, double *x2_start, double *tipVector,                   
+                                     unsigned char *tipX1, int n, double *diagptable_start, const pllBoolean fastScaling)
+{
+  double  sum = 0.0, term;       
+  int     i;
+#if (!defined(__SSE3) && !defined(__AVX))
+  int j;  
+#endif
+  double  *diagptable, *x1, *x2;                            
+ 
+  if(tipX1)
+    {          
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+          PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+#endif
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(x2_start[2 * i]);
+          
+          diagptable = &(diagptable_start[2 * cptr[i]]);                          
+        
+#if (defined(__SSE3) || defined(__AVX))
+          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          
+          if(fastScaling)
+            term = log(fabs(t[0] + t[1]));
+          else
+            term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));                           
+#else               
+          for(j = 0, term = 0.0; j < 2; j++)                         
+            term += x1[j] * x2[j] * diagptable[j];            
+                 
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));                                                      
+#endif    
+
+          sum += wptr[i] * term;
+        }       
+    }               
+  else
+    {
+      for (i = 0; i < n; i++) 
+        {       
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+#endif                  
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+          
+          diagptable = &diagptable_start[2 * cptr[i]];            
+#if (defined(__SSE3) || defined(__AVX))
+          _mm_store_pd(t, _mm_mul_pd(_mm_load_pd(x1), _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(diagptable))));
+          
+          if(fastScaling)
+            term = log(fabs(t[0] + t[1]));
+          else
+            term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));                        
+#else     
+          for(j = 0, term = 0.0; j < 2; j++)
+            term += x1[j] * x2[j] * diagptable[j];   
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+#endif
+          
+          sum += wptr[i] * term;
+        }          
+    }
+       
+  return  sum;         
+} 
+
+
+static double evaluateGTRGAMMA_BINARY(int *ex1, int *ex2, int *wptr,
+                                      double *x1_start, double *x2_start, 
+                                      double *tipVector, 
+                                      unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+#if (!defined(__SSE3) && !defined(__AVX))
+  int k;
+#endif 
+  double  *x1, *x2;             
+
+  if(tipX1)
+    {          
+      for (i = 0; i < n; i++)
+        {
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+          __m128d termv, x1v, x2v, dv;
+#endif
+          x1 = &(tipVector[2 * tipX1[i]]);       
+          x2 = &x2_start[8 * i];                                
+#if (defined(__SSE3) || defined(__AVX))
+          termv = _mm_set1_pd(0.0);                
+          
+          for(j = 0; j < 4; j++)
+            {
+              x1v = _mm_load_pd(&x1[0]);
+              x2v = _mm_load_pd(&x2[j * 2]);
+              dv   = _mm_load_pd(&diagptable[j * 2]);
+              
+              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = _mm_mul_pd(x1v, dv);
+              
+              termv = _mm_add_pd(termv, x1v);                 
+            }
+          
+          _mm_store_pd(t, termv);               
+          
+          if(fastScaling)
+            term = log(0.25 * (fabs(t[0] + t[1])));
+          else
+            term = log(0.25 * (fabs(t[0] + t[1]))) + (ex2[i] * log(PLL_MINLIKELIHOOD));       
+#else
+          for(j = 0, term = 0.0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              term += x1[k] * x2[j * 2 + k] * diagptable[j * 2 + k];                                                
+          
+          if(fastScaling)
+            term = log(0.25 * fabs(term));
+          else
+            term = log(0.25 * fabs(term)) + ex2[i] * log(PLL_MINLIKELIHOOD);
+#endif   
+          
+          sum += wptr[i] * term;
+        }         
+    }
+  else
+    {         
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+		  PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+          __m128d termv, x1v, x2v, dv;
+#endif                            
+          x1 = &x1_start[8 * i];
+          x2 = &x2_start[8 * i];
+                  
+#if (defined(__SSE3) || defined(__AVX))
+          termv = _mm_set1_pd(0.0);                
+          
+          for(j = 0; j < 4; j++)
+            {
+              x1v = _mm_load_pd(&x1[j * 2]);
+              x2v = _mm_load_pd(&x2[j * 2]);
+              dv   = _mm_load_pd(&diagptable[j * 2]);
+              
+              x1v = _mm_mul_pd(x1v, x2v);
+              x1v = _mm_mul_pd(x1v, dv);
+              
+              termv = _mm_add_pd(termv, x1v);                 
+            }
+          
+          _mm_store_pd(t, termv);
+          
+          
+          if(fastScaling)
+            term = log(0.25 * (fabs(t[0] + t[1])));
+          else
+            term = log(0.25 * (fabs(t[0] + t[1]))) + ((ex1[i] +ex2[i]) * log(PLL_MINLIKELIHOOD));     
+#else     
+          for(j = 0, term = 0.0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              term += x1[j * 2 + k] * x2[j * 2 + k] * diagptable[j * 2 + k];                                          
+
+          if(fastScaling)
+            term = log(0.25 * fabs(term));
+          else
+            term = log(0.25 * fabs(term)) + (ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD);
+#endif
+
+          sum += wptr[i] * term;
+        }                       
+    }
+
+  return sum;
+} 
+#endif
+
+
+
+/* below are the optimized function versions with geeky intrinsics */
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree under the GAMMA model of rate heterogeneity and LG4 model of evolution
+    
+    This is the same as ::evaluateGAMMA_FLEX but for the LG4 model. It contains two implementations,
+    one which is the generic, and one that is optimized with SSE3 instructions. The two implementations
+    are separated by preprocessor macros.
+    The difference from ::evaluateGAMMA_FLEX is that we have 4 different tipVectors computed from the 4 different
+    Q matrix decompositions.
+    Please check ::evaluateGAMMA_FLEX for more information and a description of the common
+    input parameters.
+*/
+static double evaluateGTRGAMMAPROT_LG4(int *ex1, int *ex2, int *wptr,
+                                       double *x1, double *x2,  
+                                       double *tipVector[4], 
+                                       unsigned char *tipX1, int n, double *diagptable, const pllBoolean fastScaling,
+                                       double * lg4_weights)
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  *left, *right;              
+  
+  if(tipX1)
+    {               
+      for (i = 0; i < n; i++) 
+        {
+#if (defined(__SSE3) || defined(__AVX))
+          __m128d tv = _mm_setzero_pd();
+                                  
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+              double *d = &diagptable[j * 20];
+
+              __m128d
+              	  t = _mm_setzero_pd(),
+              	  w = _mm_set1_pd(lg4_weights[j]);
+
+              left = &(tipVector[j][20 * tipX1[i]]);
+              right = &(x2[80 * i + 20 * j]);
+              for(l = 0; l < 20; l+=2)
+                {
+                  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+                  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
+                }
+              tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
+            }
+
+          tv = _mm_hadd_pd(tv, tv);
+          _mm_storel_pd(&term, tv);
+          
+
+#else                             
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+        	  double t = 0.0;
+
+              left = &(tipVector[j][20 * tipX1[i]]);
+              right = &(x2[80 * i + 20 * j]);
+
+              for(l = 0; l < 20; l++)
+                t += left[l] * right[l] * diagptable[j * 20 + l];
+
+              term += lg4_weights[j] * t;
+            }     
+#endif
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+
+          sum += wptr[i] * term;
+
+        }               
+    }              
+  else
+    {
+      for (i = 0; i < n; i++) 
+        {                                    
+#if (defined(__SSE3) || defined(__AVX))
+          __m128d tv = _mm_setzero_pd();                          
+              
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+              double *d = &diagptable[j * 20];
+
+              __m128d
+              t = _mm_setzero_pd(),
+              w = _mm_set1_pd(lg4_weights[j]);
+
+              left  = &(x1[80 * i + 20 * j]);
+              right = &(x2[80 * i + 20 * j]);
+              
+              for(l = 0; l < 20; l+=2)
+                {
+                  __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+                  t = _mm_add_pd(t, _mm_mul_pd(mul, _mm_load_pd(&d[l])));
+                }
+              tv = _mm_add_pd(tv, _mm_mul_pd(t, w));
+            }
+          tv = _mm_hadd_pd(tv, tv);
+          _mm_storel_pd(&term, tv);       
+#else
+          for(j = 0, term = 0.0; j < 4; j++)
+            {
+        	  double t = 0.0;
+
+              left  = &(x1[80 * i + 20 * j]);
+              right = &(x2[80 * i + 20 * j]);       
+              
+              for(l = 0; l < 20; l++)
+                t += left[l] * right[l] * diagptable[j * 20 + l];
+
+              term += lg4_weights[j] * t;
+            }
+#endif
+          
+          if(fastScaling)
+            term = log(fabs(term));
+          else
+            term = log(fabs(term)) + ((ex1[i] + ex2[i])*log(PLL_MINLIKELIHOOD));
+          
+          sum += wptr[i] * term;
+        }         
+    }
+
+  return  sum;
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity 
+    and the memory saving technique (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
+    a description of the input parameters
+*/
+static double evaluateGTRGAMMAPROT_GAPPED_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                                double *x1, double *x2,  
+                                                double *tipVector, 
+                                                unsigned char *tipX1, int n, double *diagptable, 
+                                                double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)                                    
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  
+    *left, 
+    *right,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x1v,
+    *x2v;              
+  __m128d tv;
+
+  if(tipX1)
+  {               
+    for (i = 0; i < n; i++) 
+    {
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2v = x2_gapColumn;
+      else
+      {
+        x2v = x2_ptr;
+        x2_ptr += 80;
+      }
+
+	  //TUNG: Standard C does not allow declaration after executable statement
+	  tv = _mm_setzero_pd();
+      //__m128d tv = _mm_setzero_pd();
+      left = &(tipVector[20 * tipX1[i]]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        right = &(x2v[20 * j]);
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));    
+
+      sum += wptr[i] * term;
+    }                   
+  }              
+  else
+  {
+    for (i = 0; i < n; i++) 
+    {
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1v = x1_gapColumn;
+      else
+      {
+        x1v = x1_ptr;
+        x1_ptr += 80;
+      }
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2v = x2_gapColumn;
+      else
+      {
+        x2v = x2_ptr;
+        x2_ptr += 80;
+      }
+
+      //__m128d tv = _mm_setzero_pd(); 
+	  tv = _mm_setzero_pd();
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        left  = &(x1v[20 * j]);
+        right = &(x2v[20 * j]);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);   
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }         
+  }
+
+  return  sum;
+}
+
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMAPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                    double *x1, double *x2,  
+                                    double *tipVector, 
+                                    unsigned char *tipX1, int n, double *diagptable)
+{
+  double   sum = 0.0, term;        
+  int     i, j, l;   
+  double  *left, *right;              
+
+  if(tipX1)
+  {               
+    for (i = 0; i < n; i++) 
+    {
+
+      __m128d tv = _mm_setzero_pd();
+      left = &(tipVector[20 * tipX1[i]]);                 
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        right = &(x2[80 * i + 20 * j]);
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }                   
+  }              
+  else
+  {
+    for (i = 0; i < n; i++) 
+    {                                
+      __m128d tv = _mm_setzero_pd();                      
+
+      for(j = 0, term = 0.0; j < 4; j++)
+      {
+        double *d = &diagptable[j * 20];
+        left  = &(x1[80 * i + 20 * j]);
+        right = &(x2[80 * i + 20 * j]);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+          tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l])));                
+        }                               
+      }
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);   
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(term));
+
+
+      sum += wptr[i] * term;
+    }
+  }
+
+  return  sum;
+}
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCATPROT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                  double *x1, double *x2, double *tipVector,
+                                  unsigned char *tipX1, int n, double *diagptable_start)
+{
+  double   sum = 0.0, term;
+  double  *diagptable,  *left, *right;
+  int     i, l;                           
+  __m128d tv;
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {           
+      left = &(tipVector[20 * tipX1[i]]);
+      right = &(x2[20 * i]);
+
+      diagptable = &diagptable_start[20 * cptr[i]];                      
+
+	  //TUNG: Standard C does not allow declaration after executable statement
+	  tv = _mm_setzero_pd();
+      //__m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {
+
+    for (i = 0; i < n; i++) 
+    {                                 
+      left  = &x1[20 * i];
+      right = &x2[20 * i];
+
+      diagptable = &diagptable_start[20 * cptr[i]];             
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for AA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCATPROT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                       double *x1, double *x2, double *tipVector,
+                                       unsigned char *tipX1, int n, double *diagptable_start, 
+                                       double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   
+    sum = 0.0, 
+        term,
+        *diagptable,  
+        *left, 
+        *right,
+        *left_ptr = x1,
+        *right_ptr = x2;
+
+  int     
+    i, 
+    l;                           
+
+  if(tipX1)
+  {                 
+    for (i = 0; i < n; i++) 
+    {           
+      left = &(tipVector[20 * tipX1[i]]);
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+      {
+        right = right_ptr;
+        right_ptr += 20;
+      }          
+
+      diagptable = &diagptable_start[20 * cptr[i]];                      
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));
+
+      sum += wptr[i] * term;
+    }      
+  }    
+  else
+  {
+
+    for (i = 0; i < n; i++) 
+    {                                     
+      if(isGap(x1_gap, i))
+        left = x1_gapColumn;
+      else
+      {
+        left = left_ptr;
+        left_ptr += 20;
+      }
+
+      if(isGap(x2_gap, i))
+        right = x2_gapColumn;
+      else
+      {
+        right = right_ptr;
+        right_ptr += 20;
+      }
+
+      diagptable = &diagptable_start[20 * cptr[i]];             
+
+      __m128d tv = _mm_setzero_pd();        
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d lv = _mm_load_pd(&left[l]);
+        __m128d rv = _mm_load_pd(&right[l]);
+        __m128d mul = _mm_mul_pd(lv, rv);
+        __m128d dv = _mm_load_pd(&diagptable[l]);
+
+        tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv));                  
+      }                         
+
+      tv = _mm_hadd_pd(tv, tv);
+      _mm_storel_pd(&term, tv);
+
+      if(!fastScaling)
+        term = log(fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(term));  
+
+      sum += wptr[i] * term;      
+    }
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCAT_SAVE (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                                   double *x1_start, double *x2_start, double *tipVector,                     
+                                   unsigned char *tipX1, int n, double *diagptable_start,
+                                   double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double  sum = 0.0, term;       
+  int     i;
+
+  double  *diagptable, 
+          *x1, 
+          *x2,
+          *x1_ptr = x1_start,
+          *x2_ptr = x2_start;
+
+  if(tipX1)
+  {           
+    for (i = 0; i < n; i++) 
+    {   
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &(tipVector[4 * tipX1[i]]);
+
+      if(isGap(x2_gap, i))
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 4;
+      }
+
+      diagptable = &diagptable_start[4 * cptr[i]];
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+      if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }   
+  }               
+  else
+  {
+    for (i = 0; i < n; i++) 
+    { 
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      if(isGap(x1_gap, i))
+        x1 = x1_gapColumn;
+      else
+      {
+        x1 = x1_ptr;
+        x1_ptr += 4;
+      }
+
+      if(isGap(x2_gap, i))
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 4;
+      }
+
+      diagptable = &diagptable_start[4 * cptr[i]];      
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+
+       if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+      sum += wptr[i] * term;
+    }    
+  }
+
+  return  sum;         
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity with memory saving 
+    (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX_SAVE for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX_SAVE for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMA_GAPPED_SAVE(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                                           double *x1_start, double *x2_start, 
+                                           double *tipVector, 
+                                           unsigned char *tipX1, const int n, double *diagptable,
+                                           double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+  double  
+    *x1, 
+    *x2,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+
+
+  if(tipX1)
+  {        
+
+
+    for (i = 0; i < n; i++)
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      x1 = &(tipVector[4 * tipX1[i]]);   
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;     
+        x2_ptr += 16;
+      }
+
+
+      termv = _mm_set1_pd(0.0);            
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[0]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);            
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+
+    for (i = 0; i < n; i++) 
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      if(x1_gap[i / 32] & mask32[i % 32])
+        x1 = x1_gapColumn;
+      else
+      {
+        x1 = x1_ptr;              
+        x1_ptr += 16;
+      }
+
+      if(x2_gap[i / 32] & mask32[i % 32])
+        x2 = x2_gapColumn;
+      else
+      {
+        x2 = x2_ptr;
+        x2_ptr += 16;
+      }
+
+      termv = _mm_set1_pd(0.0);          
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[j * 4]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[j * 4 + 2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b GAMMA model of rate heterogeneity (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateGAMMA_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateGAMMA_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRGAMMA(const pllBoolean fastScaling, int *ex1, int *ex2, int *wptr,
+                               double *x1_start, double *x2_start, 
+                               double *tipVector, 
+                               unsigned char *tipX1, const int n, double *diagptable)
+{
+  double   sum = 0.0, term;    
+  int     i, j;
+
+  double  *x1, *x2;             
+
+
+
+  if(tipX1)
+  {             
+    for (i = 0; i < n; i++)
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+      x1 = &(tipVector[4 * tipX1[i]]);   
+      x2 = &x2_start[16 * i];    
+
+
+      termv = _mm_set1_pd(0.0);            
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[0]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+
+       if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }     
+  }
+  else
+  {        
+    for (i = 0; i < n; i++) 
+    {
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d termv, x1v, x2v, dv;
+
+
+      x1 = &x1_start[16 * i];
+      x2 = &x2_start[16 * i];             
+
+
+      termv = _mm_set1_pd(0.0);          
+
+      for(j = 0; j < 4; j++)
+      {
+        x1v = _mm_load_pd(&x1[j * 4]);
+        x2v = _mm_load_pd(&x2[j * 4]);
+        dv   = _mm_load_pd(&diagptable[j * 4]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+
+        x1v = _mm_load_pd(&x1[j * 4 + 2]);
+        x2v = _mm_load_pd(&x2[j * 4 + 2]);
+        dv   = _mm_load_pd(&diagptable[j * 4 + 2]);
+
+        x1v = _mm_mul_pd(x1v, x2v);
+        x1v = _mm_mul_pd(x1v, dv);
+
+        termv = _mm_add_pd(termv, x1v);
+      }
+
+      _mm_store_pd(t, termv);
+
+      if(!fastScaling)
+        term = log(0.25 * fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(0.25 * fabs(t[0] + t[1]));
+
+
+
+      sum += wptr[i] * term;
+    }                           
+  }
+
+  return sum;
+} 
+
+
+/** @ingroup evaluateLikelihoodGroup
+    @brief Evaluation of log likelihood of a tree using the \b CAT model of rate heterogeneity (Optimized SSE3 version for DNA data)
+ 
+    This is the SSE3 optimized version of ::evaluateCAT_FLEX for evaluating the log
+    likelihood at some edge whose two end-points (nodes) have the conditional likelihood
+    vectors \a x1 and \a x2. Please check ::evaluateCAT_FLEX for more information and
+    a description of the common input parameters
+*/
+static double evaluateGTRCAT (const pllBoolean fastScaling, int *ex1, int *ex2, int *cptr, int *wptr,
+                              double *x1_start, double *x2_start, double *tipVector,                  
+                              unsigned char *tipX1, int n, double *diagptable_start)
+{
+  double  sum = 0.0, term;       
+  int     i;
+
+  double  *diagptable, *x1, *x2;                            
+
+  if(tipX1)
+  {           
+    for (i = 0; i < n; i++) 
+    {   
+    	PLL_ALIGN_BEGIN	double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &(tipVector[4 * tipX1[i]]);
+      x2 = &x2_start[4 * i];
+
+      diagptable = &diagptable_start[4 * cptr[i]];
+
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+       if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }   
+  }               
+  else
+  {
+    for (i = 0; i < n; i++) 
+    { 
+    	PLL_ALIGN_BEGIN double t[2] PLL_ALIGN_END;
+      __m128d x1v1, x1v2, x2v1, x2v2, dv1, dv2;
+
+      x1 = &x1_start[4 * i];
+      x2 = &x2_start[4 * i];
+
+      diagptable = &diagptable_start[4 * cptr[i]];      
+
+
+      x1v1 =  _mm_load_pd(&x1[0]);
+      x1v2 =  _mm_load_pd(&x1[2]);
+      x2v1 =  _mm_load_pd(&x2[0]);
+      x2v2 =  _mm_load_pd(&x2[2]);
+      dv1  =  _mm_load_pd(&diagptable[0]);
+      dv2  =  _mm_load_pd(&diagptable[2]);
+
+      x1v1 = _mm_mul_pd(x1v1, x2v1);
+      x1v1 = _mm_mul_pd(x1v1, dv1);
+
+      x1v2 = _mm_mul_pd(x1v2, x2v2);
+      x1v2 = _mm_mul_pd(x1v2, dv2);
+
+      x1v1 = _mm_add_pd(x1v1, x1v2);
+
+      _mm_store_pd(t, x1v1);
+
+      if(!fastScaling)
+        term = log(fabs(t[0] + t[1])) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+      else
+        term = log(fabs(t[0] + t[1]));
+
+
+      sum += wptr[i] * term;
+    }    
+  }
+
+  return  sum;         
+} 
+
+
+
+
+
+#endif
diff --git a/pllrepo/src/evaluatePartialGenericSpecial.c b/pllrepo/src/evaluatePartialGenericSpecial.c
new file mode 100644
index 0000000..4d461a5
--- /dev/null
+++ b/pllrepo/src/evaluatePartialGenericSpecial.c
@@ -0,0 +1,1378 @@
+/*  RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees 
+ *  Copyright August 2006 by Alexandros Stamatakis
+ *
+ *  Partially derived from
+ *  fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
+ *  
+ *  and 
+ *
+ *  Programs of the PHYLIP package by Joe Felsenstein.
+ 
+ *  This program is free software; you may redistribute it and/or modify its
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ * 
+ *
+ *  For any other enquiries send an Email to Alexandros Stamatakis
+ *  Alexandros.Stamatakis at epfl.ch
+ *
+ *  When publishing work that is based on the results from RAxML-VI-HPC please cite:
+ *
+ *  Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models". 
+ *  Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
+ */
+
+#include "mem_alloc.h"
+
+#ifndef WIN32 
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#endif
+
+
+/* optimized implementation for computing per-site log likelihoods under CAT and GAMMA for DNA and protein data */
+
+#if (defined(__SSE3) || defined(__AVX))
+static __inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					   traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					   unsigned  char **yVector, int mxtips);
+
+static double evaluatePartialGTRCATPROT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					int w, double *EIGN, double *EI, double *EV,
+					double *tipVector, unsigned char **yVector, 
+					int branchReference, int mxtips);
+
+static __inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					     traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					     unsigned  char **yVector, int mxtips);
+
+static double evaluatePartialGTRGAMMAPROT(int i, int counter,  traversalInfo *ti, double qz,
+					  int w, double *EIGN, double *EI, double *EV,
+					  double *tipVector, unsigned char **yVector, 
+					  double *gammaRates,
+					  int branchReference, int mxtips);
+
+static __inline void computeVectorGTRCAT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips);
+
+static double evaluatePartialGTRCAT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips);
+
+static __inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					      unsigned char **yVector, int mxtips);
+
+static double evaluatePartialGTRCAT_BINARY(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					   int w, double *EIGN, double *EI, double *EV,
+					   double *tipVector, unsigned  char **yVector, 
+					   int branchReference, int mxtips);
+
+static double evaluatePartialGTRGAMMA(int i, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned char **yVector, 
+				      double *gammaRates,
+				      int branchReference, int mxtips);
+#endif
+
+/* the next two functions are generic non-optimized versions of the per-site log likelihood calculations,
+   but only under the CAT model. There are no generic implementations available for GAMMA yet, since 
+   these functions were not needed in RAxML. However there exist optimized functions for GAMMA further below.
+   The only use of the CAT functions was to optimize per-site rates based on their likelihood for the CAT 
+   model of rate heterogeneity. */
+
+
+static __inline void computeVectorCAT_FLEX(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					 unsigned char **yVector, int mxtips, const int states)
+{      
+  /* allocate some space we need */
+ 
+  double  
+    *d1 =    (double *)rax_malloc(sizeof(double) * states), 
+    *d2 =    (double *)rax_malloc(sizeof(double) * states),  
+    *x1px2 = (double *)rax_malloc(sizeof(double) * states), 
+    ump_x1, 
+    ump_x2,    
+    lz1, 
+    lz2,
+    *x1, 
+    *x2, 
+    *x3;
+  
+  int 
+    scale,
+    j, 
+    k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  /* 
+     lVector holds the space for computing ancestral probablities on a single column of the tree 
+     hence under CAT we index the current space required to store the parent ancestral probability vector 
+     by multiplying the number of states with the offset in the array given by the inner node number
+   */
+
+  x3  = &lVector[states * (pNumber  - mxtips)];  
+ 
+  /* do a case switch to figure out how to index the child nodes x1 and x2,
+     analogous to the standard newview implementation.
+     Note the index i that we use to index the specific tip poistion/index 
+     for which we want to compute the per-site log likelihood */
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[states * yVector[qNumber][i]]);
+      x2 = &(tipVector[states * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[states * yVector[qNumber][i]]);
+      x2 = &(lVector[states * (rNumber - mxtips)]);           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[states * (qNumber - mxtips)]);
+      x2 = &(lVector[states * (rNumber - mxtips)]);     
+      break;
+    default:
+      assert(0);
+    }
+     
+  /* multiply the branch lengths with the evolutionary rate */
+
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+
+  /* exponentiate the branch lengths using the eigenvalues */
+
+  d1[0] = x1[0];
+  d2[0] = x2[0];
+
+
+  for(j = 1; j < states; j++)
+    {
+      d1[j] = x1[j] * exp(EIGN[j] * lz1);
+      d2[j] = x2[j] * exp(EIGN[j] * lz2);	    
+    }
+ 
+ 
+  /* now loop over all states */
+
+  for(j = 0; j < states; j++)
+    {         
+      ump_x1 = 0.0;
+      ump_x2 = 0.0;
+
+      for(k = 0; k < states; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * states + k];
+	  ump_x2 += d2[k] * EI[j * states + k];
+	}
+      
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < states; j++)
+    x3[j] = 0.0;
+
+  /* multiply the result of looping over all states with the eigenvector matrix EV */
+
+  for(j = 0; j < states; j++)          
+    for(k = 0; k < states; k++)	
+      x3[k] +=  x1px2[j] *  EV[states * j + k];	   
+      
+  /* now determine if we need to scale the #states entries in x[3] to avoid 
+     numerical underflow. */
+     
+
+  scale = 1;
+  for(j = 0; scale && (j < states); j++)
+    scale = ((x3[j] < PLL_MINLIKELIHOOD) && (x3[j] > PLL_MINUSMINLIKELIHOOD));
+  
+  /* if we need to scale, we multiply all probabilities of the site with 2^256 
+     and increment the scaling counter by 1. 
+     The counter eVector is used for tracking/counting the number of scaling events 
+     at the site i for which we are computing the per-site log likelihood such that 
+     we can "undo" the scaling multiplications when we compute the log likelihood of the site 
+     at the virtual root */
+  
+  if(scale)
+    {
+      for(j = 0; j < states; j++)
+	x3[j] *= PLL_TWOTOTHE256;       
+      *eVector = *eVector + 1;
+    }	              
+
+  rax_free(d1);
+  rax_free(d2);
+  rax_free(x1px2);
+       
+  return;
+}
+
+
+/* the following function computes the per-site log likelihood of a given site i at the virtual root of the tree.
+   as input it takes the indeix i, of the site, the evolutionary rate ki (for computing Q^(rt) where r = ki) 
+   the traversalDescriptor defining the full tree traversal (felsenstein pruning algo) 
+   the branch length at the root qz, the weigth of the site pattern w, i.e., how many identical sites have been compressed 
+   into the current site pattern, the eigenvalues etc (EIGN, EI, EV) associated to the Eigenvector/Eigenvalue decomposition 
+   of the given instataneous substitution matrix Q, the tipVector lookup table for obtaining tip probability vectors, 
+   a pointer to the raw sequence data at the tips, a branch index (to get the correct branch length/index into the correct branch 
+   if -M is used, i.e., a per-partition branch length estimate is deployed, and finally the maximum number of tips in the comprehensive tree 
+   as well as the number of states in the current model. */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static double evaluatePartialCAT_FLEX(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned  char **yVector, 
+				      int branchReference, int mxtips, const int states)
+{
+  int 
+    scale = 0, 
+    k;
+  
+  double 
+    /* lVector is a temporary buffer to store the ancestral probability vactors of 
+       a single site, thus we allocate states * mxtips space for storing probability values.
+       Essentially  only (states * (mxtips - 2)) space would be required, but I was to lazy 
+       to think if it has to be -1 or -2 here */
+    * lVector = NULL,   
+    * d = NULL,
+    lz, 
+    term, 
+    *x1, 
+    *x2; 
+
+  
+
+  traversalInfo 
+    *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * states * mxtips);
+  rax_posix_memalign ((void **)&d,       PLL_BYTE_ALIGNMENT, sizeof(double) * states);
+  /* make sure that at one end of the branch into which we have placed the virtual root 
+     there actually is a tip!*/
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  /* for the tip we alread have the data, so just set the left probability vector to the 
+     corresponding address in the pre-computed tipVector[] lookup table */
+
+  x1 = &(tipVector[states *  yVector[trav->pNumber][i]]);   
+
+  /* now iterate over the traversal descriptor that contains the nodes of the tree in the order required 
+     by the Felsenstein pruning algorithm */
+
+  for(k = 1; k < counter; k++)    
+    {
+      /* obtain the branch lengths and take the logarithms */
+      
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      /* invoke essentially a newview() for one site on the entry k of the traversal descriptor.
+	 counter should always correspond to the number of inner nodes in the tree for which we need
+	 to compute ancestral probability values */
+
+      computeVectorCAT_FLEX(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			    EIGN, EI, EV, 
+			    tipVector, yVector, mxtips, states);       
+    }
+   
+  /* now the ancestral probability values for site i at the node to the right of the virtual root 
+     are available and correctly computed, such that we can set the pointer to the right vector x2
+     to the corresponding entry */
+
+  x2 = &lVector[states * (trav->qNumber - mxtips)]; 
+
+  /* a paranoic assertion */
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+ 
+  /* now just compute the log likelihood score of this site */
+      
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = 1.0; 
+
+  for(k = 1; k < states; k++)
+    d[k] = exp (EIGN[k] * lz);
+  
+  term = 0.0;
+
+  for(k = 0; k < states; k++) 
+    term += x1[k] * x2[k] * d[k];       
+
+  /* note the "scale * log(PLL_MINLIKELIHOOD)" term here which we use to undo/revert the scaling multiplications 
+     such that we obtain a correct log likelihood score. The integer variable scale, contains the number of times 
+     we had to scale (multiply by 2^256) for site i only during a full tree traversal using Felsenstein's algorithm */
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  /* multiply with the site pattern weight (site pattern compression factor */
+
+  term = term * w;
+
+  /* free the memory space used for likelihood computations on this site */
+
+  rax_free(lVector);  
+  rax_free(d);
+
+  return  term;
+}
+#endif
+
+/* this is the top-level function that can be called from other parts of the code.
+   As input it takes the tree data structure, the site index, the evolutionary rate ki, 
+   and the model index (partition index. It will return the 
+   log likelihood of site i. 
+   An important pre-condition is that the tree traversal descriptor must contain 
+   a full tree traversal starting at a tip !
+
+   Note that, if you wamt to obtain per-site log likes for other altered model parameters such 
+   as the Q matrix, you will have do re-invoke the eigenvalue/eigenvector decomposition prior 
+   to calling the function below.
+*/
+
+double evaluatePartialGeneric (pllInstance *tr, partitionList *pr, int i, double ki, int _model)
+{
+  double 
+    result;
+  
+  
+  int     
+    branchReference,
+
+    /* number of states of the data type in this partition */
+    states = pr->partitionData[_model]->states;
+    
+  /* SOS ATTENTION: note the different indexing used for the parallel and sequential versions ! */
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  int index = i; 
+#else
+  int index = i - pr->partitionData[_model]->lower;
+#endif
+  
+  /* here we figure out if all partitions are linked via the same branch length, that is,
+     if we are conducting a joint branch length estimate or a per-partition branch length estimate */
+
+  if(pr->perGeneBranchLengths && pr->numberOfPartitions>1)
+    branchReference = _model;
+  else
+    branchReference = 0;
+
+  /* for the generic function implementation we only offer the CAT implementation for computing/optimizing per-site evolutionary rates */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+  if(tr->rateHetModel == PLL_CAT)
+    result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+				     pr->partitionData[_model]->wgt[index],
+				     pr->partitionData[_model]->EIGN,
+				     pr->partitionData[_model]->EI,
+				     pr->partitionData[_model]->EV,
+				     pr->partitionData[_model]->tipVector,
+				     pr->partitionData[_model]->yVector, branchReference, tr->mxtips, states);
+  else
+    /* 
+       the per-site site likelihood function should only be called for the CAT model
+       under the GAMMA model this is required only for estimating per-site protein models 
+       which has however been removed in this version of the code
+    */
+    assert(0); 
+  
+ 
+#else
+  /* switch over the number of states of the data in the current model/partition */
+  switch(states)
+    {
+    case 2:   /* BINARY */
+      assert(!tr->saveMemory);
+      assert(tr->rateHetModel == PLL_CAT);
+
+      result = evaluatePartialGTRCAT_BINARY(index, ki, tr->td[0].count, tr->td[0].ti, 
+                                            tr->td[0].ti[0].qz[branchReference],
+                                            pr->partitionData[_model]->wgt[index],
+                                            pr->partitionData[_model]->EIGN,
+                                            pr->partitionData[_model]->EI,
+                                            pr->partitionData[_model]->EV,
+                                            pr->partitionData[_model]->tipVector,
+                                            pr->partitionData[_model]->yVector, 
+                                            branchReference, 
+                                            tr->mxtips);
+      break;
+      
+    case 4:   /* DNA */
+      /* switch over CAT versus GAMMA and pass all model parameters for the respective partition to the respective functions */
+      if(tr->rateHetModel == PLL_CAT)      
+	result = evaluatePartialGTRCAT(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+				       pr->partitionData[_model]->wgt[index],
+				       pr->partitionData[_model]->EIGN,
+				       pr->partitionData[_model]->EI,
+				       pr->partitionData[_model]->EV,
+				       pr->partitionData[_model]->tipVector,
+				       pr->partitionData[_model]->yVector, branchReference, tr->mxtips);
+      else	
+	result = evaluatePartialGTRGAMMA(index, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					 pr->partitionData[_model]->wgt[index],
+					 pr->partitionData[_model]->EIGN,
+					 pr->partitionData[_model]->EI,
+					 pr->partitionData[_model]->EV,
+					 pr->partitionData[_model]->tipVector,
+					 pr->partitionData[_model]->yVector,
+					 pr->partitionData[_model]->gammaRates,
+					 branchReference, tr->mxtips);	
+	
+      break;
+    case 20: /* proteins */     
+      if(tr->rateHetModel == PLL_CAT)
+	result = evaluatePartialGTRCATPROT(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					   pr->partitionData[_model]->wgt[index],
+					   pr->partitionData[_model]->EIGN,
+					   pr->partitionData[_model]->EI,
+					   pr->partitionData[_model]->EV,
+					   pr->partitionData[_model]->tipVector,
+					   pr->partitionData[_model]->yVector, branchReference, tr->mxtips);
+      else
+	result =  evaluatePartialGTRGAMMAPROT(index, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
+					      pr->partitionData[_model]->wgt[index],
+					      pr->partitionData[_model]->EIGN,
+					      pr->partitionData[_model]->EI,
+					      pr->partitionData[_model]->EV,
+					      pr->partitionData[_model]->tipVector,
+					      pr->partitionData[_model]->yVector,
+					      pr->partitionData[_model]->gammaRates,
+					      branchReference, tr->mxtips);
+      break;   
+    default:
+      assert(0);
+    }
+  #endif
+ 
+
+  return result;
+}
+
+#if (defined(__SSE3) || defined(__AVX))
+/* optimized function implementations for computing per-site log likelihoods under CAT and GAMMA for protein and 
+   DNA data. 
+   The structure is analoguous as above with some data- and model-specific optimizations and vectorizations.
+*/
+
+static __inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					      unsigned char **yVector, int mxtips)
+{       
+  double  d1, d2,  ump_x1, ump_x2, x1px2[2], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int 
+    j, k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[2 * (pNumber  - mxtips)];  
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[2 * yVector[qNumber][i]]);
+      x2 = &(tipVector[2 * yVector[rNumber][i]]);   
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[2 * yVector[qNumber][i]]);
+      x2 = &lVector[2 * (rNumber - mxtips)];                    
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[2 * (qNumber - mxtips)];
+      x2 = &lVector[2 * (rNumber - mxtips)];               
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+ 
+  d1 = x1[1] * exp(EIGN[1] * lz1);
+  d2 = x2[1] * exp(EIGN[1] * lz2);	        
+ 
+  for(j = 0; j < 2; j++)
+    {     
+      ump_x1 = x1[0];
+      ump_x2 = x2[0];
+      
+      ump_x1 += d1 * EI[j * 2 + 1];
+      ump_x2 += d2 * EI[j * 2 + 1];
+	
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 2; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 2; j++)          
+    for(k = 0; k < 2; k++)	
+      x3[k] +=  x1px2[j] *  EV[2 * j + k];	   
+      
+  
+  if (x3[0] < PLL_MINLIKELIHOOD && x3[0] > PLL_MINUSMINLIKELIHOOD &&
+      x3[1] < PLL_MINLIKELIHOOD && x3[1] > PLL_MINUSMINLIKELIHOOD 
+      )
+    {	     
+      x3[0]   *= PLL_TWOTOTHE256;
+      x3[1]   *= PLL_TWOTOTHE256;     
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+static double evaluatePartialGTRCAT_BINARY(int i, double ki, int counter,  traversalInfo *ti, double qz,
+					   int w, double *EIGN, double *EI, double *EV,
+					   double *tipVector, unsigned  char **yVector, 
+					   int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d;
+  double   *x1, *x2; 
+  int scale = 0, k;
+  double *lVector = (double *)malloc(sizeof(double) * 2 * mxtips);  
+  traversalInfo *trav = &ti[0];
+ 
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[2 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)  
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCAT_BINARY(lVector, &scale, ki, i, qz, rz, &ti[k], 
+				 EIGN, EI, EV, 
+				 tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[2 * (trav->qNumber - mxtips)];
+     
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d = exp(EIGN[1] * lz);
+  
+  term =  x1[0] * x2[0];
+  term += x1[1] * x2[1] * d; 
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  free(lVector);
+  
+  return  term;
+}
+
+
+
+static __inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					     traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					     unsigned  char **yVector, int mxtips)
+{       
+  double   
+    *x1, 
+    *x2, 
+    *x3;  
+  
+  int
+    s,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber,
+    index1[4],
+    index2[4];
+  
+ 
+  x3  = &(lVector[80 * (pNumber  - mxtips)]);     
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:    
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(tipVector[20 * yVector[rNumber][i]]);     
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = 0;
+	  index2[s] = 0;
+	}
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(  lVector[80 * (rNumber - mxtips)]);   
+      for(s = 0; s < 4; s++)       
+	index1[s] = 0;
+      for(s = 0; s < 4; s++)     
+	index2[s] = s;                     
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[80 * (qNumber - mxtips)]);
+      x2 = &(lVector[80 * (rNumber - mxtips)]); 
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = s;
+	  index2[s] = s;
+	}                
+      break;    
+    default:
+      assert(0);
+    }
+     
+  {
+	  PLL_ALIGN_BEGIN double
+		  e1[20] PLL_ALIGN_END,
+		  e2[20] PLL_ALIGN_END,
+		  d1[20] PLL_ALIGN_END,
+		  d2[20] PLL_ALIGN_END;
+    double  
+      lz1, lz2;  
+    int 
+      l, 
+      k, 
+      scale, 
+      j;
+     
+    for(j = 0; j < 4; j++)
+      {
+	lz1 = qz * gammaRates[j];            
+	lz2 = rz * gammaRates[j];        
+
+	e1[0] = 1.0;
+	e2[0] = 1.0;
+    
+	for(l = 1; l < 20; l++)
+	  {
+	    e1[l] = exp(EIGN[l] * lz1);
+	    e2[l] = exp(EIGN[l] * lz2);
+	  }
+
+	for(l = 0; l < 20; l+=2)
+	  {
+	    __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[20 * index1[j] + l]), _mm_load_pd(&e1[l]));
+	    __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[20 * index2[j] + l]), _mm_load_pd(&e2[l]));
+	    
+	    _mm_store_pd(&d1[l], d1v);
+	    _mm_store_pd(&d2[l], d2v);	
+	  }
+
+	__m128d zero = _mm_setzero_pd();
+
+	for(l = 0; l < 20; l+=2)
+	  _mm_store_pd(&x3[j * 20 + l], zero);
+                
+	for(l = 0; l < 20; l++)
+	  { 	      
+	    double *ev = &EV[l * 20];
+	    __m128d ump_x1v = _mm_setzero_pd();
+	    __m128d ump_x2v = _mm_setzero_pd();
+	    __m128d x1px2v;
+	    
+	    for(k = 0; k < 20; k+=2)
+	      {       
+		__m128d eiv = _mm_load_pd(&EI[20 * l + k]);
+		__m128d d1v = _mm_load_pd(&d1[k]);
+		__m128d d2v = _mm_load_pd(&d2[k]);
+		ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
+		ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	      }
+
+	    ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
+	    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+	    x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+
+	    for(k = 0; k < 20; k+=2)
+	      {
+		__m128d ex3v = _mm_load_pd(&x3[j * 20 + k]);
+		__m128d EVV  = _mm_load_pd(&ev[k]);
+		ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+		
+		_mm_store_pd(&x3[j * 20 + k], ex3v);	   	   
+	      }
+	  }        
+      }
+    
+    scale = 1;
+    for(l = 0; scale && (l < 80); l++)
+      scale = ((x3[l] < PLL_MINLIKELIHOOD) && (x3[l] > PLL_MINUSMINLIKELIHOOD));	       	      	      	       	       
+    
+    if(scale)
+      {	      
+	__m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+	for(l = 0; l < 80; l+=2)
+	  {
+	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
+	    _mm_store_pd(&x3[l], ex3v);	
+	  }
+
+	*eVector = *eVector + 1;
+      }
+    
+    return;      
+  }
+}
+
+static  void computeVectorGTRGAMMA(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz,
+					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+					 unsigned  char **yVector, int mxtips)
+{       
+  double   
+    *x1, 
+    *x2, 
+    *x3;   
+
+  int
+    s,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber,
+    index1[4],
+    index2[4];
+  
+ 
+  x3  = &(lVector[16 * (pNumber  - mxtips)]);     
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:          
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(tipVector[4 * yVector[rNumber][i]]);     
+      
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = 0;
+	  index2[s] = 0;
+	}
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(lVector[16 * (rNumber - mxtips)]);   
+      for(s = 0; s < 4; s++)       
+	{
+	  index1[s] = 0;      
+	  index2[s] = s;  
+	}
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &(lVector[16 * (qNumber - mxtips)]);
+      x2 = &(lVector[16 * (rNumber - mxtips)]);       
+      for(s = 0; s < 4; s++)
+	{
+	  index1[s] = s;
+	  index2[s] = s;
+	}                
+      break;    
+    default:
+      assert(0);
+    }
+     
+  {
+	  PLL_ALIGN_BEGIN double
+		  e1[20] PLL_ALIGN_END,
+		  e2[20] PLL_ALIGN_END,
+		  d1[20] PLL_ALIGN_END,
+		  d2[20] PLL_ALIGN_END;
+    double  
+      lz1, lz2;  
+    
+    int 
+      l, 
+      k, 
+      scale, 
+      j;
+     
+    for(j = 0; j < 4; j++)
+      {
+	lz1 = qz * gammaRates[j];            
+	lz2 = rz * gammaRates[j];        
+
+	e1[0] = 1.0;
+	e2[0] = 1.0;
+    
+	for(l = 1; l < 4; l++)
+	  {
+	    e1[l] = exp(EIGN[l] * lz1);
+	    e2[l] = exp(EIGN[l] * lz2);
+	  }
+
+	for(l = 0; l < 4; l+=2)
+	  {
+	    __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[4 * index1[j] + l]), _mm_load_pd(&e1[l]));
+	    __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[4 * index2[j] + l]), _mm_load_pd(&e2[l]));
+	    
+	    _mm_store_pd(&d1[l], d1v);
+	    _mm_store_pd(&d2[l], d2v);	
+	  }
+
+	__m128d zero = _mm_setzero_pd();
+
+	for(l = 0; l < 4; l+=2)
+	  _mm_store_pd(&x3[j * 4 + l], zero);
+                
+	for(l = 0; l < 4; l++)
+	  { 	      
+	    double *ev = &EV[l * 4];
+	    __m128d ump_x1v = _mm_setzero_pd();
+	    __m128d ump_x2v = _mm_setzero_pd();
+	    __m128d x1px2v;
+	    
+	    for(k = 0; k < 4; k+=2)
+	      {       
+		__m128d eiv = _mm_load_pd(&EI[4 * l + k]);
+		__m128d d1v = _mm_load_pd(&d1[k]);
+		__m128d d2v = _mm_load_pd(&d2[k]);
+		ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv));
+		ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv));	  
+	      }
+
+	    ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v);
+	    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+	    x1px2v = _mm_mul_pd(ump_x1v, ump_x2v);
+
+	    for(k = 0; k < 4; k+=2)
+	      {
+		__m128d ex3v = _mm_load_pd(&x3[j * 4 + k]);
+		__m128d EVV  = _mm_load_pd(&ev[k]);
+		ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV));
+		
+		_mm_store_pd(&x3[j * 4 + k], ex3v);	   	   
+	      }
+	  }        
+      }
+    
+  
+    scale = 1;
+    for(l = 0; scale && (l < 16); l++)
+      scale = (PLL_ABS(x3[l]) < PLL_MINLIKELIHOOD);	       	      	      	       	       
+    
+    if(scale)
+      {	      
+	__m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	
+	for(l = 0; l < 16; l+=2)
+	  {
+	    __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto);
+	    _mm_store_pd(&x3[l], ex3v);	
+	  }
+	
+	*eVector = *eVector + 1;
+      }  
+    
+    return;      
+  }
+}
+
+
+static double evaluatePartialGTRGAMMAPROT(int i, int counter,  traversalInfo *ti, double qz,
+					  int w, double *EIGN, double *EI, double *EV,
+					  double *tipVector, unsigned char **yVector, 
+					  double *gammaRates,
+					  int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[80];
+  double   *x1, *x2; 
+  int scale = 0, k, l, j;
+
+  double 
+	  *lVector = NULL;
+  PLL_ALIGN_BEGIN double
+	  myEI[400]  PLL_ALIGN_END;
+
+  traversalInfo 
+    *trav = &ti[0];
+
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 80 * mxtips);
+
+  for(k = 0; k < 20; k++)
+    {         
+      for(l = 0; l < 20; l++)
+	myEI[k * 20 + l] = EI[k * 20 + l];
+    }
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[20 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)                
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRGAMMAPROT(lVector, &scale, gammaRates, i, qz, rz, 
+				&ti[k], EIGN, myEI, EV, 
+				tipVector, yVector, mxtips);
+    }
+   
+  x2 = &lVector[80 * (trav->qNumber - mxtips)];       
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+  
+  lz = qz;
+
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz);
+  
+  
+  
+  for(j = 0; j < 4; j++)
+    {
+      d[20 * j] = 1.0;
+      for(l = 1; l < 20; l++)
+	d[20 * j + l] = exp(EIGN[l] * lz * gammaRates[j]);
+    }
+
+ 
+  for(j = 0, term = 0.0; j < 4; j++)
+    {
+      for(l = 0; l < 20; l++)
+	term += x1[l] * x2[20 * j + l] * d[j * 20 + l];	      
+    }
+  
+  term = log(0.25 * fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+ rax_free(lVector);
+  
+ 
+  return  term;
+}
+
+static double evaluatePartialGTRGAMMA(int i, int counter,  traversalInfo *ti, double qz,
+				      int w, double *EIGN, double *EI, double *EV,
+				      double *tipVector, unsigned char **yVector, 
+				      double *gammaRates,
+				      int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[16];
+  double   *x1, *x2; 
+  int scale = 0, k, l, j;
+  double 
+	  *lVector = NULL;
+  PLL_ALIGN_BEGIN double
+	  myEI[16]  PLL_ALIGN_END;
+
+
+  traversalInfo 
+    *trav = &ti[0];
+
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 16 * mxtips);
+
+  for(k = 0; k < 4; k++)
+    {           
+      for(l = 0; l < 4; l++)
+	myEI[k * 4 + l] = EI[k * 4 + l];
+    }
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[4 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)                
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRGAMMA(lVector, &scale, gammaRates, i, qz, rz, 
+				&ti[k], EIGN, myEI, EV, 
+				tipVector, yVector, mxtips);
+    }
+   
+  x2 = &lVector[16 * (trav->qNumber - mxtips)];       
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+  
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  
+  for(j = 0; j < 4; j++)
+    {
+      d[4 * j] = 1.0;
+      for(l = 1; l < 4; l++)
+	d[4 * j + l] = exp(EIGN[l] * lz * gammaRates[j]);
+    }
+
+ 
+  for(j = 0, term = 0.0; j < 4; j++)
+    {
+      for(l = 0; l < 4; l++)
+	term += x1[l] * x2[4 * j + l] * d[j * 4 + l];	      
+    }
+
+  term = log(0.25 * fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);
+  
+  
+  return  term;
+}
+
+
+
+
+static __inline void computeVectorGTRCAT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips)
+{       
+  double  d1[3], d2[3],  ump_x1, ump_x2, x1px2[4], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int j, k,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[4 * (pNumber  - mxtips)];  
+ 
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &(tipVector[4 * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[4 * yVector[qNumber][i]]);
+      x2 = &lVector[4 * (rNumber - mxtips)];           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[4 * (qNumber - mxtips)];
+      x2 = &lVector[4 * (rNumber - mxtips)];     
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+  for(j = 0; j < 3; j++)
+    {
+      d1[j] = 
+	x1[j + 1] * 
+	exp(EIGN[j + 1] * lz1);
+      d2[j] = x2[j + 1] * exp(EIGN[j + 1] * lz2);	    
+    }
+ 
+ 
+  for(j = 0; j < 4; j++)
+    {     
+      ump_x1 = x1[0];
+      ump_x2 = x2[0];
+      for(k = 0; k < 3; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * 4 + k + 1];
+	  ump_x2 += d2[k] * EI[j * 4 + k + 1];
+	}
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 4; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 4; j++)          
+    for(k = 0; k < 4; k++)	
+      x3[k] +=  x1px2[j] *  EV[4 * j + k];	   
+      
+  
+  if (x3[0] < PLL_MINLIKELIHOOD && x3[0] > PLL_MINUSMINLIKELIHOOD &&
+      x3[1] < PLL_MINLIKELIHOOD && x3[1] > PLL_MINUSMINLIKELIHOOD &&
+      x3[2] < PLL_MINLIKELIHOOD && x3[2] > PLL_MINUSMINLIKELIHOOD &&
+      x3[3] < PLL_MINLIKELIHOOD && x3[3] > PLL_MINUSMINLIKELIHOOD)
+    {	     
+      x3[0]   *= PLL_TWOTOTHE256;
+      x3[1]   *= PLL_TWOTOTHE256;
+      x3[2]   *= PLL_TWOTOTHE256;     
+      x3[3]   *= PLL_TWOTOTHE256;     
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+
+
+
+
+
+
+
+static double evaluatePartialGTRCAT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[3];
+  double   *x1, *x2, *lVector = NULL; 
+  int scale = 0, k;
+  traversalInfo *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **) &lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 4 * mxtips);    
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[4 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)    
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCAT(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			  EIGN, EI, EV, 
+			  tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[4 * (trav->qNumber - mxtips)]; 
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = exp (EIGN[1] * lz);
+  d[1] = exp (EIGN[2] * lz);
+  d[2] = exp (EIGN[3] * lz);       	   
+  
+  term =  x1[0] * x2[0];
+  term += x1[1] * x2[1] * d[0];
+  term += x1[2] * x2[2] * d[1];
+  term += x1[3] * x2[3] * d[2];     
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);  
+
+  return  term;
+}
+
+/**********************************************************************************/
+
+static __inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
+				       traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
+				       unsigned char **yVector, int mxtips)
+{       
+  double  d1[20], d2[20],  ump_x1, ump_x2, x1px2[20], lz1, lz2; 
+  double *x1, *x2, *x3;
+  int j, k,
+    scale = 1,
+    pNumber = ti->pNumber,
+    rNumber = ti->rNumber,
+    qNumber = ti->qNumber;
+ 
+  x3  = &lVector[20 * (pNumber  - mxtips)];  
+ 
+
+  switch(ti->tipCase)
+    {
+    case PLL_TIP_TIP:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &(tipVector[20 * yVector[rNumber][i]]);    
+      break;
+    case PLL_TIP_INNER:     
+      x1 = &(tipVector[20 * yVector[qNumber][i]]);
+      x2 = &lVector[20 * (rNumber - mxtips)];           
+      break;
+    case PLL_INNER_INNER:            
+      x1 = &lVector[20 * (qNumber - mxtips)];
+      x2 = &lVector[20 * (rNumber - mxtips)];     
+      break;
+    default:
+      assert(0);
+    }
+     
+  lz1 = qz * ki;  
+  lz2 = rz * ki;
+  
+   d1[0] = x1[0];
+   d2[0] = x2[0];
+
+  for(j = 1; j < 20; j++)
+    {
+      d1[j] = x1[j] * exp(EIGN[j] * lz1);
+      d2[j] = x2[j] * exp(EIGN[j] * lz2);	    
+    }
+ 
+ 
+  for(j = 0; j < 20; j++)
+    {        
+      ump_x1 = 0;
+      ump_x2 = 0;
+
+      for(k = 0; k < 20; k++)
+	{
+	  ump_x1 += d1[k] * EI[j * 20 + k];
+	  ump_x2 += d2[k] * EI[j * 20 + k];
+	}
+      
+      x1px2[j] = ump_x1 * ump_x2;
+    }
+  
+  for(j = 0; j < 20; j++)
+    x3[j] = 0.0;
+
+  for(j = 0; j < 20; j++)          
+    for(k = 0; k < 20; k++)	
+      x3[k] +=  x1px2[j] *  EV[20 * j + k];	   
+      
+  scale = 1;
+  for(k = 0; (k < 20) && scale; k++)    
+    scale = ((x3[k] < PLL_MINLIKELIHOOD) && (x3[k] > PLL_MINUSMINLIKELIHOOD));    
+
+  if(scale)
+    {	        
+
+      for(k = 0; k < 20; k++)
+	x3[k]   *= PLL_TWOTOTHE256;
+         
+      *eVector = *eVector + 1;
+    }	              
+
+  return;
+}
+
+
+
+
+
+
+
+
+static double evaluatePartialGTRCATPROT(int i, double ki, int counter,  traversalInfo *ti, double qz,
+				    int w, double *EIGN, double *EI, double *EV,
+				    double *tipVector, unsigned  char **yVector, 
+				    int branchReference, int mxtips)
+{
+  double lz, term;       
+  double  d[20];
+  double   *x1, *x2, *lVector = NULL; 
+  int scale = 0, k;
+
+  traversalInfo *trav = &ti[0];
+ 
+  rax_posix_memalign ((void **)&lVector, PLL_BYTE_ALIGNMENT, sizeof(double) * 20 * mxtips);
+
+  assert(isTip(trav->pNumber, mxtips));
+     
+  x1 = &(tipVector[20 *  yVector[trav->pNumber][i]]);   
+
+  for(k = 1; k < counter; k++)    
+    {
+      double 
+	qz = ti[k].qz[branchReference],
+	rz = ti[k].rz[branchReference];
+      
+      qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);
+      rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);
+
+      computeVectorGTRCATPROT(lVector, &scale, ki, i, qz, rz, &ti[k], 
+			  EIGN, EI, EV, 
+			  tipVector, yVector, mxtips);       
+    }
+   
+  x2 = &lVector[20 * (trav->qNumber - mxtips)]; 
+
+  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
+       
+  if(qz < PLL_ZMIN) 
+    lz = PLL_ZMIN;
+  lz  = log(qz); 
+  lz *= ki;  
+  
+  d[0] = 1.0;
+  
+  for(k = 1; k < 20; k++)
+    d[k] =  exp (EIGN[k] * lz);
+
+        	   
+  term =  0.0;
+  for(k = 0; k < 20; k++)
+    term += x1[k] * x2[k] * d[k];     
+
+  term = log(fabs(term)) + (scale * log(PLL_MINLIKELIHOOD));   
+
+  term = term * w;
+
+  rax_free(lVector);  
+
+  return  term;
+}
+
+/******************************************/
+
+
+
+#endif
diff --git a/pllrepo/src/fastDNAparsimony.c b/pllrepo/src/fastDNAparsimony.c
new file mode 100644
index 0000000..72900a6
--- /dev/null
+++ b/pllrepo/src/fastDNAparsimony.c
@@ -0,0 +1,1941 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file fastDNAparsimony.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#if defined(__MIC_NATIVE)
+
+#include <immintrin.h>
+
+#define INTS_PER_VECTOR 16
+//#define LONG_INTS_PER_VECTOR 8
+#define LONG_INTS_PER_VECTOR (64/sizeof(long))
+#define INT_TYPE __m512i
+#define CAST double*
+#define SET_ALL_BITS_ONE _mm512_set1_epi32(0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm512_setzero_epi32()
+#define VECTOR_LOAD _mm512_load_epi32
+#define VECTOR_STORE  _mm512_store_epi32
+#define VECTOR_BIT_AND _mm512_and_epi32
+#define VECTOR_BIT_OR  _mm512_or_epi32
+#define VECTOR_AND_NOT _mm512_andnot_epi32
+
+#elif defined(__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define INTS_PER_VECTOR 8
+//#define LONG_INTS_PER_VECTOR 4
+#define LONG_INTS_PER_VECTOR (32/sizeof(long))
+#define INT_TYPE __m256d
+#define CAST double*
+//#define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+//#define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define SET_ALL_BITS_ONE _mm256_castsi256_pd(_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define SET_ALL_BITS_ZERO _mm256_castsi256_pd(_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000))
+#define VECTOR_LOAD _mm256_load_pd
+#define VECTOR_BIT_AND _mm256_and_pd
+#define VECTOR_BIT_OR  _mm256_or_pd
+#define VECTOR_STORE  _mm256_store_pd
+#define VECTOR_AND_NOT _mm256_andnot_pd
+
+#elif (defined(__SSE3))
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+  
+#define INTS_PER_VECTOR 4
+#ifdef __i386__
+//#define LONG_INTS_PER_VECTOR 4
+#define LONG_INTS_PER_VECTOR (16/sizeof(long))
+#else
+//#define LONG_INTS_PER_VECTOR 2
+#define LONG_INTS_PER_VECTOR (16/sizeof(long))
+#endif
+#define INT_TYPE __m128i
+#define CAST __m128i*
+#define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm_load_si128
+#define VECTOR_BIT_AND _mm_and_si128
+#define VECTOR_BIT_OR  _mm_or_si128
+#define VECTOR_STORE  _mm_store_si128
+#define VECTOR_AND_NOT _mm_andnot_si128
+
+#endif
+
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#if defined (_MSC_VER)
+#	if defined ( __SSE4_2__ ) || defined (__AVX__)
+#		include <nmmintrin.h>
+#		define __builtin_popcount _mm_popcnt_u32
+#		define __builtin_popcountl _mm_popcnt_u64
+#	else
+#		include <intrin.h>
+	static __inline uint32_t __builtin_popcount (uint32_t a) {
+		// popcnt instruction not available
+		uint32_t b = a - ((a >> 1) & 0x55555555);
+		uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+		uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+		uint32_t e = d * 0x01010101;
+		return   e >> 24;
+	}
+//#		define __builtin_popcount __popcnt
+#		define __builtin_popcountl __popcnt64
+#	endif
+#endif
+
+static pllBoolean tipHomogeneityCheckerPars(pllInstance *tr, nodeptr p, int grouping);
+
+extern const unsigned int mask32[32]; 
+/* vector-specific stuff */
+
+
+extern double masterTime;
+
+/************************************************ pop count stuff ***********************************************/
+
+ unsigned int bitcount_32_bit(unsigned int i)
+{
+  return ((unsigned int) __builtin_popcount(i));
+}
+
+/* bit count for 64 bit integers */
+
+//__inline unsigned int bitcount_64_bit(uint64_t i)
+//{
+//  return ((unsigned int) __builtin_popcountl(i));
+//}
+
+/* bit count for 128 bit SSE3 and 256 bit AVX registers */
+
+#if (defined(__SSE3) || defined(__AVX))
+
+#ifdef _WIN32
+ /* emulate with 32-bit version */
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+PLL_ALIGN_BEGIN unsigned int counts[INTS_PER_VECTOR] PLL_ALIGN_END;
+
+  int
+    i,
+    sum = 0;
+
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < INTS_PER_VECTOR; i++)
+    sum += __builtin_popcount(counts[i]);
+
+  return ((unsigned int)sum);
+}
+#else
+
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+
+  int    
+    i,
+    sum = 0;
+  
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < LONG_INTS_PER_VECTOR; i++)
+    sum += __builtin_popcountl(counts[i]);
+             
+  return ((unsigned int)sum);
+}
+#endif
+
+#endif
+
+
+
+/********************************DNA FUNCTIONS *****************************************************************/
+
+
+static int checkerPars(pllInstance *tr, nodeptr p)
+{
+  int group = tr->constraintVector[p->number];
+
+  if(isTip(p->number, tr->mxtips))
+    {
+      group = tr->constraintVector[p->number];
+      return group;
+    }
+  else
+    {
+      if(group != -9) 
+        return group;
+
+      group = checkerPars(tr, p->next->back);
+      if(group != -9) 
+        return group;
+
+      group = checkerPars(tr, p->next->next->back);
+      if(group != -9) 
+        return group;
+
+      return -9;
+    }
+}
+
+static pllBoolean tipHomogeneityCheckerPars(pllInstance *tr, nodeptr p, int grouping)
+{
+  if(isTip(p->number, tr->mxtips))
+    {
+      if(tr->constraintVector[p->number] != grouping) 
+        return PLL_FALSE;
+      else 
+        return PLL_TRUE;
+    }
+  else
+    {   
+      return  (tipHomogeneityCheckerPars(tr, p->next->back, grouping) && tipHomogeneityCheckerPars(tr, p->next->next->back,grouping));      
+    }
+}
+
+static void getxnodeLocal (nodeptr p)
+{
+  nodeptr  s;
+
+  if((s = p->next)->xPars || (s = s->next)->xPars)
+    {
+      p->xPars = s->xPars;
+      s->xPars = 0;
+    }
+
+  assert(p->next->xPars || p->next->next->xPars || p->xPars);
+
+}
+
+static void computeTraversalInfoParsimony(nodeptr p, int *ti, int *counter, int maxTips, pllBoolean full)
+{        
+  nodeptr 
+    q = p->next->back,
+    r = p->next->next->back;
+  
+  if(! p->xPars)
+    getxnodeLocal(p);  
+  
+  if(full)
+    {
+       if(q->number > maxTips) 
+         computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  else
+    {
+      if(q->number > maxTips && !q->xPars) 
+        computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips && !r->xPars) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  
+  
+  ti[*counter]     = p->number;
+  ti[*counter + 1] = q->number;
+  ti[*counter + 2] = r->number;
+  *counter = *counter + 4;
+}
+
+
+
+
+
+
+
+#if (defined(__SSE3) || defined(__AVX))
+
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList *pr)
+{    
+  INT_TYPE
+    allOne = SET_ALL_BITS_ONE;
+
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C,
+                      v_A, v_C;          
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);                                                                
+                    
+                    v_N = VECTOR_BIT_OR(l_A, l_C);
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);            
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C, l_G, l_T,
+                      v_A, v_C, v_G, v_T;                
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[2][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[2][i]));
+                    l_G = VECTOR_BIT_AND(s_l, s_r);
+                    v_G = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[3][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[3][i]));
+                    l_T = VECTOR_BIT_AND(s_l, s_r);
+                    v_T = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));                                
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
+                    VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G)));
+                    VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T)));                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);  
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[20], 
+                      v_A[20];           
+                    
+                    for(j = 0; j < 20; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < 20; j++)                 
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+                  }
+              }
+              break;
+            default:
+              {
+                parsimonyNumber
+                  *left[32], 
+                  *right[32],
+                  *this[32];
+
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[32], 
+                      v_A[32];           
+                    
+                    for(j = 0; j < states; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < states; j++)             
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+                  }                             
+              }
+            }            
+        }
+
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList *pr)
+{
+  INT_TYPE 
+    allOne = SET_ALL_BITS_ONE;
+
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr);
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength,
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                               
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),            
+                   v_N = VECTOR_BIT_OR(l_A, l_C);
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                 
+                 if(sum >= bestScore)
+                   return sum;                         
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                                
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
+                   l_G = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[2][i])), VECTOR_LOAD((CAST)(&right[2][i]))),
+                   l_T = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[3][i])), VECTOR_LOAD((CAST)(&right[3][i]))),
+                   v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));     
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                 
+                 if(sum >= bestScore)            
+                   return sum;          
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i += INTS_PER_VECTOR)
+                {                              
+                  int 
+                    j;
+                  
+                  INT_TYPE      
+                    l_A,
+                    v_N = SET_ALL_BITS_ZERO;     
+                  
+                  for(j = 0; j < 20; j++)
+                    {
+                      l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                      v_N = VECTOR_BIT_OR(l_A, v_N);
+                    }
+                  
+                  v_N = VECTOR_AND_NOT(v_N, allOne);
+                  
+                  sum += vectorPopcount(v_N);          
+                  
+                  if(sum >= bestScore)      
+                    return sum;                        
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               *left[32],  
+               *right[32]; 
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                               
+                 size_t
+                   j;
+                 
+                 INT_TYPE      
+                   l_A,
+                   v_N = SET_ALL_BITS_ZERO;     
+                 
+                 for(j = 0; j < states; j++)
+                   {
+                     l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                     v_N = VECTOR_BIT_OR(l_A, v_N);
+                   }
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);           
+                 
+                 if(sum >= bestScore)         
+                   return sum;                 
+               }
+           }
+         }
+    }
+  
+  return sum;
+}
+
+
+#else
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList * pr)
+{    
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;    
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+                
+                parsimonyNumber
+                   o_A,
+                   o_C,
+                   t_A,
+                   t_C, 
+                   t_N;
+                
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i++)
+                  {               
+                    t_A = left[0][i] & right[0][i];
+                    t_C = left[1][i] & right[1][i];                
+
+                    o_A = left[0][i] | right[0][i];
+                    o_C = left[1][i] | right[1][i];
+                  
+                    t_N = ~(t_A | t_C);   
+
+                    this[0][i] = t_A | (t_N & o_A);
+                    this[1][i] = t_C | (t_N & o_C);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+
+                parsimonyNumber
+                   o_A,
+                   o_C,
+                   o_G,
+                   o_T,
+                   t_A,
+                   t_C,
+                   t_G,
+                   t_T, 
+                   t_N;
+
+                for(i = 0; i < width; i++)
+                  {               
+                    t_A = left[0][i] & right[0][i];
+                    t_C = left[1][i] & right[1][i];
+                    t_G = left[2][i] & right[2][i];       
+                    t_T = left[3][i] & right[3][i];
+
+                    o_A = left[0][i] | right[0][i];
+                    o_C = left[1][i] | right[1][i];
+                    o_G = left[2][i] | right[2][i];       
+                    o_T = left[3][i] | right[3][i];
+
+                    t_N = ~(t_A | t_C | t_G | t_T);       
+
+                    this[0][i] = t_A | (t_N & o_A);
+                    this[1][i] = t_C | (t_N & o_C);
+                    this[2][i] = t_G | (t_N & o_G);
+                    this[3][i] = t_T | (t_N & o_T); 
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                parsimonyNumber
+                  o_A[20],
+                  t_A[20],        
+                  t_N;
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i++)
+                  {               
+                    size_t k;
+                    
+                    t_N = 0;
+
+                    for(k = 0; k < 20; k++)
+                      {
+                        t_A[k] = left[k][i] & right[k][i];
+                        o_A[k] = left[k][i] | right[k][i];
+                        t_N = t_N | t_A[k];
+                      }
+                    
+                    t_N = ~t_N;
+
+                    for(k = 0; k < 20; k++)                   
+                      this[k][i] = t_A[k] | (t_N & o_A[k]);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }
+              break;
+            default:
+              {         
+                parsimonyNumber
+                  *left[32],
+                  *right[32],
+                  *this[32];
+                
+                parsimonyNumber
+                  o_A[32],
+                  t_A[32],        
+                  t_N;
+                
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+                
+                for(i = 0; i < width; i++)
+                  {               
+                    t_N = 0;
+                    
+                    for(k = 0; k < states; k++)
+                      {
+                        t_A[k] = left[k][i] & right[k][i];
+                        o_A[k] = left[k][i] | right[k][i];
+                        t_N = t_N | t_A[k];
+                      }
+                    
+                    t_N = ~t_N;
+                    
+                    for(k = 0; k < states; k++)               
+                      this[k][i] = t_A[k] | (t_N & o_A[k]);                
+                    
+                    totalScore += ((unsigned int) __builtin_popcount(t_N));
+                  }
+              }                       
+            } 
+        }
+
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList * pr)
+{
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr); 
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength, 
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber 
+               t_A,
+               t_C,           
+               t_N,
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i++)
+               {                                               
+                 t_A = left[0][i] & right[0][i];
+                 t_C = left[1][i] & right[1][i];
+                 
+                  t_N = ~(t_A | t_C);
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                 
+                 if(sum >= bestScore)
+                   return sum;                         
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               t_A,
+               t_C,
+               t_G,
+               t_T,
+               t_N,
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i++)
+               {                                                
+                  t_A = left[0][i] & right[0][i];
+                  t_C = left[1][i] & right[1][i];
+                  t_G = left[2][i] & right[2][i];         
+                  t_T = left[3][i] & right[3][i];
+
+                  t_N = ~(t_A | t_C | t_G | t_T);
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                 
+                 if(sum >= bestScore)            
+                   return sum;          
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               t_A,
+               t_N,
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i++)
+                { 
+                  t_N = 0;
+                  
+                  for(k = 0; k < 20; k++)
+                    {
+                      t_A = left[k][i] & right[k][i];
+                      t_N = t_N | t_A;
+                    }
+               
+                  t_N = ~t_N;
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                  
+                  if(sum >= bestScore)      
+                    return sum;                        
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               t_A,
+               t_N,
+               *left[32], 
+               *right[32];  
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i++)
+               {                               
+                 t_N = 0;
+                  
+                 for(k = 0; k < states; k++)
+                   {
+                     t_A = left[k][i] & right[k][i];
+                     t_N = t_N | t_A;
+                   }
+               
+                  t_N = ~t_N;
+
+                  sum += ((unsigned int) __builtin_popcount(t_N));
+                                                 
+                 if(sum >= bestScore)                     
+                   return sum;                     
+               }                     
+           }
+         }
+    }
+  
+  return sum;
+}
+
+#endif
+
+
+
+
+
+
+static unsigned int evaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full)
+{
+  volatile unsigned int result;
+  nodeptr q = p->back;
+  int
+    *ti = tr->ti,
+    counter = 4;
+  
+  ti[1] = p->number;
+  ti[2] = q->number;
+
+  if(full)
+    {
+      if(p->number > tr->mxtips)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+  else
+    {
+      if(p->number > tr->mxtips && !p->xPars)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips && !q->xPars)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+
+  ti[0] = counter;
+
+  result = evaluateParsimonyIterativeFast(tr, pr);
+
+  return result;
+}
+
+
+static void newviewParsimony(pllInstance *tr, partitionList *pr, nodeptr  p)
+{     
+  if(p->number <= tr->mxtips)
+    return;
+
+  {
+    int 
+      counter = 4;     
+           
+    computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+    tr->ti[0] = counter;            
+    
+    newviewParsimonyIterativeFast(tr, pr);
+  }
+}
+
+
+
+
+
+/****************************************************************************************************************************************/
+
+static void insertParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r;
+  
+  r = q->back;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+   
+  newviewParsimony(tr, pr, p);
+} 
+
+
+
+static nodeptr buildNewTip (pllInstance *tr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  q = tr->nodep[(tr->nextnode)++];
+  hookupDefault(p, q);
+  q->next->back = (nodeptr)NULL;
+  q->next->next->back = (nodeptr)NULL;
+ 
+  return  q;
+} 
+
+static void buildSimpleTree (pllInstance *tr, partitionList *pr, int ip, int iq, int ir)
+{    
+  nodeptr  p, s;
+  int  i;
+  
+  i = PLL_MIN(ip, iq);
+  if (ir < i)  i = ir; 
+  tr->start = tr->nodep[i];
+  tr->ntips = 3;
+  p = tr->nodep[ip];
+  hookupDefault(p, tr->nodep[iq]);
+  s = buildNewTip(tr, tr->nodep[ir]);
+  insertParsimony(tr, pr, s, p);
+}
+
+
+static void testInsertParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, pllBoolean saveBranches)
+{ 
+  unsigned int 
+    mp;
+ 
+  nodeptr  
+    r = q->back;   
+
+  pllBoolean
+    doIt = PLL_TRUE;
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if(tr->grouped)
+    {
+      int 
+        rNumber = tr->constraintVector[r->number],
+        qNumber = tr->constraintVector[q->number],
+        pNumber = tr->constraintVector[p->number];
+
+      doIt = PLL_FALSE;
+     
+      if(pNumber == -9)
+        pNumber = checkerPars(tr, p->back);
+      if(pNumber == -9)
+        doIt = PLL_TRUE;
+      else
+        {
+          if(qNumber == -9)
+            qNumber = checkerPars(tr, q);
+
+          if(rNumber == -9)
+            rNumber = checkerPars(tr, r);
+
+          if(pNumber == rNumber || pNumber == qNumber)
+            doIt = PLL_TRUE;       
+        }
+    }
+
+  if(doIt)
+    {
+      double 
+        z[PLL_NUM_BRANCHES];
+      
+      if(saveBranches)
+        {
+          int i;
+          
+          for(i = 0; i < numBranches; i++)
+            z[i] = q->z[i];
+        }
+
+      insertParsimony(tr, pr, p, q);
+  
+      mp = evaluateParsimony(tr, pr, p->next->next, PLL_FALSE);
+
+      if(mp < tr->bestParsimony)
+        {
+          tr->bestParsimony = mp;
+          tr->insertNode = q;
+          tr->removeNode = p;
+        }
+      
+      if(saveBranches)
+        hookup(q, r, z, numBranches);
+      else
+        hookupDefault(q, r);
+      
+      p->next->next->back = p->next->back = (nodeptr) NULL;
+    }
+       
+  return;
+} 
+
+
+static void restoreTreeParsimony(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{ 
+  nodeptr
+    r = q->back;
+  
+  int counter = 4;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+  
+  computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+  tr->ti[0] = counter;
+    
+  newviewParsimonyIterativeFast(tr, pr);
+}
+
+
+static void addTraverseParsimony (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllBoolean doAll, pllBoolean saveBranches)
+{        
+  if (doAll || (--mintrav <= 0))               
+    testInsertParsimony(tr, pr, p, q, saveBranches);
+
+  if (((q->number > tr->mxtips)) && ((--maxtrav > 0) || doAll))
+    {         
+      addTraverseParsimony(tr, pr, p, q->next->back, mintrav, maxtrav, doAll, saveBranches);
+      addTraverseParsimony(tr, pr, p, q->next->next->back, mintrav, maxtrav, doAll, saveBranches);
+    }
+}
+
+
+
+
+
+static void makePermutationFast(int *perm, int n, pllInstance *tr)
+{    
+  int  
+    i, 
+    j, 
+    k;
+
+  for (i = 1; i <= n; i++)    
+    perm[i] = i;               
+
+  for (i = 1; i <= n; i++) 
+    {      
+      double d =  randum(&tr->randomNumberSeed);
+
+      k =  (int)((double)(n + 1 - i) * d);
+      
+      j        = perm[i];
+
+      perm[i]     = perm[i + k];
+      perm[i + k] = j; 
+    }
+}
+
+//static nodeptr  removeNodeParsimony (nodeptr p, tree *tr)
+static nodeptr  removeNodeParsimony (nodeptr p)
+{ 
+  nodeptr  q, r;         
+
+  q = p->next->back;
+  r = p->next->next->back;   
+    
+  hookupDefault(q, r);
+
+  p->next->next->back = p->next->back = (node *) NULL;
+  
+  return  q;
+}
+
+static int rearrangeParsimony(pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav, pllBoolean doAll)
+{   
+  nodeptr  
+    p1, 
+    p2, 
+    q, 
+    q1, 
+    q2;
+  
+  int      
+    mintrav2; 
+
+  pllBoolean
+    doP = PLL_TRUE,
+    doQ = PLL_TRUE;
+           
+  if (maxtrav > tr->ntips - 3)  
+    maxtrav = tr->ntips - 3; 
+
+  assert(mintrav == 1);
+
+  if(maxtrav < mintrav)
+    return 0;
+
+  q = p->back;
+
+  if(tr->constrained)
+    {    
+      if(! tipHomogeneityCheckerPars(tr, p->back, 0))
+        doP = PLL_FALSE;
+        
+      if(! tipHomogeneityCheckerPars(tr, q->back, 0))
+        doQ = PLL_FALSE;
+                        
+      if(doQ == PLL_FALSE && doP == PLL_FALSE)
+        return 0;
+    }  
+
+  if((p->number > tr->mxtips) && doP) 
+    {     
+      p1 = p->next->back;
+      p2 = p->next->next->back;
+      
+      if ((p1->number > tr->mxtips) || (p2->number > tr->mxtips)) 
+        {                 
+          //removeNodeParsimony(p, tr);          
+          removeNodeParsimony(p);                
+
+          if ((p1->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, p, p1->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, p, p1->next->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+            }
+         
+          if ((p2->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, p, p2->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, p, p2->next->next->back, mintrav, maxtrav, doAll, PLL_FALSE);
+            }
+            
+           
+          hookupDefault(p->next,       p1);
+          hookupDefault(p->next->next, p2);
+
+          newviewParsimony(tr, pr, p);
+        }
+    }  
+       
+  if ((q->number > tr->mxtips) && (maxtrav > 0) && doQ) 
+    {
+      q1 = q->next->back;
+      q2 = q->next->next->back;
+
+      if (
+          (
+           (q1->number > tr->mxtips) && 
+           ((q1->next->back->number > tr->mxtips) || (q1->next->next->back->number > tr->mxtips))
+           )
+          ||
+          (
+           (q2->number > tr->mxtips) && 
+           ((q2->next->back->number > tr->mxtips) || (q2->next->next->back->number > tr->mxtips))
+           )
+          )
+        {          
+
+          //removeNodeParsimony(q, tr);
+          removeNodeParsimony(q);
+          
+          mintrav2 = mintrav > 2 ? mintrav : 2;
+          
+          if ((q1->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, q, q1->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, q, q1->next->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+            }
+         
+          if ((q2->number > tr->mxtips)) 
+            {
+              addTraverseParsimony(tr, pr, q, q2->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+              addTraverseParsimony(tr, pr, q, q2->next->next->back, mintrav2 , maxtrav, doAll, PLL_FALSE);
+            }      
+           
+          hookupDefault(q->next,       q1);
+          hookupDefault(q->next->next, q2);
+           
+          newviewParsimony(tr, pr, q);
+        }
+    }
+
+  return 1;
+} 
+
+
+static void restoreTreeRearrangeParsimony(pllInstance *tr, partitionList *pr)
+{    
+  removeNodeParsimony(tr->removeNode);  
+  //removeNodeParsimony(tr->removeNode, tr);  
+  restoreTreeParsimony(tr, pr, tr->removeNode, tr->insertNode);
+}
+
+/*
+static pllBoolean isInformative2(pllInstance *tr, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = 15;
+
+  unsigned char
+    nucleotide,
+    target = 0;
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] =  check[nucleotide] + 1;                  
+    }
+  
+  
+  if(check[1] > 1)
+    {
+      informativeCounter++;    
+      target = target | 1;
+    }
+  if(check[2] > 1)
+    {
+      informativeCounter++; 
+      target = target | 2;
+    }
+  if(check[4] > 1)
+    {
+      informativeCounter++; 
+      target = target | 4;
+    }
+  if(check[8] > 1)
+    {
+      informativeCounter++; 
+      target = target | 8;
+    }
+          
+  if(informativeCounter >= 2)
+    return PLL_TRUE;    
+  else
+    {        
+      for(j = 0; j < undetermined; j++)
+        {
+          if(j == 3 || j == 5 || j == 6 || j == 7 || j == 9 || j == 10 || j == 11 || 
+             j == 12 || j == 13 || j == 14)
+            {
+              if(check[j] > 1)
+                {
+                  if(!(target & j))
+                    return PLL_TRUE;
+                }
+            }
+        } 
+    }
+     
+  return PLL_FALSE;          
+}
+*/
+
+static pllBoolean isInformative(pllInstance *tr, int dataType, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = getUndetermined(dataType);
+
+  const unsigned int
+    *bitVector = getBitVector(dataType);
+
+  unsigned char
+    nucleotide;
+  
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] =  check[nucleotide] + 1;
+      assert(bitVector[nucleotide] > 0);                   
+    }
+  
+  for(j = 0; j < undetermined; j++)
+    {
+      if(check[j] > 0)
+        informativeCounter++;    
+    } 
+          
+  if(informativeCounter <= 1)
+    return PLL_FALSE;    
+  else
+    {        
+      for(j = 0; j < undetermined; j++)
+        {
+          if(check[j] > 1)
+            return PLL_TRUE;
+        } 
+    }
+     
+  return PLL_FALSE;          
+}
+
+
+static void determineUninformativeSites(pllInstance *tr, partitionList *pr, int *informative)
+{
+  int 
+    model,
+    number = 0,
+    i;
+
+  /* 
+     Not all characters are useful in constructing a parsimony tree. 
+     Invariant characters, those that have the same state in all taxa, 
+     are obviously useless and are ignored by the method. Characters in 
+     which a state occurs in only one taxon are also ignored. 
+     All these characters are called parsimony uninformative.
+
+     Alternative definition: informative columns contain at least two types
+     of nucleotides, and each nucleotide must appear at least twice in each 
+     column. Kind of a pain if we intend to check for this when using, e.g.,
+     amibiguous DNA encoding.
+  */
+
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+        {
+           if(isInformative(tr, pr->partitionData[model]->dataType, i))
+             informative[i] = 1;
+           else
+             {
+               informative[i] = 0;
+               number++;
+             }  
+        }      
+    }
+
+ 
+  /* printf("Uninformative Patterns: %d\n", number); */
+}
+
+
+static void reorderNodes(pllInstance *tr, nodeptr *np, nodeptr p, int *count)
+{
+  int i, found = 0;
+
+  if((p->number <= tr->mxtips))    
+    return;
+  else
+    {              
+      for(i = tr->mxtips + 1; (i <= (tr->mxtips + tr->mxtips - 1)) && (found == 0); i++)
+        {
+          if (p == np[i] || p == np[i]->next || p == np[i]->next->next)
+            {
+              if(p == np[i])                           
+                tr->nodep[*count + tr->mxtips + 1] = np[i];                             
+              else
+                {
+                  if(p == np[i]->next)            
+                    tr->nodep[*count + tr->mxtips + 1] = np[i]->next;                      
+                  else             
+                    tr->nodep[*count + tr->mxtips + 1] = np[i]->next->next;                                 
+                }
+
+              found = 1;                     
+              *count = *count + 1;
+            }
+        }            
+     
+      assert(found != 0);
+
+      reorderNodes(tr, np, p->next->back, count);     
+      reorderNodes(tr, np, p->next->next->back, count);                
+    }
+}
+
+
+
+static void nodeRectifierPars(pllInstance *tr)
+{
+  nodeptr *np = (nodeptr *)rax_malloc(2 * tr->mxtips * sizeof(nodeptr));
+  int i;
+  int count = 0;
+  
+  tr->start       = tr->nodep[1];
+  tr->rooted      = PLL_FALSE;
+
+  /* TODO why is tr->rooted set to PLL_FALSE here ?*/
+  
+  for(i = tr->mxtips + 1; i <= (tr->mxtips + tr->mxtips - 1); i++)
+    np[i] = tr->nodep[i];           
+  
+  reorderNodes(tr, np, tr->start->back, &count); 
+
+ 
+  rax_free(np);
+}
+
+
+  
+static void compressDNA(pllInstance *tr, partitionList *pr, int *informative)
+{
+  size_t
+    totalNodes,
+    i,
+    model;
+   
+  totalNodes = 2 * (size_t)tr->mxtips;
+
+ 
+
+  for(model = 0; model < (size_t) pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = (size_t)pr->partitionData[model]->states,
+        compressedEntries,
+        compressedEntriesPadded,
+        entries = 0, 
+        lower = pr->partitionData[model]->lower,
+        upper = pr->partitionData[model]->upper;
+
+      parsimonyNumber 
+        **compressedTips = (parsimonyNumber **)rax_malloc(states * sizeof(parsimonyNumber*)),
+        *compressedValues = (parsimonyNumber *)rax_malloc(states * sizeof(parsimonyNumber));
+      
+      for(i = lower; i < upper; i++)    
+        if(informative[i])
+          entries += (size_t)tr->aliaswgt[i];     
+  
+      compressedEntries = entries / PLL_PCF;
+
+      if(entries % PLL_PCF != 0)
+        compressedEntries++;
+
+#if (defined(__SSE3) || defined(__AVX))
+      if(compressedEntries % INTS_PER_VECTOR != 0)
+        compressedEntriesPadded = compressedEntries + (INTS_PER_VECTOR - (compressedEntries % INTS_PER_VECTOR));
+      else
+        compressedEntriesPadded = compressedEntries;
+#else
+      compressedEntriesPadded = compressedEntries;
+#endif     
+
+      
+      rax_posix_memalign ((void **) &(pr->partitionData[model]->parsVect), PLL_BYTE_ALIGNMENT, (size_t)compressedEntriesPadded * states * totalNodes * sizeof(parsimonyNumber));
+     
+      for(i = 0; i < compressedEntriesPadded * states * totalNodes; i++)      
+        pr->partitionData[model]->parsVect[i] = 0;
+
+      for(i = 0; i < (size_t)tr->mxtips; i++)
+        {
+          size_t
+            w = 0,
+            compressedIndex = 0,
+            compressedCounter = 0,
+            index = 0;
+
+          for(k = 0; k < states; k++)
+            {
+              compressedTips[k] = &(pr->partitionData[model]->parsVect[(compressedEntriesPadded * states * (i + 1)) + (compressedEntriesPadded * k)]);
+              compressedValues[k] = 0;
+            }                
+              
+          for(index = lower; index < (size_t)upper; index++)
+            {
+              if(informative[index])
+                {
+                  const unsigned int 
+                    *bitValue = getBitVector(pr->partitionData[model]->dataType);
+
+                  parsimonyNumber 
+                    value = bitValue[tr->yVector[i + 1][index]];          
+              
+                  for(w = 0; w < (size_t)tr->aliaswgt[index]; w++)
+                    {      
+                      for(k = 0; k < states; k++)
+                        {
+                          if(value & mask32[k])
+                            compressedValues[k] |= mask32[compressedCounter];
+                        }
+                     
+                      compressedCounter++;
+                  
+                      if(compressedCounter == PLL_PCF)
+                        {
+                          for(k = 0; k < states; k++)
+                            {
+                              compressedTips[k][compressedIndex] = compressedValues[k];
+                              compressedValues[k] = 0;
+                            }                    
+                          
+                          compressedCounter = 0;
+                          compressedIndex++;
+                        }
+                    }
+                }
+            }
+                           
+          for(;compressedIndex < compressedEntriesPadded; compressedIndex++)
+            {   
+              for(;compressedCounter < PLL_PCF; compressedCounter++)              
+                for(k = 0; k < states; k++)
+                  compressedValues[k] |= mask32[compressedCounter];               
+          
+              for(k = 0; k < states; k++)
+                {
+                  compressedTips[k][compressedIndex] = compressedValues[k];
+                  compressedValues[k] = 0;
+                }                     
+              
+              compressedCounter = 0;
+            }           
+        }               
+  
+      pr->partitionData[model]->parsimonyLength = compressedEntriesPadded;
+
+      rax_free(compressedTips);
+      rax_free(compressedValues);
+    }
+  
+  rax_posix_memalign ((void **) &(tr->parsimonyScore), PLL_BYTE_ALIGNMENT, sizeof(unsigned int) * totalNodes);  
+          
+  for(i = 0; i < totalNodes; i++) 
+    tr->parsimonyScore[i] = 0;
+}
+
+
+
+static void stepwiseAddition(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{            
+  nodeptr 
+    r = q->back;
+
+  unsigned int 
+    mp;
+  
+  int 
+    counter = 4;
+  
+  p->next->back = q;
+  q->back = p->next;
+
+  p->next->next->back = r;
+  r->back = p->next->next;
+   
+  computeTraversalInfoParsimony(p, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+  tr->ti[0] = counter;
+  tr->ti[1] = p->number;
+  tr->ti[2] = p->back->number;
+    
+  mp = evaluateParsimonyIterativeFast(tr, pr);
+  
+  if(mp < tr->bestParsimony)
+    {    
+      tr->bestParsimony = mp;
+      tr->insertNode = q;     
+    }
+ 
+  q->back = r;
+  r->back = q;
+   
+  if(q->number > tr->mxtips && tr->parsimonyScore[q->number] > 0)
+    {         
+      stepwiseAddition(tr, pr, p, q->next->back);
+      stepwiseAddition(tr, pr, p, q->next->next->back);
+    }
+}
+
+
+
+void allocateParsimonyDataStructures(pllInstance *tr, partitionList *pr)
+{
+  int 
+    i,
+    *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength);
+ 
+  determineUninformativeSites(tr, pr, informative);
+
+  compressDNA(tr, pr, informative);
+
+  for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++)
+    {
+      nodeptr 
+        p = tr->nodep[i];
+
+      p->xPars = 1;
+      p->next->xPars = 0;
+      p->next->next->xPars = 0;
+    }
+
+  tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips);  
+
+  rax_free(informative); 
+}
+
+void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr)
+{
+  size_t 
+    model;
+
+  rax_free(tr->parsimonyScore);
+  
+  for(model = 0; model < (size_t) pr->numberOfPartitions; ++model)
+    rax_free(pr->partitionData[model]->parsVect);
+  
+  rax_free(tr->ti);
+}
+
+
+void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist)
+{   
+  nodeptr  
+    p, 
+    f;    
+
+  int 
+    i, 
+    nextsp,
+    *perm        = (int *)rax_malloc((size_t)(tr->mxtips + 1) * sizeof(int));  
+
+  unsigned int 
+    randomMP, 
+    startMP;         
+  
+  assert(!tr->constrained);
+
+  makePermutationFast(perm, tr->mxtips, tr);
+  
+  tr->ntips = 0;    
+  
+  tr->nextnode = tr->mxtips + 1;       
+  
+  buildSimpleTree(tr, pr, perm[1], perm[2], perm[3]);
+  
+  f = tr->start;       
+  
+  while(tr->ntips < tr->mxtips) 
+    {   
+      nodeptr q;
+      
+      tr->bestParsimony = INT_MAX;
+      nextsp = ++(tr->ntips);             
+      p = tr->nodep[perm[nextsp]];                 
+      q = tr->nodep[(tr->nextnode)++];
+      p->back = q;
+      q->back = p;
+        
+      if(tr->grouped)
+        {
+          int 
+            number = p->back->number;            
+
+          tr->constraintVector[number] = -9;
+        }
+          
+      stepwiseAddition(tr, pr, q, f->back);
+      
+      {
+        nodeptr   
+          r = tr->insertNode->back;
+        
+        int counter = 4;
+        
+        hookupDefault(q->next,       tr->insertNode);
+        hookupDefault(q->next->next, r);
+        
+        computeTraversalInfoParsimony(q, tr->ti, &counter, tr->mxtips, PLL_FALSE);              
+        tr->ti[0] = counter;
+        
+        newviewParsimonyIterativeFast(tr, pr);
+      }
+    }    
+  
+  nodeRectifierPars(tr);
+  
+  randomMP = tr->bestParsimony;        
+  
+  do
+    {
+      startMP = randomMP;
+      nodeRectifierPars(tr);
+      for(i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+        {
+          rearrangeParsimony(tr, pr, tr->nodep[i], 1, sprDist, PLL_FALSE);
+          if(tr->bestParsimony < randomMP)
+            {           
+              restoreTreeRearrangeParsimony(tr, pr);
+              randomMP = tr->bestParsimony;
+            }
+        }                          
+    }
+  while(randomMP < startMP);
+  
+  rax_free(perm);
+} 
diff --git a/pllrepo/src/genericParallelization.c b/pllrepo/src/genericParallelization.c
new file mode 100644
index 0000000..1454b5e
--- /dev/null
+++ b/pllrepo/src/genericParallelization.c
@@ -0,0 +1,2283 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file genericParallelization.c
+ */
+#include "mem_alloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+
+#ifdef MEASURE_TIME_PARALLEL
+#include <time.h>
+#endif
+
+#include <assert.h>
+
+#include "genericParallelization.h"
+#include "pllInternal.h"
+#include "pll.h"
+
+/** @file genericParallelization.c
+    
+    @brief Generic master-worker parallelization with either pthreads or MPI. 
+    
+    Worker threads/processes mostly work on a local
+    tree. Implementationwise, MPI operations are abstracted as good as
+    possible via defines (that translate to no-ops or memcpy-calls in
+    the pthreads version).
+
+    @todo the code still contains many memory copy operations that
+    could be executed more efficiently in-place  
+*/
+
+
+
+void perSiteLogLikelihoodsPthreads(pllInstance *tr, partitionList *pr, double *lhs, int n, int tid);
+void broadcastAfterRateOpt(pllInstance *tr, pllInstance *localTree, partitionList *pr, int n, int tid);
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches );
+void pllMasterPostBarrier(pllInstance *tr, partitionList *pr, int jobType);
+static void distributeYVectors(pllInstance *localTree, pllInstance *tr, partitionList *localPr);
+static void distributeWeights(pllInstance *localTree, pllInstance *tr, partitionList *localPr);
+static pllBoolean execFunction(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n);
+
+static void *likelihoodThread(void *tData); 
+
+static void multiprocessorScheduling(pllInstance * tr, partitionList *pr, int tid);
+
+static void computeFraction(partitionList *localPr, int tid, int n);
+static void computeFractionMany(partitionList *localPr, int tid);
+static void initializePartitionsMaster(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n);
+
+#ifdef _FINE_GRAIN_MPI
+static char* addBytes(char *buf, void *toAdd, size_t numBytes); 
+static char* popBytes(char *buf, void *result, size_t numBytes); 
+static void defineTraversalInfoMPI(void);
+static pllBoolean pllWorkerTrap(pllInstance *tr, partitionList *pr);
+#endif
+
+#ifdef _USE_PTHREADS
+static pthread_t *threads;
+static threadData *tData;
+#endif
+
+extern volatile int jobCycle; 
+extern volatile int threadJob;          /**< current job to be done by worker threads/processes */
+extern pllBoolean treeIsInitialized; 
+
+#ifdef MEASURE_TIME_PARALLEL
+extern double masterTimePerPhase; 
+double timeBuffer[NUM_PAR_JOBS]; 
+double timePerRegion[NUM_PAR_JOBS]; 
+#endif
+
+extern char* getJobName(int tmp); 
+
+//extern double *globalResult; 
+extern volatile char *barrierBuffer;
+
+
+#ifdef _FINE_GRAIN_MPI
+extern MPI_Datatype TRAVERSAL_MPI; 
+
+/** @brief Pthreads helper function for adding bytes to communication buffer.
+
+    Copy from \toAdd to \a buf \a numBytes bytes
+
+    @param buf
+      Where to place bytes
+
+    @pram toAdd
+      Where to copy them from
+
+    @para numBytes
+      How many to copy
+
+    @return
+      Pointer to the end of placed data in communication buffer (first free slot)
+ */ 
+static char* addBytes(char *buf, void *toAdd, size_t numBytes)
+{
+  memcpy(buf, toAdd, numBytes);  
+  return buf + numBytes;  
+}
+
+/** @brief Pthreads helper function for removing bytes from communication buffer
+    
+    Copies \a numBytes from communication buffer \a buf to some local buffer \a buf
+
+    @param buf
+      Where to store the bytes
+
+    @param result
+      Where to copy from
+
+    @param numBytes
+      How many to copy
+    
+    @return
+      Pointer to the end of read data in communication buffer (first free slot)
+ */ 
+static char* popBytes(char *buf, void *result, size_t numBytes)
+{
+  memcpy(result, buf, numBytes); 
+  return buf + numBytes;   
+}
+
+/** @brief Lock the MPI slave processes prior allocating partitions
+
+    MPI slave processes are locked and wait until the master process
+    has read the number of partitions, which it then broadcasts
+    to slaves, effectively unlocking them. The slave processes will
+    then allocate their own data structures and be locked in the
+    likelihood function.
+
+    @param tr
+      PLL instance
+    
+    @todo
+      This function should not be called by the user. It is called
+      at \a pllCreateInstance. Probably this function should be removed
+      and inline code be placed in \a pllCreateInstance.
+*/
+void pllLockMPI (pllInstance * tr)
+{
+  int numberOfPartitions;
+  partitionList * pr;
+
+  if (!MASTER_P) 
+   {
+     //MPI_Bcast (&numberOfPartitions, 1, MPI_INT, MPI_ROOT, MPI_COMM_WORLD);
+     MPI_Bcast (&numberOfPartitions, 1, MPI_INT, 0, MPI_COMM_WORLD);
+     pr = (partitionList *) rax_calloc (1, sizeof (partitionList));
+     pr->numberOfPartitions = numberOfPartitions;
+
+     pllWorkerTrap (tr, pr);
+     MPI_Barrier (MPI_COMM_WORLD);
+     MPI_Finalize ();
+     exit(0);
+   }
+}
+
+/** Finalize MPI run
+
+    Finalizes MPI run by synchronizing all processes (master + slaves) with a
+    barrier so that all free their allocated resources. Then \a MPI_Finalize ()
+    is called.
+
+    @todo
+      Similarly as with the \a pllLockMPI function, this should not be called
+      by the user, but it is called implicitly at the end of \a pllDestroyInstance.
+      Probably this function should be removed and inline code be placed in
+      \a pllDestroyInstance.
+*/
+void pllFinalizeMPI (void)
+{
+  MPI_Barrier (MPI_COMM_WORLD);
+  MPI_Finalize ();
+}
+
+/**
+   @brief Sets up the MPI environment.  
+
+   Calls the \a MPI_Init function and makes sure all processes store
+   their process ID and the total number of processes, using a barrier.
+   
+   @note this should be the first call that is executed in your main
+   method.
+   
+   @param argc   
+     Address of argc from main
+   @param argv   
+     Address of argv from main
+ */
+void pllInitMPI(int * argc, char **argv[])
+{  
+  MPI_Init(argc, argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &processID);
+  MPI_Comm_size(MPI_COMM_WORLD, &processes);
+
+  /* if(MASTER_P) */
+  /*   printf("\nThis is RAxML Process Number: %d (MASTER)\n", processID); */
+  MPI_Barrier(MPI_COMM_WORLD);
+
+}
+
+
+/**
+   @brief Traps worker MPI processes.    
+   
+   @note  This function should be called immediately after initMPI()
+
+   @param tr 
+     PLL instance 
+
+   @param pr
+     List of partitions
+
+   @return
+     Returns /b PLL_FALSE if the callee was the master thread/process, otherwise /b PLL_TRUE
+ */ 
+static pllBoolean pllWorkerTrap(pllInstance *tr, partitionList *pr)
+{
+  /// @note for the broadcasting, we need to, if the tree structure has already been initialized 
+  treeIsInitialized = PLL_FALSE; 
+
+  if(NOT MASTER_P) 
+    {
+      threadData tData; 
+      tData.tr = tr; 
+      tData.threadNumber = processID;
+      tData.pr = pr;
+      
+      likelihoodThread(&tData);
+
+      /* notice: the next call MUST be the return call from the main method */
+      return PLL_TRUE; 
+    }
+  return PLL_FALSE; 
+}
+
+
+#define ELEMS_IN_TRAV_INFO  9
+/** @brief Create a datastructure for sending the traversal descriptor.
+    
+    @note This seems to be a very safe method to define your own mpi
+   datatypes (often there are problems with padding). But it is not
+   entirely for the weak of heart...
+ */ 
+static void defineTraversalInfoMPI (void)
+{
+  MPI_Datatype *result  = &TRAVERSAL_MPI; 
+
+  int i ; 
+  MPI_Aint base; 
+  int blocklen[ELEMS_IN_TRAV_INFO+1] = {1, 1, 1, 1, PLL_NUM_BRANCHES, PLL_NUM_BRANCHES, 1,1,1,1}; 
+  MPI_Aint disp[ELEMS_IN_TRAV_INFO+1];
+  MPI_Datatype type[ELEMS_IN_TRAV_INFO+1] = {MPI_INT, MPI_INT, MPI_INT, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT, MPI_INT, MPI_UB}; 
+  traversalInfo desc[2]; 
+
+  MPI_Get_address( desc, disp);
+  MPI_Get_address( &(desc[0].pNumber), disp + 1 );
+  MPI_Get_address( &(desc[0].qNumber), disp + 2 );  
+  MPI_Get_address( &(desc[0].rNumber), disp + 3); 
+  MPI_Get_address( desc[0].qz, disp + 4 );
+  MPI_Get_address( desc[0].rz, disp + 5 );
+  MPI_Get_address( &(desc[0].slot_p), disp + 6);
+  MPI_Get_address( &(desc[0].slot_q), disp + 7);
+  MPI_Get_address( &(desc[0].slot_r), disp + 8);
+  MPI_Get_address( desc + 1, disp + 9);
+
+  base = disp[0]; 
+  for(i = 0; i < ELEMS_IN_TRAV_INFO+1; ++i)
+    disp[i] -= base;
+
+  MPI_Type_create_struct( ELEMS_IN_TRAV_INFO+1 , blocklen, disp, type, result);
+  MPI_Type_commit(result);
+}
+
+
+#endif
+
+
+/********************/
+/* PTHREAD-SPECIFIC */
+/********************/
+#ifdef _USE_PTHREADS
+
+#ifndef _PORTABLE_PTHREADS
+/** @brief Pins a thread to a core (for efficiency). 
+
+    This is a non-portable function that works only on some linux distributions of pthreads.
+    It sets the affinity of each thread to a specific core so that the performance is not
+    degraded due to threads migration.
+
+    @note 
+      It is only called if \a _PORTABLE_PTHREADS is not defined
+
+    @param tid the thread id
+ */ 
+void pinToCore(int tid)
+{
+  static int nextCore = 0;
+
+  cpu_set_t cpuset;
+
+  CPU_ZERO(&cpuset);    
+  CPU_SET(nextCore++, &cpuset);
+
+  if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0)
+    {
+      assert(0);
+    }
+}
+#endif
+
+/**  Start PThreads
+
+     Start JOINABLE threads by executing \a pthread_create. The threads
+     are attached to the \a pllLikelihoodThread function
+
+     @param tr
+       PLL instance
+
+     @param pr
+       List of partitions
+
+     @todo
+       This function should never be called by the user. It is called
+       implicitly at \a pllInitModel. Perhaps we should add a check
+       or inline the code
+ */ 
+void pllStartPthreads (pllInstance *tr, partitionList *pr)
+{
+  pthread_attr_t attr;
+  int rc, t;
+  treeIsInitialized = PLL_FALSE; 
+
+  jobCycle        = 0;
+  threadJob       = 0;
+
+  /* printf("\nThis is the RAxML Master Pthread\n");   */
+
+#if (NOT defined(_USE_PTHREADS) && defined( MEASURE_TIME_PARALLEL))
+  timeBuffer = rax_calloc(NUM_PAR_JOBS * tr->numberOfThreads, sizeof(double)); 
+#endif
+
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+
+  threads    = (pthread_t *)rax_malloc((size_t)tr->numberOfThreads * sizeof(pthread_t));
+  tData      = (threadData *)rax_malloc((size_t)tr->numberOfThreads * sizeof(threadData));
+
+  barrierBuffer            = (volatile char *)  rax_malloc(sizeof(volatile char)   *  (size_t)tr->numberOfThreads);
+
+  for(t = 0; t < tr->numberOfThreads; t++)
+    barrierBuffer[t] = 0;
+
+  for(t = 1; t < tr->numberOfThreads; t++)
+    {
+      tData[t].tr  = tr;
+      tData[t].pr  = pr;
+      tData[t].threadNumber = t;
+      rc = pthread_create(&threads[t], &attr, likelihoodThread, (void *)(&tData[t]));
+      if(rc)
+	{
+	  printf("ERROR; return code from pthread_create() is %d\n", rc);
+	  exit(-1);
+	}
+    }
+  pthread_attr_destroy (&attr);
+}
+
+/** Stop PThread
+    
+    Stop threads by \a pthread_join
+
+    @param  tr
+      PLL instance
+
+    @todo
+      This function should never be called by the user. It is implicitly called
+      at \a pllPartitionsDestroy. We should inline the code
+*/
+void pllStopPthreads (pllInstance * tr)
+{
+  int i;
+
+  for (i = 1; i < tr->numberOfThreads; ++ i)
+   {
+     pthread_join (threads[i], NULL);
+   }
+ 
+  rax_free (threads);
+  rax_free (tData);
+  rax_free ((void *)barrierBuffer);
+  rax_free (globalResult);
+
+}
+#endif
+
+
+/** Compute per-site log likelihoods (PThreads version) 
+
+    Worker threads evaluate the likelihood on their sites
+
+    @param tr 
+      Tree instance
+
+    @param lhs
+      Likelihood array
+
+    @param n
+      Number of threads
+
+    @param tid
+      Thread id
+ */ 
+void perSiteLogLikelihoodsPthreads(pllInstance *tr, partitionList *pr, double *lhs, int n, int tid)
+{
+  size_t 
+    model, 
+    i;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    {      
+      size_t 
+	localIndex = 0;
+
+      /* decide if this partition is handled by the thread when -Q is ativated 
+	 or when -Q is not activated figure out which sites have been assigned to the 
+	 current thread */
+
+      pllBoolean 
+	execute = ((tr->manyPartitions && isThisMyPartition(pr, tid, model)) || (!tr->manyPartitions));
+
+      /* if the entire partition has been assigned to this thread (-Q) or if -Q is not activated 
+	 we need to compute some per-site log likelihoods with thread tid for this partition */
+
+      if(execute)
+	for(i = (size_t)(pr->partitionData[model]->lower);  i < (size_t)(pr->partitionData[model]->upper); i++)
+	  {
+	    /* if -Q is active we compute all per-site log likelihoods for the partition,
+	       othwerise we only compute those that have been assigned to thread tid 
+	       using the cyclic distribution scheme */
+
+	    if(tr->manyPartitions || (i % n == (size_t)tid))
+	      {
+		double 
+		  l;
+
+		/* now compute the per-site log likelihood at the current site */
+
+		switch(tr->rateHetModel)
+		  {
+		  case PLL_CAT:
+		    l = evaluatePartialGeneric (tr, pr, localIndex, pr->partitionData[model]->perSiteRates[pr->partitionData[model]->rateCategory[localIndex]], model);
+		    break;
+		  case PLL_GAMMA:
+		    l = evaluatePartialGeneric (tr, pr, localIndex, 1.0, model);
+		    break;
+		  default:
+		    assert(0);
+		  }
+
+		/* store it in an array that is local in memory to the current thread,
+		   see function collectDouble() in axml.c for understanding how we then collect these 
+		   values stored in local arrays from the threads */
+
+		lhs[i] = l;
+
+		localIndex++;
+	      }
+	  }
+    }
+}
+
+/** @brief Check if a partition is assign to a thread/process.
+
+    Checks whether partition \a model from partition list \a localPr is
+    assigned to be processed by process/thread with id \a tid.
+
+    @param localTree
+      Local PLL instance
+
+    @param tid 
+      Thread/Process id
+
+    @param model
+      Partition number
+ */ 
+pllBoolean isThisMyPartition(partitionList *localPr, int tid, int model)
+{ 
+  if(localPr->partitionData[model]->partitionAssignment == tid)
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+/** @brief Computes partition size for all partitions (in case full partitions are assigns to workers). 
+
+    @param localPr the local partitions instance
+    
+    @param tid thread id    
+ */ 
+static void computeFractionMany(partitionList *localPr, int tid)
+{
+  int
+    sites = 0;
+
+  int   
+    model;
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      if(isThisMyPartition(localPr, tid, model))
+	{	 
+    	  localPr->partitionData[model]->width = localPr->partitionData[model]->upper - localPr->partitionData[model]->lower;
+	  sites += localPr->partitionData[model]->width;
+	}
+      else       	  
+    	  localPr->partitionData[model]->width = 0;
+    }
+
+
+}
+
+
+/** @brief Computes partition size for all partitions (for cyclic distribution of sites)
+    
+    @param localPr the local partitions instance
+    @param tid thread id
+    @param n number of workers
+ */ 
+static void computeFraction(partitionList *localPr, int tid, int n)
+{
+  int
+    i,
+    model;
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      int width = 0;
+
+      for(i = localPr->partitionData[model]->lower; i < localPr->partitionData[model]->upper; i++)
+	if(i % n == tid)
+	  width++;
+      localPr->partitionData[model]->width = width;
+    }
+}
+
+
+
+/** @brief Compare partition sizes. 
+    @param p1 pointer to a partition
+    @param p2 pointer to another partition
+ */ 
+static int partCompare(const void *p1, const void *p2)
+{
+  partitionType 
+    *rc1 = (partitionType *)p1,
+    *rc2 = (partitionType *)p2;
+
+  int 
+    i = rc1->partitionLength,
+    j = rc2->partitionLength;
+
+  if (i > j)
+    return (-1);
+  if (i < j)
+    return (1);
+  return (0);
+}
+
+
+/** @brief Top-level function for the multi processor scheduling
+    scheme (assigns full partitions to workers).
+    
+   tr->manyPartitions is set to PLL_TRUE if the user has indicated via -Q
+   that there are substantially more partitions than threads/cores
+   available. In that case we do not distribute sites from each
+   partition in a cyclic fashion to the cores , but distribute entire
+   partitions to cores.  Achieving a good balance of alignment sites
+   to cores boils down to the multi-processor scheduling problem known
+   from theoretical comp. sci.  which is NP-complete.  We have
+   implemented very simple "standard" heuristics for solving the
+   multiprocessor scheduling problem that turn out to work very well
+   and are cheap to compute.
+   
+   @param pr 
+     List of partitions
+
+   @param tid
+     Id of current process/thread 
+*/
+static void multiprocessorScheduling(pllInstance * tr, partitionList *pr, int tid)
+{
+  int 
+    s,
+    model,
+    modelStates[2] = {4, 20},
+    numberOfPartitions[2] = {0 , 0},
+      arrayLength = sizeof(modelStates) / sizeof(int);
+
+      /* check that we have not addedd any new models for data types with a different number of states
+	 and forgot to update modelStates */
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+	{        
+	  pllBoolean 
+	    exists = PLL_FALSE;
+
+	  for(s = 0; s < arrayLength; s++)
+	    {
+	      exists = exists || (pr->partitionData[model]->states == modelStates[s]);
+	      if(pr->partitionData[model]->states == modelStates[s])
+		numberOfPartitions[s] += 1;
+	    }
+
+	  assert(exists);
+	}
+
+      for(s = 0; s < arrayLength; s++)
+	{
+	  if(numberOfPartitions[s] > 0)
+	    {
+	      size_t   
+		checkSum = 0,
+		sum = 0;
+
+	      int    
+		i,
+		k,
+#ifndef _FINE_GRAIN_MPI
+		n = tr->numberOfThreads,
+#else
+		n = processes,
+#endif
+		p = numberOfPartitions[s],    
+		*assignments = (int *)rax_calloc((size_t)n, sizeof(int));  
+
+	      partitionType 
+		*pt = (partitionType *)rax_malloc(sizeof(partitionType) * (size_t)p);
+
+
+
+	      for(i = 0, k = 0; i < pr->numberOfPartitions; i++)
+		{
+		  if(pr->partitionData[i]->states == modelStates[s])
+		    {
+		      pt[k].partitionNumber = i;
+		      pt[k].partitionLength = pr->partitionData[i]->upper - pr->partitionData[i]->lower;
+		      sum += (size_t)pt[k].partitionLength;
+		      k++;
+		    }
+		}
+
+	      assert(k == p);
+
+	      qsort(pt, p, sizeof(partitionType), partCompare);    
+
+	      for(i = 0; i < p; i++)
+		{
+		  int 
+		    k, 
+		    min = INT_MAX,
+		    minIndex = -1;
+
+		  for(k = 0; k < n; k++)	
+		    if(assignments[k] < min)
+		      {
+			min = assignments[k];
+			minIndex = k;
+		      }
+
+		  assert(minIndex >= 0);
+
+		  assignments[minIndex] +=  pt[i].partitionLength;
+		  assert(pt[i].partitionNumber >= 0 && pt[i].partitionNumber < pr->numberOfPartitions);
+		  pr->partitionData[pt[i].partitionNumber]->partitionAssignment = minIndex;
+		}
+
+              
+              /* Process i gets assignments[i] sites for modelStates[s] state model */
+
+	      for(i = 0; i < n; i++)
+		checkSum += (size_t)assignments[i];
+
+	      assert(sum == checkSum);
+
+	      rax_free(assignments);
+	      rax_free(pt);
+	    }
+	}
+}
+
+
+
+/** @brief Reduce the first and second derivative of the likelihood
+    function.
+    
+    We collect the first and second derivatives from the various
+    threads and sum them up. It's similar to what we do in
+    pllEvaluateGeneric() with the only difference that we have to collect
+    two values (firsrt and second derivative) instead of onyly one (the
+    log likelihood
+
+   @warning operates on global reduction buffers \a globalResult
+   
+   @param tr tree 
+   @param dlnLdlz first derivative
+   @param d2lnLdlz2 second derivative
+*/
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches )
+{
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+
+  /* only the master executes this  */
+  assert(tr->threadID == 0); 
+  
+  int b; 
+  int t; 
+  for(b = 0; b < numBranches; ++b)
+    {
+      dlnLdlz[b] = 0; 
+      d2lnLdlz2[b] = 0; 
+
+      for(t = 0; t < tr->numberOfThreads; ++t)
+	{
+	  dlnLdlz[b] += globalResult[t * numBranches * 2 + b ];
+	  d2lnLdlz2[b] += globalResult[t * numBranches * 2 + numBranches + b];
+	}
+    }
+#else 
+  memcpy(dlnLdlz, globalResult, sizeof(double) * numBranches);
+  memcpy(d2lnLdlz2, globalResult + numBranches, sizeof(double) * numBranches);
+#endif
+}
+
+
+
+/** @brief Read from buffer or writes rates into buffer.  Return
+    number of elems written.
+
+    If \a read is set to \b PLL_TRUE, then the contents \a srcTar are
+    copied to \a buf. Otherwise, the contents of \a buf are moved to
+    \a srcTar.
+   
+   @param buf 
+     Buffer
+
+   @param srcTar 
+     Pointer to either source or destination array
+
+   @param tr
+     PLL instance
+
+   @param n number of workers
+
+   @param tid process id
+
+   @param read 
+     If read-mode then set to \b PLL_TRUE
+
+   @param countOnly
+     if \b PLL_TRUE, simply return the number of elements
+*/
+static int doublesToBuffer(double *buf, double *srcTar, pllInstance *tr, partitionList *pr, int n, int tid, pllBoolean read, pllBoolean countOnly)
+{
+  int 
+    model,
+    i;
+  double 
+    *initPtr = buf; 
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      if(tr->manyPartitions)
+	{
+	  if(isThisMyPartition(pr, tid, model))
+	    for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+	      {
+		if(NOT countOnly)
+		  {
+		    if(read)
+		      *buf = srcTar[i]; 
+		    else 
+		      srcTar[i] = *buf; 
+		  }
+		buf++;
+	      }	  
+	}      
+      else
+	{
+	  for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+	    if(i % n == tid)
+	      {
+		if(NOT countOnly)
+		  {
+		    if(read)
+		      *buf = srcTar[i];
+		    else 
+		      srcTar[i] = *buf; 
+		  }
+		buf++; 
+	      }
+	}
+    }
+  
+  return buf - initPtr; 
+}
+
+
+
+
+/** @brief broadcast rates after rate optimization. 
+    
+    @param tre Library instance
+    @param localTree local library instance 
+    @param n number of workers 
+    @param tid worker id 
+    
+    @todo mpi_alltoallv/w may be more efficient, but it is a hell to set up
+ */ 
+void broadcastAfterRateOpt(pllInstance *tr, pllInstance *localTree, partitionList *pr, int n, int tid)
+{				  
+  int
+    num1 = 0,
+    num2 = 0,
+    num3 = 0, 
+    i ; 
+    
+  for(i = 0; i < n; ++i)
+    {
+      double
+	allBuf[tr->originalCrunchedLength * 3],
+	buf1[tr->originalCrunchedLength],
+	buf2[tr->originalCrunchedLength], 
+	buf3[tr->originalCrunchedLength]; 
+
+#ifdef _USE_PTHREADS
+      if(i != tid)
+	continue; 
+#endif
+      int numDouble = 0; 
+      
+      /* extract doubles  */
+
+      num1 = doublesToBuffer(buf1, localTree->patrat, tr, pr, n,i, PLL_TRUE, i!= tid);
+      num2 = doublesToBuffer(buf2, localTree->patratStored, tr, pr, n,i, PLL_TRUE, i!= tid);
+      num3 = doublesToBuffer(buf3, localTree->lhs, tr, pr, n,i, PLL_TRUE, i!= tid);
+
+      /* printf("%d + %d + %d\n", num1, num2, num3);  */
+
+      numDouble += num1 + num2 + num3; 
+
+      /* copy doubles  */
+      
+      memcpy(allBuf, buf1, num1 * sizeof(double)); 
+      memcpy(allBuf + num1, buf2, num2 * sizeof(double)); 
+      memcpy(allBuf + (num1 + num2) , buf3, num3 * sizeof(double)); 
+
+      BCAST_BUF(allBuf, numDouble, MPI_DOUBLE, i); 
+
+      memcpy(buf1, allBuf, num1 * sizeof(double)); 
+      memcpy(buf2, allBuf + num1, num2 * sizeof(double)); 
+      memcpy(buf3, allBuf + (num1 + num2), num3 * sizeof(double)); 
+      
+      /* re-insert doubles  */
+      int assertCtr = 0; 
+      assertCtr += doublesToBuffer(buf1, tr->patrat, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+      assertCtr += doublesToBuffer(buf2, tr->patratStored, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+      assertCtr += doublesToBuffer(buf3, tr->lhs, tr, pr, n,i,PLL_FALSE, PLL_FALSE);
+
+      assert(assertCtr == numDouble); 
+    }
+}
+
+
+/** @brief Collect doubles from workers to master.
+ 
+    
+
+    @param dst destination array
+    @param src source array
+    @param tr library instance 
+    @param n number of workers 
+    @param tid worker id 
+ */
+static void collectDouble(double *dst, double *src, pllInstance *tr, partitionList *pr, int n, int tid)
+{
+#ifdef _FINE_GRAIN_MPI    
+  int
+    assertNum = 0,
+    i, 
+    displacements[tr->numberOfThreads];
+  double 
+    buf[tr->originalCrunchedLength],
+    resultBuf[tr->originalCrunchedLength]; 
+
+  /* NOTE: This was moved here because it was an additional unnecessary move for the PTHREADS version. I didnt
+  have time to check the MPI version, have to get back to this and remove it */
+  /* gather own persite log likelihood values into local buffer  */
+  int numberCollected = doublesToBuffer(buf, src, tr, pr,n,tid,PLL_TRUE, PLL_FALSE);
+
+  /* this communicates all the values to the master */
+  
+  int numberPerWorker[tr->numberOfThreads];     
+  if(MASTER_P)			/* master counts number to receive, receives and writes back */
+    {
+      for(i = 0; i < n; ++i)
+	{
+	  numberPerWorker[i] = doublesToBuffer(buf,src,tr,pr,n,i,PLL_FALSE, PLL_TRUE);
+	  displacements[i] = i == 0 ? 0 : displacements[i-1] + numberPerWorker[i-1]; 
+	}
+      
+      MPI_Gatherv(buf, numberCollected, MPI_DOUBLE,
+		  resultBuf, numberPerWorker, displacements,  MPI_DOUBLE,
+		  0, MPI_COMM_WORLD); 
+
+      double *bufPtr = resultBuf; 
+      for(i = 0 ; i < n; ++i)
+	{
+	  int numberWritten = doublesToBuffer(bufPtr, dst,tr,pr,n,i, PLL_FALSE, PLL_FALSE);
+	  bufPtr += numberWritten; 
+	  assertNum += numberWritten; 
+	}    
+      
+      assert(assertNum == tr->originalCrunchedLength);
+    }
+  else 				/* workers only send their buffer   */
+    MPI_Gatherv(buf, numberCollected, MPI_DOUBLE, resultBuf, numberPerWorker, displacements, MPI_DOUBLE, 0, MPI_COMM_WORLD);   
+#else 
+  /* pthread version only writes to global space  */  
+
+  //assertNum = doublesToBuffer(buf, dst,tr,pr,n,tid, PLL_FALSE, PLL_FALSE);
+  doublesToBuffer (dst, src, tr, pr, n, tid, PLL_TRUE, PLL_FALSE);
+  //assert(assertNum == numberCollected); 
+#endif
+}
+
+
+
+/** @brief broadcast a new alpha (for the GAMMA model)
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id 
+ */
+static void broadCastAlpha(partitionList *localPr, partitionList *pr)
+{
+  int  i, 
+    model; 
+
+#ifdef _FINE_GRAIN_MPI
+    int bufSize = localPr->numberOfPartitions * 4 * sizeof(double);
+  char bufDbl[bufSize]; 
+  char *bufPtrDbl = bufDbl;   
+#endif
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE); 
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    for(i = 0; i < 4; ++i)
+      ASSIGN_BUF_DBL(localPr->partitionData[model]->gammaRates[i], pr->partitionData[model]->gammaRates[i]);
+  
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE);  
+}
+
+/** @brief broadcast new LG4X weights
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id
+ */
+static void broadCastLg4xWeights(partitionList *localPr, partitionList *pr)
+{
+  int  i,
+    model;
+
+#ifdef _FINE_GRAIN_MPI
+    int bufSize = localPr->numberOfPartitions * 4 * sizeof(double);
+  char bufDbl[bufSize];
+  char *bufPtrDbl = bufDbl;
+#endif
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    for(i = 0; i < 4; ++i)
+      ASSIGN_BUF_DBL(localPr->partitionData[model]->lg4x_weights[i], pr->partitionData[model]->lg4x_weights[i]);
+
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE);
+}
+
+static void copyLG4(partitionList *localPr, partitionList *pr)
+{
+    int model, i, k;
+
+    /* determine size of buffer needed first */
+    int bufSize = 0;
+
+#ifdef _FINE_GRAIN_MPI
+    for(model = 0; model < localPr->numberOfPartitions; ++model )
+      {
+        const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]);
+        bufSize += 4*(pl->eignLength + pl->evLength + pl->eiLength + pl->tipVectorLength + pl->substRatesLength + pl->frequenciesLength) * sizeof(double) ;
+      }
+#endif
+
+    char
+      bufDbl[bufSize];
+    char *bufPtrDbl = bufDbl;
+
+    RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+
+    for (model = 0; model < localPr->numberOfPartitions; model++)
+    {
+        pInfo * localInfo = localPr->partitionData[model];
+        pInfo * info = pr->partitionData[model];
+
+        if (info->protModels == PLL_LG4M || info->protModels == PLL_LG4X)
+        {
+            for (k = 0; k < 4; k++)
+            {
+                const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]);
+
+                for (i = 0; i < pl->eignLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->EIGN_LG4[k][i],
+                            pr->partitionData[model]->EIGN_LG4[k][i]);
+                for (i = 0; i < pl->evLength; ++i)
+                    ASSIGN_BUF_DBL(localPr->partitionData[model]->EV_LG4[k][i],
+                            pr->partitionData[model]->EV_LG4[k][i]);
+                for (i = 0; i < pl->eiLength; ++i)
+                    ASSIGN_BUF_DBL(localPr->partitionData[model]->EI_LG4[k][i],
+                            pr->partitionData[model]->EI_LG4[k][i]);
+                for (i = 0; i < pl->substRatesLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->substRates_LG4[k][i],
+                            pr->partitionData[model]->substRates_LG4[k][i]);
+                for (i = 0; i < pl->frequenciesLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->frequencies_LG4[k][i],
+                            pr->partitionData[model]->frequencies_LG4[k][i]);
+                for (i = 0; i < pl->tipVectorLength; ++i)
+                    ASSIGN_BUF_DBL(
+                            localPr->partitionData[model]->tipVector_LG4[k][i],
+                            pr->partitionData[model]->tipVector_LG4[k][i]);
+            }
+        }
+    }
+    SEND_BUF(bufDbl, bufSize, MPI_BYTE); /*  */
+}
+
+/** @brief Master broadcasts rates.
+    
+    @param localTree local library instance
+    @param tr library instance
+    @param tid worker id     
+ */ 
+static void broadCastRates(partitionList *localPr, partitionList *pr)
+{
+  int 
+    model;
+
+  /* determine size of buffer needed first */
+  int bufSize = 0;
+#ifdef _FINE_GRAIN_MPI
+  for(model = 0; model < localPr->numberOfPartitions; ++model )
+    {	  
+      const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]); /* this is constant, isnt it?  */
+      bufSize += (pl->eignLength + pl->evLength + pl->eiLength + pl->tipVectorLength) * sizeof(double) ;
+    }
+#endif
+
+  char
+      bufDbl[bufSize];
+    char *bufPtrDbl = bufDbl;
+
+  RECV_BUF(bufDbl, bufSize, MPI_BYTE);
+  int i ; 
+
+  for(model = 0; model < localPr->numberOfPartitions; model++)
+    {
+      const partitionLengths *pl = getPartitionLengths(pr->partitionData[model]); /* this is constant, isnt it?  */
+
+      for(i = 0; i < pl->eignLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EIGN[i], pr->partitionData[model]->EIGN[i]);
+      for(i = 0; i < pl->evLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EV[i],pr->partitionData[model]->EV[i]);
+      for(i = 0; i  < pl->eiLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->EI[i], pr->partitionData[model]->EI[i]);
+      for(i = 0; i < pl->tipVectorLength; ++i)
+	ASSIGN_BUF_DBL(localPr->partitionData[model]->tipVector[i],   pr->partitionData[model]->tipVector[i]);
+    }
+  SEND_BUF(bufDbl, bufSize, MPI_BYTE); /*  */
+
+  copyLG4(localPr, pr);
+}
+
+/** @brief Evaluate the likelihood of this topology (PThreads/MPI implementation)
+
+    Evaluate the likelihood of the topology described in the PLL instance. First
+    every thread calls \a pllEvaluateIterative where it computes the log likelihoods
+    for the  portion of each assigned partition. The results (for all partition) are stored
+    as elements of a local buffer array (\a buf). This is done by all threads. Subsequently, 
+    an \a MPI_Reduce operation sums the contents of corresponding elements of the local
+    buffer arrays into another array (\a targetBuf) which are the log likelihoods of
+    each (complete) partition. Finally, the last array is copied to the master thread/process.
+    In addition, if \a getPerSiteLikelihoods is enabled the log likelihoods for each site
+    in the (compressed) alignment are stored in the array \a tr->lhs.
+
+    @param tr
+      PLL instance
+    @param tr
+      Local (thread/process) PLL instance
+
+    @param pr
+      Local (thread/process) list of partitions
+
+    @param tid
+      Thread/Process ID
+
+    @param getPerSiteLikelihoods 
+      If set to \b PLL_TRUE, compute the log likelihood for each site. 
+ */ 
+static void reduceEvaluateIterative(pllInstance *tr, pllInstance *localTree, partitionList *localPr, int tid, pllBoolean getPerSiteLikelihoods)
+{
+  int model;
+
+  pllEvaluateIterative(localTree, localPr, getPerSiteLikelihoods);
+
+  /* when this is done we need to write the per-thread log likelihood to the 
+     global reduction buffer. Tid is the thread ID, hence thread 0 will write its 
+     results to reductionBuffer[0] thread 1 to reductionBuffer[1] etc.
+
+     the actual sum over the entries in the reduction buffer will then be computed 
+     by the master thread which ensures that the sum is determinsitic */
+
+  
+  /* if (getPerSiteLikelihoods == PLL_TRUE) store per-site likelihoods in array tr->lhs */
+  if(getPerSiteLikelihoods)
+    {    
+#ifdef _FINE_GRAIN_MPI
+      int n = processes; 
+#else 
+      int n = tr->numberOfThreads; 
+#endif
+
+      /* rearrange per site likelihoods into single local array for gathering */
+      int i ; 
+      for(model = 0; model < localPr->numberOfPartitions; ++model)
+	{
+	  pInfo *partition = localPr->partitionData[model]; 
+	  pllBoolean isMyPartition  = isThisMyPartition(localPr, tid, model);
+
+	  int ctr = 0; 
+	  for(i = partition->lower; i < partition->upper; ++i)
+	    {
+	      if(tr->manyPartitions && isMyPartition)
+		localTree->lhs[i] = partition->perSiteLikelihoods[ ctr++]; 
+	      else if(NOT tr->manyPartitions && (i % n) == tid)
+		localTree->lhs[i] = partition->perSiteLikelihoods[ctr++];
+	    }
+	}
+      
+      /* gather all the double into the global array */
+      collectDouble(tr->lhs, localTree->lhs, localTree, localPr,  n, tid); 
+    }
+
+  /* printf("collecting done\n" ); */
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+  /* 
+     aberer: I implemented this as a mpi_gather operation into this buffer, 
+     pthreads version emulates this gather; 
+     master takes care of the reduction; 
+  */
+
+  double 
+    buf[localPr->numberOfPartitions];
+
+  for(model = 0; model < localPr->numberOfPartitions; ++model)
+    buf[model] = localPr->partitionData[model]->partitionLH;
+
+  /* either make reproducible or efficient */
+  ASSIGN_GATHER(globalResult, buf, localPr->numberOfPartitions, PLL_DOUBLE, tid);
+
+  /* printf("gather worked\n"); */
+#else 
+  /* the efficient mpi version: a proper reduce  */
+  double 
+    buf[localPr->numberOfPartitions];
+  
+  for(model = 0; model < localPr->numberOfPartitions; ++model)
+    buf[model] = localPr->partitionData[model]->partitionLH;
+
+  double 
+    targetBuf[localPr->numberOfPartitions];
+  
+  memset(targetBuf, 0, sizeof(double) * localPr->numberOfPartitions);
+
+  MPI_Reduce(buf, targetBuf, localPr->numberOfPartitions, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+  
+  if(MASTER_P) 
+    {
+      for(model = 0; model < localPr->numberOfPartitions; ++model) {
+	localPr->partitionData[model]->partitionLH = targetBuf[model];
+      }
+    }
+#endif
+}
+
+
+
+/*@ @brief Broadcast the traversal descriptor to worker threads. 
+
+  The one below is a hack we are re-assigning the local pointer to
+  the global one the memcpy version below is just for testing and
+  preparing the fine-grained MPI BlueGene version
+
+  @param localTree local library instance
+  @param tr library instance
+*/
+/* TODO: we should reset this at some point, the excplicit copy is just done for testing */
+__inline static void broadcastTraversalInfo(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  /* @todo these two regions could be joined */
+#ifdef _USE_PTHREADS
+  /* memcpy -> memmove (see ticket #43). This function is sometimes called with localTree == tr,
+   * in which case some memcpy implementations can corrupt the buffers.
+   */
+  
+  localTree->td[0].functionType =            tr->td[0].functionType;
+  localTree->td[0].count =                   tr->td[0].count ;
+  localTree->td[0].traversalHasChanged =     tr->td[0].traversalHasChanged;
+
+  memmove(localTree->td[0].executeModel,    tr->td[0].executeModel,    sizeof(pllBoolean) * localPr->numberOfPartitions);
+  memmove(localTree->td[0].parameterValues, tr->td[0].parameterValues, sizeof(double) * localPr->numberOfPartitions);
+  
+  if(localTree->td[0].traversalHasChanged)
+    memmove(localTree->td[0].ti, tr->td[0].ti, localTree->td[0].count * sizeof(traversalInfo));
+
+#else
+  /* MPI */
+  /* like in raxml-light: first we send a small message, if the
+     travesalDescriptor is longer, then resend */
+  
+  int length = treeIsInitialized ? localPr->numberOfPartitions : 0;
+  char broadCastBuffer[messageSize(length)]; 
+  char *bufPtr = broadCastBuffer; 
+  int i; 
+
+  RECV_BUF(broadCastBuffer, messageSize(length), MPI_BYTE); 
+
+  ASSIGN_BUF(localTree->td[0].functionType, tr->td[0].functionType , int);   
+  ASSIGN_BUF(localTree->td[0].count,  tr->td[0].count , int); 
+  ASSIGN_BUF(localTree->td[0].traversalHasChanged, tr->td[0].traversalHasChanged , int); 
+
+  if(treeIsInitialized)  
+    { 
+      for(i = 0; i < localPr->numberOfPartitions; ++i)
+	{
+	  ASSIGN_BUF(localTree->td[0].executeModel[i],      tr->td[0].executeModel[i], int); 
+	  ASSIGN_BUF(localTree->td[0].parameterValues[i],	 tr->td[0].parameterValues[i], double); 
+	}      
+
+      for(i = 0; i < TRAVERSAL_LENGTH; ++i )
+	ASSIGN_BUF(localTree->td[0].ti[i], tr->td[0].ti[i], traversalInfo); 
+    }
+    
+  SEND_BUF(broadCastBuffer, messageSize(length), MPI_BYTE); 
+
+  /* now we send the second part of the traversal descriptor, if we
+     exceed the pre-set number of elements */
+  if(treeIsInitialized && localTree->td[0].count > TRAVERSAL_LENGTH) 
+    {
+      /* lets use the MPI_Datatype for this thing, what I've read it's
+	 supposed to be more secure and efficient */
+      MPI_Bcast(localTree->td[0].ti + TRAVERSAL_LENGTH, localTree->td[0].count - TRAVERSAL_LENGTH, TRAVERSAL_MPI, 0, MPI_COMM_WORLD );
+    }
+#endif
+}
+
+
+/** @brief helper that yields a string representation of a parallel region. 
+    
+    @param type type of parallel region
+ */ 
+char* getJobName(int type)
+{
+  switch(type)  
+    {
+    case  PLL_THREAD_NEWVIEW:       
+      return "PLL_THREAD_NEWVIEW";
+    case PLL_THREAD_EVALUATE: 
+      return "PLL_THREAD_EVALUATE";
+    case PLL_THREAD_MAKENEWZ: 
+      return "PLL_THREAD_MAKENEWZ";
+    case PLL_THREAD_MAKENEWZ_FIRST: 
+      return "PLL_THREAD_MAKENEWZ_FIRST";
+    case PLL_THREAD_RATE_CATS: 
+      return "PLL_THREAD_RATE_CATS";
+    case PLL_THREAD_COPY_RATE_CATS: 
+      return "PLL_THREAD_COPY_RATE_CATS";
+    case PLL_THREAD_COPY_INIT_MODEL: 
+      return "PLL_THREAD_COPY_INIT_MODEL";
+    case PLL_THREAD_INIT_PARTITION: 
+      return "PLL_THREAD_INIT_PARTITION";
+    case PLL_THREAD_OPT_ALPHA: 
+      return "PLL_THREAD_OPT_ALPHA";
+    case PLL_THREAD_OPT_RATE: 
+      return "PLL_THREAD_OPT_RATE";
+    case PLL_THREAD_COPY_ALPHA: 
+      return "PLL_THREAD_COPY_ALPHA";
+    case PLL_THREAD_COPY_RATES: 
+      return "PLL_THREAD_COPY_RATES";
+    case PLL_THREAD_PER_SITE_LIKELIHOODS: 
+      return "PLL_THREAD_PER_SITE_LIKELIHOODS";
+    case PLL_THREAD_NEWVIEW_ANCESTRAL: 
+      return "PLL_THREAD_NEWVIEW_ANCESTRAL";
+    case PLL_THREAD_GATHER_ANCESTRAL: 
+      return "PLL_THREAD_GATHER_ANCESTRAL";
+    case PLL_THREAD_EXIT_GRACEFULLY: 
+      return "PLL_THREAD_EXIT_GRACEFULLY";
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES:
+      return "PLL_THREAD_EVALUATE_PER_SITE_LIKES";
+    default: assert(0); 
+    }
+}
+
+/**
+   @brief Generic entry point for parallel regions (mostly broadcasts
+   traversal descriptor first).
+
+   This function here handles all parallel regions in the Pthreads
+   version, when we enter this function pllMasterBarrier() has been called
+   by the master thread from within the sequential part of the
+   program, tr is the library instance (tree) at the master thread, 
+   localTree is the library instance (tree) at the worker threads
+
+   While this is not necessary, adress spaces of threads are indeed
+   separated for easier transition to a distributed memory paradigm
+   
+   @param tr library instance
+   @param localTree local library instance 
+   @param tid worker id 
+   @param n number of workers 
+*/
+static pllBoolean execFunction(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+{
+  int
+    i,
+    model,
+    localCounter;
+
+#ifdef MEASURE_TIME_PARALLEL
+  double timeForParallelRegion = gettime();
+#endif
+
+
+#ifdef _USE_PTHREADS
+  /* some stuff associated with the barrier implementation using Pthreads and busy wait */
+  int currentJob = threadJob >> 16;
+#endif
+
+  /* here the master sends and all threads/processes receive the traversal descriptor */
+  broadcastTraversalInfo(localTree, tr, localPr);
+
+#ifdef _USE_PTHREADS
+  /* make sure that nothing is going wrong */
+  assert(currentJob == localTree->td[0].functionType);
+#else   
+  localTree = tr; 
+  int currentJob = localTree->td[0].functionType; 
+#endif
+
+#ifdef DEBUG_PARALLEL
+  printf("[%d] working on %s\n", tid, getJobName(currentJob)); 
+#endif  
+
+  switch(currentJob)
+    { 
+    case PLL_THREAD_NEWVIEW: 
+      /* just a newview on the fraction of sites that have been assigned to this thread */
+
+      pllNewviewIterative(localTree, localPr, 0);
+      break;     
+    case PLL_THREAD_EVALUATE: 
+      reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+      break;	
+    case PLL_THREAD_MAKENEWZ_FIRST:
+
+      /* this is the first call from within makenewz that requires getting the likelihood vectors to the left and 
+         right of the branch via newview and doing some precomputations.
+	 
+         For details see comments in makenewzGenericSpecial.c 
+      */
+    case  PLL_THREAD_MAKENEWZ:
+      {	
+	double
+	  dlnLdlz[PLL_NUM_BRANCHES],
+	  d2lnLdlz2[PLL_NUM_BRANCHES]; 
+
+	if(localTree->td[0].functionType == PLL_THREAD_MAKENEWZ_FIRST)
+	  makenewzIterative(localTree, localPr);
+	execCore(localTree, localPr, dlnLdlz, d2lnLdlz2);
+
+	/* gather the first and second derivatives that have been written by each thread */
+	/* as for evaluate above, the final sum over the derivatives will be computed by the 
+	   master thread in its sequential part of the code */
+
+	int numBranches = localPr->perGeneBranchLengths?localPr->numberOfPartitions:1;
+
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+	/* MPI: implemented as a gather again, pthreads: just buffer copying */	
+	double buf[ 2 * numBranches];
+	memcpy( buf, dlnLdlz, numBranches * sizeof(double) );
+	memcpy(buf + numBranches, d2lnLdlz2, numBranches * sizeof(double));
+
+	ASSIGN_GATHER(globalResult, buf,  2 * numBranches, PLL_DOUBLE, tid);
+#else 	
+	double result[numBranches];
+	memset(result,0, numBranches * sizeof(double));
+	MPI_Reduce( dlnLdlz , result , numBranches, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if(MASTER_P)
+	  memcpy(globalResult, result, sizeof(double) * numBranches);
+	
+	memset(result,0,numBranches * sizeof(double));
+	MPI_Reduce( d2lnLdlz2 , result , numBranches, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	if(MASTER_P)
+	  memcpy(globalResult + numBranches, result, sizeof(double) * numBranches);
+#endif
+      }
+
+      break;
+
+    case PLL_THREAD_INIT_PARTITION:       
+
+      /* broadcast data and initialize and allocate arrays in partitions */
+      
+      initializePartitionsMaster(tr, localTree, pr, localPr, tid, n);
+
+      break;          
+    case PLL_THREAD_COPY_ALPHA: 
+    case PLL_THREAD_OPT_ALPHA:
+      /* this is when we have changed the alpha parameter, inducing a change in the discrete gamma rate categories.
+	 this is called when we are optimizing or sampling (in the Bayesioan case) alpha parameter values */
+      
+      /* distribute the new discrete gamma rates to the threads */
+      broadCastAlpha(localPr,pr);
+
+      /* compute the likelihood, note that this is always a full tree traversal ! */
+      if(localTree->td[0].functionType == PLL_THREAD_OPT_ALPHA)
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+      break;
+    case PLL_THREAD_OPT_RATE:
+    case PLL_THREAD_COPY_RATES:
+
+      /* if we are optimizing the rates in the transition matrix Q this induces recomputing the eigenvector eigenvalue 
+	 decomposition and the tipVector as well because of the special numerics in RAxML, the matrix of eigenvectors 
+	 is "rotated" into the tip lookup table.
+
+	 Hence if the sequential part of the program that steers the Q matrix rate optimization has changed a rate we
+	 need to broadcast all eigenvectors, eigenvalues etc to each thread 
+      */
+
+      broadCastRates(localPr, pr);
+
+      /* now evaluate the likelihood of the new Q matrix, this always requires a full tree traversal because the changes need
+	 to be propagated throughout the entire tree */
+
+      if(localTree->td[0].functionType == PLL_THREAD_OPT_RATE)
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+      break;
+    case PLL_THREAD_COPY_LG4X_RATES:
+
+        broadCastLg4xWeights(localPr, pr);
+        broadCastAlpha(localPr, pr);
+
+        assert(localPr->partitionData[0]->lg4x_weights[0] == pr->partitionData[0]->lg4x_weights[0]);
+
+        break;
+    case PLL_THREAD_OPT_LG4X_RATE:
+
+        broadCastLg4xWeights(localPr, pr);
+        broadCastAlpha(localPr, pr);
+
+        assert(localPr->partitionData[0]->lg4x_weights[0] == pr->partitionData[0]->lg4x_weights[0]);
+
+        /* compute the likelihood, note that this is always a full tree traversal ! */
+        reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_FALSE);
+
+        break;
+    case PLL_THREAD_COPY_INIT_MODEL:
+      {
+
+	/* need to be very careful here ! PLL_THREAD_COPY_INIT_MODEL is also used when the program is restarted 
+	   it is hence not sufficient to just initialize everything by the default values ! */
+
+	broadCastRates(localPr, pr);
+	broadCastAlpha(localPr, pr); /* isnt that only executed when we are on gamma?  */
+	broadCastLg4xWeights(localPr, pr);
+
+	/*
+	  copy initial model parameters, the Q matrix and alpha are initially, when we start our likelihood search 
+	  set to default values. 
+	  Hence we need to copy all those values that are required for computing the likelihood 
+	  with newview(), evaluate() and makenez() to the private memory of the threads 
+	*/
+
+
+	if( localTree->rateHetModel == PLL_CAT) /* TRICKY originally this should only be executed by workers  */
+	  {
+#ifdef _FINE_GRAIN_MPI
+	    int bufSize = 2 * localTree->originalCrunchedLength * sizeof(double); 
+	    char bufDbl[bufSize], 
+	      *bufPtrDbl = bufDbl; 
+#endif
+
+	    RECV_BUF(bufDbl, bufSize,MPI_BYTE); 
+
+	    /* this should be local  */
+	    for(model = 0; model < localPr->numberOfPartitions; model++)
+	      localPr->partitionData[model]->numberOfCategories      = pr->partitionData[model]->numberOfCategories;
+
+
+	    /* this is only relevant for the PSR model, we can worry about this later */
+	    for(i = 0; i < localTree->originalCrunchedLength; ++i)
+	      {
+		ASSIGN_BUF_DBL(localTree->patrat[i], tr->patrat[i]);
+		ASSIGN_BUF_DBL(localTree->patratStored[i], tr->patratStored[i]); 
+	      }
+
+	    SEND_BUF(bufDbl, bufSize, MPI_BYTE); 
+	  }
+      } 
+      break;    
+    case PLL_THREAD_RATE_CATS: 
+      {
+	/* this is for optimizing per-site rate categories under PSR, let's worry about this later */
+
+	ASSIGN_DBL( localTree->lower_spacing,  tr->lower_spacing);
+	ASSIGN_DBL( localTree->upper_spacing,  tr->upper_spacing);
+
+	optRateCatPthreads(localTree, localPr, localTree->lower_spacing, localTree->upper_spacing, localTree->lhs, n, tid);
+
+	broadcastAfterRateOpt(tr, localTree, localPr, n,  tid);
+      }
+      break;
+    case PLL_THREAD_COPY_RATE_CATS:
+      {
+	/* 
+	   this is invoked when we have changed the per-site rate category assignment
+	   In essence it distributes the new per site rates to all threads 
+
+	   The pthread-version here simply assigns everything as ought to
+	   be. The MPI-version is configured to write to a buffer instead
+	   and SEND (master) or RECV (workers) it.
+
+	*/
+
+	/* 
+	   start of communication part 
+	*/
+
+	int i, 
+	  /* buf[localPr->numberOfPartitions], */
+	  /* assertCtr = 0,  */
+	  dblBufSize = 0; 
+
+#ifdef _FINE_GRAIN_MPI
+	int bufSize = localPr->numberOfPartitions * sizeof(int); 
+	char buf[bufSize]; 
+	char *bufPtr = buf; 
+#endif
+     
+	RECV_BUF(buf, bufSize, MPI_BYTE);
+
+	for( model = 0; model < localPr->numberOfPartitions; ++model)
+	  {
+	    ASSIGN_BUF(localPr->partitionData[model]->numberOfCategories, pr->partitionData[model]->numberOfCategories, int);
+	    dblBufSize += localPr->partitionData[model]->numberOfCategories * sizeof(double);
+	  }
+
+	SEND_BUF(buf, bufSize, MPI_BYTE); 
+
+
+	dblBufSize += 2 * localTree->originalCrunchedLength * sizeof(double); 
+
+#ifdef _FINE_GRAIN_MPI
+	char bufDbl[dblBufSize],
+	  *bufPtrDbl = bufDbl;
+#endif
+
+	RECV_BUF(bufDbl, dblBufSize, MPI_BYTE); 
+
+	for(i = 0; i < localTree->originalCrunchedLength; ++i)
+	  {	 
+	    ASSIGN_BUF_DBL(localTree->patrat[i], tr->patrat[i]); 
+	    ASSIGN_BUF_DBL(localTree->patratStored[i], tr->patratStored[i]); 
+	  }
+
+	for( model = 0; model < localPr->numberOfPartitions; ++model)
+	  for(i = 0; i < localPr->partitionData[model]->numberOfCategories; i++)
+	    ASSIGN_BUF_DBL(localPr->partitionData[model]->perSiteRates[i], pr->partitionData[model]->perSiteRates[i]);
+
+	SEND_BUF(bufDbl, dblBufSize, MPI_BYTE); 
+
+
+	/* lets test, if it is a good idea to send around the basic categories  */
+#ifdef _FINE_GRAIN_MPI
+	/* TODO this is inefficient, but is seems to have a small impact on performance */
+	MPI_Bcast(tr->rateCategory, tr->originalCrunchedLength, MPI_INT, 0, MPI_COMM_WORLD); 
+#endif
+
+
+	/* 
+	   now re-assign values 
+	*/
+	for(model = 0; model < localPr->numberOfPartitions; model++)
+	  {
+	    if(localTree->manyPartitions)
+	      {
+		if(isThisMyPartition(localPr, tid, model))
+		  for(localCounter = 0, i = localPr->partitionData[model]->lower;  i < localPr->partitionData[model]->upper; i++, localCounter++)
+		    {	     
+		      localPr->partitionData[model]->rateCategory[localCounter] = tr->rateCategory[i];
+		    } 
+	      }
+	    else	  
+	      {
+		for(localCounter = 0, i = localPr->partitionData[model]->lower;  i < localPr->partitionData[model]->upper; i++)
+		  {
+		    if(i % n == tid)
+		      {		 
+			localPr->partitionData[model]->rateCategory[localCounter] = tr->rateCategory[i];
+
+			localCounter++;
+		      }
+		  }
+	      }
+	  }
+      }
+      break;
+    case PLL_THREAD_PER_SITE_LIKELIHOODS:      
+      {
+
+	/* compute per-site log likelihoods for the sites/partitions 
+	   that are handled by this thread */
+	perSiteLogLikelihoodsPthreads(localTree, localPr, localTree->lhs, n, tid);
+
+	/* do a parallel gather operation, the threads will write their results 
+	   into the global buffer tr->lhs that will then contain all per-site log likelihoods
+	   in the proper order 
+	*/
+
+	collectDouble(tr->lhs,                localTree->lhs,                  localTree, localPr, n, tid);
+
+      }
+      break;
+      /* check for errors */
+    case PLL_THREAD_NEWVIEW_ANCESTRAL:       
+      assert(0);
+      break; 
+    case PLL_THREAD_GATHER_ANCESTRAL:
+      assert(0); 
+      break; 
+    case PLL_THREAD_EXIT_GRACEFULLY: 
+      {
+	/* cleans up the workers memory */
+
+#ifdef _USE_PTHREADS
+	/* TODO destroying the tree does not work yet in a highly
+	   generic manner. */
+
+	if(NOT MASTER_P)
+	  {
+	    pllPartitionsDestroy (localTree, &localPr);
+	    /* pllTreeDestroy (localTree); */
+	  }
+	else 
+	  {
+	    //pllPartitionsDestroy (tr, &pr);
+	    /* pllTreeDestroy (tr); */
+	  }
+
+#else 
+	//pllPartitionsDestroy (tr, &pr);
+	/* pllTreeDestroy (tr); */
+	
+	//MPI_Finalize();
+	//exit(0); 
+#endif	
+	return PLL_FALSE; 
+      }
+      break; 
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES: 
+      {
+	reduceEvaluateIterative(tr, localTree, localPr, tid, PLL_TRUE);
+      }
+      break;
+    default:
+      printf("Job %d\n", currentJob);
+      assert(0);
+    }
+
+  return PLL_TRUE; 
+}
+
+
+
+
+/**  Target function where the threads/processes are trapped
+
+     The threads/processes spend all of their time in this function
+     running operations on the data (computing likelihoods).
+
+     @param tData
+       Structure that contains the vital information for the thread/process, 
+       i.e. PLL instance, list of partitions and thread ID
+
+     @note
+       The data in \a tData are different for pthreads and MPI. 
+       Expand this section.
+ */ 
+static void *likelihoodThread(void *tData)
+{
+  threadData *td = (threadData*)tData;
+  pllInstance 
+    *tr = td->tr;
+  partitionList *pr = td->pr;
+
+#ifdef _USE_PTHREADS
+  pllInstance *localTree = rax_calloc(1,sizeof(pllInstance )); 
+  partitionList *localPr = rax_calloc(1,sizeof(partitionList));
+
+  int
+    myCycle = 0,
+    localTrap = 1;
+
+  const int 
+    n = td->tr->numberOfThreads,
+    tid = td->threadNumber;
+
+#ifndef _PORTABLE_PTHREADS
+  pinToCore(tid);
+#endif
+
+  /* printf("\nThis is RAxML Worker Pthread Number: %d\n", tid); */
+
+  while(localTrap)
+    {
+
+      while (myCycle == threadJob);
+      myCycle = threadJob;
+
+      if ((threadJob >> 16) != PLL_THREAD_INIT_PARTITION) {
+    	  localPr->perGeneBranchLengths = pr->perGeneBranchLengths;
+      	  localPr->numberOfPartitions = pr->numberOfPartitions;
+      }
+      localTrap = execFunction(tr, localTree, pr, localPr, tid, n);
+
+      barrierBuffer[tid] = 1;     
+    }
+    rax_free (localTree->td[0].executeModel); //localTree->td[0].executeModel = NULL;
+    rax_free (localTree->td[0].parameterValues); //localTree->td[0].parameterValues = NULL;
+    rax_free (localTree->rateCategory); //localTree->rateCategory = NULL;
+    rax_free (localTree->lhs); //localTree->lhs = NULL;
+    rax_free (localTree->patrat); //localTree->patrat = NULL;
+    rax_free (localTree->patratStored); //localTree->patratStored = NULL;
+    rax_free (localTree->td[0].ti); //localTree->td[0].ti = NULL;
+    rax_free (localTree);
+#else 
+  const int
+    n = processes, 
+    tid = td->threadNumber;
+  int i;
+
+  /* printf("\nThis is RAxML Worker Process Number: %d\n", tid); */
+
+  while(execFunction(tr, tr, pr, pr, tid,n));
+
+  rax_free (tr->lhs);
+  rax_free (tr->td[0].ti);
+  rax_free (tr->td[0].executeModel);
+  rax_free (tr->td[0].parameterValues);
+  rax_free (tr->patrat);
+  rax_free (tr->patratStored);
+  rax_free (tr->aliaswgt);
+  rax_free (tr->y_ptr);
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+    rax_free (pr->partitionData[i]);
+  rax_free (pr->partitionData);
+  rax_free (pr);
+  rax_free (tr);
+#endif
+
+  return (void*)NULL;
+}
+
+
+/**
+   @brief Cleanup step once the master barrier succeeded. 
+
+   This is master specific code called once the barrier is
+   passed. Stuff such as reduction operations.  If we execute this
+   here, we can keep the code mostly free from parallel -specific
+   code.
+   
+   @param tr 
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @param jobType 
+     Job that is to be executed
+*/
+void pllMasterPostBarrier(pllInstance *tr, partitionList *pr, int jobType)
+{
+  assert(tr->threadID == 0); 
+  
+  switch(jobType)
+    {
+    case PLL_THREAD_EVALUATE: 
+    case PLL_THREAD_OPT_RATE: 
+    case PLL_THREAD_OPT_ALPHA:
+    case PLL_THREAD_OPT_LG4X_RATE:
+    case PLL_THREAD_EVALUATE_PER_SITE_LIKES: 
+      {
+#ifdef _REPRODUCIBLE_MPI_OR_PTHREADS
+	int i,j;
+	volatile double partitionResult;	
+
+	for(j = 0; j < pr->numberOfPartitions; j++)
+	  {
+	    for(i = 0, partitionResult = 0.0; i < tr->numberOfThreads; i++) 
+	      partitionResult += globalResult[i * pr->numberOfPartitions+ j];
+
+	    pr->partitionData[j]->partitionLH = partitionResult;
+	  }
+#endif      
+
+	break; 
+      } 
+    case PLL_THREAD_PER_SITE_LIKELIHOODS:
+      {
+	int i; 
+	/* now just compute the sum over per-site log likelihoods for error checking */      
+	double accumulatedPerSiteLikelihood = 0.; 
+	for(i = 0; i < tr->originalCrunchedLength; i++)
+	  accumulatedPerSiteLikelihood += tr->lhs[i];
+
+	/* printf("RESULT: %f\t%f", tr->likelihood, accumulatedPerSiteLikelihood);  */
+	assert(PLL_ABS(tr->likelihood - accumulatedPerSiteLikelihood) < 0.00001);
+      }
+      break;
+    default: 
+      ; 			/* dont do anything on default,
+				   mostly, we can skip that */
+    } 
+}
+
+/**
+   @brief A generic master barrier for executing parallel parts of the code
+
+   A generic master barrier through which the master thread/process controls
+   the work job execution. Through the parameter \a jobType the master instructs
+   the slaves of what type of work they must conduct.
+
+   @param tr
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @param jobType 
+     Type of job to be conducted
+ */ 
+void pllMasterBarrier(pllInstance *tr, partitionList *pr, int jobType)
+{
+
+#ifdef MEASURE_TIME_PARALLEL
+  assert(jobType < NUM_PAR_JOBS); 
+  timePerRegion[NUM_PAR_JOBS]  += gettime()- masterTimePerPhase ; 
+  masterTimePerPhase = gettime();
+#endif
+
+#ifdef _USE_PTHREADS
+  const int 
+    n = tr->numberOfThreads;
+
+  tr->td[0].functionType = jobType;
+
+  jobCycle = !jobCycle;
+  threadJob = (jobType << 16) + jobCycle;
+
+  execFunction(tr, tr, pr, pr, 0, n);
+
+  int 
+    i, 
+    sum;
+
+  do
+    {
+      for(i = 1, sum = 1; i < n; i++)
+	sum += barrierBuffer[i];
+    }
+  while(sum < n);  
+
+  for(i = 1; i < n; i++)
+    barrierBuffer[i] = 0;
+#else 
+  tr->td[0].functionType = jobType; 
+  execFunction(tr,tr,pr,pr,0,processes);
+#endif
+
+  /* code executed by the master, once the barrier is crossed */
+  pllMasterPostBarrier(tr, pr, jobType);
+
+#ifdef MEASURE_TIME_PARALLEL
+  timePerRegion[jobType] += gettime() - masterTimePerPhase; 
+  masterTimePerPhase = gettime();
+#endif
+}
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+/** @brief Initialize structures for slave process/threads
+ 
+    Allocate all memory structures required by slave threads/processes
+
+    @param tr 
+      PLL Instance
+
+    @param localTree 
+      A local PLL instance for the slave process/thread which is initialized in this function based on \a tr
+
+    @pram pr
+      List of partitions
+
+    @param localPr
+      A local list of partitions for the slave process/thread which will be initialized based on \a pr 
+
+    @pram tid
+      The slave process/thread ID
+
+    @note
+      This function should never be called by the master thread, but is called by master process in MPI implementation.
+ */ 
+static void assignAndInitPart1(pllInstance *localTree, pllInstance *tr, partitionList *localPr, partitionList *pr, int *tid)
+{
+  size_t
+    model; 
+  int
+    totalLength = 0; 
+
+#ifdef _USE_PTHREADS
+  localTree->threadID = *tid; 
+  /* printf("my id is %d\n", *tid);  */
+  assert(localTree != tr);
+  localTree->numberOfThreads = tr->numberOfThreads;
+#else  /* => MPI */
+  *tid = processID; 
+  localTree->threadID = processID; 
+  tr->numberOfThreads = processes;
+
+  int bufSize = (9 + pr->numberOfPartitions* 8) * sizeof(int);
+  char buf[bufSize], 
+    *bufPtr = buf;  
+#endif
+
+  RECV_BUF(buf, bufSize, MPI_BYTE); 
+
+  ASSIGN_BUF( localTree->useRecom,                  tr->useRecom, int);
+  ASSIGN_BUF( localTree->rateHetModel,              tr->rateHetModel, int);
+  ASSIGN_BUF( localTree->useMedian,                 tr->useMedian, int); 
+  ASSIGN_BUF( localTree->saveMemory,                tr->saveMemory, int);
+  ASSIGN_BUF( localTree->maxCategories,             tr->maxCategories, int);
+  ASSIGN_BUF( localTree->originalCrunchedLength,    tr->originalCrunchedLength, int);
+  ASSIGN_BUF( localTree->mxtips,                    tr->mxtips, int);
+  ASSIGN_BUF( localPr->numberOfPartitions,          pr->numberOfPartitions, int);
+  ASSIGN_BUF( localPr->perGeneBranchLengths,        pr->perGeneBranchLengths, pllBoolean);
+
+  localTree->td[0].count = 0; 
+
+  if(NOT MASTER_P)
+    {
+      localTree->lhs                     = (double*)rax_calloc((size_t)localTree->originalCrunchedLength, sizeof(double));     
+      localPr->partitionData           = (pInfo**)rax_calloc(PLL_NUM_BRANCHES,sizeof(pInfo*));
+      for(model = 0; model < (size_t)localPr->numberOfPartitions; model++) {
+    	localPr->partitionData[model] = (pInfo*)rax_calloc(1,sizeof(pInfo));
+      }
+      localTree->td[0].ti              = (traversalInfo *)rax_malloc(sizeof(traversalInfo) * (size_t)localTree->mxtips);
+      localTree->td[0].executeModel    = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * PLL_NUM_BRANCHES);
+      localTree->td[0].parameterValues = (double *)rax_malloc(sizeof(double) * PLL_NUM_BRANCHES);
+      localTree->patrat       = (double*)rax_malloc(sizeof(double) * (size_t)localTree->originalCrunchedLength);
+      localTree->patratStored = (double*)rax_malloc(sizeof(double) * (size_t)localTree->originalCrunchedLength);            
+    }
+  
+  for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    {
+      ASSIGN_BUF(localPr->partitionData[model]->numberOfCategories,     pr->partitionData[model]->numberOfCategories, int);
+      ASSIGN_BUF(localPr->partitionData[model]->states,                 pr->partitionData[model]->states, int);
+      ASSIGN_BUF(localPr->partitionData[model]->maxTipStates ,          pr->partitionData[model]->maxTipStates, int);
+      ASSIGN_BUF(localPr->partitionData[model]->dataType ,              pr->partitionData[model]->dataType, int);
+      ASSIGN_BUF(localPr->partitionData[model]->protModels ,            pr->partitionData[model]->protModels, int);
+      ASSIGN_BUF(localPr->partitionData[model]->protUseEmpiricalFreqs , pr->partitionData[model]->protUseEmpiricalFreqs, int);
+      ASSIGN_BUF(localPr->partitionData[model]->lower ,                 pr->partitionData[model]->lower, int);
+      ASSIGN_BUF(localPr->partitionData[model]->upper ,                 pr->partitionData[model]->upper, int);
+      ASSIGN_BUF(localPr->partitionData[model]->ascBias,                pr->partitionData[model]->ascBias, pllBoolean);
+
+      localPr->partitionData[model]->partitionLH = 0.0;      
+
+      totalLength += (localPr->partitionData[model]->upper -  localPr->partitionData[model]->lower);
+    }
+
+  SEND_BUF(buf, bufSize, MPI_BYTE); 
+
+  assert(totalLength == localTree->originalCrunchedLength);
+
+  ASSIGN_DBL(localTree->vectorRecomFraction, tr->vectorRecomFraction); 
+}
+#endif
+
+
+/** @brief Distribute y-vectors during initialization. 
+
+    Distribute the alignment data to the slave process/threads. Each slave
+    copies the data (alignment) from its assigned partition to its local 
+    partition structure.
+
+    @param tr 
+      PLL instance
+    
+    @param localTree 
+      Local library instance for the current thread
+
+    @param localPr
+      Local list of partitions structure for the current thread
+ */ 
+static void distributeYVectors(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  size_t 
+    i,
+    n = localTree->numberOfThreads,
+    globalCounter = 0,
+    localCounter = 0,
+    model = 0, 
+    j; 
+  int tid = localTree->threadID; 
+  
+
+  /* distribute the y-vectors */
+  for(j = 1 ; j <= (size_t)localTree->mxtips; j++)	
+    {
+#ifdef _FINE_GRAIN_MPI
+      unsigned char yBuf[tr->originalCrunchedLength]; 	  
+      if(MASTER_P)
+	memcpy(yBuf, tr->yVector[j], tr->originalCrunchedLength * sizeof(unsigned char));
+      MPI_Bcast(  yBuf, tr->originalCrunchedLength, MPI_UNSIGNED_CHAR,0,MPI_COMM_WORLD); 
+#endif	  
+
+      for(model = 0, globalCounter = 0; model < (size_t)localPr->numberOfPartitions; model++)
+	{
+	  if(tr->manyPartitions)
+	    {
+	      if(isThisMyPartition(localPr, tid, model))
+		{
+		  assert(localPr->partitionData[model]->upper - localPr->partitionData[model]->lower == localPr->partitionData[model]->width);
+		  for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, localCounter++, globalCounter++)
+#ifdef _USE_PTHREADS
+		    localPr->partitionData[model]->yVector[j][localCounter] = tr->yVector[j][globalCounter];
+#else 
+		  localPr->partitionData[model]->yVector[j][localCounter] = yBuf[globalCounter];
+#endif
+
+
+		}
+	      else
+		globalCounter += (localPr->partitionData[model]->upper - localPr->partitionData[model]->lower);
+	    }
+	  else 
+	    {
+	      for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, globalCounter++)
+		{
+		  if(i % (size_t)n == (size_t)tid)
+		    {
+#ifdef _USE_PTHREADS
+		      localPr->partitionData[model]->yVector[j][localCounter] = tr->yVector[j][globalCounter];
+#else 
+		      localPr->partitionData[model]->yVector[j][localCounter] = yBuf[globalCounter];
+#endif
+		      ++localCounter; 
+		    }
+		}	   
+	    }
+	}
+    }
+}
+
+/** @brief Distribute the weights in the alignment of slave process/threads
+
+    Allocate space in the local tree structure for the alignment weights. Then
+    copy the weights vector from the master process/thread to the slaves.
+
+    @param tr 
+      PLL instance
+    
+    @param localTree 
+      Local library instance for the current process/thread
+
+    @param localPr
+      Local list of partitions for the current process/thread
+
+    @todo
+      The alignment weights should go to the partitions structure rather than the tree structure
+ */ 
+static void distributeWeights(pllInstance *localTree, pllInstance *tr, partitionList *localPr)
+{
+  int tid = localTree->threadID; 
+  int n = localTree->numberOfThreads; 
+
+  size_t     
+    globalCounter = 0,
+    i,
+    localCounter  = 0,
+    model; 
+
+
+
+  /* distribute the weights  */
+#ifdef _FINE_GRAIN_MPI 		/* need to broadcast a few things first */
+  if(NOT MASTER_P)
+    tr->aliaswgt = rax_malloc(sizeof(int) * tr->originalCrunchedLength); 
+  MPI_Bcast(tr->aliaswgt, tr->originalCrunchedLength, MPI_INT, 0, MPI_COMM_WORLD);      
+#endif
+  for(model = 0, globalCounter = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    { 
+      if(tr->manyPartitions)
+	{
+	  if(isThisMyPartition(localPr, tid, model))
+	    {
+	      assert(localPr->partitionData[model]->upper - localPr->partitionData[model]->lower == localPr->partitionData[model]->width);
+	      for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, localCounter++, globalCounter++)
+		localPr->partitionData[model]->wgt[localCounter]          = tr->aliaswgt[globalCounter];
+	    }
+	  else
+	    globalCounter += (localPr->partitionData[model]->upper - localPr->partitionData[model]->lower);
+	}
+      else 
+	{ 
+	  for(localCounter = 0, i = (size_t)localPr->partitionData[model]->lower;  i < (size_t)localPr->partitionData[model]->upper; i++, globalCounter++)
+	    {
+	      if(i % (size_t)n == (size_t)tid)
+		localPr->partitionData[model]->wgt[localCounter++]       = tr->aliaswgt[globalCounter];
+	    }	   
+	}
+    }
+}
+
+
+/** @brief Initialize the partitioning scheme (master function) in parallel environment.
+    
+    Initialize the partition scheme in all processes/threads. This is a wrapper function
+    that calls all necessary functions for allocating the local structures for slave threads
+    and for distributing all necessary data from the master threads, such as alignment data,
+    and weight vectors.
+
+    @param tr 
+      PLL instance
+
+    @param localTree 
+      Local PLL instance for the slave process/thread
+
+    @param pr
+      List of partitions
+
+    @param localPr
+      Local partition structure for the slave process/thread
+
+    @param tid
+      Process/thread id
+
+    @param n 
+      Number of processes/threads
+*/ 
+static void initializePartitionsMaster(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+{ 
+  size_t
+    model;
+
+  treeIsInitialized = PLL_TRUE; 
+
+  ASSIGN_INT(localTree->manyPartitions, tr->manyPartitions);
+  ASSIGN_INT(localTree->numberOfThreads, tr->numberOfThreads);
+  ASSIGN_INT(localPr->numberOfPartitions, pr->numberOfPartitions);
+
+#ifdef _USE_PTHREADS
+  if(MASTER_P)
+    globalResult = rax_calloc((size_t) tr->numberOfThreads * (size_t)pr->numberOfPartitions* 2 ,sizeof(double));
+  else 
+    assignAndInitPart1(localTree, tr, localPr, pr, &tid);
+#else 
+  globalResult = rax_calloc((size_t) tr->numberOfThreads * (size_t)pr->numberOfPartitions* 2 ,sizeof(double));
+  assignAndInitPart1(localTree, tr, localPr, pr, &tid);
+  defineTraversalInfoMPI();
+#endif
+
+  for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+    localPr->partitionData[model]->width        = 0;
+
+  if(tr->manyPartitions)    
+    {
+      multiprocessorScheduling(localTree, localPr, tid);
+      computeFractionMany(localPr, tid);
+    }
+  else
+    computeFraction(localPr, tid, n);
+
+  initializePartitionData(localTree, localPr);
+
+  {
+    size_t 
+      model,  
+      i,      
+      countOffset,
+      myLength = 0;
+
+    for(model = 0; model < (size_t)localPr->numberOfPartitions; model++)
+      myLength += localPr->partitionData[model]->width;
+
+    /* assign local memory for storing sequence data */
+    
+    localTree->y_ptr = (unsigned char *)rax_malloc(myLength * (size_t)(localTree->mxtips) * sizeof(unsigned char));
+    assert(localTree->y_ptr != NULL);
+
+    for(i = 0; i < (size_t)localTree->mxtips; i++)
+      {
+	for(model = 0, countOffset = 0; model < (size_t)localPr->numberOfPartitions; model++)
+	  {	    
+	    localPr->partitionData[model]->yVector[i+1]   = &localTree->y_ptr[i * myLength + countOffset];
+	    countOffset +=  localPr->partitionData[model]->width;
+	  }
+	assert(countOffset == myLength);
+      }
+
+    /* figure in data */
+
+    distributeWeights(localTree, tr, localPr);
+
+    distributeYVectors(localTree, tr, localPr);
+
+  }
+
+  initMemorySavingAndRecom(localTree, localPr);
+}
diff --git a/pllrepo/src/genericParallelization.h b/pllrepo/src/genericParallelization.h
new file mode 100644
index 0000000..576f8e9
--- /dev/null
+++ b/pllrepo/src/genericParallelization.h
@@ -0,0 +1,127 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file genericParallelization.h
+ */
+#ifndef _GENERIC_PARALL_H 
+#define _GENERIC_PARALL_H 
+
+
+extern double *globalResult; 
+
+
+/**********/
+/* CONFIG */
+/**********/
+
+/* #define MEASURE_TIME_PARALLEL */
+#define _PORTABLE_PTHREADS
+/* #define DEBUG_PARALLEL */ 
+/* #define DEBUG_MPI_EACH_SEND */
+/* #define _REPRODUCIBLE_MPI_OR_PTHREADS */
+#ifdef _USE_PTHREADS
+#ifndef _PORTABLE_PTHREADS
+void pinToCore(int tid);
+#endif
+#endif
+
+
+#define NOT ! 
+#define IS_PARALLEL (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI)) 
+
+
+
+#ifdef MEASURE_TIME_PARALLEL
+#define NUM_PAR_JOBS 16
+extern double masterTimePerPhase; 
+#endif
+
+
+/******************/
+/* MPI SPECIFIC   */
+/******************/
+#ifdef _FINE_GRAIN_MPI
+#include <mpi.h>
+#ifdef DEBUG_MPI_EACH_SEND
+#define DEBUG_PRINT(text, elem) printf(text, elem)
+#else 
+#define DEBUG_PRINT(text, elem) NULL
+#endif
+
+/* for the broadcast of traversal descriptor */
+#define TRAVERSAL_LENGTH 5
+#define traversalSize sizeof(traversalInfo)
+#define messageSize(x)   (3 * sizeof(int) +  x * (sizeof(int)+ sizeof(double)) + TRAVERSAL_LENGTH * traversalSize)
+
+#define VOLATILE_PAR 
+#define MASTER_P (processID == 0)
+#define POP_OR_PUT_BYTES(bufPtr, elem, type) (MASTER_P ? (bufPtr = addBytes((bufPtr), &(elem), sizeof(type))) : (bufPtr = popBytes((bufPtr), &(elem), sizeof(type))))
+
+#define ASSIGN_INT(x,y) (MPI_Bcast(&y,1,MPI_INT,0,MPI_COMM_WORLD),DEBUG_PRINT("\tSEND/RECV %d\n", y)) 
+#define ASSIGN_BUF(x,y,type) (POP_OR_PUT_BYTES(bufPtr, y,type))
+#define ASSIGN_BUF_DBL(x,y) (POP_OR_PUT_BYTES(bufPtrDbl,y, double))
+#define ASSIGN_DBL(x,y) (MPI_Bcast(&y,1,MPI_DOUBLE, 0, MPI_COMM_WORLD), DEBUG_PRINT("\tSEND/RECV %f\n", y)) 
+#define ASSIGN_DBLS(tar,src,length) MPI_Bcast(tar, length, MPI_DOUBLE, 0, MPI_COMM_WORLD)
+#define PLL_DOUBLE MPI_DOUBLE
+#define ASSIGN_GATHER(tar,src,length,type,tid) MPI_Gather(src,length,type,tar,length,type,0, MPI_COMM_WORLD)
+#define SEND_BUF(buf, bufSize,type) if(MASTER_P) MPI_Bcast(buf, bufSize, type, 0, MPI_COMM_WORLD) 
+#define RECV_BUF(buf, bufSize,type) if(NOT MASTER_P) MPI_Bcast(buf, bufSize, type, 0, MPI_COMM_WORLD) 
+#define BCAST_BUF(buf, bufSize,type,who)  MPI_Bcast(buf, bufSize, type, who,MPI_COMM_WORLD )
+
+
+
+extern int processes; 
+extern int processID; 
+#endif 
+
+/*********************/
+/* PTHREAD SPECIFIC  */
+/*********************/
+#ifdef _USE_PTHREADS
+#if defined (_MSC_VER)
+#include "pthread.h"
+#else
+#include <pthread.h>
+#endif
+#define _REPRODUCIBLE_MPI_OR_PTHREADS
+#define VOLATILE_PAR volatile 
+#define MASTER_P (tid == 0)
+#define ASSIGN_INT(x,y) (x = y)
+#define ASSIGN_BUF(x,y,type) (x = y)
+#define ASSIGN_BUF_DBL(x,y) (x = y)
+#define ASSIGN_DBL(x,y) (x = y)
+#define ASSIGN_DBLS(tar,src,length) memmove(tar, src, length * sizeof(double))
+#define PLL_DOUBLE double 	/* just rededining that to make the source code less confusing */
+#define ASSIGN_GATHER(tar,src,length,type,tid) (memmove((tar) + (tid) * (length) ,src, length * sizeof(type)))
+#define SEND_BUF(buf, bufSize, type) 
+#define RECV_BUF(buf, bufSize, type) 
+#define BCAST_BUF(buf, bufSize,type,who)  
+#define TRAVERSAL_LENGTH 5
+#define messageSize(x) 0
+#endif
+
+
+#endif	/* end include guard  */
diff --git a/pllrepo/src/globalVariables.h b/pllrepo/src/globalVariables.h
new file mode 100644
index 0000000..1c76da8
--- /dev/null
+++ b/pllrepo/src/globalVariables.h
@@ -0,0 +1,170 @@
+/*  RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees 
+ *  Copyright August 2006 by Alexandros Stamatakis
+ *
+ *  Partially derived from
+ *  fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
+ *  
+ *  and 
+ *
+ *  Programs of the PHYLIP package by Joe Felsenstein.
+ *
+ *  This program is free software; you may redistribute it and/or modify its
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ * 
+ *
+ *  For any other enquiries send an Email to Alexandros Stamatakis
+ *  Alexandros.Stamatakis at epfl.ch
+ *
+ *  When publishing work that is based on the results from RAxML-VI-HPC please cite:
+ *
+ *  Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models". 
+ *  Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
+ */
+
+#ifdef GLOBAL_VARIABLES_DEFINITION
+
+
+const char *protModels[PLL_NUM_PROT_MODELS] = {"DAYHOFF", "DCMUT", "JTT", "MTREV", "WAG", "RTREV", "CPREV", "VT", "BLOSUM62", "MTMAM", "LG", "MTART", "MTZOA", "PMB", 
+					   "HIVB", "HIVW", "JTTDCMUT", "FLU", "AUTO", "LG4M", "LG4X", "GTR"};
+
+const char binaryStateNames[2]   = {'0', '1'};  
+
+const char dnaStateNames[4]      = {'A', 'C', 'G', 'T'};
+
+const char protStateNames[20]    = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 
+				    'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 
+				    'Y', 'V'};
+
+const char inverseMeaningBINARY[4] = {'_', '0', '1', '-'};
+const char inverseMeaningDNA[16]   = {'_', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', '-'};
+const char inverseMeaningPROT[23]  = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 
+			       'T', 'W', 'Y', 'V', 'B', 'Z', '-'};
+const char inverseMeaningGeneric32[33] = {'0', '1', '2', '3', '4', '5', '6', '7', 
+				    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+				    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+				    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+				    '-'};
+const char inverseMeaningGeneric64[33] = {'0', '1', '2', '3', '4', '5', '6', '7', 
+				    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+				    'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+				    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+				    '-'};
+
+const unsigned int bitVectorIdentity[256] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,
+					     27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 ,
+					     52 ,53 ,54 ,55 ,56 ,57 ,58 ,59 ,60 ,61 ,62 ,63 ,64 ,65 ,66 ,67 ,68 ,69 ,70 ,71 ,72 ,73 ,74 ,75 ,76 ,
+					     77 ,78 ,79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,87 ,88 ,89 ,90 ,91 ,92 ,93 ,94 ,95 ,96 ,97 ,98 ,99 ,100 ,101 ,
+					     102 ,103 ,104 ,105 ,106 ,107 ,108 ,109 ,110 ,111 ,112 ,113 ,114 ,115 ,116 ,117 ,118 ,119 ,120 ,121 ,122 ,
+					     123 ,124 ,125 ,126 ,127 ,128 ,129 ,130 ,131 ,132 ,133 ,134 ,135 ,136 ,137 ,138 ,139 ,140 ,141 ,142 ,143 ,
+					     144 ,145 ,146 ,147 ,148 ,149 ,150 ,151 ,152 ,153 ,154 ,155 ,156 ,157 ,158 ,159 ,160 ,161 ,162 ,163 ,164 ,
+					     165 ,166 ,167 ,168 ,169 ,170 ,171 ,172 ,173 ,174 ,175 ,176 ,177 ,178 ,179 ,180 ,181 ,182 ,183 ,184 ,185 ,
+					     186 ,187 ,188 ,189 ,190 ,191 ,192 ,193 ,194 ,195 ,196 ,197 ,198 ,199 ,200 ,201 ,202 ,203 ,204 ,205 ,206 ,
+					     207 ,208 ,209 ,210 ,211 ,212 ,213 ,214 ,215 ,216 ,217 ,218 ,219 ,220 ,221 ,222 ,223 ,224 ,225 ,226 ,227 ,
+					     228 ,229 ,230 ,231 ,232 ,233 ,234 ,235 ,236 ,237 ,238 ,239 ,240 ,241 ,242 ,243 ,244 ,245 ,246 ,247 ,248 ,
+					     249 ,250 ,251 ,252 ,253 ,254 ,255};
+
+
+
+const unsigned int bitVectorAA[23] = {1, 2, 4, 8, 16, 32, 64, 128, 
+				      256, 512, 1024, 2048, 4096, 
+				      8192, 16384, 32768, 65536, 131072, 262144, 
+				      524288, 12 /* N | D */, 96 /*Q | E*/, 1048575 /* - */};
+
+const unsigned int bitVectorSecondary[256] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
+					      10, 11, 12, 13, 14, 15, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 
+					      208, 224, 240, 0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 
+					      255, 0, 256, 512, 768, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 
+					      3584, 3840, 0, 257, 514, 771, 1028, 1285, 1542, 1799, 2056, 2313, 2570, 2827, 3084, 
+					      3341, 3598, 3855, 0, 272, 544, 816, 1088, 1360, 1632, 1904, 2176, 2448, 2720, 2992, 
+					      3264, 3536, 3808, 4080, 0, 273, 546, 819, 1092, 1365, 1638, 1911, 2184, 2457, 2730, 
+					      3003, 3276, 3549, 3822, 4095, 0, 4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 
+					      36864, 40960, 45056, 49152, 53248, 57344, 61440, 0, 4097, 8194, 12291, 16388, 20485, 24582, 
+					      28679, 32776, 36873, 40970, 45067, 49164, 53261, 57358, 61455, 0, 4112, 8224, 12336, 16448, 
+					      20560, 24672, 28784, 32896, 37008, 41120, 45232, 49344, 53456, 57568, 61680, 0, 4113, 8226, 
+					      12339, 16452, 20565, 24678, 28791, 32904, 37017, 41130, 45243, 49356, 53469, 57582, 61695, 
+					      0, 4352, 8704, 13056, 17408, 21760, 26112, 30464, 34816, 39168, 43520, 47872, 52224, 56576, 
+					      60928, 65280, 0, 4353, 8706, 13059, 17412, 21765, 26118, 30471, 34824, 39177, 43530, 47883, 
+					      52236, 56589, 60942, 65295, 0, 4368, 8736, 13104, 17472, 21840, 26208, 30576, 34944, 39312, 
+					      43680, 48048, 52416, 56784, 61152, 65520, 0, 4369, 8738, 13107, 17476, 21845, 26214, 30583, 
+					      34952, 39321, 43690, 48059, 52428, 56797, 61166, 65535};
+
+const unsigned int bitVector32[33] = {1,     2,    4,    8,   16,   32,    64,   128,
+                                      256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
+                                      65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608,
+                                      16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648u, 
+				      4294967295u};
+
+/*const unsigned int bitVector64[65] = {};*/
+/** @brief Array for setting bits 0 .. 31 in a bit vector, used in saveMemory technique for the gapVector */
+const unsigned int mask32[32] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 
+					262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 
+					268435456, 536870912, 1073741824, 2147483648U};
+
+const char *secondaryModelList[21] = { "S6A (GTR)", "S6B", "S6C", "S6D", "S6E", "S7A (GTR)", "S7B", "S7C", "S7D", "S7E", "S7F", "S16 (GTR)", "S16A", "S16B", "S16C", 
+				       "S16D", "S16E", "S16F", "S16I", "S16J", "S16K"};
+
+const partitionLengths pLengths[PLL_MAX_MODEL] = {
+  
+  /* BINARY */
+  {4,   4,   2,  4,  4, 1, 2,  8, 2, 2, PLL_FALSE, PLL_FALSE, 3, inverseMeaningBINARY, 2, PLL_FALSE, bitVectorIdentity},
+  
+  /* DNA */
+  {16,  16,  4, 16, 16, 6, 4, 64, 6, 4, PLL_FALSE, PLL_FALSE, 15, inverseMeaningDNA, 4, PLL_FALSE, bitVectorIdentity},
+        
+  /* AA */
+  {400, 400, 20, 400, 400, 190, 20, 460, 190, 20, PLL_FALSE, PLL_FALSE, 22, inverseMeaningPROT, 20, PLL_TRUE, bitVectorAA},
+  
+  /* SECONDARY_DATA */
+
+  {256, 256, 16, 256, 256, 120, 16, 4096, 120, 16, PLL_FALSE, PLL_FALSE, 255, (char*)NULL, 16, PLL_TRUE, bitVectorSecondary},
+
+  
+  /* SECONDARY_DATA_6 */
+  {36, 36,  6, 36, 36, 15, 6, 384, 15, 6, PLL_FALSE, PLL_FALSE, 63, (char*)NULL, 6, PLL_TRUE, bitVectorIdentity},
+
+  
+  /* SECONDARY_DATA_7 */
+  {49,   49,    7,   49, 49,  21, 7, 896, 21, 7, PLL_FALSE, PLL_FALSE, 127, (char*)NULL, 7, PLL_TRUE, bitVectorIdentity},
+
+  /* 32 states */
+  {1024, 1024, 32, 1024, 1024, 496, 32, 1056, 496, 32, PLL_FALSE, PLL_FALSE, 32, inverseMeaningGeneric32, 32, PLL_TRUE, bitVector32},
+  
+  /* 64 states */
+  {4096, 4096, 64, 4096, 4096, 2016, 64, 4160, 64, 2016, PLL_FALSE, PLL_FALSE, 64, (char*)NULL, 64, PLL_TRUE, (unsigned int*)NULL}
+};
+
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+double *globalResult;
+pllBoolean treeIsInitialized;
+#ifdef MEASURE_TIME_PARALLEL
+double masterTimePerPhase; 
+#endif
+#endif
+
+#ifdef _USE_PTHREADS
+volatile int             jobCycle = 0;
+volatile int             threadJob = 0;
+volatile char            *barrierBuffer;
+#endif
+
+#ifdef _FINE_GRAIN_MPI
+int processes;
+int processID; 
+MPI_Datatype TRAVERSAL_MPI; 
+#endif
+
+#else
+extern const partitionLengths pLengths[PLL_MAX_MODEL];
+extern const char * protModels[PLL_NUM_PROT_MODELS];
+extern char * secondaryModelList[21];
+//extern const unsigned int * mask32;
+
+#endif
diff --git a/pllrepo/src/hardware.c b/pllrepo/src/hardware.c
new file mode 100644
index 0000000..3607568
--- /dev/null
+++ b/pllrepo/src/hardware.c
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <string.h>
+#include "hardware.h"
+
+#define PLL_FEAT_AVAIL(x,y) (((x) & (y)) == (y))
+#define PLL_SYS_CPU_DIR_PATH "/sys/devices/system/cpu/"
+
+//#ifdef _MSC_VER
+//#define inline __inline
+//#endif
+
+static __inline void cpuid(unsigned int op, int count,
+                         unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx)
+{
+#ifdef WIN32
+	__int32 regs[4];
+	__cpuid((int*)regs, (int)op);
+	*eax = regs[0];
+	*ebx = regs[1];
+	*ecx = regs[2];
+	*edx = regs[3];
+#else
+	*eax = op;
+  *ecx = count;
+  asm volatile("cpuid"
+        : "=a" (*eax),
+          "=b" (*ebx),
+          "=c" (*ecx),
+          "=d" (*edx)
+
+        : "0" (*eax), "2" (*ecx)
+        : "memory");
+#endif
+}
+
+
+void show_hardware_info(pllHardwareInfo * hw)
+{
+  printf ("MMX.........: %d\n"
+          "SSE.........: %d\n"
+          "SSE2........: %d\n"
+          "SSE3........: %d\n"
+          "SSSE3.......: %d\n"
+          "FMA.........: %d\n"
+          "SSE4.1......: %d\n"
+          "SSE4.2......: %d\n"
+          "AVX.........: %d\n"
+          "AVX2........: %d\n"
+          "SSE4A.......: %d\n"
+          "FMA4........: %d\n\n"
+          "Core(s).....: %d\n"
+          "CPU Sockets.: %d\n",
+
+          hw->has_mmx, hw->has_sse, hw->has_sse2, hw->has_sse3, hw->has_ssse3,
+          hw->has_fma, hw->has_sse41, hw->has_sse42, hw->has_avx, hw->has_avx2,
+          hw->has_sse4a, hw->has_fma4, hw->cores, hw->cpu_sockets);
+}
+
+static int pll_probe_cpu (pllHardwareInfo * hw)
+{
+  struct stat cpustat;
+  char cpu[30];
+  char cpupath[100];
+  int i, id, max_physical_id = -1;
+  char * physical_id_path = "/topology/physical_package_id";
+  FILE * fd;
+
+  /* check whether the sys cpu dir exists */
+  if (stat(PLL_SYS_CPU_DIR_PATH, &cpustat)) return (0);
+  
+  /* and also check whether it is a dir */
+  if (!S_ISDIR(cpustat.st_mode)) return (0);
+
+  /* detect number of processors */
+  for (i = 0; ; ++i)
+   {
+     sprintf(cpu, "cpu%d", i);
+     strcpy (cpupath, PLL_SYS_CPU_DIR_PATH);
+     strcat (cpupath, cpu);
+     if (stat(cpupath, &cpustat)) break;
+
+     strcat (cpupath, physical_id_path);
+     if (!stat(cpupath, &cpustat))
+      {
+        fd = fopen (cpupath,"r");
+        fscanf (fd, "%d", &id);
+        /* printf ("Detected processor %d belonging to package %d\n", i, id); */
+        if (id > max_physical_id) max_physical_id = id;
+        fclose (fd);
+      }
+   }
+  
+  hw->cores       = i;
+  hw->cpu_sockets = max_physical_id + 1;
+
+  return (1);
+}
+
+static void pll_probe_hardware (pllHardwareInfo * hw)
+{
+  unsigned int a, b, c, d;
+  c = 0;
+
+  cpuid(0,0,&a,&b,&c,&d);
+  *((unsigned int *)(hw->vendor)    ) = b;
+  *((unsigned int *)(hw->vendor + 4)) = d;
+  *((unsigned int *)(hw->vendor + 8)) = c;
+  hw->vendor[12] = 0;
+
+  printf ("%s\n", hw->vendor);
+
+  cpuid(1,0,&a,&b,&c,&d);
+
+  hw->has_mmx   = PLL_FEAT_AVAIL(d,PLL_HAS_MMX); 
+  hw->has_sse   = PLL_FEAT_AVAIL(d,PLL_HAS_SSE);
+  hw->has_sse2  = PLL_FEAT_AVAIL(d,PLL_HAS_SSE2);
+
+  hw->has_sse3  = PLL_FEAT_AVAIL(c,PLL_HAS_SSE3);
+  hw->has_ssse3 = PLL_FEAT_AVAIL(c,PLL_HAS_SSSE3);
+  hw->has_fma   = PLL_FEAT_AVAIL(c,PLL_HAS_FMA);
+  hw->has_sse41 = PLL_FEAT_AVAIL(c,PLL_HAS_SSE41);
+  hw->has_sse42 = PLL_FEAT_AVAIL(c,PLL_HAS_SSE42);
+  hw->has_avx   = PLL_FEAT_AVAIL(c,PLL_HAS_AVX);
+
+  cpuid(7,0,&a,&b,&c,&d);
+
+  hw->has_avx2  = PLL_FEAT_AVAIL(b,PLL_HAS_AVX2);
+
+  /* TODO: note, here we have to check whether leaf 0x80000001 exists */
+  cpuid(0x80000001,0,&a,&b,&c,&d);
+
+  hw->has_sse4a = PLL_FEAT_AVAIL(c,PLL_HAS_SSE4A);
+  hw->has_fma4  = PLL_FEAT_AVAIL(c,PLL_HAS_FMA4);
+}
+
+int pllGetHardwareInfo (pllHardwareInfo * hw)
+{
+  pll_probe_hardware (hw);
+  pll_probe_cpu (hw);
+
+  /* TODO: finish failure checks in probe_hardware and probe_cpu */
+  return (1);
+
+}
+
+/* TODO: Remove after testing */
+/* 
+int main (int argc, char * argv[])
+{ 
+  pllHardwareInfo hw;
+
+  pll_probe_hardware(&hw);
+  pll_probe_cpu(&hw);
+
+  show_hardware_info(&hw);
+  return (EXIT_SUCCESS);
+}
+*/
diff --git a/pllrepo/src/hardware.h b/pllrepo/src/hardware.h
new file mode 100644
index 0000000..d1bfa33
--- /dev/null
+++ b/pllrepo/src/hardware.h
@@ -0,0 +1,48 @@
+#ifndef PLL_HARDWARE
+#define PLL_HARDWARE
+
+/* leaf 1 */
+/* edx */
+#define PLL_HAS_MMX             1 << 23
+#define PLL_HAS_SSE             1 << 25
+#define PLL_HAS_SSE2            1 << 26
+
+/* ecx */
+#define PLL_HAS_SSE3            1
+#define PLL_HAS_SSSE3           1 <<  9
+#define PLL_HAS_FMA             1 << 12
+#define PLL_HAS_SSE41           1 << 19
+#define PLL_HAS_SSE42           1 << 20
+#define PLL_HAS_AVX             1 << 28
+
+
+/* leaf 7 */
+/* ebx */
+#define PLL_HAS_AVX2            1 <<  5
+
+/* leaf 0x80000001 */
+/* ecx*/
+#define PLL_HAS_SSE4A           1 <<  6
+#define PLL_HAS_FMA4            1 << 16
+
+typedef struct
+{
+  int has_mmx;
+  int has_sse;
+  int has_sse2;
+  int has_sse3;
+  int has_ssse3;
+  int has_sse41;
+  int has_sse42;
+  int has_sse4a;
+  int has_avx;
+  int has_avx2;
+  int has_fma;
+  int has_fma4;
+  int cpu_sockets;
+  int cores;
+  char vendor[13];
+
+} pllHardwareInfo;
+
+#endif
diff --git a/pllrepo/src/hash.c b/pllrepo/src/hash.c
new file mode 100644
index 0000000..4a68225
--- /dev/null
+++ b/pllrepo/src/hash.c
@@ -0,0 +1,219 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file hash.c
+ */
+#include <stdio.h>
+#include <string.h>
+#include "pll.h"
+#include "mem_alloc.h"
+
+static const unsigned int initTable[] = 
+  {
+    53,         97,         193,       389,       769,    
+    1543,       3079,       6151,      12289,     24593, 
+    49157,      98317,      196613,    393241,    786433, 
+    1572869,    3145739,    6291469,   12582917,  25165843, 
+    50331653,   100663319,  201326611, 402653189, 805306457, 
+    1610612741, 3221225473, 4294967291
+  };
+       
+/** @brief Generate the hash value for a string 
+
+    Generates the hash value of a string \a s.
+
+    @param s     The string to compute the hash for
+    @param size  Size of the hash table
+    @return      String hash \a s, i.e. index in hash table
+*/
+unsigned int pllHashString (const char * s, unsigned int size)
+{
+  unsigned int hash = 0;
+
+  for (; *s; ++s) hash = (hash << 5) - hash + (unsigned int )*s;
+
+  return (hash % size);
+}
+
+/** @brief Add a string and its data to a hashtable
+    
+    Add an \a item and possibly a string \a s to hashtable \a hTable at position
+    \a hash, where \a hash must be a value between 0 and \a hTable->size - 1. If
+    string \a s is given and another record with the same computed hash and the
+    same associated string exists in the hash table, then the new record will \b not be added and the
+    value \b PLL_FALSE is returned. Otherwise, the new item is added at the
+    beginning of the corresponding linked list and the value \b PLL_TRUE is
+    returned.
+
+    @param hTable Hashtable
+    @param hash   Position where to store in hash table
+    @param s      String
+    @param item   Data associated with \a s
+    @return       Returns \b PLL_TRUE if added with success, otherwise \b PLL_FALSE
+*/
+int pllHashAdd  (pllHashTable * hTable, unsigned int hash, const char * s, void * item)
+{
+  pllHashItem * hItem;
+
+  hItem = hTable->Items[hash];
+
+  /* If a string was given, check whether the record already exists */
+  if (s)
+   {
+     for (; hItem; hItem = hItem->next)
+      {
+        if (hItem->str && !strcmp (s, hItem->str)) return (PLL_FALSE);
+      }
+   }
+
+  hItem = (pllHashItem *) rax_malloc (sizeof (pllHashItem));
+
+  /* store the string together with the element if given */
+  if (s)
+   {
+     hItem->str = (char *) rax_malloc ((strlen(s) + 1) * sizeof (char));
+     strcpy (hItem->str, s);
+   }
+  else
+   hItem->str = NULL;
+
+  hItem->data = item;
+
+  hItem->next = hTable->Items[hash];
+  hTable->Items[hash] = hItem;
+  hTable->entries += 1;
+
+  return (PLL_TRUE);
+}
+
+       
+/** @brief Initialize hash table
+    
+    Create a hash table of size at least \a n. The size of the hash table will
+    be the first prime number higher or equal to \a n.
+
+    @param n  Minimum size of hash table
+    @return   In case of success, returns a pointer to the created hash table, otherwise returns \b NULL
+*/
+pllHashTable * pllHashInit (unsigned int n)
+{ 
+  pllHashTable * hTable;
+  unsigned int i;
+  unsigned int primeTableLength;
+       
+  hTable = (pllHashTable *) rax_malloc (sizeof (pllHashTable));
+  if (!hTable) return (NULL);
+  
+  primeTableLength = sizeof (initTable) / sizeof(initTable[0]);
+
+  i = 0;
+ 
+  while (initTable[i] < n && i < primeTableLength) ++ i;
+ 
+  n = initTable[i];  
+ 
+  hTable->Items = (pllHashItem **) rax_calloc (n, sizeof (pllHashItem *));
+  if (!hTable->Items)
+   {
+     rax_free (hTable);
+     return (NULL);
+   }
+  hTable->size    = n;
+  hTable->entries = 0;
+ 
+  return (hTable);
+}
+
+/** @brief Retrieve the data stored in hash table for a given string
+
+    Retrieve the data stored in hash table \a hTable under a given string \a s.
+    In case the string is found in the hash table, the associated data are
+    stored in \a item and the function returns \b PLL_TRUE. In the opposite
+    case, or if \a s is given as \b NULL then \b PLL_FALSE is returned.
+
+    @param hTable   Hash table to be searched
+    @param s        String to look for
+    @param item     Where to store the retrieved data
+    @return         Returns \b PLL_TRUE if the string was found, otherwise \b PLL_FALSE
+*/
+int pllHashSearch (pllHashTable * hTable, char * s, void ** item)
+{
+  unsigned int pos;
+  pllHashItem * hItem;
+
+  if (!s) return (PLL_FALSE);
+
+  pos   = pllHashString (s, hTable->size);
+  hItem = hTable->Items[pos];
+
+  for (; hItem; hItem = hItem->next)
+   {
+     if (hItem->str && !strcmp (s, hItem->str))
+      {
+        *item = hItem->data;
+        return (PLL_TRUE);
+      }
+   }
+
+  return (PLL_FALSE);
+}
+
+/** @brief Deallocate a hash table
+
+    Deallocates the hash table. A callback function may be specified as \a
+    cbDealloc which will be executed upon all \a data elements of the hash
+    table, for deallocating custom data. If no deallocation is required for the
+    custom data, then \a cbDealloc must be set to \b NULL. The strings
+    associated with each hash element are deallocated.
+
+    @param hTable    Hash table to be deallocated
+    @pram  cbDealloc Callback function to perform deallocation of each data element of the hash table
+    @notes
+      Deallocates the structure for the hash table. Note that the 
+      data associated with the indexed strings are not deallocated.
+*/
+void pllHashDestroy (pllHashTable ** hTable, void (*cbDealloc)(void *))
+{
+  unsigned int i;
+  pllHashItem * hItem;
+  pllHashItem * tmp;
+
+  for (i = 0; i < (*hTable)->size; ++ i)
+  {
+    hItem = (*hTable)->Items[i];
+    while (hItem)
+     {
+       tmp   = hItem;
+       hItem = hItem->next;
+       if (tmp->str)  rax_free (tmp->str);
+       if (cbDealloc) cbDealloc (tmp->data);
+       rax_free (tmp);
+     }
+  }
+  rax_free ((*hTable)->Items);
+  rax_free (*hTable);
+  *hTable = NULL;
+}
diff --git a/pllrepo/src/hash.h b/pllrepo/src/hash.h
new file mode 100644
index 0000000..a550f38
--- /dev/null
+++ b/pllrepo/src/hash.h
@@ -0,0 +1,50 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file hash.h
+ */
+#ifndef __pll_HASH__
+#define __pll_HASH__
+
+struct pllHashItem
+{
+  void * data;
+  char * str;
+  struct pllHashItem * next;
+};
+
+struct pllHashTable
+{
+  unsigned int size;
+  struct pllHashItem ** Items;
+};
+
+unsigned int pllHashString (const char * s, unsigned int size);
+int pllHashAdd  (struct pllHashTable * hTable, const char * s, void * item);
+struct pllHashTable * pllHashInit (unsigned int n);
+int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item);
+void pllHashDestroy (struct pllHashTable ** hTable, int);
+#endif
diff --git a/pllrepo/src/lexer.c b/pllrepo/src/lexer.c
new file mode 100644
index 0000000..1cbf614
--- /dev/null
+++ b/pllrepo/src/lexer.c
@@ -0,0 +1,299 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file lexer.c
+ */
+#include <stdio.h>
+#include "lexer.h"
+
+static const char * rawtext;
+static long rawtext_size;
+static long pos = 0;
+
+int lex_table[PLL_ASCII_SIZE] = {
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN,     PLL_SYM_TAB,      PLL_SYM_CR,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN,      PLL_SYM_LF, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*      */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/*  !"# */   PLL_SYM_SPACE, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/* $%&' */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN,
+/* ()*+ */  PLL_SYM_OPAREN,  PLL_SYM_CPAREN, PLL_SYM_UNKNOWN,      PLL_SYM_PLUS,
+/* ,-./ */   PLL_SYM_COMMA,    PLL_SYM_DASH,     PLL_SYM_DOT,     PLL_SYM_SLASH,
+/* 0123 */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,     PLL_SYM_DIGIT,
+/* 4567 */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,     PLL_SYM_DIGIT,
+/* 89:; */   PLL_SYM_DIGIT,   PLL_SYM_DIGIT,   PLL_SYM_COLON, PLL_SYM_SEMICOLON,
+/* <=>? */ PLL_SYM_UNKNOWN,   PLL_SYM_EQUAL, PLL_SYM_UNKNOWN,      PLL_SYM_CHAR,
+/* @ABC */ PLL_SYM_UNKNOWN,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* DEFG */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* HIJK */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* LMNO */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* PQRS */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* TUVW */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* XYZ[ */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,   PLL_SYM_UNKNOWN,
+/* \]^_ */ PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,      PLL_SYM_CHAR,
+/* `abc */ PLL_SYM_UNKNOWN,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* defg */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* hijk */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* lmno */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* pqrs */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* tuvw */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,      PLL_SYM_CHAR,
+/* xyz{ */    PLL_SYM_CHAR,    PLL_SYM_CHAR,    PLL_SYM_CHAR,   PLL_SYM_UNKNOWN,
+/* |}~  */    PLL_SYM_CHAR, PLL_SYM_UNKNOWN, PLL_SYM_UNKNOWN,   PLL_SYM_UNKNOWN
+ };
+
+int 
+get_next_byte (void)
+{
+  if (pos == rawtext_size) 
+   {
+     ++pos;
+     return (PLL_EOS);
+   }
+
+  return (rawtext[pos++]);
+}
+
+int
+get_next_symbol (void)
+{
+  int ch, sym;
+
+  ch = get_next_byte ();
+
+  if (ch == PLL_EOS) return (PLL_SYM_EOF);
+  if (ch >= PLL_ASCII_SIZE) return (PLL_SYM_UNKNOWN);
+
+  sym = lex_table[ch];
+
+  if (sym == PLL_SYM_LF)
+   {
+     if (get_next_byte() == '\n')
+      {
+        sym = PLL_SYM_LFCR;
+      }
+     else
+      {
+        --pos;
+      }
+   }
+
+  return sym;
+}
+
+pllLexToken
+get_token (int * input)
+{
+  pllLexToken token;
+  int
+    start_pos,
+    isFloating = 0;
+
+  token.lexeme = rawtext + pos - 1;
+  start_pos    = pos;
+
+  switch (*input)
+   {
+     case PLL_SYM_SLASH:
+       token.tokenType = PLL_TOKEN_SLASH;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_DASH:
+       token.tokenType = PLL_TOKEN_DASH;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_EQUAL:
+       token.tokenType = PLL_TOKEN_EQUAL;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_SEMICOLON:
+       token.tokenType = PLL_TOKEN_SEMICOLON;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_COMMA:
+       token.tokenType = PLL_TOKEN_COMMA;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_COLON:
+       token.tokenType = PLL_TOKEN_COLON;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_OPAREN:
+       token.tokenType = PLL_TOKEN_OPAREN;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_CPAREN:
+       token.tokenType = PLL_TOKEN_CPAREN;
+       *input = get_next_symbol();
+       break;
+
+     case PLL_SYM_SPACE:
+     case PLL_SYM_TAB:
+       do
+        {
+          *input = get_next_symbol();
+        } while (*input == PLL_SYM_SPACE || *input == PLL_SYM_TAB);
+       token.len   = pos - start_pos;
+       token.tokenType = PLL_TOKEN_WHITESPACE; 
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+       
+     case PLL_SYM_DIGIT:
+       do
+        {
+          *input = get_next_symbol();   
+        } while (*input == PLL_SYM_DIGIT);
+
+       if (*input == PLL_SYM_DOT)
+        {
+          isFloating = 1;
+          do
+           {
+             *input = get_next_symbol ();
+           } while (*input == PLL_SYM_DIGIT);
+        }
+
+       if (*input != PLL_SYM_CHAR)
+        {
+          token.len   = pos - start_pos;
+          if (!isFloating)
+            token.tokenType = PLL_TOKEN_NUMBER;
+          else
+            token.tokenType = PLL_TOKEN_FLOAT;
+        }
+       else
+        {
+          /* check for E notation */
+          if (rawtext[pos - 1] == 'E' || rawtext[pos - 1] == 'e')
+           {
+             *input = get_next_symbol ();
+
+             if (*input == PLL_SYM_PLUS || *input == PLL_SYM_DASH || *input == PLL_SYM_DIGIT)
+              {
+                do
+                 {
+                   *input = get_next_symbol ();
+                 } while (*input == PLL_SYM_DIGIT);
+
+                if (*input != PLL_SYM_CHAR)
+                 {
+                   token.len = pos - start_pos;
+                   token.tokenType = PLL_TOKEN_FLOAT;
+                 }
+              }
+             else
+              {
+                token.len = pos - start_pos;
+                token.tokenType = PLL_TOKEN_STRING;
+              }
+           }
+
+          if (*input == PLL_SYM_CHAR)
+           {
+             do {
+               *input = get_next_symbol();
+             } while (*input == PLL_SYM_CHAR || *input == PLL_SYM_DIGIT || *input == PLL_SYM_DOT);
+             token.len   = pos - start_pos;
+             token.tokenType = PLL_TOKEN_STRING;
+           }
+        }
+
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+
+     case PLL_SYM_CHAR:
+       do
+        {
+          *input = get_next_symbol();
+        } 
+       while (*input == PLL_SYM_CHAR  || 
+              *input == PLL_SYM_DIGIT || 
+              *input == PLL_SYM_DASH  ||
+              *input == PLL_SYM_DOT);
+       token.len   = pos - start_pos;
+       token.tokenType = PLL_TOKEN_STRING;
+       if (*input == PLL_SYM_LFCR) --token.len;
+       break;
+       
+     case PLL_SYM_EOF:
+       token.tokenType = PLL_TOKEN_EOF;
+       break;
+
+     case PLL_SYM_CR:
+     case PLL_SYM_LF:
+     case PLL_SYM_LFCR:
+       do
+        {
+          *input = get_next_symbol();
+        } while (*input == PLL_SYM_CR || *input == PLL_SYM_LFCR || *input == PLL_SYM_LF);
+       token.tokenType = PLL_TOKEN_NEWLINE;
+       break;
+     case PLL_SYM_UNKNOWN:
+     default:
+       token.tokenType = PLL_TOKEN_UNKNOWN;
+       break;
+   }
+
+  return (token);
+}
+
+void
+lex_table_amend_phylip (void)
+{
+  lex_table['-'] = lex_table['.'] = PLL_SYM_CHAR; 
+}
+
+void
+lex_table_amend_fasta (void)
+{
+  lex_table['-'] = lex_table['.'] = lex_table['>'] = PLL_SYM_CHAR; 
+}
+
+void
+lex_table_restore (void)
+{
+  lex_table['-'] = PLL_SYM_DASH;
+  lex_table['.'] = PLL_SYM_DOT; 
+  lex_table['>'] = PLL_SYM_UNKNOWN;
+}
+
+void
+init_lexan (const char * text, long n)
+{
+  rawtext      = text;
+  rawtext_size = n;
+  pos          = 0;
+}
diff --git a/pllrepo/src/lexer.h b/pllrepo/src/lexer.h
new file mode 100644
index 0000000..6924259
--- /dev/null
+++ b/pllrepo/src/lexer.h
@@ -0,0 +1,88 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file lexer.h
+ */
+#ifndef __pll_LEXER__
+#define __pll_LEXER__
+
+#define  PLL_ASCII_SIZE                128
+#define  PLL_EOS                       0x00000200
+
+#define  PLL_SYM_CR                    1 << 0
+#define  PLL_SYM_LF                    1 << 1
+#define  PLL_SYM_LFCR                  1 << 2
+#define  PLL_SYM_DIGIT                 1 << 3
+#define  PLL_SYM_CHAR                  1 << 4
+#define  PLL_SYM_SPACE                 1 << 5
+#define  PLL_SYM_TAB                   1 << 6
+#define  PLL_SYM_EOF                   1 << 7
+#define  PLL_SYM_UNKNOWN               1 << 8
+#define  PLL_SYM_DOT                   1 << 9
+#define  PLL_SYM_COLON                 1 << 10
+#define  PLL_SYM_OPAREN                1 << 11
+#define  PLL_SYM_CPAREN                1 << 12
+#define  PLL_SYM_COMMA                 1 << 13
+#define  PLL_SYM_SEMICOLON             1 << 14
+#define  PLL_SYM_EQUAL                 1 << 15
+#define  PLL_SYM_DASH                  1 << 16
+#define  PLL_SYM_SLASH                 1 << 17
+#define  PLL_SYM_PLUS                  1 << 18
+
+#define  PLL_TOKEN_NUMBER              1 << 0
+#define  PLL_TOKEN_STRING              1 << 1
+#define  PLL_TOKEN_EOF                 1 << 2
+#define  PLL_TOKEN_WHITESPACE          1 << 3
+#define  PLL_TOKEN_NEWLINE             1 << 4
+#define  PLL_TOKEN_UNKNOWN             1 << 5
+#define  PLL_TOKEN_COLON               1 << 6
+#define  PLL_TOKEN_OPAREN              1 << 7
+#define  PLL_TOKEN_CPAREN              1 << 8
+#define  PLL_TOKEN_FLOAT               1 << 9
+#define  PLL_TOKEN_COMMA               1 << 10
+#define  PLL_TOKEN_SEMICOLON           1 << 11
+#define  PLL_TOKEN_EQUAL               1 << 12
+#define  PLL_TOKEN_DASH                1 << 13
+#define  PLL_TOKEN_SLASH               1 << 14
+
+#define CONSUME(x)         while (token.tokenType & (x)) token = get_token (&input);
+#define NEXT_TOKEN         token = get_token (&input);
+
+typedef struct
+ {
+   int 	        tokenType;
+   const char * lexeme;
+   int          len;
+ } pllLexToken;
+
+int get_next_byte (void);
+int get_next_symbol (void);
+pllLexToken get_token (int * input);
+void init_lexan (const char * text, long n);
+void lex_table_amend_phylip (void);
+void lex_table_amend_fasta (void);
+void lex_table_restore (void);
+#endif
diff --git a/pllrepo/src/makenewzGenericSpecial.c b/pllrepo/src/makenewzGenericSpecial.c
new file mode 100644
index 0000000..b2b114a
--- /dev/null
+++ b/pllrepo/src/makenewzGenericSpecial.c
@@ -0,0 +1,3145 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __SSE3
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+/*#include <tmmintrin.h>*/
+#endif
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+
+/** @file makenewzGenericSpecial.c
+ *  
+ *  @brief Branch length optimization
+ */
+
+
+
+/* pointers to reduction buffers for storing and gathering the first and second derivative 
+   of the likelihood in Pthreads and MPI */
+
+#if IS_PARALLEL
+void branchLength_parallelReduce(pllInstance *tr, double *dlnLdlz,  double *d2lnLdlz2, int numBranches ) ;
+//extern double *globalResult;
+#endif
+
+
+extern const unsigned int mask32[32];
+
+#if (defined(__SSE3) || defined(__AVX))
+static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+                            unsigned char *tipX1, unsigned char *tipX2, int n);
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
+                              volatile double *d1, volatile double *d2, 
+                              double *rptr, double *EIGN, int *cptr, double lz, int *wgt);
+static void sumCAT_BINARY(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+                          unsigned char *tipX1, unsigned char *tipX2, int n);
+#endif
+
+/*******************/
+
+
+/* generic function to get the required pointers to the data associated with the left and right node that define a branch */
+
+static void getVects(pllInstance *tr, 
+                     partitionList *pr, 
+                     unsigned char **tipX1, unsigned char **tipX2, 
+                     double **x1_start, double **x2_start, 
+                     int *tipCase, 
+                     int model, 
+                     double **x1_gapColumn, double **x2_gapColumn, 
+                     unsigned int **x1_gap, unsigned int **x2_gap,
+                     double ** x1_start_asc,
+                     double ** x2_start_asc)
+{
+  int    
+    rateHet = (int)discreteRateCategories(tr->rateHetModel),
+            states = pr->partitionData[model]->states,
+            pNumber, 
+            qNumber; 
+
+  /* get the left and right node number of the nodes defining the branch we want to optimize */
+
+  pNumber = tr->td[0].ti[0].pNumber;
+  qNumber = tr->td[0].ti[0].qNumber;
+
+  /* get the index where the ancestral vector is expected to be found */
+  int p_slot, q_slot;
+  if(tr->useRecom)
+  {
+    p_slot = tr->td[0].ti[0].slot_p; 
+    q_slot = tr->td[0].ti[0].slot_q;
+  }
+  else
+  {
+    p_slot = pNumber - tr->mxtips - 1;
+    q_slot = qNumber - tr->mxtips - 1;
+  }
+   
+
+  /* initialize to NULL */
+
+  *x1_start = (double*)NULL,
+  *x2_start = (double*)NULL;
+  
+  *tipX1 = (unsigned char*)NULL,
+  *tipX2 = (unsigned char*)NULL;
+
+  *x1_start_asc = NULL;
+  *x2_start_asc = NULL;
+
+  /* switch over the different tip cases again here */
+
+  if(isTip(pNumber, tr->mxtips) || isTip(qNumber, tr->mxtips))
+  {      
+    if(!( isTip(pNumber, tr->mxtips) && isTip(qNumber, tr->mxtips)) )
+    {
+      *tipCase = PLL_TIP_INNER;
+      if(isTip(qNumber, tr->mxtips))
+      {
+        *tipX1 = pr->partitionData[model]->yVector[qNumber];
+        *x2_start = pr->partitionData[model]->xVector[p_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+        if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+          if(pr->partitionData[model]->ascBias)
+#endif
+          {
+            *x2_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          }
+
+        if(tr->saveMemory)
+        {
+          *x2_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+          *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+        }
+      }
+      else
+      {
+        *tipX1 = pr->partitionData[model]->yVector[pNumber];
+        *x2_start = pr->partitionData[model]->xVector[q_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+        if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+          if(pr->partitionData[model]->ascBias)
+#endif  
+          {
+            *x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          }
+
+        if(tr->saveMemory)
+        {
+          *x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+          *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+        }
+      }
+    }
+    else
+    {
+      /* note that tip tip should normally not occur since this means that we are trying to optimize 
+         a branch in a two-taxon tree. However, this has been inherited be some RAxML function 
+         that optimized pair-wise distances between all taxa in a tree */
+
+      *tipCase = PLL_TIP_TIP;
+      *tipX1 = pr->partitionData[model]->yVector[pNumber];
+      *tipX2 = pr->partitionData[model]->yVector[qNumber];
+    }
+  }
+  else
+  {
+    *tipCase = PLL_INNER_INNER;
+
+    *x1_start = pr->partitionData[model]->xVector[p_slot];
+    *x2_start = pr->partitionData[model]->xVector[q_slot];
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+        if(pr->partitionData[model]->ascBias)
+#endif
+        {
+          *x1_start_asc = &pr->partitionData[model]->ascVector[(pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+          *x2_start_asc = &pr->partitionData[model]->ascVector[(qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+        }           
+    if(tr->saveMemory)
+    {
+      *x1_gap = &(pr->partitionData[model]->gapVector[pNumber * pr->partitionData[model]->gapVectorLength]);
+      *x1_gapColumn   = &pr->partitionData[model]->gapColumn[(pNumber - tr->mxtips - 1) * states * rateHet];
+
+      *x2_gap = &(pr->partitionData[model]->gapVector[qNumber * pr->partitionData[model]->gapVectorLength]);
+      *x2_gapColumn   = &pr->partitionData[model]->gapColumn[(qNumber - tr->mxtips - 1) * states * rateHet];
+    }
+  }
+
+}
+
+
+/* this is actually a pre-computation and storage of values that remain constant while we change the value of the branch length 
+   we want to adapt. the target pointer sumtable is a single pre-allocated array that has the same 
+   size as a conditional likelihood vector at an inner node.
+
+   So if we want to do a Newton-Raphson optimization we only execute this function once in the beginning for each new branch we are considering !
+   */
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static void sumCAT_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *sum, 
+    *left, 
+    *right;
+
+  switch(tipCase)
+  {
+
+    /* switch over possible configurations of the nodes p and q defining the branch */
+
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[states * tipX1[i]]);
+        right = &(tipVector[states * tipX2[i]]);
+        sum = &sumtable[states * i];
+
+        /* just multiply the values with each other for each site, note the similarity with evaluate() 
+           we precompute the product which will remain constant and then just multiply this pre-computed 
+           product with the changing P matrix exponentaions that depend on the branch lengths */
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    case PLL_TIP_INNER:
+
+      /* same as for PLL_TIP_TIP only that 
+         we now access on tip vector and one 
+         inner vector. 
+
+         You may also observe that we do not consider using scaling vectors anywhere here.
+
+         This is because we are interested in the first and second derivatives of the likelihood and 
+         hence the addition of the log() of the scaling factor times the number of scaling events
+         becomes obsolete through the derivative */
+
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[states * tipX1[i]]);
+        right = &x2[states * i];
+        sum = &sumtable[states * i];
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left  = &x1[states * i];
+        right = &x2[states * i];
+        sum = &sumtable[states * i];
+
+        for(l = 0; l < states; l++)
+          sum[l] = left[l] * right[l];
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
+
+
+#if (!defined(__SSE3) && !defined(__AVX))
+
+/* same thing for GAMMA models. The only noteworthy thing here is that we have an additional inner loop over the 
+   number of discrete gamma rates. The data access pattern is also different since for tip vector accesses through our 
+   lookup table, we do not distnguish between rates 
+
+   Note the different access pattern in PLL_TIP_INNER:
+
+   left = &(tipVector[states * tipX1[i]]);        
+   right = &(x2[span * i + l * states]);
+
+*/
+
+static void sumGAMMA_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
+{
+  int 
+    i, 
+    l, 
+    k;
+
+  const int 
+    span = 4 * states;
+
+  double 
+    *left, 
+    *right, 
+    *sum;
+
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[states * tipX1[i]]);
+        right = &(tipVector[states * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * span + l * states];
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      //reorder_back( x2, n, span );
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[states * tipX1[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2[span * i + l * states]);
+          sum = &sumtable[i * span + l * states];
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+
+        }
+      }
+      //reorder( x2, n, span );
+      break;
+    case PLL_INNER_INNER:
+      //reorder_back( x1, n, span );
+      //reorder_back( x2, n, span );
+      for(i = 0; i < n; i++)
+      {
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1[span * i + l * states]);
+          right = &(x2[span * i + l * states]);
+          sum   = &(sumtable[i * span + l * states]);
+
+
+          for(k = 0; k < states; k++)
+            sum[k] = left[k] * right[k];
+        }
+      }
+      //reorder( x1, n, span );
+      //reorder( x2, n, span );
+      break;
+    default:
+      assert(0);
+  }
+}
+#endif
+
+/* optimized functions for branch length optimization */
+
+
+#if (defined(__SSE3) || defined(__AVX))
+
+static void sumCAT_SAVE(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMA_GAPPED_SAVE(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMA(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumCAT(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGAMMAPROT_GAPPED_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector[4],
+                             unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGAMMAPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGTRCATPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+static void sumGTRCATPROT_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap);
+
+static void coreGTRGAMMAPROT_LG4(double *gammaRates, double *EIGN[4], double *sumtable, int upper, int *wrptr,
+                                 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz,
+                                 double * lg4_weights);
+
+static void coreGTRGAMMA(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+static void coreGTRCAT(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt, 
+    double *rptr, double *EIGN, int *cptr, double lz);
+
+
+static void coreGTRGAMMAPROT(double *gammaRates, double *EIGN, double *sumtable, int upper, int *wrptr,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz);
+
+static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
+    int *wgt, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable);
+
+#endif
+
+
+/* now this is the core function of the newton-Raphson based branch length optimization that actually computes 
+   the first and second derivative of the likelihood given a new proposed branch length lz */
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static double coreCatAsc(double *EIGN, double *sumtable, int upper,
+			 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz, const int numStates,
+			 double *ascScaler)
+{
+  double  
+    diagptable[1024], 
+    lh = 0.0,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr;
+
+  int     
+    i,     
+    l;  
+
+ 
+  ki = 1.0;
+  kisqr = 1.0;
+
+  for(l = 1; l < numStates; l++)
+    {
+      diagptable[l * 4]     = exp(EIGN[l-1] * ki * lz);
+      diagptable[l * 4 + 1] = EIGN[l-1] * ki;
+      diagptable[l * 4 + 2] = EIGN[l-1] * EIGN[l-1] * kisqr;
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+	*sum = &sumtable[i * numStates],
+	tmp,
+	inv_Li   = 0.0,
+	dlnLidlz = 0.0,
+	d2lnLidlz2 = 0.0;
+
+    
+      inv_Li += sum[0];
+
+      for(l = 1; l < numStates; l++)
+	{
+	  inv_Li     += (tmp = diagptable[l * 4] * sum[l]);
+	  dlnLidlz   += tmp * diagptable[l * 4 + 1];
+	  d2lnLidlz2 += tmp * diagptable[l * 4 + 2];
+	}	            
+            
+      inv_Li = fabs(inv_Li);             
+       
+      lh        += inv_Li * ascScaler[i];
+      dlnLdlz   += dlnLidlz * ascScaler[i];
+      d2lnLdlz2 += d2lnLidlz2 * ascScaler[i];
+    } 
+
+  *ext_dlnLdlz   = (dlnLdlz / (lh - 1.0));
+  *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0)));  
+
+  return lh;
+}
+
+
+static double coreGammaAsc(double *gammaRates, double *EIGN, double *sumtable, int upper,
+			   volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz, const int numStates,
+			   double *ascScaler)
+{
+  double  
+    diagptable[1024], 
+    lh = 0.0,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr;
+
+  int     
+    i, 
+    j, 
+    l;  
+
+  const int 
+    gammaStates = 4 * numStates;
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+
+      for(l = 1; l < numStates; l++)
+	{
+	  diagptable[i * gammaStates + l * 4]     = exp(EIGN[l-1] * ki * lz);
+	  diagptable[i * gammaStates + l * 4 + 1] = EIGN[l-1] * ki;
+	  diagptable[i * gammaStates + l * 4 + 2] = EIGN[l-1] * EIGN[l-1] * kisqr;
+	}
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+	*sum = &sumtable[i * gammaStates],
+	tmp,
+	inv_Li   = 0.0,
+	dlnLidlz = 0.0,
+	d2lnLidlz2 = 0.0;
+
+      for(j = 0; j < 4; j++)
+	{
+	  inv_Li += sum[j * numStates];
+
+	  for(l = 1; l < numStates; l++)
+	    {
+	      inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * numStates + l]);
+	      dlnLidlz   += tmp * diagptable[j * gammaStates + l * 4 + 1];
+	      d2lnLidlz2 += tmp * diagptable[j * gammaStates + l * 4 + 2];
+	    }	  
+	}    
+            
+      inv_Li = 0.25 * fabs(inv_Li);         
+      dlnLidlz *= 0.25;
+      d2lnLidlz2 *= 0.25;
+       
+      lh        += inv_Li * ascScaler[i];
+      dlnLdlz   += dlnLidlz * ascScaler[i];
+      d2lnLdlz2 += d2lnLidlz2 * ascScaler[i];
+    } 
+
+  *ext_dlnLdlz   = (dlnLdlz / (lh - 1.0));
+  *ext_d2lnLdlz2 = (((lh - 1.0) * (d2lnLdlz2) - (dlnLdlz * dlnLdlz)) / ((lh - 1.0) * (lh - 1.0)));  
+
+  return lh;
+}
+
+static void sumCatAsc(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+			int n, const int numStates)
+{
+  int i, k;
+  double *left, *right, *sum;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(tipVector[numStates * tip[i]]);
+	  right = &(tipVector[numStates * tip[i]]);
+
+	  
+	  sum = &sumtable[i * numStates];
+	  
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	  
+	}
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left = &(tipVector[numStates * tip[i]]);
+
+	  
+	  right = &(x2[i * numStates]);
+	  sum = &sumtable[i * numStates];
+
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	 
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(x1[i * numStates]);
+	  right = &(x2[i * numStates]);
+	  sum   = &(sumtable[i * numStates]);
+
+	  for(k = 0; k < numStates; k++)
+	    sum[k] = left[k] * right[k];	 
+	}
+      break;
+    default:
+      assert(0);
+    }
+}
+
+static void sumGammaAsc(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+			int n, const int numStates)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  const int gammaStates = numStates * 4;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+	{
+	  left  = &(tipVector[numStates * tip[i]]);
+	  right = &(tipVector[numStates * tip[i]]);
+
+	  for(l = 0; l < 4; l++)
+	    {
+	      sum = &sumtable[i * gammaStates + l * numStates];
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  left = &(tipVector[numStates * tip[i]]);
+
+	  for(l = 0; l < 4; l++)
+	    {
+	      right = &(x2[gammaStates * i + l * numStates]);
+	      sum = &sumtable[i * gammaStates + l * numStates];
+
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  for(l = 0; l < 4; l++)
+	    {
+	      left  = &(x1[gammaStates * i + l * numStates]);
+	      right = &(x2[gammaStates * i + l * numStates]);
+	      sum   = &(sumtable[i * gammaStates + l * numStates]);
+
+	      for(k = 0; k < numStates; k++)
+		sum[k] = left[k] * right[k];
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+
+
+#if (!defined(__AVX) && !defined(__SSE3))
+static void coreCAT_FLEX(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt,
+    double *rptr, double *EIGN, int *cptr, double lz, const int states)
+    /* rptr perSiteRates pointer, cptr rateCategory pointer */
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *d, 
+
+    /* arrays to store stuff we can pre-compute */
+    *d_start = NULL,
+    *e = NULL,
+    *s = NULL,
+    *dd = NULL,
+    inv_Li, 
+    dlnLidlz, 
+    d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  rax_posix_memalign ((void **) &d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * states * sizeof(double));
+  rax_posix_memalign ((void **) &e,       PLL_BYTE_ALIGNMENT, (states * sizeof(double)));
+  rax_posix_memalign ((void **) &s,       PLL_BYTE_ALIGNMENT, states * sizeof(double));
+  rax_posix_memalign ((void **) &dd,      PLL_BYTE_ALIGNMENT, states * sizeof(double)),
+  d = d_start;
+
+  e[0] = 0.0;
+  s[0] = 0.0; 
+  dd[0] = 0.0;
+
+
+  /* we are pre-computing values for computing the first and second derivative of P(lz)
+     since this requires an exponetial that the only thing we really have to derive here */
+
+  for(l = 1; l < states; l++)
+  { 
+    s[l]  = EIGN[l];
+    e[l]  = EIGN[l] * EIGN[l];     
+    dd[l] = s[l] * lz;
+  }
+
+  /* compute the P matrices and their derivatives for 
+     all per-site rate categories */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {      
+    d[states * i] = 1.0;
+    for(l = 1; l < states; l++)
+      d[states * i + l] = exp(dd[l] * rptr[i]);
+  }
+
+
+  /* now loop over the sites in this partition to obtain the per-site 1st and 2nd derivatives */
+
+  for (i = 0; i < upper; i++)
+  {    
+    /* get the correct p matrix for the rate at the current site i */
+
+    d = &d_start[states * cptr[i]];      
+
+    /* this is the likelihood at site i, NOT the log likelihood, we don't need the log 
+       likelihood to compute derivatives ! */
+
+    inv_Li     = sum[states * i]; 
+
+    /* those are for storing the first and second derivative of the Likelihood at site i */
+
+    dlnLidlz   = 0.0;
+    d2lnLidlz2 = 0.0;
+
+    /* now multiply the likelihood and the first and second derivative with the 
+       appropriate derivatives of P(lz) */
+
+    for(l = 1; l < states; l++)
+    {
+      double
+        tmpv = d[l] * sum[states * i + l];
+
+      inv_Li     += tmpv;                 
+      dlnLidlz   += tmpv * s[l];       
+      d2lnLidlz2 += tmpv * e[l];
+    }     
+
+    /* below we are implementing the other mathematical operations that are required 
+       to obtain the deirivatives */
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    /* compute the accumulated first and second derivatives of this site */
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  /* 
+     set the result values, i.e., the sum of the per-site first and second derivatives of the likelihood function 
+     for this partition. 
+     */
+
+  *d1  = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  /* free the temporary arrays */
+
+  rax_free(d_start);
+  rax_free(e);
+  rax_free(s);
+  rax_free(dd);
+}
+
+static void coreGAMMA_FLEX(int upper, double *sumtable, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, 
+    double *EIGN, double *gammaRates, double lz, int *wrptr, const int states)
+{
+  double  
+    *sum, 
+    diagptable[1024], /* TODO make this dynamic */
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0,
+    ki, 
+    kisqr,
+    tmp,
+    inv_Li, 
+    dlnLidlz, 
+    d2lnLidlz2;
+
+  int     
+    i, 
+    j, 
+    l;  
+
+  const int 
+    gammaStates = 4 * states;
+
+  /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    for(l = 1; l < states; l++)
+    {
+      diagptable[i * gammaStates + l * 4]     = exp(EIGN[l] * ki * lz);
+      diagptable[i * gammaStates + l * 4 + 1] = EIGN[l] * ki;
+      diagptable[i * gammaStates + l * 4 + 2] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  /* loop over sites in this partition */
+
+  for (i = 0; i < upper; i++)
+  {
+    /* access the array with pre-computed values */
+    sum = &sumtable[i * gammaStates];
+
+    /* initial per-site likelihood and 1st and 2nd derivatives */
+
+    inv_Li   = 0.0;
+    dlnLidlz = 0.0;
+    d2lnLidlz2 = 0.0;
+
+    /* loop over discrete GAMMA rates */
+
+    for(j = 0; j < 4; j++)
+    {
+      inv_Li += sum[j * states];
+
+      for(l = 1; l < states; l++)
+      {
+        inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * states + l]);
+        dlnLidlz   +=  tmp * diagptable[j * gammaStates + l * 4 + 1];
+        d2lnLidlz2 +=  tmp * diagptable[j * gammaStates + l * 4 + 2];
+      }
+    }
+
+    /* finalize derivative computation */
+    /* note that wrptr[] here unlike in CAT above is the 
+       integer weight vector of the current site 
+
+       The operations:
+
+       EIGN[l] * ki;
+       EIGN[l] * EIGN[l] * kisqr;
+
+       that are hidden in CAT in wrptr (at least the * ki and * ki *ki part of them 
+       are done explicitely here 
+
+*/
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+
+}
+#endif
+
+//void sumGAMMA_FLEX_reorder(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+//    unsigned char *tipX1, unsigned char *tipX2, int n, const int states);
+
+/** @brief Precompute values (sumtable) from the 2 likelihood vectors of a given branch
+ *
+ * @warning These precomputations are stored in \a tr->partitionData[model].sumBuffer, which is used by function \a execCore
+ *
+ * @param tr
+ *   Library instance
+ *
+ * @warning the given branch is implicitly defined in \a tr by these nodes:
+ * pNumber = tr->td[0].ti[0].pNumber;
+ * qNumber = tr->td[0].ti[0].qNumber;
+ *
+ *
+ * @note This function should be called only once at the very beginning of each Newton-Raphson procedure for optimizing barnch lengths. It initially invokes an iterative newview call to get a consistent pair of vectors at the left and the right end of the branch and thereafter invokes the one-time only precomputation of values (sumtable) that can be re-used in each Newton-Raphson iteration. Once this function has been called we can execute the actual NR procedure
+ *
+ *
+ */
+void makenewzIterative(pllInstance *tr, partitionList * pr)
+{
+  int 
+    model, 
+    tipCase;
+
+  double
+    *x1_start     = NULL,
+    *x2_start     = NULL,
+    *x1_start_asc = NULL,
+    *x2_start_asc = NULL;
+
+
+  unsigned char
+    *tipX1,
+    *tipX2;
+
+  double
+    *x1_gapColumn = (double*)NULL,
+    *x2_gapColumn = (double*)NULL;
+
+  unsigned int
+    *x1_gap = (unsigned int*)NULL,
+    *x2_gap = (unsigned int*)NULL;                            
+
+  /* call newvieIterative to get the likelihood arrays to the left and right of the branch */
+
+  pllNewviewIterative(tr, pr, 1);
+
+
+  /* 
+     loop over all partoitions to do the precomputation of the sumTable buffer 
+     This is analogous to the pllNewviewIterative() and pllEvaluateIterative() 
+     implementations.
+     */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  { 
+    int 
+      width = pr->partitionData[model]->width;
+
+    if(tr->td[0].executeModel[model] && width > 0)
+    {
+      int          
+        states = pr->partitionData[model]->states;
+
+
+      getVects(tr, pr, &tipX1, &tipX2, &x1_start, &x2_start, &tipCase, model, &x1_gapColumn, &x2_gapColumn, &x1_gap, &x2_gap, &x1_start_asc, &x2_start_asc);
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+      assert(!tr->saveMemory);
+      if(tr->rateHetModel == PLL_CAT)
+        sumCAT_FLEX(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+            width, states);
+      else
+        //sumGAMMA_FLEX_reorder(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+          sumGAMMA_FLEX(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+            width, states);
+#else
+      switch(states)
+      {
+      case 2: /* BINARY */
+          assert(!tr->saveMemory);
+          if (tr->rateHetModel == PLL_CAT)
+            sumCAT_BINARY(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                          width);
+          else
+            sumGAMMA_BINARY(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                            width);
+          break;
+      case 4: /* DNA */
+#ifdef __MIC_NATIVE
+      assert(!tr->saveMemory);
+      assert(tr->rateHetModel == PLL_GAMMA);
+
+      sumGTRGAMMA_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+          width);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+          {
+            if(tr->saveMemory)
+              sumCAT_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else
+              sumCAT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width);
+          }
+          else
+          {
+            if(tr->saveMemory)
+              sumGAMMA_GAPPED_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else
+              sumGAMMA(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width);
+          }
+#endif
+          break;                
+        case 20: /* proteins */
+#ifdef __MIC_NATIVE
+          assert(!tr->saveMemory);
+          assert(tr->rateHetModel == PLL_GAMMA);
+
+              if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                          sumGTRGAMMAPROT_LG4_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector_LG4, tipX1, tipX2,
+                                  width);
+              else
+                          sumGTRGAMMAPROT_MIC(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                                  width);
+#else
+
+            if(tr->rateHetModel == PLL_CAT)
+          {
+            if(tr->saveMemory)
+              sumGTRCATPROT_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+            else                      
+              sumGTRCATPROT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width);
+          }
+          else
+          {
+
+            if(tr->saveMemory)
+              sumGAMMAPROT_GAPPED_SAVE(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector, tipX1, tipX2,
+                  width, x1_gapColumn, x2_gapColumn, x1_gap, x2_gap);
+              else
+                    {
+                      if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                        sumGAMMAPROT_LG4(tipCase,  pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector_LG4,
+                                         tipX1, tipX2, width);
+            else
+              sumGAMMAPROT(tipCase, pr->partitionData[model]->sumBuffer, x1_start, x2_start, pr->partitionData[model]->tipVector,
+                  tipX1, tipX2, width);
+                    }
+          }
+#endif
+          break;                
+        default:
+          assert(0);
+      }
+#endif
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if (pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+      if (pr->partitionData[model]->ascBias)
+#endif
+       {
+            int pNumber = tr->td[0].ti[0].pNumber, qNumber =
+                    tr->td[0].ti[0].qNumber, i, *ex1_asc =
+                    &pr->partitionData[model]->ascExpVector[(pNumber
+                            - tr->mxtips - 1) * states], *ex2_asc =
+                    &pr->partitionData[model]->ascExpVector[(qNumber
+                            - tr->mxtips - 1) * states];
+            switch (tipCase)
+            {
+            case PLL_TIP_TIP:
+                assert(0);
+                break;
+            case PLL_TIP_INNER:
+                if (isTip(pNumber, tr->mxtips))
+                {
+                    for (i = 0; i < states; i++)
+                        pr->partitionData[model]->ascScaler[i] = pow(
+                                PLL_MINLIKELIHOOD, (double) ex2_asc[i]);
+                }
+                else
+                {
+                    for (i = 0; i < states; i++)
+                        pr->partitionData[model]->ascScaler[i] = pow(
+                                PLL_MINLIKELIHOOD, (double) ex1_asc[i]);
+                }
+                break;
+            case PLL_INNER_INNER:
+                for (i = 0; i < states; i++)
+                    pr->partitionData[model]->ascScaler[i] = pow(
+                            PLL_MINLIKELIHOOD,
+                            (double) (ex1_asc[i] + ex2_asc[i]));
+                break;
+            default:
+                assert(0);
+            }
+         if (tr->rateHetModel == PLL_CAT)
+           sumCatAsc  (tipCase, pr->partitionData[model]->ascSumBuffer, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector, states, states);
+         else
+           sumGammaAsc(tipCase, pr->partitionData[model]->ascSumBuffer, x1_start_asc, x2_start_asc, pr->partitionData[model]->tipVector, states, states);
+       }
+    }
+  }
+}
+
+
+/** @brief Compute first and second derivatives of the likelihood with respect to a given branch length 
+ *
+ * @param tr
+ *   library instance
+ *
+ * @param _dlnLdlz 
+ *   First derivative dl/dlz
+ *
+ * @param _d2lnLdlz2
+ *   Second derivative d(dl/dlz)/dlz
+ *
+ * @warning \a makenewzIterative should have been called to precompute \a tr->partitionData[model].sumBuffer at the given branch
+ *
+ * @note  this function actually computes the first and second derivatives of the likelihood for a given branch stored in tr->coreLZ[model] Note that in the parallel case coreLZ must always be broadcasted together with the traversal descriptor, at least for optimizing branch lengths 
+ *
+ */
+void execCore(pllInstance *tr, partitionList *pr, volatile double *_dlnLdlz, volatile double *_d2lnLdlz2)
+{
+  int model, branchIndex;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  double lz;
+
+  _dlnLdlz[0]   = 0.0;
+  _d2lnLdlz2[0] = 0.0;
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    int 
+      width = pr->partitionData[model]->width;
+
+    /* check if we (the present thread for instance) needs to compute something at 
+       all for the present partition */
+
+    if(tr->td[0].executeModel[model] && width > 0)
+    {
+      int           
+        states = pr->partitionData[model]->states;
+
+      double 
+        *sumBuffer       = (double*)NULL;
+
+
+      volatile double
+        dlnLdlz   = 0.0,
+                  d2lnLdlz2 = 0.0;
+
+      /* set a pointer to the part of the pre-computed sumBuffer we are going to access */
+
+      sumBuffer = pr->partitionData[model]->sumBuffer;
+
+      /* figure out if we are optimizing branch lengths individually per partition or jointly across 
+         all partitions. If we do this on a per partition basis, we also need to compute and store 
+         the per-partition derivatives of the likelihood separately, otherwise not */
+
+      if(numBranches > 1)
+      {
+        branchIndex = model;          
+        lz = tr->td[0].parameterValues[model];
+        _dlnLdlz[model]   = 0.0;
+        _d2lnLdlz2[model] = 0.0;
+      }
+      else
+      {
+        branchIndex = 0;              
+        lz = tr->td[0].parameterValues[0];
+      }
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+      /* compute first and second derivatives with the slow generic functions */
+
+      if(tr->rateHetModel == PLL_CAT)
+        coreCAT_FLEX(width, pr->partitionData[model]->numberOfCategories, sumBuffer,
+            &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->wgt,
+            pr->partitionData[model]->perSiteRates, pr->partitionData[model]->EIGN,  pr->partitionData[model]->rateCategory, lz, states);
+      else
+        coreGAMMA_FLEX(width, sumBuffer,
+            &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+            pr->partitionData[model]->wgt, states);
+#else
+      switch(states)
+       {    
+         case 2: /* BINARY */
+           if (tr->rateHetModel == PLL_CAT)
+              coreGTRCAT_BINARY(width, 
+                                pr->partitionData[model]->numberOfCategories, 
+                                sumBuffer,
+                                &dlnLdlz, 
+                                &d2lnLdlz2, 
+                                pr->partitionData[model]->perSiteRates, 
+                                pr->partitionData[model]->EIGN,  
+                                pr->partitionData[model]->rateCategory, 
+                                lz, 
+                                pr->partitionData[model]->wgt);
+           else
+              coreGTRGAMMA_BINARY(width, 
+                                   sumBuffer,
+                                   &dlnLdlz, 
+                                   &d2lnLdlz2, 
+                                   pr->partitionData[model]->EIGN,
+                                   pr->partitionData[model]->gammaRates, 
+                                   lz,
+                                   pr->partitionData[model]->wgt);
+           break;
+         case 4: /* DNA */
+#ifdef __MIC_NATIVE
+           assert(tr->rateHetModel == PLL_GAMMA);
+
+           coreGTRGAMMA_MIC(width, 
+                            sumBuffer,
+                            &dlnLdlz, 
+                            &d2lnLdlz2, 
+                            pr->partitionData[model]->EIGN, 
+                            pr->partitionData[model]->gammaRates, 
+                            lz,
+                            pr->partitionData[model]->wgt);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+            coreGTRCAT(width, pr->partitionData[model]->numberOfCategories, sumBuffer,
+                &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->wgt,
+                pr->partitionData[model]->perSiteRates, pr->partitionData[model]->EIGN,  pr->partitionData[model]->rateCategory, lz);
+          else 
+            coreGTRGAMMA(width, sumBuffer,
+                &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+                pr->partitionData[model]->wgt);
+
+#endif
+          break;                    
+        case 20: /* proteins */
+
+#ifdef __MIC_NATIVE
+      assert(tr->rateHetModel == PLL_GAMMA);
+
+          if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                  coreGTRGAMMAPROT_LG4_MIC(width, sumBuffer,
+                          &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN_LG4, pr->partitionData[model]->gammaRates, lz,
+                          pr->partitionData[model]->wgt, pr->partitionData[model]->lg4x_weights);
+          else
+                  coreGTRGAMMAPROT_MIC(width, sumBuffer,
+                          &dlnLdlz, &d2lnLdlz2, pr->partitionData[model]->EIGN, pr->partitionData[model]->gammaRates, lz,
+                          pr->partitionData[model]->wgt);
+#else
+
+          if(tr->rateHetModel == PLL_CAT)
+            coreGTRCATPROT(pr->partitionData[model]->EIGN, lz, pr->partitionData[model]->numberOfCategories,  pr->partitionData[model]->perSiteRates,
+                pr->partitionData[model]->rateCategory, width,
+                pr->partitionData[model]->wgt,
+                &dlnLdlz, &d2lnLdlz2,
+                sumBuffer);
+            else
+                { 
+                  if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                    coreGTRGAMMAPROT_LG4(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN_LG4,
+                                         sumBuffer, width, pr->partitionData[model]->wgt,
+                                         &dlnLdlz, &d2lnLdlz2, lz, pr->partitionData[model]->lg4x_weights);
+          else
+
+            coreGTRGAMMAPROT(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN,
+                sumBuffer, width, pr->partitionData[model]->wgt,
+                &dlnLdlz, &d2lnLdlz2, lz);
+            
+                }
+#endif
+          break;                   
+        default:
+          assert(0);
+      }
+#endif
+
+      /* store first and second derivative */
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+     if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+     if(pr->partitionData[model]->ascBias)
+#endif  
+       {
+         size_t
+           i;
+
+         double 
+           correction;
+
+         int            
+           w = 0;
+         
+         volatile double 
+           d1 = 0.0,
+           d2 = 0.0;                   
+         
+         for(i = (size_t)pr->partitionData[model]->lower; i < (size_t)pr->partitionData[model]->upper; i++)
+           w += tr->aliaswgt[i];     
+         
+          switch(tr->rateHetModel)
+            {
+            case PLL_CAT:
+              correction = coreCatAsc(pr->partitionData[model]->EIGN, pr->partitionData[model]->ascSumBuffer, states,
+                                        &d1,  &d2, lz, states, pr->partitionData[model]->ascScaler);
+              break;
+            case PLL_GAMMA:
+              correction = coreGammaAsc(pr->partitionData[model]->gammaRates, pr->partitionData[model]->EIGN, pr->partitionData[model]->ascSumBuffer, states,
+                                        &d1,  &d2, lz, states, pr->partitionData[model]->ascScaler);
+              break;
+            default:
+              assert(0);
+            }
+        
+         correction = 1.0 - correction;
+     
+         /* Lewis correction */
+         _dlnLdlz[branchIndex]   =  _dlnLdlz[branchIndex] + dlnLdlz - (double)w * d1;
+         _d2lnLdlz2[branchIndex] =  _d2lnLdlz2[branchIndex] + d2lnLdlz2-  (double)w * d2;
+           
+       }  
+      else
+       {
+         _dlnLdlz[branchIndex]   = _dlnLdlz[branchIndex]   + dlnLdlz;
+         _d2lnLdlz2[branchIndex] = _d2lnLdlz2[branchIndex] + d2lnLdlz2;
+       }
+    }
+    else
+    {
+      /* set to 0 to make the reduction operation consistent */
+
+      if(width == 0 && (numBranches > 1))
+      {
+        _dlnLdlz[model]   = 0.0;
+        _d2lnLdlz2[model] = 0.0;
+      }                                    
+    }
+  }
+
+}
+
+
+/* the function below actually implements the iterative Newton-Raphson procedure.
+   It is particularly messy and hard to read because for the case of per-partition branch length 
+   estimates it needs to keep track of whetehr the Newton Raphson procedure has 
+   converged for each partition individually. 
+
+   The rational efor doing it like this is also provided in:
+
+
+   A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009,
+
+*/
+
+static void topLevelMakenewz(pllInstance *tr, partitionList * pr, double *z0, int _maxiter, double *result)
+{
+  double   z[PLL_NUM_BRANCHES], zprev[PLL_NUM_BRANCHES], zstep[PLL_NUM_BRANCHES];
+  volatile double  dlnLdlz[PLL_NUM_BRANCHES], d2lnLdlz2[PLL_NUM_BRANCHES];
+  int i, maxiter[PLL_NUM_BRANCHES], model;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+  pllBoolean firstIteration = PLL_TRUE;
+  pllBoolean outerConverged[PLL_NUM_BRANCHES];
+  pllBoolean loopConverged;
+
+
+  /* figure out if this is on a per partition basis or jointly across all partitions */
+
+
+
+  /* initialize loop convergence variables etc. 
+     maxiter is the maximum number of NR iterations we are going to do before giving up */
+
+  for(i = 0; i < numBranches; i++)
+  {
+    z[i] = z0[i];
+    maxiter[i] = _maxiter;
+    outerConverged[i] = PLL_FALSE;
+    tr->curvatOK[i]       = PLL_TRUE;
+  }
+
+
+  /* nested do while loops of Newton-Raphson */
+
+  do
+  {
+
+    /* check if we ar done for partition i or if we need to adapt the branch length again */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_TRUE)
+      {
+        tr->curvatOK[i] = PLL_FALSE;
+
+        zprev[i] = z[i];
+
+        zstep[i] = (1.0 - PLL_ZMAX) * z[i] + PLL_ZMIN;
+      }
+    }
+
+    for(i = 0; i < numBranches; i++)
+    {
+      /* other case, the outer loop hasn't converged but we are trying to approach 
+         the maximum from the wrong side */
+
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_FALSE)
+      {
+        double lz;
+
+        if (z[i] < PLL_ZMIN) z[i] = PLL_ZMIN;
+        else if (z[i] > PLL_ZMAX) z[i] = PLL_ZMAX;
+        lz    = log(z[i]);
+
+        tr->coreLZ[i] = lz;
+      }
+    }
+
+
+    /* set the execution mask */
+
+    if(numBranches > 1)
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+      {
+        if(pr->partitionData[model]->executeModel)
+          pr->partitionData[model]->executeModel = !tr->curvatOK[model];
+
+      }
+    }
+    else
+    {
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        pr->partitionData[model]->executeModel = !tr->curvatOK[0];
+    }
+
+
+    /* store it in traversal descriptor */
+
+    storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+    /* store the new branch length values to be tested in traversal descriptor */
+
+    storeValuesInTraversalDescriptor(tr, pr, &(tr->coreLZ[0]));
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+    /* if this is the first iteration of NR we will need to first do this one-time call 
+       of maknewzIterative() Note that, only this call requires broadcasting the traversal descriptor,
+       subsequent calls to pllMasterBarrier(PLL_THREAD_MAKENEWZ, tr); will not require this
+       */
+
+    if(firstIteration)
+      {
+        tr->td[0].traversalHasChanged = PLL_TRUE; 
+        pllMasterBarrier (tr, pr, PLL_THREAD_MAKENEWZ_FIRST);
+        firstIteration = PLL_FALSE; 
+        tr->td[0].traversalHasChanged = PLL_FALSE; 
+      }
+    else 
+      pllMasterBarrier(tr, pr, PLL_THREAD_MAKENEWZ);
+    branchLength_parallelReduce(tr, (double*)dlnLdlz, (double*)d2lnLdlz2, numBranches);
+#else 
+    /* sequential part, if this is the first newton-raphson implementation,
+       do the precomputations as well, otherwise just execute the computation
+       of the derivatives */
+    if(firstIteration)
+      {
+        makenewzIterative(tr, pr);
+        firstIteration = PLL_FALSE;
+      }
+    execCore(tr, pr, dlnLdlz, d2lnLdlz2);
+#endif
+
+    /* do a NR step, if we are on the correct side of the maximum that's okay, otherwise 
+       shorten branch */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(outerConverged[i] == PLL_FALSE && tr->curvatOK[i] == PLL_FALSE)
+      {
+        if ((d2lnLdlz2[i] >= 0.0) && (z[i] < PLL_ZMAX))
+          zprev[i] = z[i] = 0.37 * z[i] + 0.63;  /*  Bad curvature, shorten branch */
+        else
+          tr->curvatOK[i] = PLL_TRUE;
+      }
+    }
+
+    /* do the standard NR step to obrain the next value, depending on the state for eahc partition */
+
+    for(i = 0; i < numBranches; i++)
+    {
+      if(tr->curvatOK[i] == PLL_TRUE && outerConverged[i] == PLL_FALSE)
+      {
+        if (d2lnLdlz2[i] < 0.0)
+        {
+          double tantmp = -dlnLdlz[i] / d2lnLdlz2[i];
+          if (tantmp < 100)
+          {
+            z[i] *= exp(tantmp);
+            if (z[i] < PLL_ZMIN)
+              z[i] = PLL_ZMIN;
+
+            if (z[i] > 0.25 * zprev[i] + 0.75)
+              z[i] = 0.25 * zprev[i] + 0.75;
+          }
+          else
+            z[i] = 0.25 * zprev[i] + 0.75;
+        }
+        if (z[i] > PLL_ZMAX) z[i] = PLL_ZMAX;
+
+        /* decrement the maximum number of itarations */
+
+        maxiter[i] = maxiter[i] - 1;
+
+        /* check if the outer loop has converged */
+
+        //old code below commented out, integrated new PRELIMINARY BUG FIX !
+        //this needs further work at some point!
+
+        /*
+        if(maxiter[i] > 0 && (PLL_ABS(z[i] - zprev[i]) > zstep[i]))
+          outerConverged[i] = PLL_FALSE;
+        else
+          outerConverged[i] = PLL_TRUE;
+        */
+
+        if((PLL_ABS(z[i] - zprev[i]) > zstep[i]))
+         {
+           /* We should make a more informed decision here,
+              based on the log like improvement */
+
+           if(maxiter[i] < -20)
+            {
+              z[i] = z0[i];
+              outerConverged[i] = PLL_TRUE;
+            }
+           else
+             outerConverged[i] = PLL_FALSE;
+         }
+        else
+          outerConverged[i] = PLL_TRUE;
+      }
+    }
+
+    /* check if the loop has converged for all partitions */
+
+    loopConverged = PLL_TRUE;
+    for(i = 0; i < numBranches; i++)
+      loopConverged = loopConverged && outerConverged[i];
+  }
+  while (!loopConverged);
+
+
+  /* reset  partition execution mask */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    pr->partitionData[model]->executeModel = PLL_TRUE;
+
+  /* copy the new branches in the result array of branches.
+     if we don't do a per partition estimate of 
+     branches this will only set result[0]
+     */
+
+  for(i = 0; i < numBranches; i++)
+    result[i] = z[i];
+}
+
+
+/** @brief Optimize branch length value(s) of a given branch with the Newton-Raphtson procedure 
+ *
+ * @warning A given branch may have one or several branch length values (up to PLL_NUM_BRANCHES), usually the later refers to partition-specific branch length values. Thus z0 and result represent collections rather than double values. The number of branch length values is given by \a tr->numBranches 
+ *
+ * @param tr
+ *   Library instance
+ *
+ * @param p
+ *   One node that defines the branch (p->z)
+ *
+ * @param q
+ *   The other node side of the branch (usually p->back), but the branch length can be estimated even if p and q are
+ *   not connected, e.g. before the insertion of a subtree.
+ *
+ * @param z0 
+ *   Initial branch length value(s) for the given branch \a p->z 
+ *
+ * @param maxiter 
+ *   Maximum number of iterations in the Newton-Raphson procedure 
+ *
+ * @param result 
+ *   Resulting branch length value(s) for the given branch \a p->z 
+ *
+ * @param mask 
+ *   Specifies if a mask to track partition convergence (\a tr->partitionConverged) is being used.
+ *
+ * @sa typical values for \a maxiter are constants \a iterations and \a PLL_NEWZPERCYCLE
+ * @note Requirement: q->z == p->z
+ */
+void makenewzGeneric(pllInstance *tr, partitionList * pr, nodeptr p, nodeptr q, double *z0, int maxiter, double *result, pllBoolean mask)
+{
+  int i;
+  //boolean originalExecute[PLL_NUM_BRANCHES];
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  pllBoolean 
+    p_recom = PLL_FALSE, /* if one of was missing, we will need to force recomputation */
+    q_recom = PLL_FALSE;
+
+  /* the first entry of the traversal descriptor stores the node pair that defines 
+     the branch */
+
+  tr->td[0].ti[0].pNumber = p->number;
+  tr->td[0].ti[0].qNumber = q->number;
+
+  for(i = 0; i < numBranches; i++)
+  {
+    //originalExecute[i] =  pr->partitionData[i]->executeModel;
+    tr->td[0].ti[0].qz[i] =  z0[i];
+    if(mask)
+    {
+      if (tr->partitionConverged[i])
+        pr->partitionData[i]->executeModel = PLL_FALSE;
+      else
+        pr->partitionData[i]->executeModel = PLL_TRUE;
+    }
+  }
+  if (tr->useRecom)
+  {
+    int
+      slot = -1;
+      //count = 0;
+
+    /* Ensure p and q get a unpinnable slot in physical memory */
+    if(!isTip(q->number, tr->mxtips))
+    {
+      q_recom = getxVector(tr->rvec, q->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_q = slot;
+    }
+    if(!isTip(p->number, tr->mxtips))
+    {
+      p_recom = getxVector(tr->rvec, p->number, &slot, tr->mxtips);
+      tr->td[0].ti[0].slot_p = slot;
+    }
+  }
+
+
+  /* compute the traversal descriptor of the likelihood vectors  that need to be re-computed 
+     first in makenewzIterative */
+
+  tr->td[0].count = 1;
+
+  if(p_recom || needsRecomp(tr->useRecom, tr->rvec, p, tr->mxtips))
+    computeTraversal(tr, p, PLL_TRUE, numBranches);
+
+  if(q_recom || needsRecomp(tr->useRecom, tr->rvec, q, tr->mxtips))
+    computeTraversal(tr, q, PLL_TRUE, numBranches);
+
+  /* call the Newton-Raphson procedure */
+
+  topLevelMakenewz(tr, pr, z0, maxiter, result);
+
+  /* Mark node as unpinnable */
+  if(tr->useRecom)
+  {
+    unpinNode(tr->rvec, p->number, tr->mxtips);
+    unpinNode(tr->rvec, q->number, tr->mxtips);
+  }
+
+  /* fix eceuteModel this seems to be a bit redundant with topLevelMakenewz */ 
+
+  for(i = 0; i < numBranches; i++)
+    pr->partitionData[i]->executeModel = PLL_TRUE;
+}
+
+
+/* below are, once again the optimized functions */
+
+#if (defined(__SSE3) || defined(__AVX))
+
+
+static void sumCAT_BINARY(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+                          unsigned char *tipX1, unsigned char *tipX2, int n)
+
+{
+  int i;
+  
+#if (!defined(__SSE3) && !defined(__AVX))
+  int j;
+#endif
+  double *x1, *x2;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(tipVector[2 * tipX2[i]]);
+
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));
+#endif
+        }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &x2_start[2 * i];
+
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));  
+#endif
+        }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+#if (!defined(__SSE3) && !defined(__AVX))
+          for(j = 0; j < 2; j++)
+            sum[i * 2 + j]     = x1[j] * x2[j];
+#else
+          _mm_store_pd(&sum[i * 2], _mm_mul_pd( _mm_load_pd(x1), _mm_load_pd(x2)));   
+#endif
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumCAT_SAVE(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int i;
+  double 
+    *x1, 
+    *x2,    
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        if(isGap(x2_gap, i))
+          x2 = x2_gapColumn;
+        else
+        {
+          x2 = x2_ptr;
+          x2_ptr += 4;
+        }
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        if(isGap(x1_gap, i))
+          x1 = x1_gapColumn;
+        else
+        {
+          x1 = x1_ptr;
+          x1_ptr += 4;
+        }
+
+        if(isGap(x2_gap, i))
+          x2 = x2_gapColumn;
+        else
+        {
+          x2 = x2_ptr;
+          x2_ptr += 4;
+        }
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+
+      }    
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+                            unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  double *x1, *x2, *sum;
+  int i, j;
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+  int k;
+#endif
+
+  /* C-OPT once again switch over possible configurations at inner node */
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      /* C-OPT main for loop overt alignment length */
+      for (i = 0; i < n; i++)
+        {
+          x1 = &(tipVector[2 * tipX1[i]]);
+          x2 = &(tipVector[2 * tipX2[i]]);
+          sum = &sumtable[i * 8];
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[k] * x2[k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));         
+#endif
+        }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1  = &(tipVector[2 * tipX1[i]]);
+          x2  = &x2_start[8 * i];
+          sum = &sumtable[8 * i];
+
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[k] * x2[j * 2 + k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[j * 2] )));
+#endif
+        }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1  = &x1_start[8 * i];
+          x2  = &x2_start[8 * i];
+          sum = &sumtable[8 * i];
+#if (!defined(_USE_PTHREADS) && !defined(_FINE_GRAIN_MPI))
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 2; k++)
+              sum[j * 2 + k] = x1[j * 2 + k] * x2[j * 2 + k];
+#else
+          for(j = 0; j < 4; j++)
+            _mm_store_pd( &sum[j*2], _mm_mul_pd( _mm_load_pd( &x1[j * 2] ), _mm_load_pd( &x2[j * 2] )));
+#endif
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumGAMMA_GAPPED_SAVE(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  double 
+    *x1, 
+    *x2, 
+    *sum,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start;
+
+  int i, j, k; 
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:     
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+        sum = &sumtable[i * 16];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &(tipVector[4 * tipX1[i]]);
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2 = x2_gapColumn;
+        else
+        {
+          x2  = x2_ptr;
+          x2_ptr += 16;
+        }
+
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        if(x1_gap[i / 32] & mask32[i % 32])
+          x1 = x1_gapColumn;
+        else
+        {
+          x1  = x1_ptr;
+          x1_ptr += 16;
+        }
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2 = x2_gapColumn;
+        else
+        {
+          x2  = x2_ptr;
+          x2_ptr += 16;
+        }
+
+        sum = &sumtable[16 * i];
+
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+
+
+static void sumGAMMA(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  double *x1, *x2, *sum;
+  int i, j, k;
+
+  /* C-OPT once again switch over possible configurations at inner node */
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      /* C-OPT main for loop overt alignment length */
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+        sum = &sumtable[i * 16];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[k] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &(tipVector[4 * tipX1[i]]);
+        x2  = &x2_start[16 * i];
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1  = &x1_start[16 * i];
+        x2  = &x2_start[16 * i];
+        sum = &sumtable[16 * i];
+
+        for(j = 0; j < 4; j++)      
+          for(k = 0; k < 4; k+=2)
+            _mm_store_pd( &sum[j*4 + k], _mm_mul_pd( _mm_load_pd( &x1[j * 4 + k] ), _mm_load_pd( &x2[j * 4 + k] )));
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumCAT(int tipCase, double *sum, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i;
+  double 
+    *x1, 
+    *x2;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &x2_start[4 * i];
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &x1_start[4 * i];
+        x2 = &x2_start[4 * i];
+
+        _mm_store_pd( &sum[i*4 + 0], _mm_mul_pd( _mm_load_pd( &x1[0] ), _mm_load_pd( &x2[0] )));
+        _mm_store_pd( &sum[i*4 + 2], _mm_mul_pd( _mm_load_pd( &x1[2] ), _mm_load_pd( &x2[2] )));
+
+      }    
+      break;
+    default:
+      assert(0);
+  }
+}
+static void sumGAMMAPROT_GAPPED_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int i, l, k;
+  double 
+    *left, 
+    *right, 
+    *sum,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x1v,
+    *x2v;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2v = x2_gapColumn;
+        else
+        {
+          x2v = x2_ptr;
+          x2_ptr += 80;
+        }
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2v[l * 20]);
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        if(x1_gap[i / 32] & mask32[i % 32])
+          x1v = x1_gapColumn;
+        else
+        {
+          x1v  = x1_ptr;
+          x1_ptr += 80;
+        }
+
+        if(x2_gap[i / 32] & mask32[i % 32])
+          x2v = x2_gapColumn;
+        else
+        {
+          x2v  = x2_ptr;
+          x2_ptr += 80;
+        }
+
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1v[l * 20]);
+          right = &(x2v[l * 20]);
+          sum   = &(sumtable[i * 80 + l * 20]);
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGAMMAPROT_LG4(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector[4],
+                             unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+        {         
+          for(l = 0; l < 4; l++)
+            {
+              left  = &(tipVector[l][20 * tipX1[i]]);
+              right = &(tipVector[l][20 * tipX2[i]]);
+
+              sum = &sumtable[i * 80 + l * 20];
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+        {
+         
+
+          for(l = 0; l < 4; l++)
+            { 
+              left = &(tipVector[l][20 * tipX1[i]]);
+              right = &(x2[80 * i + l * 20]);
+              sum = &sumtable[i * 80 + l * 20];
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+        {
+          for(l = 0; l < 4; l++)
+            {
+              left  = &(x1[80 * i + l * 20]);
+              right = &(x2[80 * i + l * 20]);
+              sum   = &(sumtable[i * 80 + l * 20]);
+
+#ifdef __SSE3
+              for(k = 0; k < 20; k+=2)
+                {
+                  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+                  
+                  _mm_store_pd(&sum[k], sumv);           
+                }
+#else
+              for(k = 0; k < 20; k++)
+                sum[k] = left[k] * right[k];
+#endif
+            }
+        }
+      break;
+    default:
+      assert(0);
+    }
+}
+
+
+static void sumGAMMAPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l, k;
+  double *left, *right, *sum;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for(i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      for(i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+
+        for(l = 0; l < 4; l++)
+        {
+          right = &(x2[80 * i + l * 20]);
+          sum = &sumtable[i * 80 + l * 20];
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        for(l = 0; l < 4; l++)
+        {
+          left  = &(x1[80 * i + l * 20]);
+          right = &(x2[80 * i + l * 20]);
+          sum   = &(sumtable[i * 80 + l * 20]);
+
+
+          for(k = 0; k < 20; k+=2)
+          {
+            __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
+
+            _mm_store_pd(&sum[k], sumv);                 
+          }
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGTRCATPROT(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+  int i, l;
+  double *sum, *left, *right;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);
+        right = &x2[20 * i];
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left  = &x1[20 * i];
+        right = &x2[20 * i];
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+static void sumGTRCATPROT_SAVE(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n, 
+    double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap)
+{
+  int 
+    i, 
+    l;
+
+  double 
+    *sum, 
+    *left, 
+    *right,
+    *left_ptr = x1,
+    *right_ptr = x2;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+      {
+        left  = &(tipVector[20 * tipX1[i]]);
+        right = &(tipVector[20 * tipX2[i]]);
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+      {
+        left = &(tipVector[20 * tipX1[i]]);       
+
+        if(isGap(x2_gap, i))
+          right = x2_gapColumn;
+        else
+        {
+          right = right_ptr;
+          right_ptr += 20;
+        }
+
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {  
+        if(isGap(x1_gap, i))
+          left = x1_gapColumn;
+        else
+        {
+          left = left_ptr;
+          left_ptr += 20;
+        }
+
+        if(isGap(x2_gap, i))
+          right = x2_gapColumn;
+        else
+        {
+          right = right_ptr;
+          right_ptr += 20;
+        }
+
+        sum = &sumtable[20 * i];
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l]));
+
+          _mm_store_pd(&sum[l], sumv);           
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+}
+
+static void coreGTRGAMMA(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+  double 
+    dlnLdlz = 0.0,
+            d2lnLdlz2 = 0.0,
+            ki, 
+            kisqr,  
+            inv_Li, 
+            dlnLidlz, 
+            d2lnLidlz2,  
+		*sum;
+	PLL_ALIGN_BEGIN double
+            diagptable0[16] PLL_ALIGN_END,
+            diagptable1[16] PLL_ALIGN_END,
+            diagptable2[16] PLL_ALIGN_END;
+
+  int     
+    i, 
+    j, 
+    l;
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    diagptable0[i * 4] = 1.0;
+    diagptable1[i * 4] = 0.0;
+    diagptable2[i * 4] = 0.0;
+
+    for(l = 1; l < 4; l++)
+    {
+      diagptable0[i * 4 + l] = exp(EIGN[l] * ki * lz);
+      diagptable1[i * 4 + l] = EIGN[l] * ki;
+      diagptable2[i * 4 + l] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  for (i = 0; i < upper; i++)
+  { 
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    sum = &sumtable[i * 16];         
+
+    for(j = 0; j < 4; j++)
+    {                   
+      double       
+        *d0 = &diagptable0[j * 4],
+        *d1 = &diagptable1[j * 4],
+        *d2 = &diagptable2[j * 4];
+
+      for(l = 0; l < 4; l+=2)
+      {
+        __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 4 + l]));
+        a0 = _mm_add_pd(a0, tmpv);
+        a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+        a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+      }           
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);     
+    _mm_storel_pd(&dlnLidlz, a1);
+    _mm_storel_pd(&d2lnLidlz2, a2); 
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;     
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2; 
+}
+
+static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
+                              volatile double *d1, volatile double *d2, 
+                              double *rptr, double *EIGN, int *cptr, double lz, int *wgt)
+{
+  int i;
+  double
+    *d, *d_start = NULL,
+    tmp_0, inv_Li, dlnLidlz, d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+  double e[2];
+  double dd1;
+
+  e[0] = EIGN[0];
+  e[1] = EIGN[0] * EIGN[0];
+
+
+  d = d_start = (double *)rax_malloc(numberOfCategories * sizeof(double));
+
+  dd1 = e[0] * lz;
+
+  for(i = 0; i < numberOfCategories; i++)
+    d[i] = exp(dd1 * rptr[i]);
+
+  for (i = 0; i < upper; i++)
+    {
+      double
+        r = rptr[cptr[i]],
+        wr1 = r * wgt[i],
+        wr2 = r * r * wgt[i];
+      
+      d = &d_start[cptr[i]];
+
+      inv_Li = sum[2 * i];
+      inv_Li += (tmp_0 = d[0] * sum[2 * i + 1]);
+
+      inv_Li = 1.0/fabs(inv_Li);
+
+      dlnLidlz   = tmp_0 * e[0];
+      d2lnLidlz2 = tmp_0 * e[1];
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+      dlnLdlz   += wr1 * dlnLidlz;
+      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+
+static void coreGTRCAT(int upper, int numberOfCategories, double *sum,
+    volatile double *d1, volatile double *d2, int *wgt,
+    double *rptr, double *EIGN, int *cptr, double lz)
+{
+  int i;
+  double
+    *d, *d_start = NULL,
+    inv_Li, dlnLidlz, d2lnLidlz2,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  PLL_ALIGN_BEGIN double e1[4] PLL_ALIGN_END;
+  PLL_ALIGN_BEGIN double e2[4] PLL_ALIGN_END;
+  double dd1, dd2, dd3;
+
+  __m128d
+    e1v[2],
+    e2v[2];
+
+  e1[0] = 0.0;
+  e2[0] = 0.0;
+  e1[1] = EIGN[1];
+  e2[1] = EIGN[1] * EIGN[1];
+  e1[2] = EIGN[2];
+  e2[2] = EIGN[2] * EIGN[2];
+  e1[3] = EIGN[3];
+  e2[3] = EIGN[3] * EIGN[3];
+
+  e1v[0]= _mm_load_pd(&e1[0]);
+  e1v[1]= _mm_load_pd(&e1[2]);
+
+  e2v[0]= _mm_load_pd(&e2[0]);
+  e2v[1]= _mm_load_pd(&e2[2]);
+
+  rax_posix_memalign ((void **) &d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * 4 * sizeof(double));
+  d = d_start;
+
+  dd1 = EIGN[1] * lz;
+  dd2 = EIGN[2] * lz;
+  dd3 = EIGN[3] * lz;
+
+  for(i = 0; i < numberOfCategories; i++)
+  {
+    d[i * 4 + 0] = 1.0;
+    d[i * 4 + 1] = exp(dd1 * rptr[i]);
+    d[i * 4 + 2] = exp(dd2 * rptr[i]);
+    d[i * 4 + 3] = exp(dd3 * rptr[i]);
+  }
+
+  for (i = 0; i < upper; i++)
+  {
+    double *s = &sum[4 * i];
+    d = &d_start[4 * cptr[i]];  
+
+    __m128d tmp_0v =_mm_mul_pd(_mm_load_pd(&d[0]),_mm_load_pd(&s[0]));
+    __m128d tmp_1v =_mm_mul_pd(_mm_load_pd(&d[2]),_mm_load_pd(&s[2]));
+
+    __m128d inv_Liv    = _mm_add_pd(tmp_0v, tmp_1v);      
+
+    __m128d dlnLidlzv   = _mm_add_pd(_mm_mul_pd(tmp_0v, e1v[0]), _mm_mul_pd(tmp_1v, e1v[1]));     
+    __m128d d2lnLidlz2v = _mm_add_pd(_mm_mul_pd(tmp_0v, e2v[0]), _mm_mul_pd(tmp_1v, e2v[1]));
+
+
+    inv_Liv   = _mm_hadd_pd(inv_Liv, inv_Liv);
+    dlnLidlzv = _mm_hadd_pd(dlnLidlzv, dlnLidlzv);
+    d2lnLidlz2v = _mm_hadd_pd(d2lnLidlz2v, d2lnLidlz2v);                 
+
+    _mm_storel_pd(&inv_Li, inv_Liv);     
+    _mm_storel_pd(&dlnLidlz, dlnLidlzv);                 
+    _mm_storel_pd(&d2lnLidlz2, d2lnLidlz2v);      
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+#if (!defined(__SSE3) && !defined(__AVX))
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+  int i, j;
+  double
+    *diagptable, *diagptable_start, *sum,
+    tmp_1, inv_Li, dlnLidlz, d2lnLidlz2, ki, kisqr,
+    dlnLdlz = 0.0,
+    d2lnLdlz2 = 0.0;
+
+  diagptable = diagptable_start = (double *)rax_malloc(sizeof(double) * 12);
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+
+      diagptable[i * 3]     = exp (EIGN[1] * ki * lz);
+      diagptable[i * 3 + 1] = EIGN[1] * ki;
+      diagptable[i * 3 + 2] = EIGN[1] * EIGN[1] * kisqr;
+    }
+
+  for (i = 0; i < upper; i++)
+    {
+      diagptable = diagptable_start;
+      sum = &(sumtable[i * 8]);
+
+      inv_Li      = 0.0;
+      dlnLidlz    = 0.0;
+      d2lnLidlz2  = 0.0;
+
+      for(j = 0; j < 4; j++)
+        {
+          inv_Li += sum[2 * j];
+
+          tmp_1      =  diagptable[3 * j] * sum[2 * j + 1];
+          inv_Li     += tmp_1;
+          dlnLidlz   += tmp_1 * diagptable[3 * j + 1];
+          d2lnLidlz2 += tmp_1 * diagptable[3 * j + 2];
+        }
+
+      inv_Li = 1.0 / fabs(inv_Li);
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+
+      dlnLdlz  += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *d1 = dlnLdlz;
+  *d2 = d2lnLdlz2;
+
+  rax_free(diagptable_start);
+}
+#else
+static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
+                                volatile double *d1,   volatile double *d2, double *EIGN, double *gammaRates, double lz, int *wrptr)
+{
+	double
+		dlnLdlz = 0.0,
+		d2lnLdlz2 = 0.0,
+		ki,
+		kisqr,
+		inv_Li,
+		dlnLidlz,
+		d2lnLidlz2,
+		*sum;
+	PLL_ALIGN_BEGIN double
+		diagptable0[8] PLL_ALIGN_END,
+		diagptable1[8] PLL_ALIGN_END,
+		diagptable2[8] PLL_ALIGN_END;
+    
+  int     
+    i, 
+    j;
+  
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+      
+      diagptable0[i * 2] = 1.0;
+      diagptable1[i * 2] = 0.0;
+      diagptable2[i * 2] = 0.0;
+     
+      diagptable0[i * 2 + 1] = exp(EIGN[0] * ki * lz);
+      diagptable1[i * 2 + 1] = EIGN[0] * ki;
+      diagptable2[i * 2 + 1] = EIGN[0] * EIGN[0] * kisqr;    
+    }
+
+  for (i = 0; i < upper; i++)
+    { 
+      __m128d a0 = _mm_setzero_pd();
+      __m128d a1 = _mm_setzero_pd();
+      __m128d a2 = _mm_setzero_pd();
+
+      sum = &sumtable[i * 8];         
+
+      for(j = 0; j < 4; j++)
+        {                       
+          double           
+            *d0 = &diagptable0[j * 2],
+            *d1 = &diagptable1[j * 2],
+            *d2 = &diagptable2[j * 2];
+                         
+          __m128d tmpv = _mm_mul_pd(_mm_load_pd(d0), _mm_load_pd(&sum[j * 2]));
+          a0 = _mm_add_pd(a0, tmpv);
+          a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(d1)));
+          a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(d2)));
+                          
+        }
+
+      a0 = _mm_hadd_pd(a0, a0);
+      a1 = _mm_hadd_pd(a1, a1);
+      a2 = _mm_hadd_pd(a2, a2);
+
+      _mm_storel_pd(&inv_Li, a0);     
+      _mm_storel_pd(&dlnLidlz, a1);
+      _mm_storel_pd(&d2lnLidlz2, a2); 
+
+      inv_Li = 1.0 / fabs(inv_Li);
+     
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;     
+
+      dlnLdlz   += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+ 
+  *d1   = dlnLdlz;
+  *d2 = d2lnLdlz2; 
+}
+
+
+#endif
+
+static void coreGTRGAMMAPROT_LG4(double *gammaRates, double *EIGN[4], double *sumtable, int upper, int *wrptr,
+                                 volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz,
+                                 double * lg4_weights)
+{
+	double  *sum;
+	PLL_ALIGN_BEGIN double
+    diagptable0[80] PLL_ALIGN_END,
+    diagptable1[80] PLL_ALIGN_END,
+    diagptable2[80] PLL_ALIGN_END;    
+  int     i, j, l;
+  double  dlnLdlz = 0;
+  double d2lnLdlz2 = 0;
+  double ki, kisqr; 
+
+  for(i = 0; i < 4; i++)
+    {
+      ki = gammaRates[i];
+      kisqr = ki * ki;
+      
+      diagptable0[i * 20] = 1.0;
+      diagptable1[i * 20] = 0.0;
+      diagptable2[i * 20] = 0.0;
+
+      for(l = 1; l < 20; l++)
+        {
+          diagptable0[i * 20 + l] = exp(EIGN[i][l] * ki * lz);
+          diagptable1[i * 20 + l] = EIGN[i][l] * ki;
+          diagptable2[i * 20 + l] = EIGN[i][l] * EIGN[i][l] * kisqr;
+        }
+    }
+
+  for (i = 0; i < upper; i++)
+    { 
+
+      double
+      	  inv_Li = 0.0,
+      	  dlnLidlz = 0.0,
+      	  d2lnLidlz2 = 0.0;
+
+      sum = &sumtable[i * 80];         
+
+      for(j = 0; j < 4; j++)
+        {                       
+          double
+          	l0,
+          	l1,
+          	l2,
+            *d0 = &diagptable0[j * 20],
+            *d1 = &diagptable1[j * 20],
+            *d2 = &diagptable2[j * 20];
+                 
+          __m128d a0 = _mm_setzero_pd();
+          __m128d a1 = _mm_setzero_pd();
+          __m128d a2 = _mm_setzero_pd();
+
+          for(l = 0; l < 20; l+=2)
+            {
+              __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
+              a0 = _mm_add_pd(a0, tmpv);
+              a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+              a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+            }             
+
+          a0 = _mm_hadd_pd(a0, a0);
+      	  a1 = _mm_hadd_pd(a1, a1);
+      	  a2 = _mm_hadd_pd(a2, a2);
+
+      	 _mm_storel_pd(&l0, a0);
+      	 _mm_storel_pd(&l1, a1);
+      	 _mm_storel_pd(&l2, a2);
+
+      	 inv_Li     += lg4_weights[j] * l0;
+      	 dlnLidlz   += lg4_weights[j] * l1;
+     	 d2lnLidlz2 += lg4_weights[j] * l2;
+      }
+
+      inv_Li = 1.0 / fabs (inv_Li);
+
+      dlnLidlz   *= inv_Li;
+      d2lnLidlz2 *= inv_Li;
+
+      dlnLdlz   += wrptr[i] * dlnLidlz;
+      d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+    }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+
+static void coreGTRGAMMAPROT(double *gammaRates, double *EIGN, double *sumtable, int upper, int *wrptr,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double lz)
+{
+	double  *sum;
+	PLL_ALIGN_BEGIN double
+		diagptable0[80] PLL_ALIGN_END,
+		diagptable1[80] PLL_ALIGN_END,
+		diagptable2[80] PLL_ALIGN_END;
+
+  int     i, j, l;
+  double  dlnLdlz = 0;
+  double d2lnLdlz2 = 0;
+  double ki, kisqr; 
+  double inv_Li, dlnLidlz, d2lnLidlz2;
+
+  for(i = 0; i < 4; i++)
+  {
+    ki = gammaRates[i];
+    kisqr = ki * ki;
+
+    diagptable0[i * 20] = 1.0;
+    diagptable1[i * 20] = 0.0;
+    diagptable2[i * 20] = 0.0;
+
+    for(l = 1; l < 20; l++)
+    {
+      diagptable0[i * 20 + l] = exp(EIGN[l] * ki * lz);
+      diagptable1[i * 20 + l] = EIGN[l] * ki;
+      diagptable2[i * 20 + l] = EIGN[l] * EIGN[l] * kisqr;
+    }
+  }
+
+  for (i = 0; i < upper; i++)
+  { 
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    sum = &sumtable[i * 80];         
+
+    for(j = 0; j < 4; j++)
+    {                   
+      double       
+        *d0 = &diagptable0[j * 20],
+        *d1 = &diagptable1[j * 20],
+        *d2 = &diagptable2[j * 20];
+
+      for(l = 0; l < 20; l+=2)
+      {
+        __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d0[l]), _mm_load_pd(&sum[j * 20 +l]));
+        a0 = _mm_add_pd(a0, tmpv);
+        a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, _mm_load_pd(&d1[l])));
+        a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, _mm_load_pd(&d2[l])));
+      }           
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);
+    _mm_storel_pd(&dlnLidlz, a1);
+    _mm_storel_pd(&d2lnLidlz2, a2);
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz   += wrptr[i] * dlnLidlz;
+    d2lnLdlz2 += wrptr[i] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+
+static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
+    int *wgt, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable)
+{
+  int i, l;
+  double *d1, *d_start = NULL, *sum;
+  PLL_ALIGN_BEGIN double 
+    e[20] PLL_ALIGN_END, 
+    s[20] PLL_ALIGN_END, 
+    dd[20] PLL_ALIGN_END;
+  double inv_Li, dlnLidlz, d2lnLidlz2;
+  double  dlnLdlz = 0.0;
+  double  d2lnLdlz2 = 0.0;
+
+  rax_posix_memalign ((void **)&d_start, PLL_BYTE_ALIGNMENT, numberOfCategories * 20 * sizeof(double));
+  d1 = d_start; 
+
+  e[0] = 0.0;
+  s[0] = 0.0; 
+
+  for(l = 1; l < 20; l++)
+  {
+    e[l]  = EIGN[l] * EIGN[l];
+    s[l]  = EIGN[l];
+    dd[l] = s[l] * lz;
+  }
+
+  for(i = 0; i < numberOfCategories; i++)
+  {      
+    d1[20 * i] = 1.0;
+    for(l = 1; l < 20; l++)
+      d1[20 * i + l] = exp(dd[l] * rptr[i]);
+  }
+
+  for (i = 0; i < upper; i++)
+  {
+    __m128d a0 = _mm_setzero_pd();
+    __m128d a1 = _mm_setzero_pd();
+    __m128d a2 = _mm_setzero_pd();
+
+    d1 = &d_start[20 * cptr[i]];
+    sum = &sumtable[20 * i];
+
+    for(l = 0; l < 20; l+=2)
+    {     
+      __m128d tmpv = _mm_mul_pd(_mm_load_pd(&d1[l]), _mm_load_pd(&sum[l]));
+
+      a0 = _mm_add_pd(a0, tmpv);
+      __m128d sv = _mm_load_pd(&s[l]);    
+
+      a1 = _mm_add_pd(a1, _mm_mul_pd(tmpv, sv));
+      __m128d ev = _mm_load_pd(&e[l]);    
+
+      a2 = _mm_add_pd(a2, _mm_mul_pd(tmpv, ev));
+    }
+
+    a0 = _mm_hadd_pd(a0, a0);
+    a1 = _mm_hadd_pd(a1, a1);
+    a2 = _mm_hadd_pd(a2, a2);
+
+    _mm_storel_pd(&inv_Li, a0);     
+    _mm_storel_pd(&dlnLidlz, a1);                 
+    _mm_storel_pd(&d2lnLidlz2, a2);
+
+    inv_Li = 1.0 / fabs (inv_Li);
+
+    dlnLidlz   *= inv_Li;
+    d2lnLidlz2 *= inv_Li;
+
+    dlnLdlz  += wgt[i] * rptr[cptr[i]] * dlnLidlz;
+    d2lnLdlz2 += wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
+  }
+
+  *ext_dlnLdlz   = dlnLdlz;
+  *ext_d2lnLdlz2 = d2lnLdlz2;
+
+  rax_free(d_start);
+}
+
+
+
+
+#endif
+
+
+
diff --git a/pllrepo/src/mem_alloc.c b/pllrepo/src/mem_alloc.c
new file mode 100644
index 0000000..68e928d
--- /dev/null
+++ b/pllrepo/src/mem_alloc.c
@@ -0,0 +1,228 @@
+
+#define MEM_ALLOC_NO_GUARDS 1
+
+#include "mem_alloc.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef __APPLE__
+#include <malloc.h>             // this is probably not necessary
+#endif
+
+#ifdef RAXML_USE_LLALLOC
+
+// the llalloc library implementation in lockless_alloc/ll_alloc.c exports the alloction functions prefixed
+// with 'llalloc'. The following are the forward declarations of the llalloc* functions 
+
+#define PREFIX(X)   llalloc##X
+
+void *PREFIX(memalign)(size_t align, size_t size);
+void *PREFIX(malloc)(size_t size);
+void *PREFIX(realloc)(void *p, size_t size);
+int PREFIX(posix_memalign)(void **p, size_t align, size_t size);
+void *PREFIX(calloc)(size_t n, size_t size);
+void PREFIX(free)(void *p);
+
+
+// wrappers that forward the rax_* functions to the corresponding llalloc* functions
+
+
+void *rax_memalign(size_t align, size_t size) {
+  return PREFIX(memalign)(align, size);
+}
+
+void *rax_malloc( size_t size ) {
+  return PREFIX(malloc)(size);
+}
+void *rax_realloc( void *p, size_t size ) {
+  return PREFIX(realloc)(p, size);
+}
+
+
+void rax_free(void *p) {
+  PREFIX(free)(p);
+}
+
+int rax_posix_memalign(void **p, size_t align, size_t size) {
+  return PREFIX(posix_memalign)(p, align, size);
+}
+void *rax_calloc(size_t n, size_t size) {
+  return PREFIX(calloc)(n,size);
+}
+
+void *rax_malloc_aligned(size_t size) 
+{
+  const size_t PLL_BYTE_ALIGNMENT = 32;
+  return rax_memalign(PLL_BYTE_ALIGNMENT, size);
+  
+}
+
+#else // RAXML_USE_LLALLOC
+// if llalloc should not be used, forward the rax_* functions to the corresponding standard function
+
+void *rax_memalign(size_t align, size_t size) {
+#if defined (__APPLE__)
+    void * mem;
+    if (posix_memalign (&mem, align, size))
+      return (NULL);
+    else
+      return (mem);
+#else
+    return memalign(align, size);
+#endif
+    
+}
+
+void *rax_malloc( size_t size ) {
+  return malloc(size);
+}
+void *rax_realloc( void *p, size_t size ) {
+  return realloc(p, size);
+}
+
+
+void rax_free(void *p) {
+  free(p);
+}
+
+int rax_posix_memalign(void **p, size_t align, size_t size) {
+  return posix_memalign(p, align, size);
+}
+void *rax_calloc(size_t n, size_t size) {
+  return calloc(n,size);
+}
+
+void *rax_malloc_aligned(size_t size) 
+{
+  const size_t PLL_BYTE_ALIGNMENT = 32;
+  return rax_memalign(PLL_BYTE_ALIGNMENT, size);
+  
+}
+
+#endif
+
+
+
+#if 0
+//
+// two test cases to check if the default malloc plays along well with lockless malloc. Normally there shoudl not be a
+// problem as long as everyone handles 'foreign' sbrk calls gracefully (as lockless and glibc seem to do). 
+// WARNING: there is a slightly worrying comment in glibc malloc, which seems to assume that magically no foreign sbrks
+// happen between two consecutive sbrk calls while re-establishing page alignment in some obscure special case. IMHO, this
+// is clearly an error (race) in multithreaded programs, as there is no way how a foreign sbrk user can properly lock anything.
+// see: http://sourceware.org/git/?p=glibc.git;a=blob;f=malloc/malloc.c;h=0f1796c9134ffef289ec31fb1cd538f3a9490ae1;hb=HEAD#l2581
+//
+// If all threads consistently only use the rax_* wrappers this is not a problem, but as this is a library, we can not be sure 
+// that no other thread uses default malloc... note that lockless malloc only uses sbrk for the slab (=small block) area, while 
+// raxml heavy uses malloc/free only on much larger blocks...
+// If anything ever goes wrong while using mixed glibc/lockless malloc, this should be investigated.
+//
+// TODO: the potential race seems to be related to handling the case where a 'foreign sbrk' adjusted the break to a non page-boundary.
+// check if lockless malloc actually ever adjusts to non page-boundaries.
+
+
+void check_block( void *p, size_t size ) {
+    size_t i;
+    char *cp = (char*)p;
+    
+    for( i = 0; i < size; ++i ) {
+        
+        if( cp[i] != (char)i ) {
+            printf( "MEEEEEEEEEEEEEEEEEEEEP\n" );
+            abort();
+        }
+    }
+    
+}
+
+
+void fill_block( void *p, size_t size ) {
+    size_t i;
+    char *cp = (char*)p;
+    
+    for( i = 0; i < size; ++i ) {
+        cp[i] = (char)i;
+    }
+}
+
+
+void malloc_stress() {
+    const int n_slots = 100000;
+    
+    void *blocks1[n_slots];
+    size_t sizes1[n_slots];
+    void *blocks2[n_slots];
+    size_t sizes2[n_slots];
+    
+    memset( blocks1, 0, sizeof( void * ) * n_slots ); 
+    memset( blocks2, 0, sizeof( void * ) * n_slots ); 
+    
+    memset( sizes1, 0, sizeof( size_t ) * n_slots );
+    memset( sizes2, 0, sizeof( size_t ) * n_slots );
+    
+    
+    
+    while( 1 ) {
+        int r = rand() % n_slots;
+        
+        void *bs;
+        
+        
+        int size;
+        if( rand() % 2 == 0 ) {
+            size = rand() % (32 * 16); // hit slab
+        } else {
+            size = (rand() % 128) * 128; // not slab
+        }
+            
+            
+        if( 1 || rand() % 2 == 0 ) {
+            if( blocks1[r] == 0 ) {
+                blocks1[r] = malloc( size );
+                sizes1[r] = size;
+                fill_block( blocks1[r], sizes1[r] );
+            } else {
+                check_block( blocks1[r], sizes1[r] );
+                free( blocks1[r] );
+                blocks1[r] = 0;
+            }
+        } else {
+            if( blocks2[r] == 0 ) {
+                blocks2[r] = rax_malloc( size );
+                sizes2[r] = size;
+                fill_block( blocks2[r], sizes2[r] );
+            } else {
+                check_block( blocks2[r], sizes2[r] );
+                
+                rax_free( blocks2[r] );
+                blocks2[r] = 0;
+            }
+        }
+            
+       
+        
+    }
+    
+}
+
+
+void malloc_stress2() {
+    const size_t n_slots = 1000;
+    
+    void *blocks[n_slots];
+    size_t i;
+    for( i = 0; i < n_slots; ++i ) {
+        blocks[i] = malloc( (rand() % 32) * 1024 ); 
+        
+    }
+    sbrk( 10 );
+    for( i = 0; i < n_slots; ++i ) {
+        free(blocks[i]);
+        
+    }
+    
+    
+    
+}
+#endif
+
diff --git a/pllrepo/src/mem_alloc.h b/pllrepo/src/mem_alloc.h
new file mode 100644
index 0000000..29553c7
--- /dev/null
+++ b/pllrepo/src/mem_alloc.h
@@ -0,0 +1,70 @@
+#ifndef __mem_alloc_h
+#define __mem_alloc_h
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+#include <stdlib.h>
+//#include <intrin.h>
+#include <malloc.h>
+//#include <windows.h>
+#endif
+
+#include <stddef.h>
+#include <stdlib.h>
+#ifdef __linux__
+#include <malloc.h>
+#endif
+#include "pll.h"
+#include <string.h>
+
+//#define rax_memalign memalign
+//#define rax_malloc malloc
+//#define rax_calloc calloc
+//#define rax_realloc realloc
+
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+#define rax_posix_memalign(ptr,alignment,size) *(ptr) = _aligned_malloc((size),(alignment))
+#define rax_malloc(size) _aligned_malloc((size), PLL_BYTE_ALIGNMENT)
+void *rax_calloc(size_t count, size_t size);
+#define rax_free _aligned_free
+#else
+#define rax_posix_memalign posix_memalign
+#define rax_malloc malloc
+#define rax_calloc calloc
+#define rax_free free
+#endif
+
+//#define rax_malloc_aligned(x) memalign(PLL_BYTE_ALIGNMENT,x)
+
+//void *rax_memalign(size_t align, size_t size);
+//void *rax_malloc(size_t size);
+//void *rax_realloc(void *p, size_t size);
+//void rax_free(void *p);
+//int rax_posix_memalign(void **p, size_t align, size_t size);
+//void *rax_calloc(size_t n, size_t size);
+//
+//void *rax_malloc_aligned(size_t size);
+
+
+/* for strndup stuff */
+static __inline char *my_strndup(const char *s, size_t n) {
+	char *ret = (char *) rax_malloc(n+1);
+	strncpy(ret, s, n);
+	ret[n] = 0;
+	return ret;
+}
+
+#if 0
+// using the following contraption to trigger a compile-time error does not work on some gcc versions. It will trigger a confising linker error in the best case, so it is deativated.
+
+#if defined(RAXML_USE_LLALLOC) && !defined(MEM_ALLOC_NO_GUARDS)
+#define malloc(x) XXX_DONT_USE_MALLOC_WITHOUT_RAX_PREFIX_XXX
+#define free(x) XXX_DONT_USE_FREE_WITHOUT_RAX_PREFIX_XXX
+#define calloc(x,y) XXX_DONT_USE_CALLOC_WITHOUT_RAX_PREFIX_XXX
+#define realloc(x,y) XXX_DONT_USE_REALLOC_WITHOUT_RAX_PREFIX_XXX
+#define malloc_aligned(x) XXX_DONT_USE_MALLOC_ALIGNED_WITHOUT_RAX_PREFIX_XXX
+#define posix_memalign(x,y,z) XXX_DONT_USE_POSIX_MEMALIGN_ALIGNED_WITHOUT_RAX_PREFIX_XXX
+#endif
+#endif
+
+#endif
diff --git a/pllrepo/src/mic_native.h b/pllrepo/src/mic_native.h
new file mode 100644
index 0000000..38b24a3
--- /dev/null
+++ b/pllrepo/src/mic_native.h
@@ -0,0 +1,56 @@
+#ifndef MIC_NATIVE_H_
+#define MIC_NATIVE_H_
+
+void newviewGTRGAMMA_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+void sumGTRGAMMA_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMA_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+// protein data
+void newviewGTRGAMMAPROT_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+double evaluateGTRGAMMAPROT_MIC(int *ex1, int *ex2, int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling);
+
+void sumGTRGAMMAPROT_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMAPROT_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wrptr);
+
+// protein data - LG4
+
+void newviewGTRGAMMAPROT_LG4_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                  unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement);
+
+double evaluateGTRGAMMAPROT_LG4_MIC(int *wptr,
+                 double *x1_start, double *x2_start,
+                 double *tipVector[4],
+                 unsigned char *tipX1, const int n, double *diagptable);
+
+void sumGTRGAMMAPROT_LG4_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector[4],
+    unsigned char *tipX1, unsigned char *tipX2, int n);
+
+void coreGTRGAMMAPROT_LG4_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN[4], double *gammaRates, double lz, int *wrptr);
+
+
+#endif /* MIC_NATIVE_H_ */
diff --git a/pllrepo/src/mic_native_aa.c b/pllrepo/src/mic_native_aa.c
new file mode 100644
index 0000000..2cfd2b1
--- /dev/null
+++ b/pllrepo/src/mic_native_aa.c
@@ -0,0 +1,1254 @@
+#include <omp.h>
+#include <immintrin.h>
+#include <string.h>
+#include <math.h>
+
+#include "pll.h"
+#include "mic_native.h"
+
+static const int states = 20;
+static const int statesSquare = 20 * 20;
+static const int span = 20 * 4;
+static const int maxStateValue = 23;
+
+__inline void mic_fma4x80(const double* inv, double* outv, double* mulv)
+{
+    __mmask8 k1 = _mm512_int2mask(0x0F);
+    __mmask8 k2 = _mm512_int2mask(0xF0);
+    for(int l = 0; l < 80; l += 40)
+    {
+        __m512d t = _mm512_setzero_pd();
+
+        t = _mm512_extload_pd(&inv[l], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        __m512d m = _mm512_load_pd(&mulv[l]);
+        __m512d acc = _mm512_load_pd(&outv[l]);
+        __m512d r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l], r);
+
+        m = _mm512_load_pd(&mulv[l + 8]);
+        acc = _mm512_load_pd(&outv[l + 8]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 8], r);
+
+        t = _mm512_mask_extload_pd(t, k1, &inv[l], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        t = _mm512_mask_extload_pd(t, k2, &inv[l+20], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+
+        m = _mm512_load_pd(&mulv[l + 16]);
+        acc = _mm512_load_pd(&outv[l + 16]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 16], r);
+
+        t = _mm512_extload_pd(&inv[l+20], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        m = _mm512_load_pd(&mulv[l + 24]);
+        acc = _mm512_load_pd(&outv[l + 24]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 24], r);
+
+        m = _mm512_load_pd(&mulv[l + 32]);
+        acc = _mm512_load_pd(&outv[l + 32]);
+        r = _mm512_fmadd_pd(t, m, acc);
+        _mm512_store_pd(&outv[l + 32], r);
+    }
+}
+
+
+void newviewGTRGAMMAPROT_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+  __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+  __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+  int addScale = 0;
+
+  double aEV[1600] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+  #pragma ivdep
+  for (int l = 0; l < 1600; ++l)
+  {
+      aEV[l] = extEV[(l / span) * states + (l % states)];
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+        double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double umpX2[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        for(int i = 0; i < maxStateValue; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              umpX2[i * span + k] = 0.0;
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipVector[i * states + l] *  left[k * states + l];
+                  umpX2[i * span + k] +=  tipVector[i * states + l] * right[k * states + l];
+              }
+          }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            const double *uX1 = &umpX1[span * tipX1[i]];
+            const double *uX2 = &umpX2[span * tipX2[i]];
+
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v3 = &x3[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            // init scaling counter for the site
+            if (!fastScaling)
+                ex3[i] = 0;
+
+        } // sites loop
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int i = 0; i < maxStateValue; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipVector[i * states + l] *  left[k * states + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aRight[k * span + j * states + l] = right[j * statesSquare +  l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        } // site loop
+
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aLeft[k * span + j * states + l] = left[j * statesSquare + l * states + k];
+                    aRight[k * span + j * states + l] = right[j * statesSquare + l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+
+            double uX1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX1[l] = 0.;
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  *scalerIncrement = addScale;
+
+}
+
+
+
+double evaluateGTRGAMMAPROT_MIC(int *ex1, int *ex2, int *wgt, double *x1_start, double *x2_start, double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+    double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+        double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < maxStateValue; k++)
+        {
+            for(int l = 0; l < states; l++)
+            {
+                aTipVec[k*span + l] = aTipVec[k*span + states + l] = aTipVec[k*span + 2*states + l] = aTipVec[k*span + 3*states + l] = tipVector[k*states + l];
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+          /* access pre-computed tip vector values via a lookup table */
+          const double *x1 = &(aTipVec[span * tipX1[i]]);
+          /* access the other(inner) node at the other end of the branch */
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++) {
+              term += x1[j] * x2[j] * diagptable[j];
+          }
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * fabs(term));
+
+          sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char *) &x1_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &x1_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          const double *x1 = &(x1_start[span * i]);
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * fabs(term));
+
+          sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMAPROT_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+    double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < maxStateValue; k++)
+    {
+        for(int l = 0; l < states; l++)
+        {
+            aTipVec[k*span + l] = aTipVec[k*span + states + l] = aTipVec[k*span + 2*states + l] = aTipVec[k*span + 3*states + l] = tipVector[k*states + l];
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[span * tipX1[i]]);
+            const double *right = &(aTipVec[span * tipX2[i]]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+          _mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
+          _mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
+
+          _mm_prefetch((const char *) &x2_start[span*(i+2)], _MM_HINT_T0);
+          _mm_prefetch((const char *) &x2_start[span*(i+2) + 8], _MM_HINT_T0);
+
+          const double *left = &(aTipVec[span * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+          for(int l = 0; l < span; l++)
+          {
+              sumtable[i * span + l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char *) &x1_start[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x1_start[span*(i+16) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &x2_start[span*(i+16) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &x1_start[span*(i+2)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x1_start[span*(i+2) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+2)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &x2_start[span*(i+2) + 8], _MM_HINT_T0);
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMAPROT_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wgt)
+{
+    double diagptable0[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*states] = 1.;
+        diagptable1[i*states] = 0.;
+        diagptable2[i*states] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * states + l]  = exp(EIGN[l] * ki * lz);
+          diagptable1[i * states + l] = EIGN[l] * ki;
+          diagptable2[i * states + l] = EIGN[l] * EIGN[l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < span; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % PLL_VECTOR_WIDTH == 0 ? upper / PLL_VECTOR_WIDTH : upper / PLL_VECTOR_WIDTH + 1;
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        _mm_prefetch((const char *) &sumtable[i * span * 8], _MM_HINT_T0);
+        _mm_prefetch((const char *) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
+
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * PLL_VECTOR_WIDTH];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d1Buf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d2Buf[PLL_VECTOR_WIDTH] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < PLL_VECTOR_WIDTH; j++)
+        {
+            _mm_prefetch((const char *) &sum[span*(j+8)], _MM_HINT_T1);
+            _mm_prefetch((const char *) &sum[span*(j+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char *) &sum[span*(j+1)], _MM_HINT_T0);
+            _mm_prefetch((const char *) &sum[span*(j+1) + 8], _MM_HINT_T0);
+
+            __m512d inv_1 = _mm512_setzero_pd();
+            __m512d d1_1 = _mm512_setzero_pd();
+            __m512d d2_1 = _mm512_setzero_pd();
+
+            for (int offset = 0; offset < span; offset += 8)
+            {
+                __m512d d0_1 = _mm512_load_pd(&diagptable0[offset]);
+                __m512d d01_1 = _mm512_load_pd(&diagptable01[offset]);
+                __m512d d02_1 = _mm512_load_pd(&diagptable02[offset]);
+                __m512d s_1 = _mm512_load_pd(&sum[j*span + offset]);
+
+                inv_1 = _mm512_fmadd_pd(d0_1, s_1, inv_1);
+                d1_1 = _mm512_fmadd_pd(d01_1, s_1, d1_1);
+                d2_1 = _mm512_fmadd_pd(d02_1, s_1, d2_1);
+            }
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_CDAB));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_BADC));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_1), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_1);
+
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_CDAB));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_BADC));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_1), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_1);
+
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_CDAB));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_BADC));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_1), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_1);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < PLL_VECTOR_WIDTH; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLdlz += wgt[i * PLL_VECTOR_WIDTH + j] * d1;
+            d2lnLdlz2 += wgt[i * PLL_VECTOR_WIDTH + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
+
+/****
+ *       PROTEIN - LG4
+ */
+
+void newviewGTRGAMMAPROT_LG4_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                  unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement)
+{
+
+  __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+  __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+  __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+  int addScale = 0;
+
+  double aEV[1600] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+  #pragma ivdep
+  for (int l = 0; l < 1600; ++l)
+  {
+      aEV[l] = extEV[(l % span) / states][(l / span) * states + (l % states)];
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+        double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double umpX2[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        for(int i = 0; i < 23; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              umpX2[i * span + k] = 0.0;
+              double *tipv = &(tipVector[k / states][i * states]);
+
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipv[l] *  left[k * states + l];
+                  umpX2[i * span + k] +=  tipv[l] * right[k * states + l];
+              }
+          }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            const double *uX1 = &umpX1[span * tipX1[i]];
+            const double *uX2 = &umpX2[span * tipX2[i]];
+
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v3 = &x3[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+                for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+        } // sites loop
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int i = 0; i < 23; ++i)
+        {
+          for(int k = 0; k < span; ++k)
+          {
+              umpX1[i * span + k] = 0.0;
+              double *tipv = &(tipVector[k / states][i * states]);
+
+              #pragma ivdep
+              for(int l = 0; l < states; ++l)
+              {
+                  umpX1[i * span + k] +=  tipv[l] *  left[k * states + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aRight[k * span + j * states + l] = right[j * statesSquare +  l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                addScale += wgt[i];
+            }
+        } // site loop
+
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[4 * statesSquare] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int k = 0; k < states; k++)
+            {
+                for(int l = 0; l < states; l++)
+                {
+                    aLeft[k * span + j * states + l] = left[j * statesSquare + l * states + k];
+                    aRight[k * span + j * states + l] = right[j * statesSquare + l * states + k];
+                }
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+
+            #pragma unroll(10)
+            for (int j = 0; j < span; j += 8)
+            {
+                _mm_prefetch((const char *)&x1[span*(i+1) + j], _MM_HINT_T1);
+                _mm_prefetch((const char *)&x2[span*(i+1) + j], _MM_HINT_T1);
+            }
+
+
+            double uX1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            double* v3 = &(x3[span * i]);
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX1[l] = 0.;
+                uX2[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aRight[span*(k+1) + j], _MM_HINT_T0);
+                    _mm_prefetch((const char *)&aLeft[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&v1[k], uX1, &aLeft[k * span]);
+                mic_fma4x80(&v2[k], uX2, &aRight[k * span]);
+            }
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < span; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            for(int k = 0; k < states; ++k)
+            {
+				#pragma unroll(10)
+            	for (int j = 0; j < span; j += 8)
+                {
+                    _mm_prefetch((const char *)&aEV[span*(k+1) + j], _MM_HINT_T0);
+                }
+
+                mic_fma4x80(&uX[k], v3, &aEV[k * span]);
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax = _mm512_reduce_gmax_pd(t1);
+            double mx[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            for (int l = 8; l < span; l += 8)
+            {
+                __m512d t = _mm512_load_pd(&v3[l]);
+                t = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t), absMask_MIC));
+                double vmax2 = _mm512_reduce_gmax_pd(t);
+                vmax = PLL_MAX(vmax, vmax2);
+            }
+
+            if (vmax < PLL_MINLIKELIHOOD)
+            {
+                #pragma vector aligned nontemporal
+                for(int l = 0; l < span; l++)
+                  v3[l] *= PLL_TWOTOTHE256;
+
+                addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  *scalerIncrement = addScale;
+
+}
+
+
+
+double evaluateGTRGAMMAPROT_LG4_MIC(int *wgt, double *x1_start, double *x2_start, double *tipVector[4],
+                 unsigned char *tipX1, const int n, double *diagptable)
+{
+    double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+        double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < 23; k++)
+        {
+            for(int j = 0; j < 4; j++)
+            {
+				for(int l = 0; l < states; l++)
+				{
+					aTipVec[k*span + j*states + l] = tipVector[j][k*states + l];
+				}
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+			/* access pre-computed tip vector values via a lookup table */
+			const double *x1 = &(aTipVec[span * tipX1[i]]);
+			/* access the other(inner) node at the other end of the branch */
+			const double *x2 = &(x2_start[span * i]);
+
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+			double term = 0.;
+
+			#pragma ivdep
+			#pragma vector aligned
+			#pragma noprefetch x2
+			for(int j = 0; j < span; j++) {
+			  term += x1[j] * x2[j] * diagptable[j];
+			}
+
+			term = log(0.25 * fabs(term));
+
+			sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+			const double *x1 = &(x1_start[span * i]);
+			const double *x2 = &(x2_start[span * i]);
+
+			double term = 0.;
+
+			#pragma ivdep
+			#pragma vector aligned
+			#pragma noprefetch x1 x2
+			for(int j = 0; j < span; j++)
+			  term += x1[j] * x2[j] * diagptable[j];
+
+			term = log(0.25 * fabs(term));
+
+			sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMAPROT_LG4_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector[4],
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+    double aTipVec[1840] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < maxStateValue; k++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+			for(int l = 0; l < states; l++)
+			{
+				aTipVec[k*span + j*states + l] = tipVector[j][k*states + l];
+			}
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[span * tipX1[i]]);
+            const double *right = &(aTipVec[span * tipX2[i]]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+          const double *left = &(aTipVec[span * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+		  #pragma noprefetch right
+          for(int l = 0; l < span; l++)
+          {
+              sumtable[i * span + l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+			#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &x1_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x1_start[span*(i+1) + k], _MM_HINT_T0);
+
+				_mm_prefetch((const char *) &x2_start[span*(i+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &x2_start[span*(i+1) + k], _MM_HINT_T0);
+			}
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+			#pragma noprefetch left right
+            for(int l = 0; l < span; l++)
+            {
+                sumtable[i * span + l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMAPROT_LG4_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN[4], double *gammaRates, double lz, int *wgt)
+{
+    double diagptable0[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[span] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*states] = 1.;
+        diagptable1[i*states] = 0.;
+        diagptable2[i*states] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * states + l]  = exp(EIGN[i][l] * ki * lz);
+          diagptable1[i * states + l] = EIGN[i][l] * ki;
+          diagptable2[i * states + l] = EIGN[i][l] * EIGN[i][l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < span; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % 8 == 0 ? upper / 8 : upper / 8 + 1;
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * 8];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d1Buf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double d2Buf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < 8; j++)
+        {
+
+        	#pragma unroll(10)
+			for (int k = 0; k < span; k += 8)
+			{
+				_mm_prefetch((const char *) &sum[span*(j+2) + k], _MM_HINT_T1);
+				_mm_prefetch((const char *) &sum[span*(j+1) + k], _MM_HINT_T0);
+			}
+
+            __m512d inv_1 = _mm512_setzero_pd();
+            __m512d d1_1 = _mm512_setzero_pd();
+            __m512d d2_1 = _mm512_setzero_pd();
+
+            for (int offset = 0; offset < span; offset += 8)
+            {
+                __m512d d0_1 = _mm512_load_pd(&diagptable0[offset]);
+                __m512d d01_1 = _mm512_load_pd(&diagptable01[offset]);
+                __m512d d02_1 = _mm512_load_pd(&diagptable02[offset]);
+                __m512d s_1 = _mm512_load_pd(&sum[j*span + offset]);
+
+                inv_1 = _mm512_fmadd_pd(d0_1, s_1, inv_1);
+                d1_1 = _mm512_fmadd_pd(d01_1, s_1, d1_1);
+                d2_1 = _mm512_fmadd_pd(d02_1, s_1, d2_1);
+            }
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_CDAB));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_swizzle_pd(inv_1, _MM_SWIZ_REG_BADC));
+            inv_1 = _mm512_add_pd (inv_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_1), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_1);
+
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_CDAB));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_swizzle_pd(d1_1, _MM_SWIZ_REG_BADC));
+            d1_1 = _mm512_add_pd (d1_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_1), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_1);
+
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_CDAB));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_swizzle_pd(d2_1, _MM_SWIZ_REG_BADC));
+            d2_1 = _mm512_add_pd (d2_1, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_1), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_1);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < 8; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLdlz += wgt[i * 8 + j] * d1;
+            d2lnLdlz2 += wgt[i * 8 + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
+
diff --git a/pllrepo/src/mic_native_dna.c b/pllrepo/src/mic_native_dna.c
new file mode 100644
index 0000000..6dd6631
--- /dev/null
+++ b/pllrepo/src/mic_native_dna.c
@@ -0,0 +1,676 @@
+#include <omp.h>
+#include <immintrin.h>
+#include <string.h>
+#include <math.h>
+
+#include "pll.h"
+#include "mic_native.h"
+
+static const int states = 4;
+static const int statesSquare = 16;
+static const int span = 4 * 4;
+static const int maxStateValue = 16;
+
+__inline void mic_broadcast16x64(const double* inv, double* outv)
+{
+    __mmask8 k1 = _mm512_int2mask(0x0F);
+    __mmask8 k2 = _mm512_int2mask(0xF0);
+    for(int l = 0; l < 16; l += 2)
+    {
+        __m512d t = _mm512_setzero_pd();
+        t = _mm512_mask_extload_pd(t, k1, &inv[(l%4)*4 + l/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+        t = _mm512_mask_extload_pd(t, k2, &inv[((l+1)%4)*4 + (l+1)/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
+
+        _mm512_store_pd(&outv[l*4], t);
+    }
+}
+
+void newviewGTRGAMMA_MIC(int tipCase,
+                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+    __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
+    __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
+    __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);
+
+	int addScale = 0;
+
+    double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    #pragma ivdep
+    for (int l = 0; l < 64; ++l)
+    {
+        aEV[l] = extEV[(l / 16) * 4 + (l % 4)];
+    }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* multiply all possible tip state vectors with the respective P-matrices
+        */
+
+            double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            for(int k = 0; k < 256; ++k)
+            {
+                umpX1[k] = 0.0;
+                umpX2[k] = 0.0;
+            }
+
+            for(int i = 0; i < maxStateValue; ++i)
+            {
+              for(int l = 0; l < states; ++l)
+              {
+                  #pragma ivdep
+                  for(int k = 0; k < span; ++k)
+                  {
+                      umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
+                      umpX2[16 * i + k] +=  tipVector[i * 4 + l] * right[k * 4 + l];
+                  }
+              }
+            }
+
+        double auX[64] __attribute__((align(64)));
+
+        for(int i = 0; i < n; ++i)
+        {
+            _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            const double *uX1 = &umpX1[16 * tipX1[i]];
+            const double *uX2 = &umpX2[16 * tipX2[i]];
+
+            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double* v = &x3[i * 16];
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v[l] = 0.;
+            }
+
+            mic_broadcast16x64(uX, auX);
+
+            for (int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                #pragma vector nontemporal
+                for(int k = 0; k < 16; ++k)
+                {
+                    v[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+            // init scaling counter for the site
+            if (!fastScaling)
+                ex3[i] = 0;
+
+        } // sites loop
+
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them
+        only for one tip vector */
+
+          double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+        /* precompute P and left tip vector product */
+
+        for(int k = 0; k < 256; ++k)
+        {
+            umpX1[k] = 0.0;
+        }
+
+        for(int i = 0; i < 16; ++i)
+        {
+          for(int l = 0; l < 4; ++l)
+          {
+              #pragma ivdep
+              for(int k = 0; k < 16; ++k)
+              {
+                  umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
+              }
+          }
+        }
+
+        // re-arrange right matrix for better memory layout
+        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int l = 0; l < 16; l++)
+            {
+                aRight[j*16 + l] = right[l*4 + j];
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+            double* uX1 = &umpX1[span * tipX1[i]];
+            double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX2[l] = 0.;
+            }
+
+            double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            const double* v2 = &(x2[16 * i]);
+
+            mic_broadcast16x64(v2, aV2);
+
+            for(int j = 0; j < 4; j++)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int l = 0; l < 16; l++)
+                {
+                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
+                }
+            }
+
+            double* v3 = &(x3[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+            mic_broadcast16x64(uX, auX);
+
+            for (int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int k = 0; k < 16; ++k)
+                {
+                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax1 = _mm512_reduce_gmax_pd(t1);
+            __m512d t2 = _mm512_load_pd(&v3[8]);
+            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
+            double vmax2 = _mm512_reduce_gmax_pd(t2);
+
+            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
+            {
+				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
+				_mm512_store_pd(&v3[0], t1);
+				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
+				_mm512_store_pd(&v3[8], t2);
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        } // site loop
+      }
+      break;
+    case PLL_INNER_INNER:
+    {
+      /* same as above, without pre-computations */
+
+        // re-arrange right matrix for better memory layout
+        double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int j = 0; j < 4; j++)
+        {
+            for(int l = 0; l < 16; l++)
+            {
+                aLeft[j*16 + l] = left[l*4 + j];
+                aRight[j*16 + l] = right[l*4 + j];
+            }
+        }
+
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1);
+            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);
+
+            _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
+            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);
+
+            double uX1[16] __attribute__((align(64)));
+            double uX2[16] __attribute__((align(64)));
+            double uX[16] __attribute__((align(64)));
+
+            for(int l = 0; l < 16; l++)
+            {
+              uX1[l] = 0.;
+              uX2[l] = 0.;
+            }
+
+            double aV1[64] __attribute__((align(64)));
+            double aV2[64] __attribute__((align(64)));
+
+            const double* v1 = &(x1[span * i]);
+            const double* v2 = &(x2[span * i]);
+
+            mic_broadcast16x64(v1, aV1);
+
+            mic_broadcast16x64(v2, aV2);
+
+            for(int j = 0; j < 4; j++)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int l = 0; l < 16; l++)
+                {
+                    uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l];
+                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
+                }
+            }
+
+            double* v3 =  &(x3[span * i]);
+
+            #pragma ivdep
+            #pragma vector aligned
+            for(int l = 0; l < 16; ++l)
+            {
+                uX[l] = uX1[l] * uX2[l];
+                v3[l] = 0.;
+            }
+
+            double auX[64] __attribute__((align(64)));
+            mic_broadcast16x64(uX, auX);
+
+            for(int j = 0; j < 4; ++j)
+            {
+                #pragma ivdep
+                #pragma vector aligned
+                for(int k = 0; k < 16; ++k)
+                {
+                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
+                }
+            }
+
+
+            __m512d t1 = _mm512_load_pd(&v3[0]);
+            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
+            double vmax1 = _mm512_reduce_gmax_pd(t1);
+            __m512d t2 = _mm512_load_pd(&v3[8]);
+            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
+            double vmax2 = _mm512_reduce_gmax_pd(t2);
+
+            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
+            {
+				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
+				_mm512_store_pd(&v3[0], t1);
+				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
+				_mm512_store_pd(&v3[8], t2);
+
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+            }
+        }
+    } break;
+    default:
+//      assert(0);
+      break;
+  }
+
+  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications
+     carried out for computing the likelihood array at node p */
+
+  if (fastScaling)
+  {
+      *scalerIncrement = addScale;
+  }
+
+}
+
+double evaluateGTRGAMMA_MIC(int *ex1, int *ex2, int *wgt,
+                 double *x1_start, double *x2_start,
+                 double *tipVector,
+                 unsigned char *tipX1, const int n, double *diagptable, const pllBoolean fastScaling)
+{
+	double sum = 0.0;
+
+    /* the left node is a tip */
+    if(tipX1)
+    {
+
+        double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+        for(int k = 0; k < 16; k++)
+        {
+            for(int l = 0; l < 4; l++)
+            {
+                aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l];
+            }
+        }
+
+        /* loop over the sites of this partition */
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          /* access pre-computed tip vector values via a lookup table */
+          const double *x1 = &(aTipVec[16 * tipX1[i]]);
+          /* access the other(inner) node at the other end of the branch */
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * term) + (ex2[i] * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * term);
+
+          sum += wgt[i] * term;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1_start[span*(i+8) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x1_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1_start[span*(i+1) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+1) + 8], _MM_HINT_T0);
+
+          const double *x1 = &(x1_start[span * i]);
+          const double *x2 = &(x2_start[span * i]);
+
+          double term = 0.;
+
+          #pragma ivdep
+          #pragma vector aligned
+          for(int j = 0; j < span; j++)
+              term += x1[j] * x2[j] * diagptable[j];
+
+          if(!fastScaling)
+              term = log(0.25 * fabs(term)) + ((ex1[i] + ex2[i]) * log(PLL_MINLIKELIHOOD));
+          else
+              term = log(0.25 * term);
+
+          sum += wgt[i] * term;
+        }
+    }
+
+    return sum;
+}
+
+void sumGTRGAMMA_MIC(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
+    unsigned char *tipX1, unsigned char *tipX2, int n)
+{
+	double aTipVec[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for(int k = 0; k < 16; k++)
+    {
+        for(int l = 0; l < 4; l++)
+        {
+            aTipVec[k*16 + l] = aTipVec[k*16 + 4 + l] = aTipVec[k*16 + 8 + l] = aTipVec[k*16 + 12 + l] = tipVector[k*4 + l];
+        }
+    }
+
+    switch(tipCase)
+    {
+      case PLL_TIP_TIP:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            const double *left  = &(aTipVec[16 * tipX1[i]]);
+            const double *right = &(aTipVec[16 * tipX2[i]]);
+            double* sum = &sumtable[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+              sum[l] = left[l] * right[l];
+            }
+        }
+      } break;
+      case PLL_TIP_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+          _mm_prefetch((const char*) &x2_start[span*(i+32)], _MM_HINT_T1);
+          _mm_prefetch((const char*) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
+
+          _mm_prefetch((const char*) &x2_start[span*(i+4)], _MM_HINT_T0);
+          _mm_prefetch((const char*) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
+
+          const double *left = &(aTipVec[16 * tipX1[i]]);
+          const double *right = &(x2_start[span * i]);
+          double* sum = &sumtable[i * span];
+
+          #pragma ivdep
+          #pragma vector aligned nontemporal
+          for(int l = 0; l < span; l++)
+          {
+              sum[l] = left[l] * right[l];
+          }
+        }
+      } break;
+      case PLL_INNER_INNER:
+      {
+        for(int i = 0; i < n; i++)
+        {
+            _mm_prefetch((const char*) &x1_start[span*(i+32)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x1_start[span*(i+32) + 8], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+32)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &x2_start[span*(i+32) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &x1_start[span*(i+4)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x1_start[span*(i+4) + 8], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+4)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &x2_start[span*(i+4) + 8], _MM_HINT_T0);
+
+            const double *left  = &(x1_start[span * i]);
+            const double *right = &(x2_start[span * i]);
+            double* sum = &sumtable[i * span];
+
+            #pragma ivdep
+            #pragma vector aligned nontemporal
+            for(int l = 0; l < span; l++)
+            {
+                sum[l] = left[l] * right[l];
+            }
+        }
+      } break;
+  //    default:
+  //      assert(0);
+    }
+}
+
+void coreGTRGAMMA_MIC(const int upper, double *sumtable,
+    volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *EIGN, double *gammaRates, double lz, int *wgt)
+{
+	double diagptable0[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable1[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable01[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double diagptable02[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+
+    /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
+
+    for(int i = 0; i < 4; i++)
+    {
+        const double ki = gammaRates[i];
+        const double kisqr = ki * ki;
+
+        diagptable0[i*4] = 1.;
+        diagptable1[i*4] = 0.;
+        diagptable2[i*4] = 0.;
+
+        for(int l = 1; l < states; l++)
+        {
+          diagptable0[i * 4 + l]  = exp(EIGN[l] * ki * lz);
+          diagptable1[i * 4 + l] = EIGN[l] * ki;
+          diagptable2[i * 4 + l] = EIGN[l] * EIGN[l] * kisqr;
+        }
+    }
+
+    #pragma ivdep
+    for(int i = 0; i < 16; i++)
+    {
+        diagptable01[i] = diagptable0[i] * diagptable1[i];
+        diagptable02[i] = diagptable0[i] * diagptable2[i];
+    }
+
+    /* loop over sites in this partition */
+
+    const int aligned_width = upper % 8 == 0 ? upper / 8 : upper / 8 + 1;
+
+    double dlnLBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    double d2lnLBuf[8] __attribute__((align(PLL_BYTE_ALIGNMENT)));
+    for (int j = 0; j < 8; ++j)
+    {
+        dlnLBuf[j] = 0.;
+        d2lnLBuf[j] = 0.;
+    }
+
+    __mmask16 k1 = _mm512_int2mask(0x000000FF);
+
+    for (int i = 0; i < aligned_width; i++)
+    {
+        _mm_prefetch((const char*) &sumtable[i * span * 8], _MM_HINT_T0);
+        _mm_prefetch((const char*) &sumtable[i * span * 8 + 8], _MM_HINT_T0);
+
+        /* access the array with pre-computed values */
+        const double *sum = &sumtable[i * span * 8];
+
+        /* initial per-site likelihood and 1st and 2nd derivatives */
+
+        double invBuf[8] __attribute__((align(64)));
+        double d1Buf[8] __attribute__((align(64)));
+        double d2Buf[8] __attribute__((align(64)));
+
+        __m512d invVec;
+        __m512d d1Vec;
+        __m512d d2Vec;
+        int mask = 0x01;
+
+        #pragma noprefetch sum
+        #pragma unroll(8)
+        for(int j = 0; j < 8; j++)
+        {
+            _mm_prefetch((const char*) &sum[span*(j+8)], _MM_HINT_T1);
+            _mm_prefetch((const char*) &sum[span*(j+8) + 8], _MM_HINT_T1);
+
+            _mm_prefetch((const char*) &sum[span*(j+1)], _MM_HINT_T0);
+            _mm_prefetch((const char*) &sum[span*(j+1) + 8], _MM_HINT_T0);
+
+            __m512d d0_1 = _mm512_load_pd(&diagptable0[0]);
+            __m512d d0_2 = _mm512_load_pd(&diagptable0[8]);
+
+            __m512d d01_1 = _mm512_load_pd(&diagptable01[0]);
+            __m512d d01_2 = _mm512_load_pd(&diagptable01[8]);
+
+            __m512d d02_1 = _mm512_load_pd(&diagptable02[0]);
+            __m512d d02_2 = _mm512_load_pd(&diagptable02[8]);
+
+            __m512d s_1 = _mm512_load_pd(&sum[j*16]);
+            __m512d s_2 = _mm512_load_pd(&sum[j*16 + 8]);
+            __m512d inv_1 = _mm512_mul_pd(d0_1, s_1);
+            __m512d d1_1 = _mm512_mul_pd(d01_1, s_1);
+            __m512d d2_1 = _mm512_mul_pd(d02_1, s_1);
+
+            __m512d inv_2 = _mm512_fmadd_pd(d0_2, s_2, inv_1);
+            __m512d d1_2 = _mm512_fmadd_pd(d01_2, s_2, d1_1);
+            __m512d d2_2 = _mm512_fmadd_pd(d02_2, s_2, d2_1);
+
+            __mmask8 k1 = _mm512_int2mask(mask);
+            mask <<= 1;
+
+            // reduce
+            inv_2 = _mm512_add_pd (inv_2, _mm512_swizzle_pd(inv_2, _MM_SWIZ_REG_CDAB));
+            inv_2 = _mm512_add_pd (inv_2, _mm512_swizzle_pd(inv_2, _MM_SWIZ_REG_BADC));
+            inv_2 = _mm512_add_pd (inv_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(inv_2), _MM_PERM_BADC)));
+            invVec = _mm512_mask_mov_pd(invVec, k1, inv_2);
+
+            d1_2 = _mm512_add_pd (d1_2, _mm512_swizzle_pd(d1_2, _MM_SWIZ_REG_CDAB));
+            d1_2 = _mm512_add_pd (d1_2, _mm512_swizzle_pd(d1_2, _MM_SWIZ_REG_BADC));
+            d1_2 = _mm512_add_pd (d1_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d1_2), _MM_PERM_BADC)));
+            d1Vec = _mm512_mask_mov_pd(d1Vec, k1, d1_2);
+
+            d2_2 = _mm512_add_pd (d2_2, _mm512_swizzle_pd(d2_2, _MM_SWIZ_REG_CDAB));
+            d2_2 = _mm512_add_pd (d2_2, _mm512_swizzle_pd(d2_2, _MM_SWIZ_REG_BADC));
+            d2_2 = _mm512_add_pd (d2_2, _mm512_castsi512_pd(_mm512_permute4f128_epi32(_mm512_castpd_si512(d2_2), _MM_PERM_BADC)));
+            d2Vec = _mm512_mask_mov_pd(d2Vec, k1, d2_2);
+        }
+
+        _mm512_store_pd(&invBuf[0], invVec);
+        _mm512_store_pd(&d1Buf[0], d1Vec);
+        _mm512_store_pd(&d2Buf[0], d2Vec);
+
+        #pragma ivdep
+        #pragma vector aligned
+        for (int j = 0; j < 8; ++j)
+        {
+            const double inv_Li = 1.0 / invBuf[j];
+
+            const double d1 = d1Buf[j] * inv_Li;
+            const double d2 = d2Buf[j] * inv_Li;
+
+            dlnLBuf[j] += wgt[i * 8 + j] * d1;
+            d2lnLBuf[j] += wgt[i * 8 + j] * (d2 - d1 * d1);
+        }
+    } // site loop
+
+    double dlnLdlz = 0.;
+    double d2lnLdlz2 = 0.;
+    for (int j = 0; j < 8; ++j)
+    {
+        dlnLdlz += dlnLBuf[j];
+        d2lnLdlz2 += d2lnLBuf[j];
+    }
+
+    *ext_dlnLdlz   = dlnLdlz;
+    *ext_d2lnLdlz2 = d2lnLdlz2;
+}
diff --git a/pllrepo/src/models.c b/pllrepo/src/models.c
new file mode 100644
index 0000000..7bc24ef
--- /dev/null
+++ b/pllrepo/src/models.c
@@ -0,0 +1,4377 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file models.c
+ *  
+ * @brief Model related code
+ *
+ * Detailed description to appear soon.
+ */ 
+
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+extern const unsigned int bitVectorSecondary[256];
+extern const unsigned int bitVector32[33];
+extern const unsigned int bitVectorAA[23];
+extern const unsigned int bitVectorIdentity[256];
+
+extern const partitionLengths pLengths[PLL_MAX_MODEL];
+
+
+
+extern FILE *byteFile;
+
+
+
+
+
+
+
+
+
+/** @brief Hardcoded values for the WAG model
+  
+    Fill the \a ext_initialRates array with hardcoded substitution rates
+    of the WAG model.
+   
+    @param ext_initialRates
+      Where to place the substitution rates
+*/
+void putWAG(double *ext_initialRates)
+{ 
+  double
+    scaler,
+    q[20][20],
+    daa[400];
+
+  int 
+    i,
+    j,
+    r;
+
+  /* fill the triangle below the diagonal with values */
+  daa[ 1*20+ 0] =  55.15710; daa[ 2*20+ 0] =  50.98480; daa[ 2*20+ 1] =  63.53460; 
+  daa[ 3*20+ 0] =  73.89980; daa[ 3*20+ 1] =  14.73040; daa[ 3*20+ 2] = 542.94200; 
+  daa[ 4*20+ 0] = 102.70400; daa[ 4*20+ 1] =  52.81910; daa[ 4*20+ 2] =  26.52560; 
+  daa[ 4*20+ 3] =   3.02949; daa[ 5*20+ 0] =  90.85980; daa[ 5*20+ 1] = 303.55000; 
+  daa[ 5*20+ 2] = 154.36400; daa[ 5*20+ 3] =  61.67830; daa[ 5*20+ 4] =   9.88179; 
+  daa[ 6*20+ 0] = 158.28500; daa[ 6*20+ 1] =  43.91570; daa[ 6*20+ 2] =  94.71980; 
+  daa[ 6*20+ 3] = 617.41600; daa[ 6*20+ 4] =   2.13520; daa[ 6*20+ 5] = 546.94700; 
+  daa[ 7*20+ 0] = 141.67200; daa[ 7*20+ 1] =  58.46650; daa[ 7*20+ 2] = 112.55600; 
+  daa[ 7*20+ 3] =  86.55840; daa[ 7*20+ 4] =  30.66740; daa[ 7*20+ 5] =  33.00520; 
+  daa[ 7*20+ 6] =  56.77170; daa[ 8*20+ 0] =  31.69540; daa[ 8*20+ 1] = 213.71500; 
+  daa[ 8*20+ 2] = 395.62900; daa[ 8*20+ 3] =  93.06760; daa[ 8*20+ 4] =  24.89720; 
+  daa[ 8*20+ 5] = 429.41100; daa[ 8*20+ 6] =  57.00250; daa[ 8*20+ 7] =  24.94100; 
+  daa[ 9*20+ 0] =  19.33350; daa[ 9*20+ 1] =  18.69790; daa[ 9*20+ 2] =  55.42360; 
+  daa[ 9*20+ 3] =   3.94370; daa[ 9*20+ 4] =  17.01350; daa[ 9*20+ 5] =  11.39170; 
+  daa[ 9*20+ 6] =  12.73950; daa[ 9*20+ 7] =   3.04501; daa[ 9*20+ 8] =  13.81900; 
+  daa[10*20+ 0] =  39.79150; daa[10*20+ 1] =  49.76710; daa[10*20+ 2] =  13.15280; 
+  daa[10*20+ 3] =   8.48047; daa[10*20+ 4] =  38.42870; daa[10*20+ 5] =  86.94890; 
+  daa[10*20+ 6] =  15.42630; daa[10*20+ 7] =   6.13037; daa[10*20+ 8] =  49.94620; 
+  daa[10*20+ 9] = 317.09700; daa[11*20+ 0] =  90.62650; daa[11*20+ 1] = 535.14200; 
+  daa[11*20+ 2] = 301.20100; daa[11*20+ 3] =  47.98550; daa[11*20+ 4] =   7.40339; 
+  daa[11*20+ 5] = 389.49000; daa[11*20+ 6] = 258.44300; daa[11*20+ 7] =  37.35580; 
+  daa[11*20+ 8] =  89.04320; daa[11*20+ 9] =  32.38320; daa[11*20+10] =  25.75550; 
+  daa[12*20+ 0] =  89.34960; daa[12*20+ 1] =  68.31620; daa[12*20+ 2] =  19.82210; 
+  daa[12*20+ 3] =  10.37540; daa[12*20+ 4] =  39.04820; daa[12*20+ 5] = 154.52600; 
+  daa[12*20+ 6] =  31.51240; daa[12*20+ 7] =  17.41000; daa[12*20+ 8] =  40.41410; 
+  daa[12*20+ 9] = 425.74600; daa[12*20+10] = 485.40200; daa[12*20+11] =  93.42760; 
+  daa[13*20+ 0] =  21.04940; daa[13*20+ 1] =  10.27110; daa[13*20+ 2] =   9.61621; 
+  daa[13*20+ 3] =   4.67304; daa[13*20+ 4] =  39.80200; daa[13*20+ 5] =   9.99208; 
+  daa[13*20+ 6] =   8.11339; daa[13*20+ 7] =   4.99310; daa[13*20+ 8] =  67.93710; 
+  daa[13*20+ 9] = 105.94700; daa[13*20+10] = 211.51700; daa[13*20+11] =   8.88360; 
+  daa[13*20+12] = 119.06300; daa[14*20+ 0] = 143.85500; daa[14*20+ 1] =  67.94890; 
+  daa[14*20+ 2] =  19.50810; daa[14*20+ 3] =  42.39840; daa[14*20+ 4] =  10.94040; 
+  daa[14*20+ 5] =  93.33720; daa[14*20+ 6] =  68.23550; daa[14*20+ 7] =  24.35700; 
+  daa[14*20+ 8] =  69.61980; daa[14*20+ 9] =   9.99288; daa[14*20+10] =  41.58440; 
+  daa[14*20+11] =  55.68960; daa[14*20+12] =  17.13290; daa[14*20+13] =  16.14440; 
+  daa[15*20+ 0] = 337.07900; daa[15*20+ 1] = 122.41900; daa[15*20+ 2] = 397.42300; 
+  daa[15*20+ 3] = 107.17600; daa[15*20+ 4] = 140.76600; daa[15*20+ 5] = 102.88700; 
+  daa[15*20+ 6] =  70.49390; daa[15*20+ 7] = 134.18200; daa[15*20+ 8] =  74.01690; 
+  daa[15*20+ 9] =  31.94400; daa[15*20+10] =  34.47390; daa[15*20+11] =  96.71300; 
+  daa[15*20+12] =  49.39050; daa[15*20+13] =  54.59310; daa[15*20+14] = 161.32800; 
+  daa[16*20+ 0] = 212.11100; daa[16*20+ 1] =  55.44130; daa[16*20+ 2] = 203.00600; 
+  daa[16*20+ 3] =  37.48660; daa[16*20+ 4] =  51.29840; daa[16*20+ 5] =  85.79280; 
+  daa[16*20+ 6] =  82.27650; daa[16*20+ 7] =  22.58330; daa[16*20+ 8] =  47.33070; 
+  daa[16*20+ 9] = 145.81600; daa[16*20+10] =  32.66220; daa[16*20+11] = 138.69800; 
+  daa[16*20+12] = 151.61200; daa[16*20+13] =  17.19030; daa[16*20+14] =  79.53840; 
+  daa[16*20+15] = 437.80200; daa[17*20+ 0] =  11.31330; daa[17*20+ 1] = 116.39200; 
+  daa[17*20+ 2] =   7.19167; daa[17*20+ 3] =  12.97670; daa[17*20+ 4] =  71.70700; 
+  daa[17*20+ 5] =  21.57370; daa[17*20+ 6] =  15.65570; daa[17*20+ 7] =  33.69830; 
+  daa[17*20+ 8] =  26.25690; daa[17*20+ 9] =  21.24830; daa[17*20+10] =  66.53090; 
+  daa[17*20+11] =  13.75050; daa[17*20+12] =  51.57060; daa[17*20+13] = 152.96400; 
+  daa[17*20+14] =  13.94050; daa[17*20+15] =  52.37420; daa[17*20+16] =  11.08640; 
+  daa[18*20+ 0] =  24.07350; daa[18*20+ 1] =  38.15330; daa[18*20+ 2] = 108.60000; 
+  daa[18*20+ 3] =  32.57110; daa[18*20+ 4] =  54.38330; daa[18*20+ 5] =  22.77100; 
+  daa[18*20+ 6] =  19.63030; daa[18*20+ 7] =  10.36040; daa[18*20+ 8] = 387.34400; 
+  daa[18*20+ 9] =  42.01700; daa[18*20+10] =  39.86180; daa[18*20+11] =  13.32640; 
+  daa[18*20+12] =  42.84370; daa[18*20+13] = 645.42800; daa[18*20+14] =  21.60460; 
+  daa[18*20+15] =  78.69930; daa[18*20+16] =  29.11480; daa[18*20+17] = 248.53900; 
+  daa[19*20+ 0] = 200.60100; daa[19*20+ 1] =  25.18490; daa[19*20+ 2] =  19.62460; 
+  daa[19*20+ 3] =  15.23350; daa[19*20+ 4] = 100.21400; daa[19*20+ 5] =  30.12810; 
+  daa[19*20+ 6] =  58.87310; daa[19*20+ 7] =  18.72470; daa[19*20+ 8] =  11.83580; 
+  daa[19*20+ 9] = 782.13000; daa[19*20+10] = 180.03400; daa[19*20+11] =  30.54340; 
+  daa[19*20+12] = 205.84500; daa[19*20+13] =  64.98920; daa[19*20+14] =  31.48870; 
+  daa[19*20+15] =  23.27390; daa[19*20+16] = 138.82300; daa[19*20+17] =  36.53690; 
+  daa[19*20+18] =  31.47300; 
+
+  /* initialize a 20x20 matrix */
+  for(i = 0; i < 20; i++)
+    for(j = 0; j < 20; j++)
+      q[i][j] = 0.0;
+
+  /* fill the triangle above the diagonal with the corresponding values from the
+     lower triangle */
+  for (i=0; i<20; i++)  
+    for (j=0; j<i; j++)               
+      daa[j*20+i] = daa[i*20+j];
+
+  /* copy the triangle above the diagonal from daa (which is a linear block) to
+     the triangle above the diagonal of a square matrix q */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)      
+      q[i][j] = daa[i * 20 + j];
+
+  
+  /*
+    for (i=0; i<20; i++) 
+    {
+      for (j=0; j<20; j++)
+        printf("%1.2f ", q[i][j]);
+      printf("\n");
+    }
+    printf("\n");
+
+    printf("%f\n", q[18][19]);
+  */
+
+  /* create a scaler from the last value (last row last column) of the upper
+     triangle of q */
+  scaler = 1.0 / q[18][19];
+
+  
+
+  /* scale all values of the matrix */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)      
+      q[i][j] *= scaler;
+
+  /* copy the upper triangle of q to the linear array ext_initialRates */
+  for(i = 0, r = 0; i < 19; i++)          
+    for(j = i + 1; j < 20; j++)      
+      ext_initialRates[r++] = q[i][j];           
+      
+  /*
+    for (i=0; i<20; i++) 
+    {
+      for (j=0; j<20; j++)
+        printf("%1.2f ", q[i][j]);
+      printf("\n");
+    }
+    printf("\n");
+  */
+
+}
+
+
+
+/** @brief Initialize protein substitution rates matrix 
+  * 
+  * Initialize the array pointed to by \a ext_initialRates with the substitution
+  * rates of the corresponding protein model and set f to the appropriate
+  * stationary frequencies
+  *
+  * @param f
+  *   Array where to store the stationary frequency rates
+  *
+  * @param proteinMatrix
+  *   Which protein matrix to use  
+  *
+  * @param ext_initialRates
+      Where to store the retrieved substitution rates
+  *
+  * @param lg4_index
+  *   In case we are filling a substitution rates matrix of an LG4 model the index
+  *   specifies which of the four matrixes to use 
+  *
+*/
+static void initProtMat(double f[20], int proteinMatrix, double *ext_initialRates, int lg4_index)
+{ 
+  double q[20][20];
+  double daa[400], max, temp;
+  int i, j, r;
+  double *initialRates = ext_initialRates;
+  double scaler;
+
+  {
+      switch(proteinMatrix)
+        {
+        case PLL_DAYHOFF:
+          {     
+            daa[ 1*20+ 0] =   27.00; daa[ 2*20+ 0] =   98.00; daa[ 2*20+ 1] =   32.00; daa[ 3*20+ 0] =  120.00;
+            daa[ 3*20+ 1] =    0.00; daa[ 3*20+ 2] =  905.00; daa[ 4*20+ 0] =   36.00; daa[ 4*20+ 1] =   23.00;
+            daa[ 4*20+ 2] =    0.00; daa[ 4*20+ 3] =    0.00; daa[ 5*20+ 0] =   89.00; daa[ 5*20+ 1] =  246.00;
+            daa[ 5*20+ 2] =  103.00; daa[ 5*20+ 3] =  134.00; daa[ 5*20+ 4] =    0.00; daa[ 6*20+ 0] =  198.00;
+            daa[ 6*20+ 1] =    1.00; daa[ 6*20+ 2] =  148.00; daa[ 6*20+ 3] = 1153.00; daa[ 6*20+ 4] =    0.00;
+            daa[ 6*20+ 5] =  716.00; daa[ 7*20+ 0] =  240.00; daa[ 7*20+ 1] =    9.00; daa[ 7*20+ 2] =  139.00;
+            daa[ 7*20+ 3] =  125.00; daa[ 7*20+ 4] =   11.00; daa[ 7*20+ 5] =   28.00; daa[ 7*20+ 6] =   81.00;
+            daa[ 8*20+ 0] =   23.00; daa[ 8*20+ 1] =  240.00; daa[ 8*20+ 2] =  535.00; daa[ 8*20+ 3] =   86.00;
+            daa[ 8*20+ 4] =   28.00; daa[ 8*20+ 5] =  606.00; daa[ 8*20+ 6] =   43.00; daa[ 8*20+ 7] =   10.00;
+            daa[ 9*20+ 0] =   65.00; daa[ 9*20+ 1] =   64.00; daa[ 9*20+ 2] =   77.00; daa[ 9*20+ 3] =   24.00;
+            daa[ 9*20+ 4] =   44.00; daa[ 9*20+ 5] =   18.00; daa[ 9*20+ 6] =   61.00; daa[ 9*20+ 7] =    0.00;
+            daa[ 9*20+ 8] =    7.00; daa[10*20+ 0] =   41.00; daa[10*20+ 1] =   15.00; daa[10*20+ 2] =   34.00;
+            daa[10*20+ 3] =    0.00; daa[10*20+ 4] =    0.00; daa[10*20+ 5] =   73.00; daa[10*20+ 6] =   11.00;
+            daa[10*20+ 7] =    7.00; daa[10*20+ 8] =   44.00; daa[10*20+ 9] =  257.00; daa[11*20+ 0] =   26.00;
+            daa[11*20+ 1] =  464.00; daa[11*20+ 2] =  318.00; daa[11*20+ 3] =   71.00; daa[11*20+ 4] =    0.00;
+            daa[11*20+ 5] =  153.00; daa[11*20+ 6] =   83.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   26.00;
+            daa[11*20+ 9] =   46.00; daa[11*20+10] =   18.00; daa[12*20+ 0] =   72.00; daa[12*20+ 1] =   90.00;
+            daa[12*20+ 2] =    1.00; daa[12*20+ 3] =    0.00; daa[12*20+ 4] =    0.00; daa[12*20+ 5] =  114.00;
+            daa[12*20+ 6] =   30.00; daa[12*20+ 7] =   17.00; daa[12*20+ 8] =    0.00; daa[12*20+ 9] =  336.00;
+            daa[12*20+10] =  527.00; daa[12*20+11] =  243.00; daa[13*20+ 0] =   18.00; daa[13*20+ 1] =   14.00;
+            daa[13*20+ 2] =   14.00; daa[13*20+ 3] =    0.00; daa[13*20+ 4] =    0.00; daa[13*20+ 5] =    0.00;
+            daa[13*20+ 6] =    0.00; daa[13*20+ 7] =   15.00; daa[13*20+ 8] =   48.00; daa[13*20+ 9] =  196.00;
+            daa[13*20+10] =  157.00; daa[13*20+11] =    0.00; daa[13*20+12] =   92.00; daa[14*20+ 0] =  250.00;
+            daa[14*20+ 1] =  103.00; daa[14*20+ 2] =   42.00; daa[14*20+ 3] =   13.00; daa[14*20+ 4] =   19.00;
+            daa[14*20+ 5] =  153.00; daa[14*20+ 6] =   51.00; daa[14*20+ 7] =   34.00; daa[14*20+ 8] =   94.00;
+            daa[14*20+ 9] =   12.00; daa[14*20+10] =   32.00; daa[14*20+11] =   33.00; daa[14*20+12] =   17.00;
+            daa[14*20+13] =   11.00; daa[15*20+ 0] =  409.00; daa[15*20+ 1] =  154.00; daa[15*20+ 2] =  495.00;
+            daa[15*20+ 3] =   95.00; daa[15*20+ 4] =  161.00; daa[15*20+ 5] =   56.00; daa[15*20+ 6] =   79.00;
+            daa[15*20+ 7] =  234.00; daa[15*20+ 8] =   35.00; daa[15*20+ 9] =   24.00; daa[15*20+10] =   17.00;
+            daa[15*20+11] =   96.00; daa[15*20+12] =   62.00; daa[15*20+13] =   46.00; daa[15*20+14] =  245.00;
+            daa[16*20+ 0] =  371.00; daa[16*20+ 1] =   26.00; daa[16*20+ 2] =  229.00; daa[16*20+ 3] =   66.00;
+            daa[16*20+ 4] =   16.00; daa[16*20+ 5] =   53.00; daa[16*20+ 6] =   34.00; daa[16*20+ 7] =   30.00;
+            daa[16*20+ 8] =   22.00; daa[16*20+ 9] =  192.00; daa[16*20+10] =   33.00; daa[16*20+11] =  136.00;
+            daa[16*20+12] =  104.00; daa[16*20+13] =   13.00; daa[16*20+14] =   78.00; daa[16*20+15] =  550.00;
+            daa[17*20+ 0] =    0.00; daa[17*20+ 1] =  201.00; daa[17*20+ 2] =   23.00; daa[17*20+ 3] =    0.00;
+            daa[17*20+ 4] =    0.00; daa[17*20+ 5] =    0.00; daa[17*20+ 6] =    0.00; daa[17*20+ 7] =    0.00;
+            daa[17*20+ 8] =   27.00; daa[17*20+ 9] =    0.00; daa[17*20+10] =   46.00; daa[17*20+11] =    0.00;
+            daa[17*20+12] =    0.00; daa[17*20+13] =   76.00; daa[17*20+14] =    0.00; daa[17*20+15] =   75.00;
+            daa[17*20+16] =    0.00; daa[18*20+ 0] =   24.00; daa[18*20+ 1] =    8.00; daa[18*20+ 2] =   95.00;
+            daa[18*20+ 3] =    0.00; daa[18*20+ 4] =   96.00; daa[18*20+ 5] =    0.00; daa[18*20+ 6] =   22.00;
+            daa[18*20+ 7] =    0.00; daa[18*20+ 8] =  127.00; daa[18*20+ 9] =   37.00; daa[18*20+10] =   28.00;
+            daa[18*20+11] =   13.00; daa[18*20+12] =    0.00; daa[18*20+13] =  698.00; daa[18*20+14] =    0.00;
+            daa[18*20+15] =   34.00; daa[18*20+16] =   42.00; daa[18*20+17] =   61.00; daa[19*20+ 0] =  208.00;
+            daa[19*20+ 1] =   24.00; daa[19*20+ 2] =   15.00; daa[19*20+ 3] =   18.00; daa[19*20+ 4] =   49.00;
+            daa[19*20+ 5] =   35.00; daa[19*20+ 6] =   37.00; daa[19*20+ 7] =   54.00; daa[19*20+ 8] =   44.00;
+            daa[19*20+ 9] =  889.00; daa[19*20+10] =  175.00; daa[19*20+11] =   10.00; daa[19*20+12] =  258.00;
+            daa[19*20+13] =   12.00; daa[19*20+14] =   48.00; daa[19*20+15] =   30.00; daa[19*20+16] =  157.00;
+            daa[19*20+17] =    0.00; daa[19*20+18] =   28.00;               
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033618; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080482;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+          }
+          break;
+        case PLL_DCMUT:
+          {     
+            daa[ 1*20+ 0] =   26.78280; daa[ 2*20+ 0] =   98.44740; daa[ 2*20+ 1] =   32.70590; daa[ 3*20+ 0] =  119.98050; 
+            daa[ 3*20+ 1] =    0.00000; daa[ 3*20+ 2] =  893.15150; daa[ 4*20+ 0] =   36.00160; daa[ 4*20+ 1] =   23.23740; 
+            daa[ 4*20+ 2] =    0.00000; daa[ 4*20+ 3] =    0.00000; daa[ 5*20+ 0] =   88.77530; daa[ 5*20+ 1] =  243.99390; 
+            daa[ 5*20+ 2] =  102.85090; daa[ 5*20+ 3] =  134.85510; daa[ 5*20+ 4] =    0.00000; daa[ 6*20+ 0] =  196.11670; 
+            daa[ 6*20+ 1] =    0.00000; daa[ 6*20+ 2] =  149.34090; daa[ 6*20+ 3] = 1138.86590; daa[ 6*20+ 4] =    0.00000; 
+            daa[ 6*20+ 5] =  708.60220; daa[ 7*20+ 0] =  238.61110; daa[ 7*20+ 1] =    8.77910; daa[ 7*20+ 2] =  138.53520; 
+            daa[ 7*20+ 3] =  124.09810; daa[ 7*20+ 4] =   10.72780; daa[ 7*20+ 5] =   28.15810; daa[ 7*20+ 6] =   81.19070; 
+            daa[ 8*20+ 0] =   22.81160; daa[ 8*20+ 1] =  238.31480; daa[ 8*20+ 2] =  529.00240; daa[ 8*20+ 3] =   86.82410; 
+            daa[ 8*20+ 4] =   28.27290; daa[ 8*20+ 5] =  601.16130; daa[ 8*20+ 6] =   43.94690; daa[ 8*20+ 7] =   10.68020; 
+            daa[ 9*20+ 0] =   65.34160; daa[ 9*20+ 1] =   63.26290; daa[ 9*20+ 2] =   76.80240; daa[ 9*20+ 3] =   23.92480; 
+            daa[ 9*20+ 4] =   43.80740; daa[ 9*20+ 5] =   18.03930; daa[ 9*20+ 6] =   60.95260; daa[ 9*20+ 7] =    0.00000; 
+            daa[ 9*20+ 8] =    7.69810; daa[10*20+ 0] =   40.64310; daa[10*20+ 1] =   15.49240; daa[10*20+ 2] =   34.11130; 
+            daa[10*20+ 3] =    0.00000; daa[10*20+ 4] =    0.00000; daa[10*20+ 5] =   73.07720; daa[10*20+ 6] =   11.28800; 
+            daa[10*20+ 7] =    7.15140; daa[10*20+ 8] =   44.35040; daa[10*20+ 9] =  255.66850; daa[11*20+ 0] =   25.86350; 
+            daa[11*20+ 1] =  461.01240; daa[11*20+ 2] =  314.83710; daa[11*20+ 3] =   71.69130; daa[11*20+ 4] =    0.00000; 
+            daa[11*20+ 5] =  151.90780; daa[11*20+ 6] =   83.00780; daa[11*20+ 7] =   26.76830; daa[11*20+ 8] =   27.04750; 
+            daa[11*20+ 9] =   46.08570; daa[11*20+10] =   18.06290; daa[12*20+ 0] =   71.78400; daa[12*20+ 1] =   89.63210; 
+            daa[12*20+ 2] =    0.00000; daa[12*20+ 3] =    0.00000; daa[12*20+ 4] =    0.00000; daa[12*20+ 5] =  112.74990; 
+            daa[12*20+ 6] =   30.48030; daa[12*20+ 7] =   17.03720; daa[12*20+ 8] =    0.00000; daa[12*20+ 9] =  333.27320; 
+            daa[12*20+10] =  523.01150; daa[12*20+11] =  241.17390; daa[13*20+ 0] =   18.36410; daa[13*20+ 1] =   13.69060; 
+            daa[13*20+ 2] =   13.85030; daa[13*20+ 3] =    0.00000; daa[13*20+ 4] =    0.00000; daa[13*20+ 5] =    0.00000; 
+            daa[13*20+ 6] =    0.00000; daa[13*20+ 7] =   15.34780; daa[13*20+ 8] =   47.59270; daa[13*20+ 9] =  195.19510; 
+            daa[13*20+10] =  156.51600; daa[13*20+11] =    0.00000; daa[13*20+12] =   92.18600; daa[14*20+ 0] =  248.59200; 
+            daa[14*20+ 1] =  102.83130; daa[14*20+ 2] =   41.92440; daa[14*20+ 3] =   13.39400; daa[14*20+ 4] =   18.75500; 
+            daa[14*20+ 5] =  152.61880; daa[14*20+ 6] =   50.70030; daa[14*20+ 7] =   34.71530; daa[14*20+ 8] =   93.37090; 
+            daa[14*20+ 9] =   11.91520; daa[14*20+10] =   31.62580; daa[14*20+11] =   33.54190; daa[14*20+12] =   17.02050; 
+            daa[14*20+13] =   11.05060; daa[15*20+ 0] =  405.18700; daa[15*20+ 1] =  153.15900; daa[15*20+ 2] =  488.58920; 
+            daa[15*20+ 3] =   95.60970; daa[15*20+ 4] =  159.83560; daa[15*20+ 5] =   56.18280; daa[15*20+ 6] =   79.39990; 
+            daa[15*20+ 7] =  232.22430; daa[15*20+ 8] =   35.36430; daa[15*20+ 9] =   24.79550; daa[15*20+10] =   17.14320; 
+            daa[15*20+11] =   95.45570; daa[15*20+12] =   61.99510; daa[15*20+13] =   45.99010; daa[15*20+14] =  242.72020; 
+            daa[16*20+ 0] =  368.03650; daa[16*20+ 1] =   26.57450; daa[16*20+ 2] =  227.16970; daa[16*20+ 3] =   66.09300; 
+            daa[16*20+ 4] =   16.23660; daa[16*20+ 5] =   52.56510; daa[16*20+ 6] =   34.01560; daa[16*20+ 7] =   30.66620; 
+            daa[16*20+ 8] =   22.63330; daa[16*20+ 9] =  190.07390; daa[16*20+10] =   33.10900; daa[16*20+11] =  135.05990; 
+            daa[16*20+12] =  103.15340; daa[16*20+13] =   13.66550; daa[16*20+14] =   78.28570; daa[16*20+15] =  543.66740; 
+            daa[17*20+ 0] =    0.00000; daa[17*20+ 1] =  200.13750; daa[17*20+ 2] =   22.49680; daa[17*20+ 3] =    0.00000; 
+            daa[17*20+ 4] =    0.00000; daa[17*20+ 5] =    0.00000; daa[17*20+ 6] =    0.00000; daa[17*20+ 7] =    0.00000; 
+            daa[17*20+ 8] =   27.05640; daa[17*20+ 9] =    0.00000; daa[17*20+10] =   46.17760; daa[17*20+11] =    0.00000; 
+            daa[17*20+12] =    0.00000; daa[17*20+13] =   76.23540; daa[17*20+14] =    0.00000; daa[17*20+15] =   74.08190; 
+            daa[17*20+16] =    0.00000; daa[18*20+ 0] =   24.41390; daa[18*20+ 1] =    7.80120; daa[18*20+ 2] =   94.69400; 
+            daa[18*20+ 3] =    0.00000; daa[18*20+ 4] =   95.31640; daa[18*20+ 5] =    0.00000; daa[18*20+ 6] =   21.47170; 
+            daa[18*20+ 7] =    0.00000; daa[18*20+ 8] =  126.54000; daa[18*20+ 9] =   37.48340; daa[18*20+10] =   28.65720; 
+            daa[18*20+11] =   13.21420; daa[18*20+12] =    0.00000; daa[18*20+13] =  695.26290; daa[18*20+14] =    0.00000; 
+            daa[18*20+15] =   33.62890; daa[18*20+16] =   41.78390; daa[18*20+17] =   60.80700; daa[19*20+ 0] =  205.95640; 
+            daa[19*20+ 1] =   24.03680; daa[19*20+ 2] =   15.80670; daa[19*20+ 3] =   17.83160; daa[19*20+ 4] =   48.46780; 
+            daa[19*20+ 5] =   34.69830; daa[19*20+ 6] =   36.72500; daa[19*20+ 7] =   53.81650; daa[19*20+ 8] =   43.87150; 
+            daa[19*20+ 9] =  881.00380; daa[19*20+10] =  174.51560; daa[19*20+11] =   10.38500; daa[19*20+12] =  256.59550; 
+            daa[19*20+13] =   12.36060; daa[19*20+14] =   48.50260; daa[19*20+15] =   30.38360; daa[19*20+16] =  156.19970; 
+            daa[19*20+17] =    0.00000; daa[19*20+18] =   27.93790;                
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033619; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080481;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+	    f[ 0] = 0.087127; f[ 1] = 0.040904; f[ 2] = 0.040432; f[ 3] = 0.046872;
+	    f[ 4] = 0.033474; f[ 5] = 0.038255; f[ 6] = 0.049530; f[ 7] = 0.088612;
+	    f[ 8] = 0.033619; f[ 9] = 0.036886; f[10] = 0.085357; f[11] = 0.080481;
+	    f[12] = 0.014753; f[13] = 0.039772; f[14] = 0.050680; f[15] = 0.069577;
+	    f[16] = 0.058542; f[17] = 0.010494; f[18] = 0.029916; f[19] = 0.064717;
+
+          }
+          break;
+        case PLL_JTT:
+          {
+            daa[ 1*20+ 0] =   58.00; daa[ 2*20+ 0] =   54.00; daa[ 2*20+ 1] =   45.00; daa[ 3*20+ 0] =   81.00;
+            daa[ 3*20+ 1] =   16.00; daa[ 3*20+ 2] =  528.00; daa[ 4*20+ 0] =   56.00; daa[ 4*20+ 1] =  113.00;
+            daa[ 4*20+ 2] =   34.00; daa[ 4*20+ 3] =   10.00; daa[ 5*20+ 0] =   57.00; daa[ 5*20+ 1] =  310.00;
+            daa[ 5*20+ 2] =   86.00; daa[ 5*20+ 3] =   49.00; daa[ 5*20+ 4] =    9.00; daa[ 6*20+ 0] =  105.00;
+            daa[ 6*20+ 1] =   29.00; daa[ 6*20+ 2] =   58.00; daa[ 6*20+ 3] =  767.00; daa[ 6*20+ 4] =    5.00;
+            daa[ 6*20+ 5] =  323.00; daa[ 7*20+ 0] =  179.00; daa[ 7*20+ 1] =  137.00; daa[ 7*20+ 2] =   81.00;
+            daa[ 7*20+ 3] =  130.00; daa[ 7*20+ 4] =   59.00; daa[ 7*20+ 5] =   26.00; daa[ 7*20+ 6] =  119.00;
+            daa[ 8*20+ 0] =   27.00; daa[ 8*20+ 1] =  328.00; daa[ 8*20+ 2] =  391.00; daa[ 8*20+ 3] =  112.00;
+            daa[ 8*20+ 4] =   69.00; daa[ 8*20+ 5] =  597.00; daa[ 8*20+ 6] =   26.00; daa[ 8*20+ 7] =   23.00;
+            daa[ 9*20+ 0] =   36.00; daa[ 9*20+ 1] =   22.00; daa[ 9*20+ 2] =   47.00; daa[ 9*20+ 3] =   11.00;
+            daa[ 9*20+ 4] =   17.00; daa[ 9*20+ 5] =    9.00; daa[ 9*20+ 6] =   12.00; daa[ 9*20+ 7] =    6.00;
+            daa[ 9*20+ 8] =   16.00; daa[10*20+ 0] =   30.00; daa[10*20+ 1] =   38.00; daa[10*20+ 2] =   12.00;
+            daa[10*20+ 3] =    7.00; daa[10*20+ 4] =   23.00; daa[10*20+ 5] =   72.00; daa[10*20+ 6] =    9.00;
+            daa[10*20+ 7] =    6.00; daa[10*20+ 8] =   56.00; daa[10*20+ 9] =  229.00; daa[11*20+ 0] =   35.00;
+            daa[11*20+ 1] =  646.00; daa[11*20+ 2] =  263.00; daa[11*20+ 3] =   26.00; daa[11*20+ 4] =    7.00;
+            daa[11*20+ 5] =  292.00; daa[11*20+ 6] =  181.00; daa[11*20+ 7] =   27.00; daa[11*20+ 8] =   45.00;
+            daa[11*20+ 9] =   21.00; daa[11*20+10] =   14.00; daa[12*20+ 0] =   54.00; daa[12*20+ 1] =   44.00;
+            daa[12*20+ 2] =   30.00; daa[12*20+ 3] =   15.00; daa[12*20+ 4] =   31.00; daa[12*20+ 5] =   43.00;
+            daa[12*20+ 6] =   18.00; daa[12*20+ 7] =   14.00; daa[12*20+ 8] =   33.00; daa[12*20+ 9] =  479.00;
+            daa[12*20+10] =  388.00; daa[12*20+11] =   65.00; daa[13*20+ 0] =   15.00; daa[13*20+ 1] =    5.00;
+            daa[13*20+ 2] =   10.00; daa[13*20+ 3] =    4.00; daa[13*20+ 4] =   78.00; daa[13*20+ 5] =    4.00;
+            daa[13*20+ 6] =    5.00; daa[13*20+ 7] =    5.00; daa[13*20+ 8] =   40.00; daa[13*20+ 9] =   89.00;
+            daa[13*20+10] =  248.00; daa[13*20+11] =    4.00; daa[13*20+12] =   43.00; daa[14*20+ 0] =  194.00;
+            daa[14*20+ 1] =   74.00; daa[14*20+ 2] =   15.00; daa[14*20+ 3] =   15.00; daa[14*20+ 4] =   14.00;
+            daa[14*20+ 5] =  164.00; daa[14*20+ 6] =   18.00; daa[14*20+ 7] =   24.00; daa[14*20+ 8] =  115.00;
+            daa[14*20+ 9] =   10.00; daa[14*20+10] =  102.00; daa[14*20+11] =   21.00; daa[14*20+12] =   16.00;
+            daa[14*20+13] =   17.00; daa[15*20+ 0] =  378.00; daa[15*20+ 1] =  101.00; daa[15*20+ 2] =  503.00;
+            daa[15*20+ 3] =   59.00; daa[15*20+ 4] =  223.00; daa[15*20+ 5] =   53.00; daa[15*20+ 6] =   30.00;
+            daa[15*20+ 7] =  201.00; daa[15*20+ 8] =   73.00; daa[15*20+ 9] =   40.00; daa[15*20+10] =   59.00;
+            daa[15*20+11] =   47.00; daa[15*20+12] =   29.00; daa[15*20+13] =   92.00; daa[15*20+14] =  285.00;
+            daa[16*20+ 0] =  475.00; daa[16*20+ 1] =   64.00; daa[16*20+ 2] =  232.00; daa[16*20+ 3] =   38.00;
+            daa[16*20+ 4] =   42.00; daa[16*20+ 5] =   51.00; daa[16*20+ 6] =   32.00; daa[16*20+ 7] =   33.00;
+            daa[16*20+ 8] =   46.00; daa[16*20+ 9] =  245.00; daa[16*20+10] =   25.00; daa[16*20+11] =  103.00;
+            daa[16*20+12] =  226.00; daa[16*20+13] =   12.00; daa[16*20+14] =  118.00; daa[16*20+15] =  477.00;
+            daa[17*20+ 0] =    9.00; daa[17*20+ 1] =  126.00; daa[17*20+ 2] =    8.00; daa[17*20+ 3] =    4.00;
+            daa[17*20+ 4] =  115.00; daa[17*20+ 5] =   18.00; daa[17*20+ 6] =   10.00; daa[17*20+ 7] =   55.00;
+            daa[17*20+ 8] =    8.00; daa[17*20+ 9] =    9.00; daa[17*20+10] =   52.00; daa[17*20+11] =   10.00;
+            daa[17*20+12] =   24.00; daa[17*20+13] =   53.00; daa[17*20+14] =    6.00; daa[17*20+15] =   35.00;
+            daa[17*20+16] =   12.00; daa[18*20+ 0] =   11.00; daa[18*20+ 1] =   20.00; daa[18*20+ 2] =   70.00;
+            daa[18*20+ 3] =   46.00; daa[18*20+ 4] =  209.00; daa[18*20+ 5] =   24.00; daa[18*20+ 6] =    7.00;
+            daa[18*20+ 7] =    8.00; daa[18*20+ 8] =  573.00; daa[18*20+ 9] =   32.00; daa[18*20+10] =   24.00;
+            daa[18*20+11] =    8.00; daa[18*20+12] =   18.00; daa[18*20+13] =  536.00; daa[18*20+14] =   10.00;
+            daa[18*20+15] =   63.00; daa[18*20+16] =   21.00; daa[18*20+17] =   71.00; daa[19*20+ 0] =  298.00;
+            daa[19*20+ 1] =   17.00; daa[19*20+ 2] =   16.00; daa[19*20+ 3] =   31.00; daa[19*20+ 4] =   62.00;
+            daa[19*20+ 5] =   20.00; daa[19*20+ 6] =   45.00; daa[19*20+ 7] =   47.00; daa[19*20+ 8] =   11.00;
+            daa[19*20+ 9] =  961.00; daa[19*20+10] =  180.00; daa[19*20+11] =   14.00; daa[19*20+12] =  323.00;
+            daa[19*20+13] =   62.00; daa[19*20+14] =   23.00; daa[19*20+15] =   38.00; daa[19*20+16] =  112.00;
+            daa[19*20+17] =   25.00; daa[19*20+18] =   16.00;
+                    
+	    f[ 0] = 0.076748; f[ 1] = 0.051691; f[ 2] = 0.042645; f[ 3] = 0.051544;
+	    f[ 4] = 0.019803; f[ 5] = 0.040752; f[ 6] = 0.061830; f[ 7] = 0.073152;
+	    f[ 8] = 0.022944; f[ 9] = 0.053761; f[10] = 0.091904; f[11] = 0.058676;
+	    f[12] = 0.023826; f[13] = 0.040126; f[14] = 0.050901; f[15] = 0.068765;
+	    f[16] = 0.058565; f[17] = 0.014261; f[18] = 0.032102; f[19] = 0.066004;
+          }
+          break;
+        case  PLL_MTREV:
+          {
+            daa[ 1*20+ 0] =   23.18; daa[ 2*20+ 0] =   26.95; daa[ 2*20+ 1] =   13.24; daa[ 3*20+ 0] =   17.67;
+            daa[ 3*20+ 1] =    1.90; daa[ 3*20+ 2] =  794.38; daa[ 4*20+ 0] =   59.93; daa[ 4*20+ 1] =  103.33;
+            daa[ 4*20+ 2] =   58.94; daa[ 4*20+ 3] =    1.90; daa[ 5*20+ 0] =    1.90; daa[ 5*20+ 1] =  220.99;
+            daa[ 5*20+ 2] =  173.56; daa[ 5*20+ 3] =   55.28; daa[ 5*20+ 4] =   75.24; daa[ 6*20+ 0] =    9.77;
+            daa[ 6*20+ 1] =    1.90; daa[ 6*20+ 2] =   63.05; daa[ 6*20+ 3] =  583.55; daa[ 6*20+ 4] =    1.90;
+            daa[ 6*20+ 5] =  313.56; daa[ 7*20+ 0] =  120.71; daa[ 7*20+ 1] =   23.03; daa[ 7*20+ 2] =   53.30;
+            daa[ 7*20+ 3] =   56.77; daa[ 7*20+ 4] =   30.71; daa[ 7*20+ 5] =    6.75; daa[ 7*20+ 6] =   28.28;
+            daa[ 8*20+ 0] =   13.90; daa[ 8*20+ 1] =  165.23; daa[ 8*20+ 2] =  496.13; daa[ 8*20+ 3] =  113.99;
+            daa[ 8*20+ 4] =  141.49; daa[ 8*20+ 5] =  582.40; daa[ 8*20+ 6] =   49.12; daa[ 8*20+ 7] =    1.90;
+            daa[ 9*20+ 0] =   96.49; daa[ 9*20+ 1] =    1.90; daa[ 9*20+ 2] =   27.10; daa[ 9*20+ 3] =    4.34;
+            daa[ 9*20+ 4] =   62.73; daa[ 9*20+ 5] =    8.34; daa[ 9*20+ 6] =    3.31; daa[ 9*20+ 7] =    5.98;
+            daa[ 9*20+ 8] =   12.26; daa[10*20+ 0] =   25.46; daa[10*20+ 1] =   15.58; daa[10*20+ 2] =   15.16;
+            daa[10*20+ 3] =    1.90; daa[10*20+ 4] =   25.65; daa[10*20+ 5] =   39.70; daa[10*20+ 6] =    1.90;
+            daa[10*20+ 7] =    2.41; daa[10*20+ 8] =   11.49; daa[10*20+ 9] =  329.09; daa[11*20+ 0] =    8.36;
+            daa[11*20+ 1] =  141.40; daa[11*20+ 2] =  608.70; daa[11*20+ 3] =    2.31; daa[11*20+ 4] =    1.90;
+            daa[11*20+ 5] =  465.58; daa[11*20+ 6] =  313.86; daa[11*20+ 7] =   22.73; daa[11*20+ 8] =  127.67;
+            daa[11*20+ 9] =   19.57; daa[11*20+10] =   14.88; daa[12*20+ 0] =  141.88; daa[12*20+ 1] =    1.90;
+            daa[12*20+ 2] =   65.41; daa[12*20+ 3] =    1.90; daa[12*20+ 4] =    6.18; daa[12*20+ 5] =   47.37;
+            daa[12*20+ 6] =    1.90; daa[12*20+ 7] =    1.90; daa[12*20+ 8] =   11.97; daa[12*20+ 9] =  517.98;
+            daa[12*20+10] =  537.53; daa[12*20+11] =   91.37; daa[13*20+ 0] =    6.37; daa[13*20+ 1] =    4.69;
+            daa[13*20+ 2] =   15.20; daa[13*20+ 3] =    4.98; daa[13*20+ 4] =   70.80; daa[13*20+ 5] =   19.11;
+            daa[13*20+ 6] =    2.67; daa[13*20+ 7] =    1.90; daa[13*20+ 8] =   48.16; daa[13*20+ 9] =   84.67;
+            daa[13*20+10] =  216.06; daa[13*20+11] =    6.44; daa[13*20+12] =   90.82; daa[14*20+ 0] =   54.31;
+            daa[14*20+ 1] =   23.64; daa[14*20+ 2] =   73.31; daa[14*20+ 3] =   13.43; daa[14*20+ 4] =   31.26;
+            daa[14*20+ 5] =  137.29; daa[14*20+ 6] =   12.83; daa[14*20+ 7] =    1.90; daa[14*20+ 8] =   60.97;
+            daa[14*20+ 9] =   20.63; daa[14*20+10] =   40.10; daa[14*20+11] =   50.10; daa[14*20+12] =   18.84;
+            daa[14*20+13] =   17.31; daa[15*20+ 0] =  387.86; daa[15*20+ 1] =    6.04; daa[15*20+ 2] =  494.39;
+            daa[15*20+ 3] =   69.02; daa[15*20+ 4] =  277.05; daa[15*20+ 5] =   54.11; daa[15*20+ 6] =   54.71;
+            daa[15*20+ 7] =  125.93; daa[15*20+ 8] =   77.46; daa[15*20+ 9] =   47.70; daa[15*20+10] =   73.61;
+            daa[15*20+11] =  105.79; daa[15*20+12] =  111.16; daa[15*20+13] =   64.29; daa[15*20+14] =  169.90;
+            daa[16*20+ 0] =  480.72; daa[16*20+ 1] =    2.08; daa[16*20+ 2] =  238.46; daa[16*20+ 3] =   28.01;
+            daa[16*20+ 4] =  179.97; daa[16*20+ 5] =   94.93; daa[16*20+ 6] =   14.82; daa[16*20+ 7] =   11.17;
+            daa[16*20+ 8] =   44.78; daa[16*20+ 9] =  368.43; daa[16*20+10] =  126.40; daa[16*20+11] =  136.33;
+            daa[16*20+12] =  528.17; daa[16*20+13] =   33.85; daa[16*20+14] =  128.22; daa[16*20+15] =  597.21;
+            daa[17*20+ 0] =    1.90; daa[17*20+ 1] =   21.95; daa[17*20+ 2] =   10.68; daa[17*20+ 3] =   19.86;
+            daa[17*20+ 4] =   33.60; daa[17*20+ 5] =    1.90; daa[17*20+ 6] =    1.90; daa[17*20+ 7] =   10.92;
+            daa[17*20+ 8] =    7.08; daa[17*20+ 9] =    1.90; daa[17*20+10] =   32.44; daa[17*20+11] =   24.00;
+            daa[17*20+12] =   21.71; daa[17*20+13] =    7.84; daa[17*20+14] =    4.21; daa[17*20+15] =   38.58;
+            daa[17*20+16] =    9.99; daa[18*20+ 0] =    6.48; daa[18*20+ 1] =    1.90; daa[18*20+ 2] =  191.36;
+            daa[18*20+ 3] =   21.21; daa[18*20+ 4] =  254.77; daa[18*20+ 5] =   38.82; daa[18*20+ 6] =   13.12;
+            daa[18*20+ 7] =    3.21; daa[18*20+ 8] =  670.14; daa[18*20+ 9] =   25.01; daa[18*20+10] =   44.15;
+            daa[18*20+11] =   51.17; daa[18*20+12] =   39.96; daa[18*20+13] =  465.58; daa[18*20+14] =   16.21;
+            daa[18*20+15] =   64.92; daa[18*20+16] =   38.73; daa[18*20+17] =   26.25; daa[19*20+ 0] =  195.06;
+            daa[19*20+ 1] =    7.64; daa[19*20+ 2] =    1.90; daa[19*20+ 3] =    1.90; daa[19*20+ 4] =    1.90;
+            daa[19*20+ 5] =   19.00; daa[19*20+ 6] =   21.14; daa[19*20+ 7] =    2.53; daa[19*20+ 8] =    1.90;
+            daa[19*20+ 9] = 1222.94; daa[19*20+10] =   91.67; daa[19*20+11] =    1.90; daa[19*20+12] =  387.54;
+            daa[19*20+13] =    6.35; daa[19*20+14] =    8.23; daa[19*20+15] =    1.90; daa[19*20+16] =  204.54;
+            daa[19*20+17] =    5.37; daa[19*20+18] =    1.90;
+            
+            
+            f[ 0] = 0.072000; f[ 1] = 0.019000; f[ 2] = 0.039000; f[ 3] = 0.019000;
+            f[ 4] = 0.006000; f[ 5] = 0.025000; f[ 6] = 0.024000; f[ 7] = 0.056000;
+            f[ 8] = 0.028000; f[ 9] = 0.088000; f[10] = 0.169000; f[11] = 0.023000;
+            f[12] = 0.054000; f[13] = 0.061000; f[14] = 0.054000; f[15] = 0.072000;
+            f[16] = 0.086000; f[17] = 0.029000; f[18] = 0.033000; f[19] = 0.043000;
+          }
+          break;
+        case PLL_WAG:
+          {
+            daa[ 1*20+ 0] =  55.15710; daa[ 2*20+ 0] =  50.98480; daa[ 2*20+ 1] =  63.53460; 
+            daa[ 3*20+ 0] =  73.89980; daa[ 3*20+ 1] =  14.73040; daa[ 3*20+ 2] = 542.94200; 
+            daa[ 4*20+ 0] = 102.70400; daa[ 4*20+ 1] =  52.81910; daa[ 4*20+ 2] =  26.52560; 
+            daa[ 4*20+ 3] =   3.02949; daa[ 5*20+ 0] =  90.85980; daa[ 5*20+ 1] = 303.55000; 
+            daa[ 5*20+ 2] = 154.36400; daa[ 5*20+ 3] =  61.67830; daa[ 5*20+ 4] =   9.88179; 
+            daa[ 6*20+ 0] = 158.28500; daa[ 6*20+ 1] =  43.91570; daa[ 6*20+ 2] =  94.71980; 
+            daa[ 6*20+ 3] = 617.41600; daa[ 6*20+ 4] =   2.13520; daa[ 6*20+ 5] = 546.94700; 
+            daa[ 7*20+ 0] = 141.67200; daa[ 7*20+ 1] =  58.46650; daa[ 7*20+ 2] = 112.55600; 
+            daa[ 7*20+ 3] =  86.55840; daa[ 7*20+ 4] =  30.66740; daa[ 7*20+ 5] =  33.00520; 
+            daa[ 7*20+ 6] =  56.77170; daa[ 8*20+ 0] =  31.69540; daa[ 8*20+ 1] = 213.71500; 
+            daa[ 8*20+ 2] = 395.62900; daa[ 8*20+ 3] =  93.06760; daa[ 8*20+ 4] =  24.89720; 
+            daa[ 8*20+ 5] = 429.41100; daa[ 8*20+ 6] =  57.00250; daa[ 8*20+ 7] =  24.94100; 
+            daa[ 9*20+ 0] =  19.33350; daa[ 9*20+ 1] =  18.69790; daa[ 9*20+ 2] =  55.42360; 
+            daa[ 9*20+ 3] =   3.94370; daa[ 9*20+ 4] =  17.01350; daa[ 9*20+ 5] =  11.39170; 
+            daa[ 9*20+ 6] =  12.73950; daa[ 9*20+ 7] =   3.04501; daa[ 9*20+ 8] =  13.81900; 
+            daa[10*20+ 0] =  39.79150; daa[10*20+ 1] =  49.76710; daa[10*20+ 2] =  13.15280; 
+            daa[10*20+ 3] =   8.48047; daa[10*20+ 4] =  38.42870; daa[10*20+ 5] =  86.94890; 
+            daa[10*20+ 6] =  15.42630; daa[10*20+ 7] =   6.13037; daa[10*20+ 8] =  49.94620; 
+            daa[10*20+ 9] = 317.09700; daa[11*20+ 0] =  90.62650; daa[11*20+ 1] = 535.14200; 
+            daa[11*20+ 2] = 301.20100; daa[11*20+ 3] =  47.98550; daa[11*20+ 4] =   7.40339; 
+            daa[11*20+ 5] = 389.49000; daa[11*20+ 6] = 258.44300; daa[11*20+ 7] =  37.35580; 
+            daa[11*20+ 8] =  89.04320; daa[11*20+ 9] =  32.38320; daa[11*20+10] =  25.75550; 
+            daa[12*20+ 0] =  89.34960; daa[12*20+ 1] =  68.31620; daa[12*20+ 2] =  19.82210; 
+            daa[12*20+ 3] =  10.37540; daa[12*20+ 4] =  39.04820; daa[12*20+ 5] = 154.52600; 
+            daa[12*20+ 6] =  31.51240; daa[12*20+ 7] =  17.41000; daa[12*20+ 8] =  40.41410; 
+            daa[12*20+ 9] = 425.74600; daa[12*20+10] = 485.40200; daa[12*20+11] =  93.42760; 
+            daa[13*20+ 0] =  21.04940; daa[13*20+ 1] =  10.27110; daa[13*20+ 2] =   9.61621; 
+            daa[13*20+ 3] =   4.67304; daa[13*20+ 4] =  39.80200; daa[13*20+ 5] =   9.99208; 
+            daa[13*20+ 6] =   8.11339; daa[13*20+ 7] =   4.99310; daa[13*20+ 8] =  67.93710; 
+            daa[13*20+ 9] = 105.94700; daa[13*20+10] = 211.51700; daa[13*20+11] =   8.88360; 
+            daa[13*20+12] = 119.06300; daa[14*20+ 0] = 143.85500; daa[14*20+ 1] =  67.94890; 
+            daa[14*20+ 2] =  19.50810; daa[14*20+ 3] =  42.39840; daa[14*20+ 4] =  10.94040; 
+            daa[14*20+ 5] =  93.33720; daa[14*20+ 6] =  68.23550; daa[14*20+ 7] =  24.35700; 
+            daa[14*20+ 8] =  69.61980; daa[14*20+ 9] =   9.99288; daa[14*20+10] =  41.58440; 
+            daa[14*20+11] =  55.68960; daa[14*20+12] =  17.13290; daa[14*20+13] =  16.14440; 
+            daa[15*20+ 0] = 337.07900; daa[15*20+ 1] = 122.41900; daa[15*20+ 2] = 397.42300; 
+            daa[15*20+ 3] = 107.17600; daa[15*20+ 4] = 140.76600; daa[15*20+ 5] = 102.88700; 
+            daa[15*20+ 6] =  70.49390; daa[15*20+ 7] = 134.18200; daa[15*20+ 8] =  74.01690; 
+            daa[15*20+ 9] =  31.94400; daa[15*20+10] =  34.47390; daa[15*20+11] =  96.71300; 
+            daa[15*20+12] =  49.39050; daa[15*20+13] =  54.59310; daa[15*20+14] = 161.32800; 
+            daa[16*20+ 0] = 212.11100; daa[16*20+ 1] =  55.44130; daa[16*20+ 2] = 203.00600; 
+            daa[16*20+ 3] =  37.48660; daa[16*20+ 4] =  51.29840; daa[16*20+ 5] =  85.79280; 
+            daa[16*20+ 6] =  82.27650; daa[16*20+ 7] =  22.58330; daa[16*20+ 8] =  47.33070; 
+            daa[16*20+ 9] = 145.81600; daa[16*20+10] =  32.66220; daa[16*20+11] = 138.69800; 
+            daa[16*20+12] = 151.61200; daa[16*20+13] =  17.19030; daa[16*20+14] =  79.53840; 
+            daa[16*20+15] = 437.80200; daa[17*20+ 0] =  11.31330; daa[17*20+ 1] = 116.39200; 
+            daa[17*20+ 2] =   7.19167; daa[17*20+ 3] =  12.97670; daa[17*20+ 4] =  71.70700; 
+            daa[17*20+ 5] =  21.57370; daa[17*20+ 6] =  15.65570; daa[17*20+ 7] =  33.69830; 
+            daa[17*20+ 8] =  26.25690; daa[17*20+ 9] =  21.24830; daa[17*20+10] =  66.53090; 
+            daa[17*20+11] =  13.75050; daa[17*20+12] =  51.57060; daa[17*20+13] = 152.96400; 
+            daa[17*20+14] =  13.94050; daa[17*20+15] =  52.37420; daa[17*20+16] =  11.08640; 
+            daa[18*20+ 0] =  24.07350; daa[18*20+ 1] =  38.15330; daa[18*20+ 2] = 108.60000; 
+            daa[18*20+ 3] =  32.57110; daa[18*20+ 4] =  54.38330; daa[18*20+ 5] =  22.77100; 
+            daa[18*20+ 6] =  19.63030; daa[18*20+ 7] =  10.36040; daa[18*20+ 8] = 387.34400; 
+            daa[18*20+ 9] =  42.01700; daa[18*20+10] =  39.86180; daa[18*20+11] =  13.32640; 
+            daa[18*20+12] =  42.84370; daa[18*20+13] = 645.42800; daa[18*20+14] =  21.60460; 
+            daa[18*20+15] =  78.69930; daa[18*20+16] =  29.11480; daa[18*20+17] = 248.53900; 
+            daa[19*20+ 0] = 200.60100; daa[19*20+ 1] =  25.18490; daa[19*20+ 2] =  19.62460; 
+            daa[19*20+ 3] =  15.23350; daa[19*20+ 4] = 100.21400; daa[19*20+ 5] =  30.12810; 
+            daa[19*20+ 6] =  58.87310; daa[19*20+ 7] =  18.72470; daa[19*20+ 8] =  11.83580; 
+            daa[19*20+ 9] = 782.13000; daa[19*20+10] = 180.03400; daa[19*20+11] =  30.54340; 
+            daa[19*20+12] = 205.84500; daa[19*20+13] =  64.98920; daa[19*20+14] =  31.48870; 
+            daa[19*20+15] =  23.27390; daa[19*20+16] = 138.82300; daa[19*20+17] =  36.53690; 
+            daa[19*20+18] =  31.47300; 
+                   
+	    f[0]  = 0.0866279; f[1]  = 0.043972;  f[2]  = 0.0390894; f[3]  = 0.0570451;
+	    f[4]  = 0.0193078; f[5]  = 0.0367281; f[6]  = 0.0580589; f[7]  = 0.0832518;
+	    f[8]  = 0.0244313; f[9]  = 0.048466;  f[10] = 0.086209;  f[11] = 0.0620286;
+	    f[12] = 0.0195027; f[13] = 0.0384319; f[14] = 0.0457631; f[15] = 0.0695179;
+	    f[16] = 0.0610127; f[17] = 0.0143859; f[18] = 0.0352742; f[19] = 0.0708957;   
+          }
+          break;
+        case PLL_RTREV:
+          {
+            daa[1*20+0]= 34;         daa[2*20+0]= 51;         daa[2*20+1]= 35;         daa[3*20+0]= 10;         
+            daa[3*20+1]= 30;         daa[3*20+2]= 384;        daa[4*20+0]= 439;        daa[4*20+1]= 92;         
+            daa[4*20+2]= 128;        daa[4*20+3]= 1;          daa[5*20+0]= 32;         daa[5*20+1]= 221;        
+            daa[5*20+2]= 236;        daa[5*20+3]= 78;         daa[5*20+4]= 70;         daa[6*20+0]= 81;         
+            daa[6*20+1]= 10;         daa[6*20+2]= 79;         daa[6*20+3]= 542;        daa[6*20+4]= 1;          
+            daa[6*20+5]= 372;        daa[7*20+0]= 135;        daa[7*20+1]= 41;         daa[7*20+2]= 94;         
+            daa[7*20+3]= 61;         daa[7*20+4]= 48;         daa[7*20+5]= 18;         daa[7*20+6]= 70;         
+            daa[8*20+0]= 30;         daa[8*20+1]= 90;         daa[8*20+2]= 320;        daa[8*20+3]= 91;         
+            daa[8*20+4]= 124;        daa[8*20+5]= 387;        daa[8*20+6]= 34;         daa[8*20+7]= 68;         
+            daa[9*20+0]= 1;          daa[9*20+1]= 24;         daa[9*20+2]= 35;         daa[9*20+3]= 1;          
+            daa[9*20+4]= 104;        daa[9*20+5]= 33;         daa[9*20+6]= 1;          daa[9*20+7]= 1;          
+            daa[9*20+8]= 34;         daa[10*20+0]= 45;        daa[10*20+1]= 18;        daa[10*20+2]= 15;        
+            daa[10*20+3]= 5;         daa[10*20+4]= 110;       daa[10*20+5]= 54;        daa[10*20+6]= 21;        
+            daa[10*20+7]= 3;         daa[10*20+8]= 51;        daa[10*20+9]= 385;       daa[11*20+0]= 38;        
+            daa[11*20+1]= 593;       daa[11*20+2]= 123;       daa[11*20+3]= 20;        daa[11*20+4]= 16;        
+            daa[11*20+5]= 309;       daa[11*20+6]= 141;       daa[11*20+7]= 30;        daa[11*20+8]= 76;        
+            daa[11*20+9]= 34;        daa[11*20+10]= 23;       daa[12*20+0]= 235;       daa[12*20+1]= 57;        
+            daa[12*20+2]= 1;         daa[12*20+3]= 1;         daa[12*20+4]= 156;       daa[12*20+5]= 158;       
+            daa[12*20+6]= 1;         daa[12*20+7]= 37;        daa[12*20+8]= 116;       daa[12*20+9]= 375;       
+            daa[12*20+10]= 581;      daa[12*20+11]= 134;      daa[13*20+0]= 1;         daa[13*20+1]= 7;         
+            daa[13*20+2]= 49;        daa[13*20+3]= 1;         daa[13*20+4]= 70;        daa[13*20+5]= 1;         
+            daa[13*20+6]= 1;         daa[13*20+7]= 7;         daa[13*20+8]= 141;       daa[13*20+9]= 64;        
+            daa[13*20+10]= 179;      daa[13*20+11]= 14;       daa[13*20+12]= 247;      daa[14*20+0]= 97;        
+            daa[14*20+1]= 24;        daa[14*20+2]= 33;        daa[14*20+3]= 55;        daa[14*20+4]= 1;         
+            daa[14*20+5]= 68;        daa[14*20+6]= 52;        daa[14*20+7]= 17;        daa[14*20+8]= 44;        
+            daa[14*20+9]= 10;        daa[14*20+10]= 22;       daa[14*20+11]= 43;       daa[14*20+12]= 1;        
+            daa[14*20+13]= 11;       daa[15*20+0]= 460;       daa[15*20+1]= 102;       daa[15*20+2]= 294;       
+            daa[15*20+3]= 136;       daa[15*20+4]= 75;        daa[15*20+5]= 225;       daa[15*20+6]= 95;        
+            daa[15*20+7]= 152;       daa[15*20+8]= 183;       daa[15*20+9]= 4;         daa[15*20+10]= 24;       
+            daa[15*20+11]= 77;       daa[15*20+12]= 1;        daa[15*20+13]= 20;       daa[15*20+14]= 134;      
+            daa[16*20+0]= 258;       daa[16*20+1]= 64;        daa[16*20+2]= 148;       daa[16*20+3]= 55;        
+            daa[16*20+4]= 117;       daa[16*20+5]= 146;       daa[16*20+6]= 82;        daa[16*20+7]= 7;         
+            daa[16*20+8]= 49;        daa[16*20+9]= 72;        daa[16*20+10]= 25;       daa[16*20+11]= 110;      
+            daa[16*20+12]= 131;      daa[16*20+13]= 69;       daa[16*20+14]= 62;       daa[16*20+15]= 671;      
+            daa[17*20+0]= 5;         daa[17*20+1]= 13;        daa[17*20+2]= 16;        daa[17*20+3]= 1;         
+            daa[17*20+4]= 55;        daa[17*20+5]= 10;        daa[17*20+6]= 17;        daa[17*20+7]= 23;        
+            daa[17*20+8]= 48;        daa[17*20+9]= 39;        daa[17*20+10]= 47;       daa[17*20+11]= 6;        
+            daa[17*20+12]= 111;      daa[17*20+13]= 182;      daa[17*20+14]= 9;        daa[17*20+15]= 14;       
+            daa[17*20+16]= 1;        daa[18*20+0]= 55;        daa[18*20+1]= 47;        daa[18*20+2]= 28;        
+            daa[18*20+3]= 1;         daa[18*20+4]= 131;       daa[18*20+5]= 45;        daa[18*20+6]= 1;         
+            daa[18*20+7]= 21;        daa[18*20+8]= 307;       daa[18*20+9]= 26;        daa[18*20+10]= 64;       
+            daa[18*20+11]= 1;        daa[18*20+12]= 74;       daa[18*20+13]= 1017;     daa[18*20+14]= 14;       
+            daa[18*20+15]= 31;       daa[18*20+16]= 34;       daa[18*20+17]= 176;      daa[19*20+0]= 197;       
+            daa[19*20+1]= 29;        daa[19*20+2]= 21;        daa[19*20+3]= 6;         daa[19*20+4]= 295;       
+            daa[19*20+5]= 36;        daa[19*20+6]= 35;        daa[19*20+7]= 3;         daa[19*20+8]= 1;         
+            daa[19*20+9]= 1048;      daa[19*20+10]= 112;      daa[19*20+11]= 19;       daa[19*20+12]= 236;      
+            daa[19*20+13]= 92;       daa[19*20+14]= 25;       daa[19*20+15]= 39;       daa[19*20+16]= 196;      
+            daa[19*20+17]= 26;       daa[19*20+18]= 59;       
+            
+            f[0]= 0.0646;           f[1]= 0.0453;           f[2]= 0.0376;           f[3]= 0.0422;           
+            f[4]= 0.0114;           f[5]= 0.0606;           f[6]= 0.0607;           f[7]= 0.0639;           
+            f[8]= 0.0273;           f[9]= 0.0679;           f[10]= 0.1018;          f[11]= 0.0751;          
+            f[12]= 0.015;           f[13]= 0.0287;          f[14]= 0.0681;          f[15]= 0.0488;          
+            f[16]= 0.0622;          f[17]= 0.0251;          f[18]= 0.0318;          f[19]= 0.0619;                  
+          }
+          break;
+        case PLL_CPREV:
+          {
+            daa[1*20+0]= 105;        daa[2*20+0]= 227;        daa[2*20+1]= 357;        daa[3*20+0]= 175;        
+            daa[3*20+1]= 43;         daa[3*20+2]= 4435;       daa[4*20+0]= 669;        daa[4*20+1]= 823;        
+            daa[4*20+2]= 538;        daa[4*20+3]= 10;         daa[5*20+0]= 157;        daa[5*20+1]= 1745;       
+            daa[5*20+2]= 768;        daa[5*20+3]= 400;        daa[5*20+4]= 10;         daa[6*20+0]= 499;        
+            daa[6*20+1]= 152;        daa[6*20+2]= 1055;       daa[6*20+3]= 3691;       daa[6*20+4]= 10;         
+            daa[6*20+5]= 3122;       daa[7*20+0]= 665;        daa[7*20+1]= 243;        daa[7*20+2]= 653;        
+            daa[7*20+3]= 431;        daa[7*20+4]= 303;        daa[7*20+5]= 133;        daa[7*20+6]= 379;        
+            daa[8*20+0]= 66;         daa[8*20+1]= 715;        daa[8*20+2]= 1405;       daa[8*20+3]= 331;        
+            daa[8*20+4]= 441;        daa[8*20+5]= 1269;       daa[8*20+6]= 162;        daa[8*20+7]= 19;         
+            daa[9*20+0]= 145;        daa[9*20+1]= 136;        daa[9*20+2]= 168;        daa[9*20+3]= 10;         
+            daa[9*20+4]= 280;        daa[9*20+5]= 92;         daa[9*20+6]= 148;        daa[9*20+7]= 40;         
+            daa[9*20+8]= 29;         daa[10*20+0]= 197;       daa[10*20+1]= 203;       daa[10*20+2]= 113;       
+            daa[10*20+3]= 10;        daa[10*20+4]= 396;       daa[10*20+5]= 286;       daa[10*20+6]= 82;        
+            daa[10*20+7]= 20;        daa[10*20+8]= 66;        daa[10*20+9]= 1745;      daa[11*20+0]= 236;       
+            daa[11*20+1]= 4482;      daa[11*20+2]= 2430;      daa[11*20+3]= 412;       daa[11*20+4]= 48;        
+            daa[11*20+5]= 3313;      daa[11*20+6]= 2629;      daa[11*20+7]= 263;       daa[11*20+8]= 305;       
+            daa[11*20+9]= 345;       daa[11*20+10]= 218;      daa[12*20+0]= 185;       daa[12*20+1]= 125;       
+            daa[12*20+2]= 61;        daa[12*20+3]= 47;        daa[12*20+4]= 159;       daa[12*20+5]= 202;       
+            daa[12*20+6]= 113;       daa[12*20+7]= 21;        daa[12*20+8]= 10;        daa[12*20+9]= 1772;      
+            daa[12*20+10]= 1351;     daa[12*20+11]= 193;      daa[13*20+0]= 68;        daa[13*20+1]= 53;        
+            daa[13*20+2]= 97;        daa[13*20+3]= 22;        daa[13*20+4]= 726;       daa[13*20+5]= 10;        
+            daa[13*20+6]= 145;       daa[13*20+7]= 25;        daa[13*20+8]= 127;       daa[13*20+9]= 454;       
+            daa[13*20+10]= 1268;     daa[13*20+11]= 72;       daa[13*20+12]= 327;      daa[14*20+0]= 490;       
+            daa[14*20+1]= 87;        daa[14*20+2]= 173;       daa[14*20+3]= 170;       daa[14*20+4]= 285;       
+            daa[14*20+5]= 323;       daa[14*20+6]= 185;       daa[14*20+7]= 28;        daa[14*20+8]= 152;       
+            daa[14*20+9]= 117;       daa[14*20+10]= 219;      daa[14*20+11]= 302;      daa[14*20+12]= 100;      
+            daa[14*20+13]= 43;       daa[15*20+0]= 2440;      daa[15*20+1]= 385;       daa[15*20+2]= 2085;      
+            daa[15*20+3]= 590;       daa[15*20+4]= 2331;      daa[15*20+5]= 396;       daa[15*20+6]= 568;       
+            daa[15*20+7]= 691;       daa[15*20+8]= 303;       daa[15*20+9]= 216;       daa[15*20+10]= 516;      
+            daa[15*20+11]= 868;      daa[15*20+12]= 93;       daa[15*20+13]= 487;      daa[15*20+14]= 1202;     
+            daa[16*20+0]= 1340;      daa[16*20+1]= 314;       daa[16*20+2]= 1393;      daa[16*20+3]= 266;       
+            daa[16*20+4]= 576;       daa[16*20+5]= 241;       daa[16*20+6]= 369;       daa[16*20+7]= 92;        
+            daa[16*20+8]= 32;        daa[16*20+9]= 1040;      daa[16*20+10]= 156;      daa[16*20+11]= 918;      
+            daa[16*20+12]= 645;      daa[16*20+13]= 148;      daa[16*20+14]= 260;      daa[16*20+15]= 2151;     
+            daa[17*20+0]= 14;        daa[17*20+1]= 230;       daa[17*20+2]= 40;        daa[17*20+3]= 18;        
+            daa[17*20+4]= 435;       daa[17*20+5]= 53;        daa[17*20+6]= 63;        daa[17*20+7]= 82;        
+            daa[17*20+8]= 69;        daa[17*20+9]= 42;        daa[17*20+10]= 159;      daa[17*20+11]= 10;       
+            daa[17*20+12]= 86;       daa[17*20+13]= 468;      daa[17*20+14]= 49;       daa[17*20+15]= 73;       
+            daa[17*20+16]= 29;       daa[18*20+0]= 56;        daa[18*20+1]= 323;       daa[18*20+2]= 754;       
+            daa[18*20+3]= 281;       daa[18*20+4]= 1466;      daa[18*20+5]= 391;       daa[18*20+6]= 142;       
+            daa[18*20+7]= 10;        daa[18*20+8]= 1971;      daa[18*20+9]= 89;        daa[18*20+10]= 189;      
+            daa[18*20+11]= 247;      daa[18*20+12]= 215;      daa[18*20+13]= 2370;     daa[18*20+14]= 97;       
+            daa[18*20+15]= 522;      daa[18*20+16]= 71;       daa[18*20+17]= 346;      daa[19*20+0]= 968;       
+            daa[19*20+1]= 92;        daa[19*20+2]= 83;        daa[19*20+3]= 75;        daa[19*20+4]= 592;       
+            daa[19*20+5]= 54;        daa[19*20+6]= 200;       daa[19*20+7]= 91;        daa[19*20+8]= 25;        
+            daa[19*20+9]= 4797;      daa[19*20+10]= 865;      daa[19*20+11]= 249;      daa[19*20+12]= 475;      
+            daa[19*20+13]= 317;      daa[19*20+14]= 122;      daa[19*20+15]= 167;      daa[19*20+16]= 760;      
+            daa[19*20+17]= 10;       daa[19*20+18]= 119;      
+            
+            f[0]= 0.076;            f[1]= 0.062;            f[2]= 0.041;            f[3]= 0.037;            
+            f[4]= 0.009;            f[5]= 0.038;            f[6]= 0.049;            f[7]= 0.084;            
+            f[8]= 0.025;            f[9]= 0.081;            f[10]= 0.101;           f[11]= 0.05;            
+            f[12]= 0.022;           f[13]= 0.051;           f[14]= 0.043;           f[15]= 0.062;           
+            f[16]= 0.054;           f[17]= 0.018;           f[18]= 0.031;           f[19]= 0.066; 
+          }
+          break;
+        case PLL_VT:
+          {
+            /*
+              daa[1*20+0]= 0.233108;   daa[2*20+0]= 0.199097;   daa[2*20+1]= 0.210797;   daa[3*20+0]= 0.265145;   
+              daa[3*20+1]= 0.105191;   daa[3*20+2]= 0.883422;   daa[4*20+0]= 0.227333;   daa[4*20+1]= 0.031726;   
+              daa[4*20+2]= 0.027495;   daa[4*20+3]= 0.010313;   daa[5*20+0]= 0.310084;   daa[5*20+1]= 0.493763;   
+              daa[5*20+2]= 0.2757;     daa[5*20+3]= 0.205842;   daa[5*20+4]= 0.004315;   daa[6*20+0]= 0.567957;   
+              daa[6*20+1]= 0.25524;    daa[6*20+2]= 0.270417;   daa[6*20+3]= 1.599461;   daa[6*20+4]= 0.005321;   
+              daa[6*20+5]= 0.960976;   daa[7*20+0]= 0.876213;   daa[7*20+1]= 0.156945;   daa[7*20+2]= 0.362028;   
+              daa[7*20+3]= 0.311718;   daa[7*20+4]= 0.050876;   daa[7*20+5]= 0.12866;    daa[7*20+6]= 0.250447;   
+              daa[8*20+0]= 0.078692;   daa[8*20+1]= 0.213164;   daa[8*20+2]= 0.290006;   daa[8*20+3]= 0.134252;   
+              daa[8*20+4]= 0.016695;   daa[8*20+5]= 0.315521;   daa[8*20+6]= 0.104458;   daa[8*20+7]= 0.058131;   
+              daa[9*20+0]= 0.222972;   daa[9*20+1]= 0.08151;    daa[9*20+2]= 0.087225;   daa[9*20+3]= 0.01172;    
+              daa[9*20+4]= 0.046398;   daa[9*20+5]= 0.054602;   daa[9*20+6]= 0.046589;   daa[9*20+7]= 0.051089;   
+              daa[9*20+8]= 0.020039;   daa[10*20+0]= 0.42463;   daa[10*20+1]= 0.192364;  daa[10*20+2]= 0.069245;  
+              daa[10*20+3]= 0.060863;  daa[10*20+4]= 0.091709;  daa[10*20+5]= 0.24353;   daa[10*20+6]= 0.151924;  
+              daa[10*20+7]= 0.087056;  daa[10*20+8]= 0.103552;  daa[10*20+9]= 2.08989;   daa[11*20+0]= 0.393245;  
+              daa[11*20+1]= 1.755838;  daa[11*20+2]= 0.50306;   daa[11*20+3]= 0.261101;  daa[11*20+4]= 0.004067;  
+              daa[11*20+5]= 0.738208;  daa[11*20+6]= 0.88863;   daa[11*20+7]= 0.193243;  daa[11*20+8]= 0.153323;  
+              daa[11*20+9]= 0.093181;  daa[11*20+10]= 0.201204; daa[12*20+0]= 0.21155;   daa[12*20+1]= 0.08793;   
+              daa[12*20+2]= 0.05742;   daa[12*20+3]= 0.012182;  daa[12*20+4]= 0.02369;   daa[12*20+5]= 0.120801;  
+              daa[12*20+6]= 0.058643;  daa[12*20+7]= 0.04656;   daa[12*20+8]= 0.021157;  daa[12*20+9]= 0.493845;  
+              daa[12*20+10]= 1.105667; daa[12*20+11]= 0.096474; daa[13*20+0]= 0.116646;  daa[13*20+1]= 0.042569;  
+              daa[13*20+2]= 0.039769;  daa[13*20+3]= 0.016577;  daa[13*20+4]= 0.051127;  daa[13*20+5]= 0.026235;  
+              daa[13*20+6]= 0.028168;  daa[13*20+7]= 0.050143;  daa[13*20+8]= 0.079807;  daa[13*20+9]= 0.32102;   
+              daa[13*20+10]= 0.946499; daa[13*20+11]= 0.038261; daa[13*20+12]= 0.173052; daa[14*20+0]= 0.399143;  
+              daa[14*20+1]= 0.12848;   daa[14*20+2]= 0.083956;  daa[14*20+3]= 0.160063;  daa[14*20+4]= 0.011137;  
+              daa[14*20+5]= 0.15657;   daa[14*20+6]= 0.205134;  daa[14*20+7]= 0.124492;  daa[14*20+8]= 0.078892;  
+              daa[14*20+9]= 0.054797;  daa[14*20+10]= 0.169784; daa[14*20+11]= 0.212302; daa[14*20+12]= 0.010363; 
+              daa[14*20+13]= 0.042564; daa[15*20+0]= 1.817198;  daa[15*20+1]= 0.292327;  daa[15*20+2]= 0.847049;  
+              daa[15*20+3]= 0.461519;  daa[15*20+4]= 0.17527;   daa[15*20+5]= 0.358017;  daa[15*20+6]= 0.406035;  
+              daa[15*20+7]= 0.612843;  daa[15*20+8]= 0.167406;  daa[15*20+9]= 0.081567;  daa[15*20+10]= 0.214977; 
+              daa[15*20+11]= 0.400072; daa[15*20+12]= 0.090515; daa[15*20+13]= 0.138119; daa[15*20+14]= 0.430431; 
+              daa[16*20+0]= 0.877877;  daa[16*20+1]= 0.204109;  daa[16*20+2]= 0.471268;  daa[16*20+3]= 0.178197;  
+              daa[16*20+4]= 0.079511;  daa[16*20+5]= 0.248992;  daa[16*20+6]= 0.321028;  daa[16*20+7]= 0.136266;  
+              daa[16*20+8]= 0.101117;  daa[16*20+9]= 0.376588;  daa[16*20+10]= 0.243227; daa[16*20+11]= 0.446646; 
+              daa[16*20+12]= 0.184609; daa[16*20+13]= 0.08587;  daa[16*20+14]= 0.207143; daa[16*20+15]= 1.767766; 
+              daa[17*20+0]= 0.030309;  daa[17*20+1]= 0.046417;  daa[17*20+2]= 0.010459;  daa[17*20+3]= 0.011393;  
+              daa[17*20+4]= 0.007732;  daa[17*20+5]= 0.021248;  daa[17*20+6]= 0.018844;  daa[17*20+7]= 0.02399;   
+              daa[17*20+8]= 0.020009;  daa[17*20+9]= 0.034954;  daa[17*20+10]= 0.083439; daa[17*20+11]= 0.023321; 
+              daa[17*20+12]= 0.022019; daa[17*20+13]= 0.12805;  daa[17*20+14]= 0.014584; daa[17*20+15]= 0.035933; 
+              daa[17*20+16]= 0.020437; daa[18*20+0]= 0.087061;  daa[18*20+1]= 0.09701;   daa[18*20+2]= 0.093268;  
+              daa[18*20+3]= 0.051664;  daa[18*20+4]= 0.042823;  daa[18*20+5]= 0.062544;  daa[18*20+6]= 0.0552;    
+              daa[18*20+7]= 0.037568;  daa[18*20+8]= 0.286027;  daa[18*20+9]= 0.086237;  daa[18*20+10]= 0.189842; 
+              daa[18*20+11]= 0.068689; daa[18*20+12]= 0.073223; daa[18*20+13]= 0.898663; daa[18*20+14]= 0.032043; 
+              daa[18*20+15]= 0.121979; daa[18*20+16]= 0.094617; daa[18*20+17]= 0.124746; daa[19*20+0]= 1.230985;  
+              daa[19*20+1]= 0.113146;  daa[19*20+2]= 0.049824;  daa[19*20+3]= 0.048769;  daa[19*20+4]= 0.163831;  
+              daa[19*20+5]= 0.112027;  daa[19*20+6]= 0.205868;  daa[19*20+7]= 0.082579;  daa[19*20+8]= 0.068575;  
+              daa[19*20+9]= 3.65443;   daa[19*20+10]= 1.337571; daa[19*20+11]= 0.144587; daa[19*20+12]= 0.307309; 
+              daa[19*20+13]= 0.247329; daa[19*20+14]= 0.129315; daa[19*20+15]= 0.1277;   daa[19*20+16]= 0.740372; 
+              daa[19*20+17]= 0.022134; daa[19*20+18]= 0.125733;                     
+              
+              f[0]  = 0.07900;         f[1]= 0.05100;        f[2]  = 0.04200;         f[3]= 0.05300;         
+              f[4]  = 0.01500;         f[5]= 0.03700;        f[6]  = 0.06200;         f[7]= 0.07100;         
+              f[8]  = 0.02300;         f[9]= 0.06200;        f[10] = 0.09600;        f[11]= 0.05700;        
+              f[12] = 0.02400;        f[13]= 0.04300;        f[14] = 0.04400;        f[15]= 0.06400;        
+              f[16] = 0.05600;        f[17]= 0.01300;        f[18] = 0.03500;        f[19]= 0.07300; 
+            */
+
+            daa[1*20+0]=   1.2412691067876198;
+            daa[2*20+0]=   1.2184237953498958;
+            daa[2*20+1]=   1.5720770753326880;
+            daa[3*20+0]=   1.3759368509441177;
+            daa[3*20+1]=   0.7550654439001206;
+            daa[3*20+2]=   7.8584219153689405;
+            daa[4*20+0]=   2.4731223087544874;
+            daa[4*20+1]=   1.4414262567428417;
+            daa[4*20+2]=   0.9784679122774127;
+            daa[4*20+3]=   0.2272488448121475;
+            daa[5*20+0]=   2.2155167805137470;
+            daa[5*20+1]=   5.5120819705248678;
+            daa[5*20+2]=   3.0143201670924822;
+            daa[5*20+3]=   1.6562495638176040;
+            daa[5*20+4]=   0.4587469126746136;
+            daa[6*20+0]=   2.3379911207495061;
+            daa[6*20+1]=   1.3542404860613146;
+            daa[6*20+2]=   2.0093434778398112;
+            daa[6*20+3]=   9.6883451875685065;
+            daa[6*20+4]=   0.4519167943192672;
+            daa[6*20+5]=   6.8124601839937675;
+            daa[7*20+0]=   3.3386555146457697;
+            daa[7*20+1]=   1.3121700301622004;
+            daa[7*20+2]=   2.4117632898861809;
+            daa[7*20+3]=   1.9142079025990228;
+            daa[7*20+4]=   1.1034605684472507;
+            daa[7*20+5]=   0.8776110594765502;
+            daa[7*20+6]=   1.3860121390169038;
+            daa[8*20+0]=   0.9615841926910841;
+            daa[8*20+1]=   4.9238668283945266;
+            daa[8*20+2]=   6.1974384977884114;
+            daa[8*20+3]=   2.1459640610133781;
+            daa[8*20+4]=   1.5196756759380692;
+            daa[8*20+5]=   7.9943228564946525;
+            daa[8*20+6]=   1.6360079688522375;
+            daa[8*20+7]=   0.8561248973045037;
+            daa[9*20+0]=   0.8908203061925510;
+            daa[9*20+1]=   0.4323005487925516;
+            daa[9*20+2]=   0.9179291175331520;
+            daa[9*20+3]=   0.2161660372725585;
+            daa[9*20+4]=   0.9126668032539315;
+            daa[9*20+5]=   0.4882733432879921;
+            daa[9*20+6]=   0.4035497929633328;
+            daa[9*20+7]=   0.2888075033037488;
+            daa[9*20+8]=   0.5787937115407940;
+            daa[10*20+0]=  1.0778497408764076;
+            daa[10*20+1]=  0.8386701149158265;
+            daa[10*20+2]=  0.4098311270816011;
+            daa[10*20+3]=  0.3574207468998517;
+            daa[10*20+4]=  1.4081315998413697;
+            daa[10*20+5]=  1.3318097154194044;
+            daa[10*20+6]=  0.5610717242294755;
+            daa[10*20+7]=  0.3578662395745526;
+            daa[10*20+8]=  1.0765007949562073;
+            daa[10*20+9]=  6.0019110258426362;
+            daa[11*20+0]=  1.4932055816372476;
+            daa[11*20+1]=  10.017330817366002;
+            daa[11*20+2]=  4.4034547578962568;
+            daa[11*20+3]=  1.4521790561663968;
+            daa[11*20+4]=  0.3371091785647479;
+            daa[11*20+5]=  6.0519085243118811;
+            daa[11*20+6]=  4.3290086529582830;
+            daa[11*20+7]=  0.8945563662345198;
+            daa[11*20+8]=  1.8085136096039203;
+            daa[11*20+9]=  0.6244297525127139;
+            daa[11*20+10]= 0.5642322882556321;
+            daa[12*20+0]=  1.9006455961717605;
+            daa[12*20+1]=  1.2488638689609959;
+            daa[12*20+2]=  0.9378803706165143;
+            daa[12*20+3]=  0.4075239926000898;
+            daa[12*20+4]=  1.2213054800811556;
+            daa[12*20+5]=  1.9106190827629084;
+            daa[12*20+6]=  0.7471936218068498;
+            daa[12*20+7]=  0.5954812791740037;
+            daa[12*20+8]=  1.3808291710019667;
+            daa[12*20+9]=  6.7597899772045418;
+            daa[12*20+10]= 8.0327792947421148;
+            daa[12*20+11]= 1.7129670976916258;
+            daa[13*20+0]=  0.6883439026872615;
+            daa[13*20+1]=  0.4224945197276290;
+            daa[13*20+2]=  0.5044944273324311;
+            daa[13*20+3]=  0.1675129724559251;
+            daa[13*20+4]=  1.6953951980808002;
+            daa[13*20+5]=  0.3573432522499545;
+            daa[13*20+6]=  0.2317194387691585;
+            daa[13*20+7]=  0.3693722640980460;
+            daa[13*20+8]=  1.3629765501081097;
+            daa[13*20+9]=  2.2864286949316077;
+            daa[13*20+10]= 4.3611548063555778;
+            daa[13*20+11]= 0.3910559903834828;
+            daa[13*20+12]= 2.3201373546296349;
+            daa[14*20+0]=  2.7355620089953550;
+            daa[14*20+1]=  1.3091837782420783;
+            daa[14*20+2]=  0.7103720531974738;
+            daa[14*20+3]=  1.0714605979577547;
+            daa[14*20+4]=  0.4326227078645523;
+            daa[14*20+5]=  2.3019177728300728;
+            daa[14*20+6]=  1.5132807416252063;
+            daa[14*20+7]=  0.7744933618134962;
+            daa[14*20+8]=  1.8370555852070649;
+            daa[14*20+9]=  0.4811402387911145;
+            daa[14*20+10]= 1.0084320519837335;
+            daa[14*20+11]= 1.3918935593582853;
+            daa[14*20+12]= 0.4953193808676289;
+            daa[14*20+13]= 0.3746821107962129;
+            daa[15*20+0]=  6.4208961859142883;
+            daa[15*20+1]=  1.9202994262316166;
+            daa[15*20+2]=  6.1234512396801764;
+            daa[15*20+3]=  2.2161944596741829;
+            daa[15*20+4]=  3.6366815408744255;
+            daa[15*20+5]=  2.3193703643237220;
+            daa[15*20+6]=  1.8273535587773553;
+            daa[15*20+7]=  3.0637776193717610;
+            daa[15*20+8]=  1.9699895187387506;
+            daa[15*20+9]=  0.6047491507504744;
+            daa[15*20+10]= 0.8953754669269811;
+            daa[15*20+11]= 1.9776630140912268;
+            daa[15*20+12]= 1.0657482318076852;
+            daa[15*20+13]= 1.1079144700606407;
+            daa[15*20+14]= 3.5465914843628927;
+            daa[16*20+0]=  5.2892514169776437;
+            daa[16*20+1]=  1.3363401740560601;
+            daa[16*20+2]=  3.8852506105922231;
+            daa[16*20+3]=  1.5066839872944762;
+            daa[16*20+4]=  1.7557065205837685;
+            daa[16*20+5]=  2.1576510103471440;
+            daa[16*20+6]=  1.5839981708584689;
+            daa[16*20+7]=  0.7147489676267383;
+            daa[16*20+8]=  1.6136654573285647;
+            daa[16*20+9]=  2.6344778384442731;
+            daa[16*20+10]= 1.0192004372506540;
+            daa[16*20+11]= 2.5513781312660280;
+            daa[16*20+12]= 3.3628488360462363;
+            daa[16*20+13]= 0.6882725908872254;
+            daa[16*20+14]= 1.9485376673137556;
+            daa[16*20+15]= 8.8479984061248178;
+            daa[17*20+0]=  0.5488578478106930;
+            daa[17*20+1]=  1.5170142153962840;
+            daa[17*20+2]=  0.1808525752605976;
+            daa[17*20+3]=  0.2496584188151770;
+            daa[17*20+4]=  1.6275179891253113;
+            daa[17*20+5]=  0.8959082681546182;
+            daa[17*20+6]=  0.4198391148111098;
+            daa[17*20+7]=  0.9349753595598769;
+            daa[17*20+8]=  0.6301954684360302;
+            daa[17*20+9]=  0.5604648274060783;
+            daa[17*20+10]= 1.5183114434679339;
+            daa[17*20+11]= 0.5851920879490173;
+            daa[17*20+12]= 1.4680478689711018;
+            daa[17*20+13]= 3.3448437239772266;
+            daa[17*20+14]= 0.4326058001438786;
+            daa[17*20+15]= 0.6791126595939816;
+            daa[17*20+16]= 0.4514203099376473;
+            daa[18*20+0]=  0.5411769916657778;
+            daa[18*20+1]=  0.8912614404565405;
+            daa[18*20+2]=  1.0894926581511342;
+            daa[18*20+3]=  0.7447620891784513;
+            daa[18*20+4]=  2.1579775140421025;
+            daa[18*20+5]=  0.9183596801412757;
+            daa[18*20+6]=  0.5818111331782764;
+            daa[18*20+7]=  0.3374467649724478;
+            daa[18*20+8]=  7.7587442309146040;
+            daa[18*20+9]=  0.8626796044156272;
+            daa[18*20+10]= 1.2452243224541324;
+            daa[18*20+11]= 0.7835447533710449;
+            daa[18*20+12]= 1.0899165770956820;
+            daa[18*20+13]= 10.384852333133459;
+            daa[18*20+14]= 0.4819109019647465;
+            daa[18*20+15]= 0.9547229305958682;
+            daa[18*20+16]= 0.8564314184691215;
+            daa[18*20+17]= 4.5377235790405388;
+            daa[19*20+0]=  4.6501894691803214;
+            daa[19*20+1]=  0.7807017855806767;
+            daa[19*20+2]=  0.4586061981719967;
+            daa[19*20+3]=  0.4594535241660911;
+            daa[19*20+4]=  2.2627456996290891;
+            daa[19*20+5]=  0.6366932501396869;
+            daa[19*20+6]=  0.8940572875547330;
+            daa[19*20+7]=  0.6193321034173915;
+            daa[19*20+8]=  0.5333220944030346;
+            daa[19*20+9]=  14.872933461519061;
+            daa[19*20+10]= 3.5458093276667237;
+            daa[19*20+11]= 0.7801080335991272;
+            daa[19*20+12]= 4.0584577156753401;
+            daa[19*20+13]= 1.7039730522675411;
+            daa[19*20+14]= 0.5985498912985666;
+            daa[19*20+15]= 0.9305232113028208;
+            daa[19*20+16]= 3.4242218450865543;
+            daa[19*20+17]= 0.5658969249032649;
+            daa[19*20+18]= 1.0000000000000000;
+            
+            f[0]=  0.0770764620135024;
+            f[1]=  0.0500819370772208;
+            f[2]=  0.0462377395993731;
+            f[3]=  0.0537929860758246;
+            f[4]=  0.0144533387583345;
+            f[5]=  0.0408923608974345;
+            f[6]=  0.0633579339160905;
+            f[7]=  0.0655672355884439;
+            f[8]=  0.0218802687005936;
+            f[9]=  0.0591969699027449;
+            f[10]= 0.0976461276528445;
+            f[11]= 0.0592079410822730;
+            f[12]= 0.0220695876653368;
+            f[13]= 0.0413508521834260;
+            f[14]= 0.0476871596856874;
+            f[15]= 0.0707295165111524;
+            f[16]= 0.0567759161524817;
+            f[17]= 0.0127019797647213;
+            f[18]= 0.0323746050281867;
+            f[19]= 0.0669190817443274;
+          }
+          break;
+        case PLL_BLOSUM62:
+          {
+            daa[1*20+0]= 0.735790389698;  daa[2*20+0]= 0.485391055466;  daa[2*20+1]= 1.297446705134;  
+            daa[3*20+0]= 0.543161820899;  
+            daa[3*20+1]= 0.500964408555;  daa[3*20+2]= 3.180100048216;  daa[4*20+0]= 1.45999531047;   
+            daa[4*20+1]= 0.227826574209;  
+            daa[4*20+2]= 0.397358949897;  daa[4*20+3]= 0.240836614802;  daa[5*20+0]= 1.199705704602;  
+            daa[5*20+1]= 3.020833610064;  
+            daa[5*20+2]= 1.839216146992;  daa[5*20+3]= 1.190945703396;  daa[5*20+4]= 0.32980150463;   
+            daa[6*20+0]= 1.1709490428;    
+            daa[6*20+1]= 1.36057419042;   daa[6*20+2]= 1.24048850864;   daa[6*20+3]= 3.761625208368;  
+            daa[6*20+4]= 0.140748891814;  
+            daa[6*20+5]= 5.528919177928;  daa[7*20+0]= 1.95588357496;   daa[7*20+1]= 0.418763308518;  
+            daa[7*20+2]= 1.355872344485;  
+            daa[7*20+3]= 0.798473248968;  daa[7*20+4]= 0.418203192284;  daa[7*20+5]= 0.609846305383;  
+            daa[7*20+6]= 0.423579992176;  
+            daa[8*20+0]= 0.716241444998;  daa[8*20+1]= 1.456141166336;  daa[8*20+2]= 2.414501434208;  
+            daa[8*20+3]= 0.778142664022;  
+            daa[8*20+4]= 0.354058109831;  daa[8*20+5]= 2.43534113114;   daa[8*20+6]= 1.626891056982;  
+            daa[8*20+7]= 0.539859124954;  
+            daa[9*20+0]= 0.605899003687;  daa[9*20+1]= 0.232036445142;  daa[9*20+2]= 0.283017326278;  
+            daa[9*20+3]= 0.418555732462;  
+            daa[9*20+4]= 0.774894022794;  daa[9*20+5]= 0.236202451204;  daa[9*20+6]= 0.186848046932;  
+            daa[9*20+7]= 0.189296292376;  
+            daa[9*20+8]= 0.252718447885;  daa[10*20+0]= 0.800016530518; daa[10*20+1]= 0.622711669692; 
+            daa[10*20+2]= 0.211888159615; 
+            daa[10*20+3]= 0.218131577594; daa[10*20+4]= 0.831842640142; daa[10*20+5]= 0.580737093181; 
+            daa[10*20+6]= 0.372625175087; 
+            daa[10*20+7]= 0.217721159236; daa[10*20+8]= 0.348072209797; daa[10*20+9]= 3.890963773304; 
+            daa[11*20+0]= 1.295201266783; 
+            daa[11*20+1]= 5.411115141489; daa[11*20+2]= 1.593137043457; daa[11*20+3]= 1.032447924952; 
+            daa[11*20+4]= 0.285078800906; 
+            daa[11*20+5]= 3.945277674515; daa[11*20+6]= 2.802427151679; daa[11*20+7]= 0.752042440303; 
+            daa[11*20+8]= 1.022507035889; 
+            daa[11*20+9]= 0.406193586642; daa[11*20+10]= 0.445570274261;daa[12*20+0]= 1.253758266664; 
+            daa[12*20+1]= 0.983692987457; 
+            daa[12*20+2]= 0.648441278787; daa[12*20+3]= 0.222621897958; daa[12*20+4]= 0.76768882348;  
+            daa[12*20+5]= 2.494896077113; 
+            daa[12*20+6]= 0.55541539747;  daa[12*20+7]= 0.459436173579; daa[12*20+8]= 0.984311525359; 
+            daa[12*20+9]= 3.364797763104; 
+            daa[12*20+10]= 6.030559379572;daa[12*20+11]= 1.073061184332;daa[13*20+0]= 0.492964679748; 
+            daa[13*20+1]= 0.371644693209; 
+            daa[13*20+2]= 0.354861249223; daa[13*20+3]= 0.281730694207; daa[13*20+4]= 0.441337471187; 
+            daa[13*20+5]= 0.14435695975;  
+            daa[13*20+6]= 0.291409084165; daa[13*20+7]= 0.368166464453; daa[13*20+8]= 0.714533703928; 
+            daa[13*20+9]= 1.517359325954; 
+            daa[13*20+10]= 2.064839703237;daa[13*20+11]= 0.266924750511;daa[13*20+12]= 1.77385516883; 
+            daa[14*20+0]= 1.173275900924; 
+            daa[14*20+1]= 0.448133661718; daa[14*20+2]= 0.494887043702; daa[14*20+3]= 0.730628272998; 
+            daa[14*20+4]= 0.356008498769; 
+            daa[14*20+5]= 0.858570575674; daa[14*20+6]= 0.926563934846; daa[14*20+7]= 0.504086599527; daa[14*20+8]= 0.527007339151; 
+            daa[14*20+9]= 0.388355409206; daa[14*20+10]= 0.374555687471;daa[14*20+11]= 1.047383450722;daa[14*20+12]= 0.454123625103;
+            daa[14*20+13]= 0.233597909629;daa[15*20+0]= 4.325092687057; daa[15*20+1]= 1.12278310421;  daa[15*20+2]= 2.904101656456; 
+            daa[15*20+3]= 1.582754142065; daa[15*20+4]= 1.197188415094; daa[15*20+5]= 1.934870924596; daa[15*20+6]= 1.769893238937; 
+            daa[15*20+7]= 1.509326253224; daa[15*20+8]= 1.11702976291;  daa[15*20+9]= 0.35754441246;  daa[15*20+10]= 0.352969184527;
+            daa[15*20+11]= 1.752165917819;daa[15*20+12]= 0.918723415746;daa[15*20+13]= 0.540027644824;daa[15*20+14]= 1.169129577716;
+            daa[16*20+0]= 1.729178019485; daa[16*20+1]= 0.914665954563; daa[16*20+2]= 1.898173634533; daa[16*20+3]= 0.934187509431; 
+            daa[16*20+4]= 1.119831358516; daa[16*20+5]= 1.277480294596; daa[16*20+6]= 1.071097236007; daa[16*20+7]= 0.641436011405; 
+            daa[16*20+8]= 0.585407090225; daa[16*20+9]= 1.17909119726;  daa[16*20+10]= 0.915259857694;daa[16*20+11]= 1.303875200799;
+            daa[16*20+12]= 1.488548053722;daa[16*20+13]= 0.488206118793;daa[16*20+14]= 1.005451683149;daa[16*20+15]= 5.15155629227; 
+            daa[17*20+0]= 0.465839367725; daa[17*20+1]= 0.426382310122; daa[17*20+2]= 0.191482046247; daa[17*20+3]= 0.145345046279; 
+            daa[17*20+4]= 0.527664418872; daa[17*20+5]= 0.758653808642; daa[17*20+6]= 0.407635648938; daa[17*20+7]= 0.508358924638; 
+            daa[17*20+8]= 0.30124860078;  daa[17*20+9]= 0.34198578754;  daa[17*20+10]= 0.6914746346;  daa[17*20+11]= 0.332243040634;
+            daa[17*20+12]= 0.888101098152;daa[17*20+13]= 2.074324893497;daa[17*20+14]= 0.252214830027;daa[17*20+15]= 0.387925622098;
+            daa[17*20+16]= 0.513128126891;daa[18*20+0]= 0.718206697586; daa[18*20+1]= 0.720517441216; daa[18*20+2]= 0.538222519037; 
+            daa[18*20+3]= 0.261422208965; daa[18*20+4]= 0.470237733696; daa[18*20+5]= 0.95898974285;  daa[18*20+6]= 0.596719300346; 
+            daa[18*20+7]= 0.308055737035; daa[18*20+8]= 4.218953969389; daa[18*20+9]= 0.674617093228; daa[18*20+10]= 0.811245856323;
+            daa[18*20+11]= 0.7179934869;  daa[18*20+12]= 0.951682162246;daa[18*20+13]= 6.747260430801;daa[18*20+14]= 0.369405319355;
+            daa[18*20+15]= 0.796751520761;daa[18*20+16]= 0.801010243199;daa[18*20+17]= 4.054419006558;daa[19*20+0]= 2.187774522005; 
+            daa[19*20+1]= 0.438388343772; daa[19*20+2]= 0.312858797993; daa[19*20+3]= 0.258129289418; daa[19*20+4]= 1.116352478606; 
+            daa[19*20+5]= 0.530785790125; daa[19*20+6]= 0.524253846338; daa[19*20+7]= 0.25334079019;  daa[19*20+8]= 0.20155597175;  
+            daa[19*20+9]= 8.311839405458; daa[19*20+10]= 2.231405688913;daa[19*20+11]= 0.498138475304;daa[19*20+12]= 2.575850755315;
+            daa[19*20+13]= 0.838119610178;daa[19*20+14]= 0.496908410676;daa[19*20+15]= 0.561925457442;daa[19*20+16]= 2.253074051176;
+            daa[19*20+17]= 0.266508731426;daa[19*20+18]= 1;             
+            
+            f[0]= 0.074;                 f[1]= 0.052;                 f[2]= 0.045;                 f[3]= 0.054;                 
+            f[4]= 0.025;                 f[5]= 0.034;                 f[6]= 0.054;                 f[7]= 0.074;                 
+            f[8]= 0.026;                 f[9]= 0.068;                 f[10]= 0.099;                f[11]= 0.058;                
+            f[12]= 0.025;                f[13]= 0.047;                f[14]= 0.039;                f[15]= 0.057;                
+            f[16]= 0.051;                f[17]= 0.013;                f[18]= 0.032;                f[19]= 0.073;
+          }
+          break;
+        case PLL_MTMAM:
+          {
+            daa[1*20+0]= 32;              daa[2*20+0]= 2;    daa[2*20+1]= 4;               daa[3*20+0]= 11;
+            daa[3*20+1]= 0;               daa[3*20+2]= 864;  daa[4*20+0]= 0;               daa[4*20+1]= 186;
+            daa[4*20+2]= 0;               daa[4*20+3]= 0;    daa[5*20+0]= 0;               daa[5*20+1]= 246;
+            daa[5*20+2]= 8;               daa[5*20+3]= 49;   daa[5*20+4]= 0;               daa[6*20+0]= 0;
+            daa[6*20+1]= 0;               daa[6*20+2]= 0;    daa[6*20+3]= 569;             daa[6*20+4]= 0;
+            daa[6*20+5]= 274;             daa[7*20+0]= 78;   daa[7*20+1]= 18;              daa[7*20+2]= 47;
+            daa[7*20+3]= 79;              daa[7*20+4]= 0;    daa[7*20+5]= 0;               daa[7*20+6]= 22;
+            daa[8*20+0]= 8;               daa[8*20+1]= 232;  daa[8*20+2]= 458;             daa[8*20+3]= 11;
+            daa[8*20+4]= 305;             daa[8*20+5]= 550;  daa[8*20+6]= 22;              daa[8*20+7]= 0;
+            daa[9*20+0]= 75;              daa[9*20+1]= 0;    daa[9*20+2]= 19;              daa[9*20+3]= 0;
+            daa[9*20+4]= 41;              daa[9*20+5]= 0;    daa[9*20+6]= 0;               daa[9*20+7]= 0;
+            daa[9*20+8]= 0;               daa[10*20+0]= 21;  daa[10*20+1]= 6;              daa[10*20+2]= 0;
+            daa[10*20+3]= 0;              daa[10*20+4]= 27;  daa[10*20+5]= 20;             daa[10*20+6]= 0;
+            daa[10*20+7]= 0;              daa[10*20+8]= 26;  daa[10*20+9]= 232;            daa[11*20+0]= 0;
+            daa[11*20+1]= 50;             daa[11*20+2]= 408; daa[11*20+3]= 0;              daa[11*20+4]= 0;
+            daa[11*20+5]= 242;            daa[11*20+6]= 215; daa[11*20+7]= 0;              daa[11*20+8]= 0;
+            daa[11*20+9]= 6;              daa[11*20+10]= 4;  daa[12*20+0]= 76;             daa[12*20+1]= 0;
+            daa[12*20+2]= 21;             daa[12*20+3]= 0;   daa[12*20+4]= 0;              daa[12*20+5]= 22;
+            daa[12*20+6]= 0;              daa[12*20+7]= 0;   daa[12*20+8]= 0;              daa[12*20+9]= 378;
+            daa[12*20+10]= 609;           daa[12*20+11]= 59; daa[13*20+0]= 0;              daa[13*20+1]= 0;
+            daa[13*20+2]= 6;              daa[13*20+3]= 5;   daa[13*20+4]= 7;              daa[13*20+5]= 0;
+            daa[13*20+6]= 0;              daa[13*20+7]= 0;   daa[13*20+8]= 0;              daa[13*20+9]= 57;
+            daa[13*20+10]= 246;           daa[13*20+11]= 0;  daa[13*20+12]= 11;            daa[14*20+0]= 53;
+            daa[14*20+1]= 9;              daa[14*20+2]= 33;  daa[14*20+3]= 2;              daa[14*20+4]= 0;
+            daa[14*20+5]= 51;             daa[14*20+6]= 0;   daa[14*20+7]= 0;              daa[14*20+8]= 53;
+            daa[14*20+9]= 5;              daa[14*20+10]= 43; daa[14*20+11]= 18;            daa[14*20+12]= 0;
+            daa[14*20+13]= 17;            daa[15*20+0]= 342; daa[15*20+1]= 3;              daa[15*20+2]= 446;
+            daa[15*20+3]= 16;             daa[15*20+4]= 347; daa[15*20+5]= 30;             daa[15*20+6]= 21;
+            daa[15*20+7]= 112;            daa[15*20+8]= 20;  daa[15*20+9]= 0;              daa[15*20+10]= 74;
+            daa[15*20+11]= 65;            daa[15*20+12]= 47; daa[15*20+13]= 90;            daa[15*20+14]= 202;
+            daa[16*20+0]= 681;            daa[16*20+1]= 0;   daa[16*20+2]= 110;            daa[16*20+3]= 0;
+            daa[16*20+4]= 114;            daa[16*20+5]= 0;   daa[16*20+6]= 4;              daa[16*20+7]= 0;
+            daa[16*20+8]= 1;              daa[16*20+9]= 360; daa[16*20+10]= 34;            daa[16*20+11]= 50;
+            daa[16*20+12]= 691;           daa[16*20+13]= 8;  daa[16*20+14]= 78;            daa[16*20+15]= 614;
+            daa[17*20+0]= 5;              daa[17*20+1]= 16;  daa[17*20+2]= 6;              daa[17*20+3]= 0;
+            daa[17*20+4]= 65;             daa[17*20+5]= 0;   daa[17*20+6]= 0;              daa[17*20+7]= 0;
+            daa[17*20+8]= 0;              daa[17*20+9]= 0;   daa[17*20+10]= 12;            daa[17*20+11]= 0;
+            daa[17*20+12]= 13;            daa[17*20+13]= 0;  daa[17*20+14]= 7;             daa[17*20+15]= 17;
+            daa[17*20+16]= 0;             daa[18*20+0]= 0;   daa[18*20+1]= 0;              daa[18*20+2]= 156;
+            daa[18*20+3]= 0;              daa[18*20+4]= 530; daa[18*20+5]= 54;             daa[18*20+6]= 0;
+            daa[18*20+7]= 1;              daa[18*20+8]= 1525;daa[18*20+9]= 16;             daa[18*20+10]= 25;
+            daa[18*20+11]= 67;            daa[18*20+12]= 0;  daa[18*20+13]= 682;           daa[18*20+14]= 8;
+            daa[18*20+15]= 107;           daa[18*20+16]= 0;  daa[18*20+17]= 14;            daa[19*20+0]= 398;
+            daa[19*20+1]= 0;              daa[19*20+2]= 0;   daa[19*20+3]= 10;             daa[19*20+4]= 0;
+            daa[19*20+5]= 33;             daa[19*20+6]= 20;  daa[19*20+7]= 5;              daa[19*20+8]= 0;
+            daa[19*20+9]= 2220;           daa[19*20+10]= 100;daa[19*20+11]= 0;             daa[19*20+12]= 832;
+            daa[19*20+13]= 6;             daa[19*20+14]= 0;  daa[19*20+15]= 0;             daa[19*20+16]= 237;
+            daa[19*20+17]= 0;             daa[19*20+18]= 0;       
+            
+            f[0]= 0.06920;  f[1]=  0.01840;  f[2]= 0.04000;  f[3]= 0.018600;
+            f[4]= 0.00650;  f[5]=  0.02380;  f[6]= 0.02360;  f[7]= 0.055700;
+            f[8]= 0.02770;  f[9]=  0.09050;  f[10]=0.16750;  f[11]= 0.02210;
+            f[12]=0.05610;  f[13]= 0.06110;  f[14]=0.05360;  f[15]= 0.07250;
+            f[16]=0.08700;  f[17]= 0.02930;  f[18]=0.03400;  f[19]= 0.04280;
+          }
+          break;
+        case PLL_LG:
+          {
+            daa[1*20+0] = 0.425093;
+
+            daa[2*20+0] = 0.276818; daa[2*20+1] = 0.751878;
+
+            daa[3*20+0] = 0.395144; daa[3*20+1] = 0.123954; daa[3*20+2] = 5.076149;
+            
+            daa[4*20+0] = 2.489084; daa[4*20+1] = 0.534551; daa[4*20+2] = 0.528768; daa[4*20+3] = 0.062556;
+                                                                 
+            daa[5*20+0] = 0.969894; daa[5*20+1] = 2.807908; daa[5*20+2] = 1.695752; daa[5*20+3] = 0.523386; daa[5*20+4] = 0.084808;
+
+            daa[6*20+0] = 1.038545; daa[6*20+1] = 0.363970; daa[6*20+2] = 0.541712; daa[6*20+3] = 5.243870; daa[6*20+4] = 0.003499; daa[6*20+5] = 4.128591;
+
+            daa[7*20+0] = 2.066040; daa[7*20+1] = 0.390192; daa[7*20+2] = 1.437645; daa[7*20+3] = 0.844926; daa[7*20+4] = 0.569265; daa[7*20+5] = 0.267959; daa[7*20+6] = 0.348847;
+ 
+            daa[8*20+0] = 0.358858; daa[8*20+1] = 2.426601; daa[8*20+2] = 4.509238; daa[8*20+3] = 0.927114; daa[8*20+4] = 0.640543; daa[8*20+5] = 4.813505; daa[8*20+6] = 0.423881; 
+            daa[8*20+7] = 0.311484;
+
+            daa[9*20+0] = 0.149830; daa[9*20+1] = 0.126991; daa[9*20+2] = 0.191503; daa[9*20+3] = 0.010690; daa[9*20+4] = 0.320627; daa[9*20+5] = 0.072854; daa[9*20+6] = 0.044265; 
+            daa[9*20+7] = 0.008705; daa[9*20+8] = 0.108882; 
+
+            daa[10*20+0] = 0.395337; daa[10*20+1] = 0.301848; daa[10*20+2] = 0.068427; daa[10*20+3] = 0.015076; daa[10*20+4] = 0.594007; daa[10*20+5] = 0.582457; daa[10*20+6] = 0.069673; 
+            daa[10*20+7] = 0.044261; daa[10*20+8] = 0.366317; daa[10*20+9] = 4.145067 ;
+
+            daa[11*20+0] = 0.536518; daa[11*20+1] = 6.326067; daa[11*20+2] = 2.145078; daa[11*20+3] = 0.282959; daa[11*20+4] = 0.013266; daa[11*20+5] = 3.234294; daa[11*20+6] = 1.807177; 
+            daa[11*20+7] = 0.296636; daa[11*20+8] = 0.697264; daa[11*20+9] = 0.159069; daa[11*20+10] = 0.137500;
+
+
+            daa[12*20+0] = 1.124035; daa[12*20+1] = 0.484133; daa[12*20+2] = 0.371004; daa[12*20+3] = 0.025548; daa[12*20+4] = 0.893680; daa[12*20+5] = 1.672569; daa[12*20+6] = 0.173735; 
+            daa[12*20+7] = 0.139538; daa[12*20+8] = 0.442472; daa[12*20+9] = 4.273607; daa[12*20+10] = 6.312358; daa[12*20+11] = 0.656604;
+
+            daa[13*20+0] = 0.253701; daa[13*20+1] = 0.052722;daa[13*20+2] = 0.089525; daa[13*20+3] = 0.017416; daa[13*20+4] = 1.105251; daa[13*20+5] = 0.035855; daa[13*20+6] = 0.018811; 
+            daa[13*20+7] = 0.089586; daa[13*20+8] = 0.682139; daa[13*20+9] = 1.112727; daa[13*20+10] = 2.592692; daa[13*20+11] = 0.023918; daa[13*20+12] = 1.798853;
+
+            daa[14*20+0] = 1.177651; daa[14*20+1] = 0.332533;daa[14*20+2] = 0.161787; daa[14*20+3] = 0.394456; daa[14*20+4] = 0.075382; daa[14*20+5] = 0.624294; daa[14*20+6] = 0.419409; 
+            daa[14*20+7] = 0.196961; daa[14*20+8] = 0.508851; daa[14*20+9] = 0.078281; daa[14*20+10] = 0.249060; daa[14*20+11] = 0.390322; daa[14*20+12] = 0.099849; 
+            daa[14*20+13] = 0.094464;
+ 
+            daa[15*20+0] = 4.727182; daa[15*20+1] = 0.858151;daa[15*20+2] = 4.008358; daa[15*20+3] = 1.240275; daa[15*20+4] = 2.784478; daa[15*20+5] = 1.223828; daa[15*20+6] = 0.611973; 
+            daa[15*20+7] = 1.739990; daa[15*20+8] = 0.990012; daa[15*20+9] = 0.064105; daa[15*20+10] = 0.182287; daa[15*20+11] = 0.748683; daa[15*20+12] = 0.346960; 
+            daa[15*20+13] = 0.361819; daa[15*20+14] = 1.338132;
+ 
+            daa[16*20+0] = 2.139501; daa[16*20+1] = 0.578987;daa[16*20+2] = 2.000679; daa[16*20+3] = 0.425860; daa[16*20+4] = 1.143480; daa[16*20+5] = 1.080136; daa[16*20+6] = 0.604545; 
+            daa[16*20+7] = 0.129836; daa[16*20+8] = 0.584262; daa[16*20+9] = 1.033739; daa[16*20+10] = 0.302936; daa[16*20+11] = 1.136863; daa[16*20+12] = 2.020366; 
+            daa[16*20+13] = 0.165001; daa[16*20+14] = 0.571468; daa[16*20+15] = 6.472279;
+
+            daa[17*20+0] = 0.180717; daa[17*20+1] = 0.593607;daa[17*20+2] = 0.045376; daa[17*20+3] = 0.029890; daa[17*20+4] = 0.670128; daa[17*20+5] = 0.236199; daa[17*20+6] = 0.077852; 
+            daa[17*20+7] = 0.268491; daa[17*20+8] = 0.597054; daa[17*20+9] = 0.111660; daa[17*20+10] = 0.619632; daa[17*20+11] = 0.049906; daa[17*20+12] = 0.696175; 
+            daa[17*20+13] = 2.457121; daa[17*20+14] = 0.095131; daa[17*20+15] = 0.248862; daa[17*20+16] = 0.140825;
+
+            daa[18*20+0] = 0.218959; daa[18*20+1] = 0.314440;daa[18*20+2] = 0.612025; daa[18*20+3] = 0.135107; daa[18*20+4] = 1.165532; daa[18*20+5] = 0.257336; daa[18*20+6] = 0.120037; 
+            daa[18*20+7] = 0.054679; daa[18*20+8] = 5.306834; daa[18*20+9] = 0.232523; daa[18*20+10] = 0.299648; daa[18*20+11] = 0.131932; daa[18*20+12] = 0.481306; 
+            daa[18*20+13] = 7.803902; daa[18*20+14] = 0.089613; daa[18*20+15] = 0.400547; daa[18*20+16] = 0.245841; daa[18*20+17] = 3.151815;
+
+            daa[19*20+0] = 2.547870; daa[19*20+1] = 0.170887;daa[19*20+2] = 0.083688; daa[19*20+3] = 0.037967; daa[19*20+4] = 1.959291; daa[19*20+5] = 0.210332; daa[19*20+6] = 0.245034; 
+            daa[19*20+7] = 0.076701; daa[19*20+8] = 0.119013; daa[19*20+9] = 10.649107; daa[19*20+10] = 1.702745; daa[19*20+11] = 0.185202; daa[19*20+12] = 1.898718; 
+            daa[19*20+13] = 0.654683; daa[19*20+14] = 0.296501; daa[19*20+15] = 0.098369; daa[19*20+16] = 2.188158; daa[19*20+17] = 0.189510; daa[19*20+18] = 0.249313;
+            
+            f[0]  = 0.079066; f[1]  = 0.055941; f[2]  = 0.041977; f[3]  = 0.053052;
+	    f[4]  = 0.012937; f[5]  = 0.040767; f[6]  = 0.071586; f[7]  = 0.057337;
+	    f[8]  = 0.022355; f[9]  = 0.062157; f[10] = 0.099081; f[11] = 0.064600;
+	    f[12] = 0.022951; f[13] = 0.042302; f[14] = 0.044040; f[15] = 0.061197;
+	    f[16] = 0.053287; f[17] = 0.012066; f[18] = 0.034155; f[19] = 0.069146;       
+          }       
+          break;
+          case PLL_LG4M:
+          {
+            double 
+              rates[4][190] = 
+              {
+                {
+                  0.269343
+                  , 0.254612, 0.150988
+                  , 0.236821, 0.031863, 0.659648
+                  , 2.506547, 0.938594, 0.975736, 0.175533
+                  , 0.359080, 0.348288, 0.697708, 0.086573, 0.095967
+                  , 0.304674, 0.156000, 0.377704, 0.449140, 0.064706, 4.342595
+                  , 1.692015, 0.286638, 0.565095, 0.380358, 0.617945, 0.202058, 0.264342
+                  , 0.251974, 0.921633, 1.267609, 0.309692, 0.390429, 2.344059, 0.217750, 0.104842
+                  , 1.085220, 0.325624, 0.818658, 0.037814, 1.144150, 0.534567, 0.222793, 0.062682, 0.567431
+                  , 0.676353, 0.602366, 0.217027, 0.007533, 1.595775, 0.671143, 0.158424, 0.070463, 0.764255, 8.226528
+                  , 0.179155, 0.971338, 1.343718, 0.133744, 0.122468, 0.983857, 0.994128, 0.220916, 0.410581, 0.387487, 0.181110
+                  , 1.636817, 0.515217, 0.670461, 0.071252, 1.534848, 5.288642, 0.255628, 0.094198, 0.257229, 25.667158, 6.819689, 1.591212
+                  , 0.235498, 0.123932, 0.099793, 0.030425, 0.897279, 0.112229, 0.022529, 0.047488, 0.762914, 1.344259, 0.865691, 0.038921, 2.030833
+                  , 1.265605, 0.040163, 0.173354, 0.027579, 0.259961, 0.580374, 0.088041, 0.145595, 0.143676, 0.298859, 1.020117, 0.000714, 0.190019, 0.093964
+                  , 5.368405, 0.470952, 5.267140, 0.780505, 4.986071, 0.890554, 0.377949, 1.755515, 0.786352, 0.527246, 0.667783, 0.659948, 0.731921, 0.837669, 1.355630
+                  , 1.539394, 0.326789, 1.688169, 0.283738, 1.389282, 0.329821, 0.231770, 0.117017, 0.449977, 3.531600, 0.721586, 0.497588, 2.691697, 0.152088, 0.698040, 16.321298
+                  , 0.140944, 0.375611, 0.025163, 0.002757, 0.801456, 0.257253, 0.103678, 0.132995, 0.345834, 0.377156, 0.839647, 0.176970, 0.505682, 1.670170, 0.091298, 0.210096, 0.013165
+                  , 0.199836, 0.146857, 0.806275, 0.234246, 1.436970, 0.319669, 0.010076, 0.036859, 3.503317, 0.598632, 0.738969, 0.154436, 0.579000, 4.245524, 0.074524, 0.454195, 0.232913, 1.178490
+                  , 9.435529, 0.285934, 0.395670, 0.130890, 6.097263, 0.516259, 0.503665, 0.222960, 0.149143, 13.666175, 2.988174, 0.162725, 5.973826, 0.843416, 0.597394, 0.701149, 4.680002, 0.300085, 0.416262
+                },
+                {
+                  0.133720
+                  , 0.337212, 0.749052
+                  , 0.110918, 0.105087, 4.773487
+                  , 3.993460, 0.188305, 1.590332, 0.304942
+                  , 0.412075, 2.585774, 1.906884, 0.438367, 0.242076
+                  , 0.435295, 0.198278, 0.296366, 7.470333, 0.008443, 3.295515
+                  , 7.837540, 0.164607, 0.431724, 0.153850, 1.799716, 0.269744, 0.242866
+                  , 0.203872, 2.130334, 9.374479, 1.080878, 0.152458, 12.299133, 0.279589, 0.089714
+                  , 0.039718, 0.024553, 0.135254, 0.014979, 0.147498, 0.033964, 0.005585, 0.007248, 0.022746
+                  , 0.075784, 0.080091, 0.084971, 0.014128, 0.308347, 0.500836, 0.022833, 0.022999, 0.161270, 1.511682
+                  , 0.177662, 10.373708, 1.036721, 0.038303, 0.043030, 2.181033, 0.321165, 0.103050, 0.459502, 0.021215, 0.078395
+                  , 0.420784, 0.192765, 0.329545, 0.008331, 0.883142, 1.403324, 0.168673, 0.160728, 0.612573, 1.520889, 7.763266, 0.307903
+                  , 0.071268, 0.019652, 0.088753, 0.013547, 0.566609, 0.071878, 0.020050, 0.041022, 0.625361, 0.382806, 1.763059, 0.044644, 1.551911
+                  , 0.959127, 1.496585, 0.377794, 0.332010, 0.318192, 1.386970, 0.915904, 0.224255, 2.611479, 0.029351, 0.068250, 1.542356, 0.047525, 0.182715
+                  , 11.721512, 0.359408, 2.399158, 0.219464, 9.104192, 0.767563, 0.235229, 3.621219, 0.971955, 0.033780, 0.043035, 0.236929, 0.319964, 0.124977, 0.840651
+                  , 2.847068, 0.218463, 1.855386, 0.109808, 4.347048, 0.765848, 0.164569, 0.312024, 0.231569, 0.356327, 0.159597, 0.403210, 1.135162, 0.106903, 0.269190, 9.816481
+                  , 0.030203, 0.387292, 0.118878, 0.067287, 0.190240, 0.122113, 0.007023, 0.137411, 0.585141, 0.020634, 0.228824, 0.000122, 0.474862, 3.135128, 0.030313, 0.093830, 0.119152
+                  , 0.067183, 0.130101, 0.348730, 0.061798, 0.301198, 0.095382, 0.095764, 0.044628, 2.107384, 0.046105, 0.100117, 0.017073, 0.192383, 8.367641, 0.000937, 0.137416, 0.044722, 4.179782
+                  , 0.679398, 0.041567, 0.092408, 0.023701, 1.271187, 0.115566, 0.055277, 0.086988, 0.060779, 8.235167, 0.609420, 0.061764, 0.581962, 0.184187, 0.080246, 0.098033, 1.438350, 0.023439, 0.039124
+                },          
+                {
+                  0.421017
+                  , 0.316236, 0.693340
+                  , 0.285984, 0.059926, 6.158219
+                  , 4.034031, 1.357707, 0.708088, 0.063669
+                  , 0.886972, 2.791622, 1.701830, 0.484347, 0.414286
+                  , 0.760525, 0.233051, 0.378723, 4.032667, 0.081977, 4.940411
+                  , 0.754103, 0.402894, 2.227443, 1.102689, 0.416576, 0.459376, 0.508409
+                  , 0.571422, 2.319453, 5.579973, 0.885376, 1.439275, 4.101979, 0.576745, 0.428799
+                  , 0.162152, 0.085229, 0.095692, 0.006129, 0.490937, 0.104843, 0.045514, 0.004705, 0.098934
+                  , 0.308006, 0.287051, 0.056994, 0.007102, 0.958988, 0.578990, 0.067119, 0.024403, 0.342983, 3.805528
+                  , 0.390161, 7.663209, 1.663641, 0.105129, 0.135029, 3.364474, 0.652618, 0.457702, 0.823674, 0.129858, 0.145630
+                  , 1.042298, 0.364551, 0.293222, 0.037983, 1.486520, 1.681752, 0.192414, 0.070498, 0.222626, 4.529623, 4.781730, 0.665308
+                  , 0.362476, 0.073439, 0.129245, 0.020078, 1.992483, 0.114549, 0.023272, 0.064490, 1.491794, 1.113437, 2.132006, 0.041677, 1.928654
+                  , 1.755491, 0.087050, 0.099325, 0.163817, 0.242851, 0.322939, 0.062943, 0.198698, 0.192904, 0.062948, 0.180283, 0.059655, 0.129323, 0.065778
+                  , 3.975060, 0.893398, 5.496314, 1.397313, 3.575120, 1.385297, 0.576191, 1.733288, 1.021255, 0.065131, 0.129115, 0.600308, 0.387276, 0.446001, 1.298493
+                  , 2.565079, 0.534056, 2.143993, 0.411388, 2.279084, 0.893006, 0.528209, 0.135731, 0.518741, 0.972662, 0.280700, 0.890086, 1.828755, 0.189028, 0.563778, 7.788147
+                  , 0.283631, 0.497926, 0.075454, 0.043794, 1.335322, 0.308605, 0.140137, 0.150797, 1.409726, 0.119868, 0.818331, 0.080591, 1.066017, 3.754687, 0.073415, 0.435046, 0.197272
+                  , 0.242513, 0.199157, 0.472207, 0.085937, 2.039787, 0.262751, 0.084578, 0.032247, 7.762326, 0.153966, 0.299828, 0.117255, 0.438215, 14.506235, 0.089180, 0.352766, 0.215417, 5.054245
+                  , 2.795818, 0.107130, 0.060909, 0.029724, 2.986426, 0.197267, 0.196977, 0.044327, 0.116751, 7.144311, 1.848622, 0.118020, 1.999696, 0.705747, 0.272763, 0.096935, 1.820982, 0.217007, 0.172975
+                },
+                {
+                  0.576160
+                  , 0.567606, 0.498643
+                  , 0.824359, 0.050698, 3.301401
+                  , 0.822724, 4.529235, 1.291808, 0.101930
+                  , 1.254238, 2.169809, 1.427980, 0.449474, 0.868679
+                  , 1.218615, 0.154502, 0.411471, 3.172277, 0.050239, 2.138661
+                  , 1.803443, 0.604673, 2.125496, 1.276384, 1.598679, 0.502653, 0.479490
+                  , 0.516862, 2.874265, 4.845769, 0.719673, 3.825677, 4.040275, 0.292773, 0.596643
+                  , 0.180898, 0.444586, 0.550969, 0.023542, 2.349573, 0.370160, 0.142187, 0.016618, 0.500788
+                  , 0.452099, 0.866322, 0.201033, 0.026731, 2.813990, 1.645178, 0.135556, 0.072152, 1.168817, 5.696116
+                  , 0.664186, 2.902886, 2.101971, 0.127988, 0.200218, 2.505933, 0.759509, 0.333569, 0.623100, 0.547454, 0.363656
+                  , 0.864415, 0.835049, 0.632649, 0.079201, 2.105931, 1.633544, 0.216462, 0.252419, 0.665406, 7.994105, 11.751178, 1.096842
+                  , 0.324478, 0.208947, 0.280339, 0.041683, 4.788477, 0.107022, 0.067711, 0.171320, 3.324779, 2.965328, 5.133843, 0.084856, 4.042591
+                  , 1.073043, 0.173826, 0.041985, 0.270336, 0.121299, 0.351384, 0.228565, 0.225318, 0.376089, 0.058027, 0.390354, 0.214230, 0.058954, 0.126299
+                  , 3.837562, 0.884342, 4.571911, 0.942751, 6.592827, 1.080063, 0.465397, 3.137614, 1.119667, 0.362516, 0.602355, 0.716940, 0.506796, 1.444484, 1.432558
+                  , 2.106026, 0.750016, 2.323325, 0.335915, 1.654673, 1.194017, 0.617231, 0.318671, 0.801030, 4.455842, 0.580191, 1.384210, 3.522468, 0.473128, 0.432718, 5.716300
+                  , 0.163720, 0.818102, 0.072322, 0.068275, 3.305436, 0.373790, 0.054323, 0.476587, 1.100360, 0.392946, 1.703323, 0.085720, 1.725516, 5.436253, 0.053108, 0.498594, 0.231832
+                  , 0.241167, 0.302440, 1.055095, 0.246940, 9.741942, 0.249895, 0.129973, 0.052363, 11.542498, 1.047449, 1.319667, 0.139770, 1.330225, 26.562270, 0.046986, 0.737653, 0.313460, 5.165098
+                  , 1.824586, 0.435795, 0.179086, 0.091739, 3.609570, 0.649507, 0.656681, 0.225234, 0.473437, 19.897252, 3.001995, 0.452926, 3.929598, 1.692159, 0.370204, 0.373501, 3.329822, 0.326593, 0.860743
+                }
+              };
+            
+            double
+              freqs[4][20] = 
+              {{0.082276,0.055172,0.043853,0.053484,0.018957,0.028152,0.046679,0.157817,0.033297,0.028284,0.054284,0.025275,0.023665,0.041874,0.063071,0.066501,0.065424,0.023837,0.038633,0.049465},
+               {0.120900,0.036460,0.026510,0.040410,0.015980,0.021132,0.025191,0.036369,0.015884,0.111029,0.162852,0.024820,0.028023,0.074058,0.012065,0.041963,0.039072,0.012666,0.040478,0.114137},
+               {0.072639,0.051691,0.038642,0.055580,0.009829,0.031374,0.048731,0.065283,0.023791,0.086640,0.120847,0.052177,0.026728,0.032589,0.039238,0.046748,0.053361,0.008024,0.037426,0.098662},
+               {0.104843,0.078835,0.043513,0.090498,0.002924,0.066163,0.151640,0.038843,0.022556,0.018383,0.038687,0.104462,0.010166,0.009089,0.066950,0.053667,0.049486,0.004409,0.012924,0.031963}};
+            
+            int 
+              i, 
+              j, 
+              r = 0;
+            
+            for(i = 1; i < 20; i++)
+              for(j = 0; j < i; j++)
+                {
+                  daa[i * 20 + j] = rates[lg4_index][r];
+                  r++;
+                }
+            
+            assert(r == 190);
+            
+            for(i = 0; i < 20; i++)
+              f[i] = freqs[lg4_index][i];         
+            
+          }
+          break;
+      	case PLL_LG4X:
+			{
+			  double
+			  rates[4][190] =
+				  {
+				  {
+				  0.295719,
+				  0.067388, 0.448317,
+				  0.253712, 0.457483, 2.358429,
+				  1.029289, 0.576016, 0.251987, 0.189008,
+				  0.107964, 1.741924, 0.216561, 0.599450, 0.029955,
+				  0.514644, 0.736017, 0.503084, 109.901504, 0.084794, 4.117654,
+				  10.868848, 0.704334, 0.435271, 1.070052, 1.862626, 0.246260, 1.202023,
+				  0.380498, 5.658311, 4.873453, 5.229858, 0.553477, 6.508329, 1.634845, 0.404968,
+				  0.084223, 0.123387, 0.090748, 0.052764, 0.151733, 0.054187, 0.060194, 0.048984, 0.204296,
+				  0.086976, 0.221777, 0.033310, 0.021407, 0.230320, 0.195703, 0.069359, 0.069963, 0.504221, 1.495537,
+				  0.188789, 93.433377, 0.746537, 0.621146, 0.096955, 1.669092, 2.448827, 0.256662, 1.991533, 0.091940, 0.122332,
+				  0.286389, 0.382175, 0.128905, 0.081091, 0.352526, 0.810168, 0.232297, 0.228519, 0.655465, 1.994320, 3.256485, 0.457430,
+				  0.155567, 0.235965, 0.127321, 0.205164, 0.590018, 0.066081, 0.064822, 0.241077, 6.799829, 0.754940, 2.261319, 0.163849, 1.559944,
+				  1.671061, 6.535048, 0.904011, 5.164456, 0.386853, 2.437439, 3.537387, 4.320442, 11.291065, 0.170343, 0.848067, 5.260446, 0.426508, 0.438856,
+				  2.132922, 0.525521, 0.939733, 0.747330, 1.559564, 0.165666, 0.435384, 3.656545, 0.961142, 0.050315, 0.064441, 0.360946, 0.132547, 0.306683, 4.586081,
+				  0.529591, 0.303537, 0.435450, 0.308078, 0.606648, 0.106333, 0.290413, 0.290216, 0.448965, 0.372166, 0.102493, 0.389413, 0.498634, 0.109129, 2.099355, 3.634276,
+				  0.115551, 0.641259, 0.046646, 0.260889, 0.587531, 0.093417, 0.280695, 0.307466, 6.227274, 0.206332, 0.459041, 0.033291, 0.559069, 18.392863, 0.411347, 0.101797, 0.034710,
+				  0.102453, 0.289466, 0.262076, 0.185083, 0.592318, 0.035149, 0.105999, 0.096556, 20.304886, 0.097050, 0.133091, 0.115301, 0.264728, 66.647302, 0.476350, 0.148995, 0.063603, 20.561407,
+				  0.916683, 0.102065, 0.043986, 0.080708, 0.885230, 0.072549, 0.206603, 0.306067, 0.205944, 5.381403, 0.561215, 0.112593, 0.693307, 0.400021, 0.584622, 0.089177, 0.755865, 0.133790, 0.154902
+				  },
+				  {
+				  0.066142,
+				  0.590377, 0.468325,
+				  0.069930, 0.013688, 2.851667,
+				  9.850951, 0.302287, 3.932151, 0.146882,
+				  1.101363, 1.353957, 8.159169, 0.249672, 0.582670,
+				  0.150375, 0.028386, 0.219934, 0.560142, 0.005035, 3.054085,
+				  0.568586, 0.037750, 0.421974, 0.046719, 0.275844, 0.129551, 0.037250,
+				  0.051668, 0.262130, 2.468752, 0.106259, 0.098208, 4.210126, 0.029788, 0.013513,
+				  0.127170, 0.016923, 0.344765, 0.003656, 0.445038, 0.165753, 0.008541, 0.002533, 0.031779,
+				  0.292429, 0.064289, 0.210724, 0.004200, 1.217010, 1.088704, 0.014768, 0.005848, 0.064558, 7.278994,
+				  0.071458, 0.855973, 1.172204, 0.014189, 0.033969, 1.889645, 0.125869, 0.031390, 0.065585, 0.029917, 0.042762,
+				  1.218562, 0.079621, 0.763553, 0.009876, 1.988516, 3.344809, 0.056702, 0.021612, 0.079927, 7.918203, 14.799537, 0.259400,
+				  0.075144, 0.011169, 0.082464, 0.002656, 0.681161, 0.111063, 0.004186, 0.004854, 0.095591, 0.450964, 1.506485, 0.009457, 1.375871,
+				  7.169085, 0.161937, 0.726566, 0.040244, 0.825960, 2.067758, 0.110993, 0.129497, 0.196886, 0.169797, 0.637893, 0.090576, 0.457399, 0.143327,
+				  30.139501, 0.276530, 11.149790, 0.267322, 18.762977, 3.547017, 0.201148, 0.976631, 0.408834, 0.104288, 0.123793, 0.292108, 0.598048, 0.328689, 3.478333,
+				  13.461692, 0.161053, 4.782635, 0.053740, 11.949233, 2.466507, 0.139705, 0.053397, 0.126088, 1.578530, 0.641351, 0.297913, 4.418398, 0.125011, 2.984862, 13.974326,
+				  0.021372, 0.081472, 0.058046, 0.006597, 0.286794, 0.188236, 0.009201, 0.019475, 0.037226, 0.015909, 0.154810, 0.017172, 0.239749, 0.562720, 0.061299, 0.154326, 0.060703,
+				  0.045779, 0.036742, 0.498072, 0.027639, 0.534219, 0.203493, 0.012095, 0.004964, 0.452302, 0.094365, 0.140750, 0.021976, 0.168432, 1.414883, 0.077470, 0.224675, 0.123480, 0.447011,
+				  4.270235, 0.030342, 0.258487, 0.012745, 4.336817, 0.281953, 0.043812, 0.015539, 0.016212, 16.179952, 3.416059, 0.032578, 2.950318, 0.227807, 1.050562, 0.112000, 5.294490, 0.033381, 0.045528
+				  },
+				  {
+				  0.733336,
+				  0.558955, 0.597671,
+				  0.503360, 0.058964, 5.581680,
+				  4.149599, 2.863355, 1.279881, 0.225860,
+				  1.415369, 2.872594, 1.335650, 0.434096, 1.043232,
+				  1.367574, 0.258365, 0.397108, 2.292917, 0.209978, 4.534772,
+				  1.263002, 0.366868, 1.840061, 1.024707, 0.823594, 0.377181, 0.496780,
+				  0.994098, 2.578946, 5.739035, 0.821921, 3.039380, 4.877840, 0.532488, 0.398817,
+				  0.517204, 0.358350, 0.284730, 0.027824, 1.463390, 0.370939, 0.232460, 0.008940, 0.349195,
+				  0.775054, 0.672023, 0.109781, 0.021443, 1.983693, 1.298542, 0.169219, 0.043707, 0.838324, 5.102837,
+				  0.763094, 5.349861, 1.612642, 0.088850, 0.397640, 3.509873, 0.755219, 0.436013, 0.888693, 0.561690, 0.401070,
+				  1.890137, 0.691594, 0.466979, 0.060820, 2.831098, 2.646440, 0.379926, 0.087640, 0.488389, 7.010411, 8.929538, 1.357738,
+				  0.540460, 0.063347, 0.141582, 0.018288, 4.102068, 0.087872, 0.020447, 0.064863, 1.385133, 3.054968, 5.525874, 0.043394, 3.135353,
+				  0.200122, 0.032875, 0.019509, 0.042687, 0.059723, 0.072299, 0.023282, 0.036426, 0.050226, 0.039318, 0.067505, 0.023126, 0.012695, 0.015631,
+				  4.972745, 0.821562, 4.670980, 1.199607, 5.901348, 1.139018, 0.503875, 1.673207, 0.962470, 0.204155, 0.273372, 0.567639, 0.570771, 0.458799, 0.233109,
+				  1.825593, 0.580847, 1.967383, 0.420710, 2.034980, 0.864479, 0.577513, 0.124068, 0.502294, 2.653232, 0.437116, 1.048288, 2.319555, 0.151684, 0.077004, 8.113282,
+				  0.450842, 0.661866, 0.088064, 0.037642, 2.600668, 0.390688, 0.109318, 0.218118, 1.065585, 0.564368, 1.927515, 0.120994, 1.856122, 4.154750, 0.011074, 0.377578, 0.222293,
+				  0.526135, 0.265730, 0.581928, 0.141233, 5.413080, 0.322761, 0.153776, 0.039217, 8.351808, 0.854294, 0.940458, 0.180650, 0.975427, 11.429924, 0.026268, 0.429221, 0.273138, 4.731579,
+				  3.839269, 0.395134, 0.145401, 0.090101, 4.193725, 0.625409, 0.696533, 0.104335, 0.377304, 15.559906, 2.508169, 0.449074, 3.404087, 1.457957, 0.052132, 0.260296, 2.903836, 0.564762, 0.681215
+				  },
+				  {
+				  0.658412,
+				  0.566269, 0.540749,
+				  0.854111, 0.058015, 3.060574,
+				  0.884454, 5.851132, 1.279257, 0.160296,
+				  1.309554, 2.294145, 1.438430, 0.482619, 0.992259,
+				  1.272639, 0.182966, 0.431464, 2.992763, 0.086318, 2.130054,
+				  1.874713, 0.684164, 2.075952, 1.296206, 2.149634, 0.571406, 0.507160,
+				  0.552007, 3.192521, 4.840271, 0.841829, 5.103188, 4.137385, 0.351381, 0.679853,
+				  0.227683, 0.528161, 0.644656, 0.031467, 3.775817, 0.437589, 0.189152, 0.025780, 0.665865,
+				  0.581512, 1.128882, 0.266076, 0.048542, 3.954021, 2.071689, 0.217780, 0.082005, 1.266791, 8.904999,
+				  0.695190, 3.010922, 2.084975, 0.132774, 0.190734, 2.498630, 0.767361, 0.326441, 0.680174, 0.652629, 0.440178,
+				  0.967985, 1.012866, 0.720060, 0.133055, 1.776095, 1.763546, 0.278392, 0.343977, 0.717301, 10.091413, 14.013035, 1.082703,
+				  0.344015, 0.227296, 0.291854, 0.056045, 4.495841, 0.116381, 0.092075, 0.195877, 4.001286, 2.671718, 5.069337, 0.091278, 4.643214,
+				  0.978992, 0.156635, 0.028961, 0.209188, 0.264277, 0.296578, 0.177263, 0.217424, 0.362942, 0.086367, 0.539010, 0.172734, 0.121821, 0.161015,
+				  3.427163, 0.878405, 4.071574, 0.925172, 7.063879, 1.033710, 0.451893, 3.057583, 1.189259, 0.359932, 0.742569, 0.693405, 0.584083, 1.531223, 1.287474,
+				  2.333253, 0.802754, 2.258357, 0.360522, 2.221150, 1.283423, 0.653836, 0.377558, 0.964545, 4.797423, 0.780580, 1.422571, 4.216178, 0.599244, 0.444362, 5.231362,
+				  0.154701, 0.830884, 0.073037, 0.094591, 3.017954, 0.312579, 0.074620, 0.401252, 1.350568, 0.336801, 1.331875, 0.068958, 1.677263, 5.832025, 0.076328, 0.548763, 0.208791,
+				  0.221089, 0.431617, 1.238426, 0.313945, 8.558815, 0.305772, 0.181992, 0.072258, 12.869737, 1.021885, 1.531589, 0.163829, 1.575754, 33.873091, 0.079916, 0.831890, 0.307846, 5.910440,
+				  2.088785, 0.456530, 0.199728, 0.118104, 4.310199, 0.681277, 0.752277, 0.241015, 0.531100, 23.029406, 4.414850, 0.481711, 5.046403, 1.914768, 0.466823, 0.382271, 3.717971, 0.282540, 0.964421
+				  }
+				  };
+			  double
+			  freqs[4][20] =
+				  {{0.147383 , 0.017579 , 0.058208 , 0.017707 , 0.026331 , 0.041582 , 0.017494 , 0.027859 , 0.011849 , 0.076971 ,
+				  0.147823 , 0.019535 , 0.037132 , 0.029940 , 0.008059 , 0.088179 , 0.089653 , 0.006477 , 0.032308 , 0.097931},
+				  {0.063139 , 0.066357 , 0.011586 , 0.066571 , 0.010800 , 0.009276 , 0.053984 , 0.146986 , 0.034214 , 0.088822 ,
+				  0.098196 , 0.032390 , 0.021263 , 0.072697 , 0.016761 , 0.020711 , 0.020797 , 0.025463 , 0.045615 , 0.094372},
+				  {0.062457 , 0.066826 , 0.049332 , 0.065270 , 0.006513 , 0.041231 , 0.058965 , 0.080852 , 0.028024 , 0.037024 ,
+				  0.075925 , 0.064131 , 0.019620 , 0.028710 , 0.104579 , 0.056388 , 0.062027 , 0.008241 , 0.033124 , 0.050760},
+				  {0.106471 , 0.074171 , 0.044513 , 0.096390 , 0.002148 , 0.066733 , 0.158908 , 0.037625 , 0.020691 , 0.014608 ,
+				  0.028797 , 0.105352 , 0.007864 , 0.007477 , 0.083595 , 0.055726 , 0.047711 , 0.003975 , 0.010088 , 0.027159}};
+			  int
+			  i,
+			  j,
+			  r = 0;
+			  for(i = 1; i < 20; i++)
+				  for(j = 0; j < i; j++)
+				  {
+					  daa[i * 20 + j] = rates[lg4_index][r];
+					  r++;
+				  }
+			  assert(r == 190);
+			  for(i = 0; i < 20; i++)
+				  f[i] = freqs[lg4_index][i];
+		  }
+		  break;
+        case PLL_MTART:
+          {
+           
+
+            daa[1*20+0]=   0.2;
+            daa[2*20+0]=   0.2;
+           daa[2*20+1]=   0.2;
+           daa[3*20+0]=   1;
+           daa[3*20+1]=   4;
+           daa[3*20+2]=   500;
+           daa[4*20+0]=   254;
+           daa[4*20+1]=   36;
+           daa[4*20+2]=   98;
+           daa[4*20+3]=   11;
+           daa[5*20+0]=   0.2;
+           daa[5*20+1]=   154;
+           daa[5*20+2]=   262;
+           daa[5*20+3]=   0.2;
+           daa[5*20+4]=   0.2;
+           daa[6*20+0]=   0.2;
+           daa[6*20+1]=   0.2;
+           daa[6*20+2]=   183;
+           daa[6*20+3]=   862;
+           daa[6*20+4]=   0.2;
+           daa[6*20+5]=   262;
+           daa[7*20+0]=   200;
+           daa[7*20+1]=   0.2;
+           daa[7*20+2]=   121;
+           daa[7*20+3]=   12;
+           daa[7*20+4]=   81;
+           daa[7*20+5]=   3;
+           daa[7*20+6]=   44;
+           daa[8*20+0]=   0.2;
+           daa[8*20+1]=   41;
+           daa[8*20+2]=   180;
+           daa[8*20+3]=   0.2;
+           daa[8*20+4]=   12;
+           daa[8*20+5]=   314;
+           daa[8*20+6]=   15;
+           daa[8*20+7]=   0.2;
+           daa[9*20+0]=   26;
+           daa[9*20+1]=   2;
+           daa[9*20+2]=   21;
+           daa[9*20+3]=   7;
+           daa[9*20+4]=   63;
+           daa[9*20+5]=   11;
+           daa[9*20+6]=   7;
+           daa[9*20+7]=   3;
+           daa[9*20+8]=   0.2;
+           daa[10*20+0]=  4;
+           daa[10*20+1]=  2;
+           daa[10*20+2]=  13;
+           daa[10*20+3]=  1;
+           daa[10*20+4]=  79;
+           daa[10*20+5]=  16;
+           daa[10*20+6]=  2;
+           daa[10*20+7]=  1;
+           daa[10*20+8]=  6;
+           daa[10*20+9]=  515;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  209;
+           daa[11*20+2]=  467;
+           daa[11*20+3]=  2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  349;
+           daa[11*20+6]=  106;
+           daa[11*20+7]=  0.2;
+           daa[11*20+8]=  0.2;
+           daa[11*20+9]=  3;
+           daa[11*20+10]= 4;
+           daa[12*20+0]=  121;
+           daa[12*20+1]=  5;
+           daa[12*20+2]=  79;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  312;
+           daa[12*20+5]=  67;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  56;
+           daa[12*20+8]=  0.2;
+           daa[12*20+9]=  515;
+           daa[12*20+10]= 885;
+           daa[12*20+11]= 106;
+           daa[13*20+0]=  13;
+           daa[13*20+1]=  5;
+           daa[13*20+2]=  20;
+           daa[13*20+3]=  0.2;
+           daa[13*20+4]=  184;
+           daa[13*20+5]=  0.2;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  1;
+           daa[13*20+8]=  14;
+           daa[13*20+9]=  118;
+           daa[13*20+10]= 263;
+           daa[13*20+11]= 11;
+           daa[13*20+12]= 322;
+           daa[14*20+0]=  49;
+           daa[14*20+1]=  0.2;
+           daa[14*20+2]=  17;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  0.2;
+           daa[14*20+5]=  39;
+           daa[14*20+6]=  8;
+           daa[14*20+7]=  0.2;
+           daa[14*20+8]=  1;
+           daa[14*20+9]=  0.2;
+           daa[14*20+10]= 12;
+           daa[14*20+11]= 17;
+           daa[14*20+12]= 5;
+           daa[14*20+13]= 15;
+           daa[15*20+0]=  673;
+           daa[15*20+1]=  3;
+           daa[15*20+2]=  398;
+           daa[15*20+3]=  44;
+           daa[15*20+4]=  664;
+           daa[15*20+5]=  52;
+           daa[15*20+6]=  31;
+           daa[15*20+7]=  226;
+           daa[15*20+8]=  11;
+           daa[15*20+9]=  7;
+           daa[15*20+10]= 8;
+           daa[15*20+11]= 144;
+           daa[15*20+12]= 112;
+           daa[15*20+13]= 36;
+           daa[15*20+14]= 87;
+           daa[16*20+0]=  244;
+           daa[16*20+1]=  0.2;
+           daa[16*20+2]=  166;
+           daa[16*20+3]=  0.2;
+           daa[16*20+4]=  183;
+           daa[16*20+5]=  44;
+           daa[16*20+6]=  43;
+           daa[16*20+7]=  0.2;
+           daa[16*20+8]=  19;
+           daa[16*20+9]=  204;
+           daa[16*20+10]= 48;
+           daa[16*20+11]= 70;
+           daa[16*20+12]= 289;
+           daa[16*20+13]= 14;
+           daa[16*20+14]= 47;
+           daa[16*20+15]= 660;
+           daa[17*20+0]=  0.2;
+           daa[17*20+1]=  0.2;
+           daa[17*20+2]=  8;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  22;
+           daa[17*20+5]=  7;
+           daa[17*20+6]=  11;
+           daa[17*20+7]=  2;
+           daa[17*20+8]=  0.2;
+           daa[17*20+9]=  0.2;
+           daa[17*20+10]= 21;
+           daa[17*20+11]= 16;
+           daa[17*20+12]= 71;
+           daa[17*20+13]= 54;
+           daa[17*20+14]= 0.2;
+           daa[17*20+15]= 2;
+           daa[17*20+16]= 0.2;
+           daa[18*20+0]=  1;
+           daa[18*20+1]=  4;
+           daa[18*20+2]=  251;
+           daa[18*20+3]=  0.2;
+           daa[18*20+4]=  72;
+           daa[18*20+5]=  87;
+           daa[18*20+6]=  8;
+           daa[18*20+7]=  9;
+           daa[18*20+8]=  191;
+           daa[18*20+9]=  12;
+           daa[18*20+10]= 20;
+           daa[18*20+11]= 117;
+           daa[18*20+12]= 71;
+           daa[18*20+13]= 792;
+           daa[18*20+14]= 18;
+           daa[18*20+15]= 30;
+           daa[18*20+16]= 46;
+           daa[18*20+17]= 38;
+           daa[19*20+0]=  340;
+           daa[19*20+1]=  0.2;
+           daa[19*20+2]=  23;
+           daa[19*20+3]=  0.2;
+           daa[19*20+4]=  350;
+           daa[19*20+5]=  0.2;
+           daa[19*20+6]=  14;
+           daa[19*20+7]=  3;
+           daa[19*20+8]=  0.2;
+           daa[19*20+9]=  1855;
+           daa[19*20+10]= 85;
+           daa[19*20+11]= 26;
+           daa[19*20+12]= 281;
+           daa[19*20+13]= 52;
+           daa[19*20+14]= 32;
+           daa[19*20+15]= 61;
+           daa[19*20+16]= 544;
+           daa[19*20+17]= 0.2;
+           daa[19*20+18]= 2;
+           
+           f[0]=  0.054116;
+           f[1]=  0.018227;
+           f[2]=  0.039903;
+           f[3]=  0.020160;
+           f[4]=  0.009709;
+           f[5]=  0.018781;
+           f[6]=  0.024289;
+           f[7]=  0.068183;
+           f[8]=  0.024518;
+           f[9]=  0.092638;
+           f[10]= 0.148658;
+           f[11]= 0.021718;
+           f[12]= 0.061453;
+           f[13]= 0.088668;
+           f[14]= 0.041826;
+           f[15]= 0.091030;
+           f[16]= 0.049194;
+           f[17]= 0.029786;
+           f[18]= 0.039443;
+           f[19]= 0.057700;
+          }
+          break;
+        case PLL_MTZOA:
+          {
+           daa[1*20+0]=   3.3;
+           daa[2*20+0]=   1.7;
+           daa[2*20+1]=   33.6;
+           daa[3*20+0]=   16.1;
+           daa[3*20+1]=   3.2;
+           daa[3*20+2]=   617.0;
+           daa[4*20+0]=   272.5;
+           daa[4*20+1]=   61.1;
+           daa[4*20+2]=   94.6;
+           daa[4*20+3]=   9.5;
+           daa[5*20+0]=   7.3;
+           daa[5*20+1]=   231.0;
+           daa[5*20+2]=   190.3;
+           daa[5*20+3]=   19.3;
+           daa[5*20+4]=   49.1;
+           daa[6*20+0]=   17.1;
+           daa[6*20+1]=   6.4;
+           daa[6*20+2]=   174.0;
+           daa[6*20+3]=   883.6;
+           daa[6*20+4]=   3.4;
+           daa[6*20+5]=   349.4;
+           daa[7*20+0]=   289.3;
+           daa[7*20+1]=   7.2;
+           daa[7*20+2]=   99.3;
+           daa[7*20+3]=   26.0;
+           daa[7*20+4]=   82.4;
+           daa[7*20+5]=   8.9;
+           daa[7*20+6]=   43.1;
+           daa[8*20+0]=   2.3;
+           daa[8*20+1]=   61.7;
+           daa[8*20+2]=   228.9;
+           daa[8*20+3]=   55.6;
+           daa[8*20+4]=   37.5;
+           daa[8*20+5]=   421.8;
+           daa[8*20+6]=   14.9;
+           daa[8*20+7]=   7.4;
+           daa[9*20+0]=   33.2;
+           daa[9*20+1]=   0.2;
+           daa[9*20+2]=   24.3;
+           daa[9*20+3]=   1.5;
+           daa[9*20+4]=   48.8;
+           daa[9*20+5]=   0.2;
+           daa[9*20+6]=   7.3;
+           daa[9*20+7]=   3.4;
+           daa[9*20+8]=   1.6;
+           daa[10*20+0]=  15.6;
+           daa[10*20+1]=  4.1;
+           daa[10*20+2]=  7.9;
+           daa[10*20+3]=  0.5;
+           daa[10*20+4]=  59.7;
+           daa[10*20+5]=  23.0;
+           daa[10*20+6]=  1.0;
+           daa[10*20+7]=  3.5;
+           daa[10*20+8]=  6.6;
+           daa[10*20+9]=  425.2;
+           daa[11*20+0]=  0.2;
+           daa[11*20+1]=  292.3;
+           daa[11*20+2]=  413.4;
+           daa[11*20+3]=  0.2;
+           daa[11*20+4]=  0.2;
+           daa[11*20+5]=  334.0;
+           daa[11*20+6]=  163.2;
+           daa[11*20+7]=  10.1;
+           daa[11*20+8]=  23.9;
+           daa[11*20+9]=  8.4;
+           daa[11*20+10]= 6.7;
+           daa[12*20+0]=  136.5;
+           daa[12*20+1]=  3.8;
+           daa[12*20+2]=  73.7;
+           daa[12*20+3]=  0.2;
+           daa[12*20+4]=  264.8;
+           daa[12*20+5]=  83.9;
+           daa[12*20+6]=  0.2;
+           daa[12*20+7]=  52.2;
+           daa[12*20+8]=  7.1;
+           daa[12*20+9]=  449.7;
+           daa[12*20+10]= 636.3;
+           daa[12*20+11]= 83.0;
+           daa[13*20+0]=  26.5;
+           daa[13*20+1]=  0.2;
+           daa[13*20+2]=  12.9;
+           daa[13*20+3]=  2.0;
+           daa[13*20+4]=  167.8;
+           daa[13*20+5]=  9.5;
+           daa[13*20+6]=  0.2;
+           daa[13*20+7]=  5.8;
+           daa[13*20+8]=  13.1;
+           daa[13*20+9]=  90.3;
+           daa[13*20+10]= 234.2;
+           daa[13*20+11]= 16.3;
+           daa[13*20+12]= 215.6;
+           daa[14*20+0]=  61.8;
+           daa[14*20+1]=  7.5;
+           daa[14*20+2]=  22.6;
+           daa[14*20+3]=  0.2;
+           daa[14*20+4]=  8.1;
+           daa[14*20+5]=  52.2;
+           daa[14*20+6]=  20.6;
+           daa[14*20+7]=  1.3;
+           daa[14*20+8]=  15.6;
+           daa[14*20+9]=  2.6;
+           daa[14*20+10]= 11.4;
+           daa[14*20+11]= 24.3;
+           daa[14*20+12]= 5.4;
+           daa[14*20+13]= 10.5;
+           daa[15*20+0]=  644.9;
+           daa[15*20+1]=  11.8;
+           daa[15*20+2]=  420.2;
+           daa[15*20+3]=  51.4;
+           daa[15*20+4]=  656.3;
+           daa[15*20+5]=  96.4;
+           daa[15*20+6]=  38.4;
+           daa[15*20+7]=  257.1;
+           daa[15*20+8]=  23.1;
+           daa[15*20+9]=  7.2;
+           daa[15*20+10]= 15.2;
+           daa[15*20+11]= 144.9;
+           daa[15*20+12]= 95.3;
+           daa[15*20+13]= 32.2;
+           daa[15*20+14]= 79.7;
+           daa[16*20+0]=  378.1;
+           daa[16*20+1]=  3.2;
+           daa[16*20+2]=  184.6;
+           daa[16*20+3]=  2.3;
+           daa[16*20+4]=  199.0;
+           daa[16*20+5]=  39.4;
+           daa[16*20+6]=  34.5;
+           daa[16*20+7]=  5.2;
+           daa[16*20+8]=  19.4;
+           daa[16*20+9]=  222.3;
+           daa[16*20+10]= 50.0;
+           daa[16*20+11]= 75.5;
+           daa[16*20+12]= 305.1;
+           daa[16*20+13]= 19.3;
+           daa[16*20+14]= 56.9;
+           daa[16*20+15]= 666.3;
+           daa[17*20+0]=  3.1;
+           daa[17*20+1]=  16.9;
+           daa[17*20+2]=  6.4;
+           daa[17*20+3]=  0.2;
+           daa[17*20+4]=  36.1;
+           daa[17*20+5]=  6.1;
+           daa[17*20+6]=  3.5;
+           daa[17*20+7]=  12.3;
+           daa[17*20+8]=  4.5;
+           daa[17*20+9]=  9.7;
+           daa[17*20+10]= 27.2;
+           daa[17*20+11]= 6.6;
+           daa[17*20+12]= 48.7;
+           daa[17*20+13]= 58.2;
+           daa[17*20+14]= 1.3;
+           daa[17*20+15]= 10.3;
+           daa[17*20+16]= 3.6;
+           daa[18*20+0]=  2.1;
+           daa[18*20+1]=  13.8;
+           daa[18*20+2]=  141.6;
+           daa[18*20+3]=  13.9;
+           daa[18*20+4]=  76.7;
+           daa[18*20+5]=  52.3;
+           daa[18*20+6]=  10.0;
+           daa[18*20+7]=  4.3;
+           daa[18*20+8]=  266.5;
+           daa[18*20+9]=  13.1;
+           daa[18*20+10]= 5.7;
+           daa[18*20+11]= 45.0;
+           daa[18*20+12]= 41.4;
+           daa[18*20+13]= 590.5;
+           daa[18*20+14]= 4.2;
+           daa[18*20+15]= 29.7;
+           daa[18*20+16]= 29.0;
+           daa[18*20+17]= 79.8;
+           daa[19*20+0]=  321.9;
+           daa[19*20+1]=  5.1;
+           daa[19*20+2]=  7.1;
+           daa[19*20+3]=  3.7;
+           daa[19*20+4]=  243.8;
+           daa[19*20+5]=  9.0;
+           daa[19*20+6]=  16.3;
+           daa[19*20+7]=  23.7;
+           daa[19*20+8]=  0.3;
+           daa[19*20+9]=  1710.6;
+           daa[19*20+10]= 126.1;
+           daa[19*20+11]= 11.1;
+           daa[19*20+12]= 279.6;
+           daa[19*20+13]= 59.6;
+           daa[19*20+14]= 17.9;
+           daa[19*20+15]= 49.5;
+           daa[19*20+16]= 396.4;
+           daa[19*20+17]= 13.7;
+           daa[19*20+18]= 15.6;
+           
+           f[0]=  0.069;
+           f[1]=  0.021;
+           f[2]=  0.030;
+           f[3]=  0.020;
+           f[4]=  0.010;
+           f[5]=  0.019;
+           f[6]=  0.025;
+           f[7]=  0.072;
+           f[8]=  0.027;
+           f[9]=  0.085;
+           f[10]= 0.157;
+           f[11]= 0.019;
+           f[12]= 0.051;
+           f[13]= 0.082;
+           f[14]= 0.045;
+           f[15]= 0.081;
+           f[16]= 0.056;
+           f[17]= 0.028;
+           f[18]= 0.037;
+           f[19]= 0.066;
+          }
+          break;
+        case PLL_PMB:
+          {
+           daa[1*20+0]=   0.674995699;
+           daa[2*20+0]=   0.589645178;
+           daa[2*20+1]=   1.189067034;
+           daa[3*20+0]=   0.462499504;
+           daa[3*20+1]=   0.605460903;
+           daa[3*20+2]=   3.573373315;
+           daa[4*20+0]=   1.065445546;
+           daa[4*20+1]=   0.31444833;
+           daa[4*20+2]=   0.589852457;
+           daa[4*20+3]=   0.246951424;
+           daa[5*20+0]=   1.111766964;
+           daa[5*20+1]=   2.967840934;
+           daa[5*20+2]=   2.299755865;
+           daa[5*20+3]=   1.686058219;
+           daa[5*20+4]=   0.245163782;
+           daa[6*20+0]=   1.046334652;
+           daa[6*20+1]=   1.201770702;
+           daa[6*20+2]=   1.277836748;
+           daa[6*20+3]=   4.399995525;
+           daa[6*20+4]=   0.091071867;
+           daa[6*20+5]=   4.15967899;
+           daa[7*20+0]=   1.587964372;
+           daa[7*20+1]=   0.523770553;
+           daa[7*20+2]=   1.374854049;
+           daa[7*20+3]=   0.734992057;
+           daa[7*20+4]=   0.31706632;
+           daa[7*20+5]=   0.596789898;
+           daa[7*20+6]=   0.463812837;
+           daa[8*20+0]=   0.580830874;
+           daa[8*20+1]=   1.457127446;
+           daa[8*20+2]=   2.283037894;
+           daa[8*20+3]=   0.839348444;
+           daa[8*20+4]=   0.411543728;
+           daa[8*20+5]=   1.812173605;
+           daa[8*20+6]=   0.877842609;
+           daa[8*20+7]=   0.476331437;
+           daa[9*20+0]=   0.464590585;
+           daa[9*20+1]=   0.35964586;
+           daa[9*20+2]=   0.426069419;
+           daa[9*20+3]=   0.266775558;
+           daa[9*20+4]=   0.417547309;
+           daa[9*20+5]=   0.315256838;
+           daa[9*20+6]=   0.30421529;
+           daa[9*20+7]=   0.180198883;
+           daa[9*20+8]=   0.285186418;
+           daa[10*20+0]=  0.804404505;
+           daa[10*20+1]=  0.520701585;
+           daa[10*20+2]=  0.41009447;
+           daa[10*20+3]=  0.269124919;
+           daa[10*20+4]=  0.450795211;
+           daa[10*20+5]=  0.625792937;
+           daa[10*20+6]=  0.32078471;
+           daa[10*20+7]=  0.259854426;
+           daa[10*20+8]=  0.363981358;
+           daa[10*20+9]=  4.162454693;
+           daa[11*20+0]=  0.831998835;
+           daa[11*20+1]=  4.956476453;
+           daa[11*20+2]=  2.037575629;
+           daa[11*20+3]=  1.114178954;
+           daa[11*20+4]=  0.274163536;
+           daa[11*20+5]=  3.521346591;
+           daa[11*20+6]=  2.415974716;
+           daa[11*20+7]=  0.581001076;
+           daa[11*20+8]=  0.985885486;
+           daa[11*20+9]=  0.374784947;
+           daa[11*20+10]= 0.498011337;
+           daa[12*20+0]=  1.546725076;
+           daa[12*20+1]=  0.81346254;
+           daa[12*20+2]=  0.737846301;
+           daa[12*20+3]=  0.341932741;
+           daa[12*20+4]=  0.618614612;
+           daa[12*20+5]=  2.067388546;
+           daa[12*20+6]=  0.531773639;
+           daa[12*20+7]=  0.465349326;
+           daa[12*20+8]=  0.380925433;
+           daa[12*20+9]=  3.65807012;
+           daa[12*20+10]= 5.002338375;
+           daa[12*20+11]= 0.661095832;
+           daa[13*20+0]=  0.546169219;
+           daa[13*20+1]=  0.303437244;
+           daa[13*20+2]=  0.425193716;
+           daa[13*20+3]=  0.219005213;
+           daa[13*20+4]=  0.669206193;
+           daa[13*20+5]=  0.406042546;
+           daa[13*20+6]=  0.224154698;
+           daa[13*20+7]=  0.35402891;
+           daa[13*20+8]=  0.576231691;
+           daa[13*20+9]=  1.495264661;
+           daa[13*20+10]= 2.392638293;
+           daa[13*20+11]= 0.269496317;
+           daa[13*20+12]= 2.306919847;
+           daa[14*20+0]=  1.241586045;
+           daa[14*20+1]=  0.65577338;
+           daa[14*20+2]=  0.711495595;
+           daa[14*20+3]=  0.775624818;
+           daa[14*20+4]=  0.198679914;
+           daa[14*20+5]=  0.850116543;
+           daa[14*20+6]=  0.794584081;
+           daa[14*20+7]=  0.588254139;
+           daa[14*20+8]=  0.456058589;
+           daa[14*20+9]=  0.366232942;
+           daa[14*20+10]= 0.430073179;
+           daa[14*20+11]= 1.036079005;
+           daa[14*20+12]= 0.337502282;
+           daa[14*20+13]= 0.481144863;
+           daa[15*20+0]=  3.452308792;
+           daa[15*20+1]=  0.910144334;
+           daa[15*20+2]=  2.572577221;
+           daa[15*20+3]=  1.440896785;
+           daa[15*20+4]=  0.99870098;
+           daa[15*20+5]=  1.348272505;
+           daa[15*20+6]=  1.205509425;
+           daa[15*20+7]=  1.402122097;
+           daa[15*20+8]=  0.799966711;
+           daa[15*20+9]=  0.530641901;
+           daa[15*20+10]= 0.402471997;
+           daa[15*20+11]= 1.234648153;
+           daa[15*20+12]= 0.945453716;
+           daa[15*20+13]= 0.613230817;
+           daa[15*20+14]= 1.217683028;
+           daa[16*20+0]=  1.751412803;
+           daa[16*20+1]=  0.89517149;
+           daa[16*20+2]=  1.823161023;
+           daa[16*20+3]=  0.994227284;
+           daa[16*20+4]=  0.847312432;
+           daa[16*20+5]=  1.320626678;
+           daa[16*20+6]=  0.949599791;
+           daa[16*20+7]=  0.542185658;
+           daa[16*20+8]=  0.83039281;
+           daa[16*20+9]=  1.114132523;
+           daa[16*20+10]= 0.779827336;
+           daa[16*20+11]= 1.290709079;
+           daa[16*20+12]= 1.551488041;
+           daa[16*20+13]= 0.718895136;
+           daa[16*20+14]= 0.780913179;
+           daa[16*20+15]= 4.448982584;
+           daa[17*20+0]=  0.35011051;
+           daa[17*20+1]=  0.618778365;
+           daa[17*20+2]=  0.422407388;
+           daa[17*20+3]=  0.362495245;
+           daa[17*20+4]=  0.445669347;
+           daa[17*20+5]=  0.72038474;
+           daa[17*20+6]=  0.261258229;
+           daa[17*20+7]=  0.37874827;
+           daa[17*20+8]=  0.72436751;
+           daa[17*20+9]=  0.516260502;
+           daa[17*20+10]= 0.794797115;
+           daa[17*20+11]= 0.43340962;
+           daa[17*20+12]= 0.768395107;
+           daa[17*20+13]= 3.29519344;
+           daa[17*20+14]= 0.499869138;
+           daa[17*20+15]= 0.496334956;
+           daa[17*20+16]= 0.38372361;
+           daa[18*20+0]=  0.573154753;
+           daa[18*20+1]=  0.628599063;
+           daa[18*20+2]=  0.720013799;
+           daa[18*20+3]=  0.436220437;
+           daa[18*20+4]=  0.55626163;
+           daa[18*20+5]=  0.728970584;
+           daa[18*20+6]=  0.50720003;
+           daa[18*20+7]=  0.284727562;
+           daa[18*20+8]=  2.210952064;
+           daa[18*20+9]=  0.570562395;
+           daa[18*20+10]= 0.811019594;
+           daa[18*20+11]= 0.664884513;
+           daa[18*20+12]= 0.93253606;
+           daa[18*20+13]= 5.894735673;
+           daa[18*20+14]= 0.433748126;
+           daa[18*20+15]= 0.593795813;
+           daa[18*20+16]= 0.523549536;
+           daa[18*20+17]= 2.996248013;
+           daa[19*20+0]=  2.063050067;
+           daa[19*20+1]=  0.388680158;
+           daa[19*20+2]=  0.474418852;
+           daa[19*20+3]=  0.275658381;
+           daa[19*20+4]=  0.998911631;
+           daa[19*20+5]=  0.634408285;
+           daa[19*20+6]=  0.527640634;
+           daa[19*20+7]=  0.314700907;
+           daa[19*20+8]=  0.305792277;
+           daa[19*20+9]=  8.002789424;
+           daa[19*20+10]= 2.113077156;
+           daa[19*20+11]= 0.526184203;
+           daa[19*20+12]= 1.737356217;
+           daa[19*20+13]= 0.983844803;
+           daa[19*20+14]= 0.551333603;
+           daa[19*20+15]= 0.507506011;
+           daa[19*20+16]= 1.89965079;
+           daa[19*20+17]= 0.429570747;
+           daa[19*20+18]= 0.716795463;
+           
+           f[0]=  0.076;
+           f[1]=  0.054;
+           f[2]=  0.038;
+           f[3]=  0.045;
+           f[4]=  0.028;
+           f[5]=  0.034;
+           f[6]=  0.053;
+           f[7]=  0.078;
+           f[8]=  0.030;
+           f[9]=  0.060;
+           f[10]= 0.096;
+           f[11]= 0.052;
+           f[12]= 0.022;
+           f[13]= 0.045;
+           f[14]= 0.042;
+           f[15]= 0.068;
+           f[16]= 0.056;
+           f[17]= 0.016;
+           f[18]= 0.036;
+           f[19]= 0.071;
+          }
+          break;
+        case PLL_HIVB:
+          {
+           daa[1*20+0]=   0.30750700;
+           daa[2*20+0]=   0.00500000;
+           daa[2*20+1]=   0.29554300;
+           daa[3*20+0]=   1.45504000;
+           daa[3*20+1]=   0.00500000;
+           daa[3*20+2]=   17.66120000;
+           daa[4*20+0]=   0.12375800;
+           daa[4*20+1]=   0.35172100;
+           daa[4*20+2]=   0.08606420;
+           daa[4*20+3]=   0.00500000;
+           daa[5*20+0]=   0.05511280;
+           daa[5*20+1]=   3.42150000;
+           daa[5*20+2]=   0.67205200;
+           daa[5*20+3]=   0.00500000;
+           daa[5*20+4]=   0.00500000;
+           daa[6*20+0]=   1.48135000;
+           daa[6*20+1]=   0.07492180;
+           daa[6*20+2]=   0.07926330;
+           daa[6*20+3]=   10.58720000;
+           daa[6*20+4]=   0.00500000;
+           daa[6*20+5]=   2.56020000;
+           daa[7*20+0]=   2.13536000;
+           daa[7*20+1]=   3.65345000;
+           daa[7*20+2]=   0.32340100;
+           daa[7*20+3]=   2.83806000;
+           daa[7*20+4]=   0.89787100;
+           daa[7*20+5]=   0.06191370;
+           daa[7*20+6]=   3.92775000;
+           daa[8*20+0]=   0.08476130;
+           daa[8*20+1]=   9.04044000;
+           daa[8*20+2]=   7.64585000;
+           daa[8*20+3]=   1.91690000;
+           daa[8*20+4]=   0.24007300;
+           daa[8*20+5]=   7.05545000;
+           daa[8*20+6]=   0.11974000;
+           daa[8*20+7]=   0.00500000;
+           daa[9*20+0]=   0.00500000;
+           daa[9*20+1]=   0.67728900;
+           daa[9*20+2]=   0.68056500;
+           daa[9*20+3]=   0.01767920;
+           daa[9*20+4]=   0.00500000;
+           daa[9*20+5]=   0.00500000;
+           daa[9*20+6]=   0.00609079;
+           daa[9*20+7]=   0.00500000;
+           daa[9*20+8]=   0.10311100;
+           daa[10*20+0]=  0.21525600;
+           daa[10*20+1]=  0.70142700;
+           daa[10*20+2]=  0.00500000;
+           daa[10*20+3]=  0.00876048;
+           daa[10*20+4]=  0.12977700;
+           daa[10*20+5]=  1.49456000;
+           daa[10*20+6]=  0.00500000;
+           daa[10*20+7]=  0.00500000;
+           daa[10*20+8]=  1.74171000;
+           daa[10*20+9]=  5.95879000;
+           daa[11*20+0]=  0.00500000;
+           daa[11*20+1]=  20.45000000;
+           daa[11*20+2]=  7.90443000;
+           daa[11*20+3]=  0.00500000;
+           daa[11*20+4]=  0.00500000;
+           daa[11*20+5]=  6.54737000;
+           daa[11*20+6]=  4.61482000;
+           daa[11*20+7]=  0.52170500;
+           daa[11*20+8]=  0.00500000;
+           daa[11*20+9]=  0.32231900;
+           daa[11*20+10]= 0.08149950;
+           daa[12*20+0]=  0.01866430;
+           daa[12*20+1]=  2.51394000;
+           daa[12*20+2]=  0.00500000;
+           daa[12*20+3]=  0.00500000;
+           daa[12*20+4]=  0.00500000;
+           daa[12*20+5]=  0.30367600;
+           daa[12*20+6]=  0.17578900;
+           daa[12*20+7]=  0.00500000;
+           daa[12*20+8]=  0.00500000;
+           daa[12*20+9]=  11.20650000;
+           daa[12*20+10]= 5.31961000;
+           daa[12*20+11]= 1.28246000;
+           daa[13*20+0]=  0.01412690;
+           daa[13*20+1]=  0.00500000;
+           daa[13*20+2]=  0.00500000;
+           daa[13*20+3]=  0.00500000;
+           daa[13*20+4]=  9.29815000;
+           daa[13*20+5]=  0.00500000;
+           daa[13*20+6]=  0.00500000;
+           daa[13*20+7]=  0.29156100;
+           daa[13*20+8]=  0.14555800;
+           daa[13*20+9]=  3.39836000;
+           daa[13*20+10]= 8.52484000;
+           daa[13*20+11]= 0.03426580;
+           daa[13*20+12]= 0.18802500;
+           daa[14*20+0]=  2.12217000;
+           daa[14*20+1]=  1.28355000;
+           daa[14*20+2]=  0.00739578;
+           daa[14*20+3]=  0.03426580;
+           daa[14*20+4]=  0.00500000;
+           daa[14*20+5]=  4.47211000;
+           daa[14*20+6]=  0.01202260;
+           daa[14*20+7]=  0.00500000;
+           daa[14*20+8]=  2.45318000;
+           daa[14*20+9]=  0.04105930;
+           daa[14*20+10]= 2.07757000;
+           daa[14*20+11]= 0.03138620;
+           daa[14*20+12]= 0.00500000;
+           daa[14*20+13]= 0.00500000;
+           daa[15*20+0]=  2.46633000;
+           daa[15*20+1]=  3.47910000;
+           daa[15*20+2]=  13.14470000;
+           daa[15*20+3]=  0.52823000;
+           daa[15*20+4]=  4.69314000;
+           daa[15*20+5]=  0.11631100;
+           daa[15*20+6]=  0.00500000;
+           daa[15*20+7]=  4.38041000;
+           daa[15*20+8]=  0.38274700;
+           daa[15*20+9]=  1.21803000;
+           daa[15*20+10]= 0.92765600;
+           daa[15*20+11]= 0.50411100;
+           daa[15*20+12]= 0.00500000;
+           daa[15*20+13]= 0.95647200;
+           daa[15*20+14]= 5.37762000;
+           daa[16*20+0]=  15.91830000;
+           daa[16*20+1]=  2.86868000;
+           daa[16*20+2]=  6.88667000;
+           daa[16*20+3]=  0.27472400;
+           daa[16*20+4]=  0.73996900;
+           daa[16*20+5]=  0.24358900;
+           daa[16*20+6]=  0.28977400;
+           daa[16*20+7]=  0.36961500;
+           daa[16*20+8]=  0.71159400;
+           daa[16*20+9]=  8.61217000;
+           daa[16*20+10]= 0.04376730;
+           daa[16*20+11]= 4.67142000;
+           daa[16*20+12]= 4.94026000;
+           daa[16*20+13]= 0.01412690;
+           daa[16*20+14]= 2.01417000;
+           daa[16*20+15]= 8.93107000;
+           daa[17*20+0]=  0.00500000;
+           daa[17*20+1]=  0.99133800;
+           daa[17*20+2]=  0.00500000;
+           daa[17*20+3]=  0.00500000;
+           daa[17*20+4]=  2.63277000;
+           daa[17*20+5]=  0.02665600;
+           daa[17*20+6]=  0.00500000;
+           daa[17*20+7]=  1.21674000;
+           daa[17*20+8]=  0.06951790;
+           daa[17*20+9]=  0.00500000;
+           daa[17*20+10]= 0.74884300;
+           daa[17*20+11]= 0.00500000;
+           daa[17*20+12]= 0.08907800;
+           daa[17*20+13]= 0.82934300;
+           daa[17*20+14]= 0.04445060;
+           daa[17*20+15]= 0.02487280;
+           daa[17*20+16]= 0.00500000;
+           daa[18*20+0]=  0.00500000;
+           daa[18*20+1]=  0.00991826;
+           daa[18*20+2]=  1.76417000;
+           daa[18*20+3]=  0.67465300;
+           daa[18*20+4]=  7.57932000;
+           daa[18*20+5]=  0.11303300;
+           daa[18*20+6]=  0.07926330;
+           daa[18*20+7]=  0.00500000;
+           daa[18*20+8]=  18.69430000;
+           daa[18*20+9]=  0.14816800;
+           daa[18*20+10]= 0.11198600;
+           daa[18*20+11]= 0.00500000;
+           daa[18*20+12]= 0.00500000;
+           daa[18*20+13]= 15.34000000;
+           daa[18*20+14]= 0.03043810;
+           daa[18*20+15]= 0.64802400;
+           daa[18*20+16]= 0.10565200;
+           daa[18*20+17]= 1.28022000;
+           daa[19*20+0]=  7.61428000;
+           daa[19*20+1]=  0.08124540;
+           daa[19*20+2]=  0.02665600;
+           daa[19*20+3]=  1.04793000;
+           daa[19*20+4]=  0.42002700;
+           daa[19*20+5]=  0.02091530;
+           daa[19*20+6]=  1.02847000;
+           daa[19*20+7]=  0.95315500;
+           daa[19*20+8]=  0.00500000;
+           daa[19*20+9]=  17.73890000;
+           daa[19*20+10]= 1.41036000;
+           daa[19*20+11]= 0.26582900;
+           daa[19*20+12]= 6.85320000;
+           daa[19*20+13]= 0.72327400;
+           daa[19*20+14]= 0.00500000;
+           daa[19*20+15]= 0.07492180;
+           daa[19*20+16]= 0.70922600;
+           daa[19*20+17]= 0.00500000;
+           daa[19*20+18]= 0.04105930;
+           
+	   f[0]= 0.060490222;           f[1]= 0.066039665;           f[2]= 0.044127815;           f[3]= 0.042109048;
+           f[4]= 0.020075899;           f[5]= 0.053606488;           f[6]= 0.071567447;           f[7]= 0.072308239;
+           f[8]= 0.022293943;           f[9]= 0.069730629;           f[10]= 0.098851122;          f[11]= 0.056968211;
+           f[12]= 0.019768318;          f[13]= 0.028809447;          f[14]= 0.046025282;          f[15]= 0.05060433;
+           f[16]= 0.053636813;          f[17]= 0.033011601;          f[18]= 0.028350243;          f[19]= 0.061625237;
+          }
+          break;
+        case PLL_HIVW:
+          {
+           daa[1*20+0]=   0.0744808;
+           daa[2*20+0]=   0.6175090;
+           daa[2*20+1]=   0.1602400;
+           daa[3*20+0]=   4.4352100;
+           daa[3*20+1]=   0.0674539;
+           daa[3*20+2]=   29.4087000;
+           daa[4*20+0]=   0.1676530;
+           daa[4*20+1]=   2.8636400;
+           daa[4*20+2]=   0.0604932;
+           daa[4*20+3]=   0.0050000;
+           daa[5*20+0]=   0.0050000;
+           daa[5*20+1]=   10.6746000;
+           daa[5*20+2]=   0.3420680;
+           daa[5*20+3]=   0.0050000;
+           daa[5*20+4]=   0.0050000;
+           daa[6*20+0]=   5.5632500;
+           daa[6*20+1]=   0.0251632;
+           daa[6*20+2]=   0.2015260;
+           daa[6*20+3]=   12.1233000;
+           daa[6*20+4]=   0.0050000;
+           daa[6*20+5]=   3.2065600;
+           daa[7*20+0]=   1.8685000;
+           daa[7*20+1]=   13.4379000;
+           daa[7*20+2]=   0.0604932;
+           daa[7*20+3]=   10.3969000;
+           daa[7*20+4]=   0.0489798;
+           daa[7*20+5]=   0.0604932;
+           daa[7*20+6]=   14.7801000;
+           daa[8*20+0]=   0.0050000;
+           daa[8*20+1]=   6.8440500;
+           daa[8*20+2]=   8.5987600;
+           daa[8*20+3]=   2.3177900;
+           daa[8*20+4]=   0.0050000;
+           daa[8*20+5]=   18.5465000;
+           daa[8*20+6]=   0.0050000;
+           daa[8*20+7]=   0.0050000;
+           daa[9*20+0]=   0.0050000;
+           daa[9*20+1]=   1.3406900;
+           daa[9*20+2]=   0.9870280;
+           daa[9*20+3]=   0.1451240;
+           daa[9*20+4]=   0.0050000;
+           daa[9*20+5]=   0.0342252;
+           daa[9*20+6]=   0.0390512;
+           daa[9*20+7]=   0.0050000;
+           daa[9*20+8]=   0.0050000;
+           daa[10*20+0]=  0.1602400;
+           daa[10*20+1]=  0.5867570;
+           daa[10*20+2]=  0.0050000;
+           daa[10*20+3]=  0.0050000;
+           daa[10*20+4]=  0.0050000;
+           daa[10*20+5]=  2.8904800;
+           daa[10*20+6]=  0.1298390;
+           daa[10*20+7]=  0.0489798;
+           daa[10*20+8]=  1.7638200;
+           daa[10*20+9]=  9.1024600;
+           daa[11*20+0]=  0.5927840;
+           daa[11*20+1]=  39.8897000;
+           daa[11*20+2]=  10.6655000;
+           daa[11*20+3]=  0.8943130;
+           daa[11*20+4]=  0.0050000;
+           daa[11*20+5]=  13.0705000;
+           daa[11*20+6]=  23.9626000;
+           daa[11*20+7]=  0.2794250;
+           daa[11*20+8]=  0.2240600;
+           daa[11*20+9]=  0.8174810;
+           daa[11*20+10]= 0.0050000;
+           daa[12*20+0]=  0.0050000;
+           daa[12*20+1]=  3.2865200;
+           daa[12*20+2]=  0.2015260;
+           daa[12*20+3]=  0.0050000;
+           daa[12*20+4]=  0.0050000;
+           daa[12*20+5]=  0.0050000;
+           daa[12*20+6]=  0.0050000;
+           daa[12*20+7]=  0.0489798;
+           daa[12*20+8]=  0.0050000;
+           daa[12*20+9]=  17.3064000;
+           daa[12*20+10]= 11.3839000;
+           daa[12*20+11]= 4.0956400;
+           daa[13*20+0]=  0.5979230;
+           daa[13*20+1]=  0.0050000;
+           daa[13*20+2]=  0.0050000;
+           daa[13*20+3]=  0.0050000;
+           daa[13*20+4]=  0.3629590;
+           daa[13*20+5]=  0.0050000;
+           daa[13*20+6]=  0.0050000;
+           daa[13*20+7]=  0.0050000;
+           daa[13*20+8]=  0.0050000;
+           daa[13*20+9]=  1.4828800;
+           daa[13*20+10]= 7.4878100;
+           daa[13*20+11]= 0.0050000;
+           daa[13*20+12]= 0.0050000;
+           daa[14*20+0]=  1.0098100;
+           daa[14*20+1]=  0.4047230;
+           daa[14*20+2]=  0.3448480;
+           daa[14*20+3]=  0.0050000;
+           daa[14*20+4]=  0.0050000;
+           daa[14*20+5]=  3.0450200;
+           daa[14*20+6]=  0.0050000;
+           daa[14*20+7]=  0.0050000;
+           daa[14*20+8]=  13.9444000;
+           daa[14*20+9]=  0.0050000;
+           daa[14*20+10]= 9.8309500;
+           daa[14*20+11]= 0.1119280;
+           daa[14*20+12]= 0.0050000;
+           daa[14*20+13]= 0.0342252;
+           daa[15*20+0]=  8.5942000;
+           daa[15*20+1]=  8.3502400;
+           daa[15*20+2]=  14.5699000;
+           daa[15*20+3]=  0.4278810;
+           daa[15*20+4]=  1.1219500;
+           daa[15*20+5]=  0.1602400;
+           daa[15*20+6]=  0.0050000;
+           daa[15*20+7]=  6.2796600;
+           daa[15*20+8]=  0.7251570;
+           daa[15*20+9]=  0.7400910;
+           daa[15*20+10]= 6.1439600;
+           daa[15*20+11]= 0.0050000;
+           daa[15*20+12]= 0.3925750;
+           daa[15*20+13]= 4.2793900;
+           daa[15*20+14]= 14.2490000;
+           daa[16*20+0]=  24.1422000;
+           daa[16*20+1]=  0.9282030;
+           daa[16*20+2]=  4.5420600;
+           daa[16*20+3]=  0.6303950;
+           daa[16*20+4]=  0.0050000;
+           daa[16*20+5]=  0.2030910;
+           daa[16*20+6]=  0.4587430;
+           daa[16*20+7]=  0.0489798;
+           daa[16*20+8]=  0.9595600;
+           daa[16*20+9]=  9.3634500;
+           daa[16*20+10]= 0.0050000;
+           daa[16*20+11]= 4.0480200;
+           daa[16*20+12]= 7.4131300;
+           daa[16*20+13]= 0.1145120;
+           daa[16*20+14]= 4.3370100;
+           daa[16*20+15]= 6.3407900;
+           daa[17*20+0]=  0.0050000;
+           daa[17*20+1]=  5.9656400;
+           daa[17*20+2]=  0.0050000;
+           daa[17*20+3]=  0.0050000;
+           daa[17*20+4]=  5.4989400;
+           daa[17*20+5]=  0.0443298;
+           daa[17*20+6]=  0.0050000;
+           daa[17*20+7]=  2.8258000;
+           daa[17*20+8]=  0.0050000;
+           daa[17*20+9]=  0.0050000;
+           daa[17*20+10]= 1.3703100;
+           daa[17*20+11]= 0.0050000;
+           daa[17*20+12]= 0.0050000;
+           daa[17*20+13]= 0.0050000;
+           daa[17*20+14]= 0.0050000;
+           daa[17*20+15]= 1.1015600;
+           daa[17*20+16]= 0.0050000;
+           daa[18*20+0]=  0.0050000;
+           daa[18*20+1]=  0.0050000;
+           daa[18*20+2]=  5.0647500;
+           daa[18*20+3]=  2.2815400;
+           daa[18*20+4]=  8.3483500;
+           daa[18*20+5]=  0.0050000;
+           daa[18*20+6]=  0.0050000;
+           daa[18*20+7]=  0.0050000;
+           daa[18*20+8]=  47.4889000;
+           daa[18*20+9]=  0.1145120;
+           daa[18*20+10]= 0.0050000;
+           daa[18*20+11]= 0.0050000;
+           daa[18*20+12]= 0.5791980;
+           daa[18*20+13]= 4.1272800;
+           daa[18*20+14]= 0.0050000;
+           daa[18*20+15]= 0.9331420;
+           daa[18*20+16]= 0.4906080;
+           daa[18*20+17]= 0.0050000;
+           daa[19*20+0]=  24.8094000;
+           daa[19*20+1]=  0.2794250;
+           daa[19*20+2]=  0.0744808;
+           daa[19*20+3]=  2.9178600;
+           daa[19*20+4]=  0.0050000;
+           daa[19*20+5]=  0.0050000;
+           daa[19*20+6]=  2.1995200;
+           daa[19*20+7]=  2.7962200;
+           daa[19*20+8]=  0.8274790;
+           daa[19*20+9]=  24.8231000;
+           daa[19*20+10]= 2.9534400;
+           daa[19*20+11]= 0.1280650;
+           daa[19*20+12]= 14.7683000;
+           daa[19*20+13]= 2.2800000;
+           daa[19*20+14]= 0.0050000;
+           daa[19*20+15]= 0.8626370;
+           daa[19*20+16]= 0.0050000;
+           daa[19*20+17]= 0.0050000;
+           daa[19*20+18]= 1.3548200;
+           
+	   f[0]= 0.0377494;             f[1]= 0.057321;              f[2]= 0.0891129;             f[3]= 0.0342034;
+           f[4]= 0.0240105;             f[5]= 0.0437824;             f[6]= 0.0618606;             f[7]= 0.0838496;
+           f[8]= 0.0156076;             f[9]= 0.0983641;             f[10]= 0.0577867;            f[11]= 0.0641682;
+           f[12]= 0.0158419;            f[13]= 0.0422741;            f[14]= 0.0458601;            f[15]= 0.0550846;
+           f[16]= 0.0813774;            f[17]= 0.019597;             f[18]= 0.0205847;            f[19]= 0.0515638;
+          }
+          break;
+        case PLL_JTTDCMUT:
+          {
+           daa[1*20+0]=   0.531678;
+           daa[2*20+0]=   0.557967;
+           daa[2*20+1]=   0.451095;
+           daa[3*20+0]=   0.827445;
+           daa[3*20+1]=   0.154899;
+           daa[3*20+2]=   5.549530;
+           daa[4*20+0]=   0.574478;
+           daa[4*20+1]=   1.019843;
+           daa[4*20+2]=   0.313311;
+           daa[4*20+3]=   0.105625;
+           daa[5*20+0]=   0.556725;
+           daa[5*20+1]=   3.021995;
+           daa[5*20+2]=   0.768834;
+           daa[5*20+3]=   0.521646;
+           daa[5*20+4]=   0.091304;
+           daa[6*20+0]=   1.066681;
+           daa[6*20+1]=   0.318483;
+           daa[6*20+2]=   0.578115;
+           daa[6*20+3]=   7.766557;
+           daa[6*20+4]=   0.053907;
+           daa[6*20+5]=   3.417706;
+           daa[7*20+0]=   1.740159;
+           daa[7*20+1]=   1.359652;
+           daa[7*20+2]=   0.773313;
+           daa[7*20+3]=   1.272434;
+           daa[7*20+4]=   0.546389;
+           daa[7*20+5]=   0.231294;
+           daa[7*20+6]=   1.115632;
+           daa[8*20+0]=   0.219970;
+           daa[8*20+1]=   3.210671;
+           daa[8*20+2]=   4.025778;
+           daa[8*20+3]=   1.032342;
+           daa[8*20+4]=   0.724998;
+           daa[8*20+5]=   5.684080;
+           daa[8*20+6]=   0.243768;
+           daa[8*20+7]=   0.201696;
+           daa[9*20+0]=   0.361684;
+           daa[9*20+1]=   0.239195;
+           daa[9*20+2]=   0.491003;
+           daa[9*20+3]=   0.115968;
+           daa[9*20+4]=   0.150559;
+           daa[9*20+5]=   0.078270;
+           daa[9*20+6]=   0.111773;
+           daa[9*20+7]=   0.053769;
+           daa[9*20+8]=   0.181788;
+           daa[10*20+0]=  0.310007;
+           daa[10*20+1]=  0.372261;
+           daa[10*20+2]=  0.137289;
+           daa[10*20+3]=  0.061486;
+           daa[10*20+4]=  0.164593;
+           daa[10*20+5]=  0.709004;
+           daa[10*20+6]=  0.097485;
+           daa[10*20+7]=  0.069492;
+           daa[10*20+8]=  0.540571;
+           daa[10*20+9]=  2.335139;
+           daa[11*20+0]=  0.369437;
+           daa[11*20+1]=  6.529255;
+           daa[11*20+2]=  2.529517;
+           daa[11*20+3]=  0.282466;
+           daa[11*20+4]=  0.049009;
+           daa[11*20+5]=  2.966732;
+           daa[11*20+6]=  1.731684;
+           daa[11*20+7]=  0.269840;
+           daa[11*20+8]=  0.525096;
+           daa[11*20+9]=  0.202562;
+           daa[11*20+10]= 0.146481;
+           daa[12*20+0]=  0.469395;
+           daa[12*20+1]=  0.431045;
+           daa[12*20+2]=  0.330720;
+           daa[12*20+3]=  0.190001;
+           daa[12*20+4]=  0.409202;
+           daa[12*20+5]=  0.456901;
+           daa[12*20+6]=  0.175084;
+           daa[12*20+7]=  0.130379;
+           daa[12*20+8]=  0.329660;
+           daa[12*20+9]=  4.831666;
+           daa[12*20+10]= 3.856906;
+           daa[12*20+11]= 0.624581;
+           daa[13*20+0]=  0.138293;
+           daa[13*20+1]=  0.065314;
+           daa[13*20+2]=  0.073481;
+           daa[13*20+3]=  0.032522;
+           daa[13*20+4]=  0.678335;
+           daa[13*20+5]=  0.045683;
+           daa[13*20+6]=  0.043829;
+           daa[13*20+7]=  0.050212;
+           daa[13*20+8]=  0.453428;
+           daa[13*20+9]=  0.777090;
+           daa[13*20+10]= 2.500294;
+           daa[13*20+11]= 0.024521;
+           daa[13*20+12]= 0.436181;
+           daa[14*20+0]=  1.959599;
+           daa[14*20+1]=  0.710489;
+           daa[14*20+2]=  0.121804;
+           daa[14*20+3]=  0.127164;
+           daa[14*20+4]=  0.123653;
+           daa[14*20+5]=  1.608126;
+           daa[14*20+6]=  0.191994;
+           daa[14*20+7]=  0.208081;
+           daa[14*20+8]=  1.141961;
+           daa[14*20+9]=  0.098580;
+           daa[14*20+10]= 1.060504;
+           daa[14*20+11]= 0.216345;
+           daa[14*20+12]= 0.164215;
+           daa[14*20+13]= 0.148483;
+           daa[15*20+0]=  3.887095;
+           daa[15*20+1]=  1.001551;
+           daa[15*20+2]=  5.057964;
+           daa[15*20+3]=  0.589268;
+           daa[15*20+4]=  2.155331;
+           daa[15*20+5]=  0.548807;
+           daa[15*20+6]=  0.312449;
+           daa[15*20+7]=  1.874296;
+           daa[15*20+8]=  0.743458;
+           daa[15*20+9]=  0.405119;
+           daa[15*20+10]= 0.592511;
+           daa[15*20+11]= 0.474478;
+           daa[15*20+12]= 0.285564;
+           daa[15*20+13]= 0.943971;
+           daa[15*20+14]= 2.788406;
+           daa[16*20+0]=  4.582565;
+           daa[16*20+1]=  0.650282;
+           daa[16*20+2]=  2.351311;
+           daa[16*20+3]=  0.425159;
+           daa[16*20+4]=  0.469823;
+           daa[16*20+5]=  0.523825;
+           daa[16*20+6]=  0.331584;
+           daa[16*20+7]=  0.316862;
+           daa[16*20+8]=  0.477355;
+           daa[16*20+9]=  2.553806;
+           daa[16*20+10]= 0.272514;
+           daa[16*20+11]= 0.965641;
+           daa[16*20+12]= 2.114728;
+           daa[16*20+13]= 0.138904;
+           daa[16*20+14]= 1.176961;
+           daa[16*20+15]= 4.777647;
+           daa[17*20+0]=  0.084329;
+           daa[17*20+1]=  1.257961;
+           daa[17*20+2]=  0.027700;
+           daa[17*20+3]=  0.057466;
+           daa[17*20+4]=  1.104181;
+           daa[17*20+5]=  0.172206;
+           daa[17*20+6]=  0.114381;
+           daa[17*20+7]=  0.544180;
+           daa[17*20+8]=  0.128193;
+           daa[17*20+9]=  0.134510;
+           daa[17*20+10]= 0.530324;
+           daa[17*20+11]= 0.089134;
+           daa[17*20+12]= 0.201334;
+           daa[17*20+13]= 0.537922;
+           daa[17*20+14]= 0.069965;
+           daa[17*20+15]= 0.310927;
+           daa[17*20+16]= 0.080556;
+           daa[18*20+0]=  0.139492;
+           daa[18*20+1]=  0.235601;
+           daa[18*20+2]=  0.700693;
+           daa[18*20+3]=  0.453952;
+           daa[18*20+4]=  2.114852;
+           daa[18*20+5]=  0.254745;
+           daa[18*20+6]=  0.063452;
+           daa[18*20+7]=  0.052500;
+           daa[18*20+8]=  5.848400;
+           daa[18*20+9]=  0.303445;
+           daa[18*20+10]= 0.241094;
+           daa[18*20+11]= 0.087904;
+           daa[18*20+12]= 0.189870;
+           daa[18*20+13]= 5.484236;
+           daa[18*20+14]= 0.113850;
+           daa[18*20+15]= 0.628608;
+           daa[18*20+16]= 0.201094;
+           daa[18*20+17]= 0.747889;
+           daa[19*20+0]=  2.924161;
+           daa[19*20+1]=  0.171995;
+           daa[19*20+2]=  0.164525;
+           daa[19*20+3]=  0.315261;
+           daa[19*20+4]=  0.621323;
+           daa[19*20+5]=  0.179771;
+           daa[19*20+6]=  0.465271;
+           daa[19*20+7]=  0.470140;
+           daa[19*20+8]=  0.121827;
+           daa[19*20+9]=  9.533943;
+           daa[19*20+10]= 1.761439;
+           daa[19*20+11]= 0.124066;
+           daa[19*20+12]= 3.038533;
+           daa[19*20+13]= 0.593478;
+           daa[19*20+14]= 0.211561;
+           daa[19*20+15]= 0.408532;
+           daa[19*20+16]= 1.143980;
+           daa[19*20+17]= 0.239697;
+           daa[19*20+18]= 0.165473;
+           
+           f[0]=  0.077;
+           f[1]=  0.051;
+           f[2]=  0.043;
+           f[3]=  0.051;
+           f[4]=  0.020;
+           f[5]=  0.041;
+           f[6]=  0.062;
+           f[7]=  0.075;
+           f[8]=  0.023;
+           f[9]=  0.053;
+           f[10]= 0.091;
+           f[11]= 0.059;
+           f[12]= 0.024;
+           f[13]= 0.040;
+           f[14]= 0.051;
+           f[15]= 0.068;
+           f[16]= 0.059;
+           f[17]= 0.014;
+           f[18]= 0.032;
+           f[19]= 0.066;
+          }
+          break;
+        case PLL_FLU:
+          {
+            daa[ 1*20+ 0]       =       0.138658765     ;
+            daa[ 2*20+ 0]       =       0.053366579     ;
+            daa[ 2*20+ 1]       =       0.161000889     ;
+            daa[ 3*20+ 0]       =       0.584852306     ;
+            daa[ 3*20+ 1]       =       0.006771843     ;
+            daa[ 3*20+ 2]       =       7.737392871     ;
+            daa[ 4*20+ 0]       =       0.026447095     ;
+            daa[ 4*20+ 1]       =       0.167207008     ;
+            daa[ 4*20+ 2]       =       1.30E-05        ;
+            daa[ 4*20+ 3]       =       1.41E-02        ;
+            daa[ 5*20+ 0]       =       0.353753982     ;
+            daa[ 5*20+ 1]       =       3.292716942     ;
+            daa[ 5*20+ 2]       =       0.530642655     ;
+            daa[ 5*20+ 3]       =       0.145469388     ;
+            daa[ 5*20+ 4]       =       0.002547334     ;
+            daa[ 6*20+ 0]       =       1.484234503     ;
+            daa[ 6*20+ 1]       =       0.124897617     ;
+            daa[ 6*20+ 2]       =       0.061652192     ;
+            daa[ 6*20+ 3]       =       5.370511279     ;
+            daa[ 6*20+ 4]       =       3.91E-11        ;
+            daa[ 6*20+ 5]       =       1.195629122     ;
+            daa[ 7*20+ 0]       =       1.132313122     ;
+            daa[ 7*20+ 1]       =       1.190624465     ;
+            daa[ 7*20+ 2]       =       0.322524648     ;
+            daa[ 7*20+ 3]       =       1.934832784     ;
+            daa[ 7*20+ 4]       =       0.116941459     ;
+            daa[ 7*20+ 5]       =       0.108051341     ;
+            daa[ 7*20+ 6]       =       1.593098825     ;
+            daa[ 8*20+ 0]       =       0.214757862     ;
+            daa[ 8*20+ 1]       =       1.879569938     ;
+            daa[ 8*20+ 2]       =       1.387096032     ;
+            daa[ 8*20+ 3]       =       0.887570549     ;
+            daa[ 8*20+ 4]       =       2.18E-02        ;
+            daa[ 8*20+ 5]       =       5.330313412     ;
+            daa[ 8*20+ 6]       =       0.256491863     ;
+            daa[ 8*20+ 7]       =       0.058774527     ;
+            daa[ 9*20+ 0]       =       0.149926734     ;
+            daa[ 9*20+ 1]       =       0.246117172     ;
+            daa[ 9*20+ 2]       =       0.218571975     ;
+            daa[ 9*20+ 3]       =       0.014085917     ;
+            daa[ 9*20+ 4]       =       0.001112158     ;
+            daa[ 9*20+ 5]       =       0.02883995      ;
+            daa[ 9*20+ 6]       =       1.42E-02        ;
+            daa[ 9*20+ 7]       =       1.63E-05        ;
+            daa[ 9*20+ 8]       =       0.243190142     ;
+            daa[10*20+ 0]       =       0.023116952     ;
+            daa[10*20+ 1]       =       0.296045557     ;
+            daa[10*20+ 2]       =       8.36E-04        ;
+            daa[10*20+ 3]       =       0.005730682     ;
+            daa[10*20+ 4]       =       0.005613627     ;
+            daa[10*20+ 5]       =       1.020366955     ;
+            daa[10*20+ 6]       =       0.016499536     ;
+            daa[10*20+ 7]       =       0.006516229     ;
+            daa[10*20+ 8]       =       0.321611694     ;
+            daa[10*20+ 9]       =       3.512072282     ;
+            daa[11*20+ 0]       =       0.47433361      ;
+            daa[11*20+ 1]       =       15.30009662     ;
+            daa[11*20+ 2]       =       2.646847965     ;
+            daa[11*20+ 3]       =       0.29004298      ;
+            daa[11*20+ 4]       =       3.83E-06        ;
+            daa[11*20+ 5]       =       2.559587177     ;
+            daa[11*20+ 6]       =       3.881488809     ;
+            daa[11*20+ 7]       =       0.264148929     ;
+            daa[11*20+ 8]       =       0.347302791     ;
+            daa[11*20+ 9]       =       0.227707997     ;
+            daa[11*20+10]       =       0.129223639     ;
+            daa[12*20+ 0]       =       0.058745423     ;
+            daa[12*20+ 1]       =       0.890162346     ;
+            daa[12*20+ 2]       =       0.005251688     ;
+            daa[12*20+ 3]       =       0.041762964     ;
+            daa[12*20+ 4]       =       0.11145731      ;
+            daa[12*20+ 5]       =       0.190259181     ;
+            daa[12*20+ 6]       =       0.313974351     ;
+            daa[12*20+ 7]       =       0.001500467     ;
+            daa[12*20+ 8]       =       0.001273509     ;
+            daa[12*20+ 9]       =       9.017954203     ;
+            daa[12*20+10]       =       6.746936485     ;
+            daa[12*20+11]       =       1.331291619     ;
+            daa[13*20+ 0]       =       0.080490909     ;
+            daa[13*20+ 1]       =       1.61E-02        ;
+            daa[13*20+ 2]       =       8.36E-04        ;
+            daa[13*20+ 3]       =       1.06E-06        ;
+            daa[13*20+ 4]       =       0.104053666     ;
+            daa[13*20+ 5]       =       0.032680657     ;
+            daa[13*20+ 6]       =       0.001003501     ;
+            daa[13*20+ 7]       =       0.001236645     ;
+            daa[13*20+ 8]       =       0.119028506     ;
+            daa[13*20+ 9]       =       1.463357278     ;
+            daa[13*20+10]       =       2.986800036     ;
+            daa[13*20+11]       =       3.20E-01        ;
+            daa[13*20+12]       =       0.279910509     ;
+            daa[14*20+ 0]       =       0.659311478     ;
+            daa[14*20+ 1]       =       0.15402718      ;
+            daa[14*20+ 2]       =       3.64E-02        ;
+            daa[14*20+ 3]       =       0.188539456     ;
+            daa[14*20+ 4]       =       1.59E-13        ;
+            daa[14*20+ 5]       =       0.712769599     ;
+            daa[14*20+ 6]       =       0.319558828     ;
+            daa[14*20+ 7]       =       0.038631761     ;
+            daa[14*20+ 8]       =       0.924466914     ;
+            daa[14*20+ 9]       =       0.080543327     ;
+            daa[14*20+10]       =       0.634308521     ;
+            daa[14*20+11]       =       0.195750632     ;
+            daa[14*20+12]       =       5.69E-02        ;
+            daa[14*20+13]       =       0.00713243      ;
+            daa[15*20+ 0]       =       3.011344519     ;
+            daa[15*20+ 1]       =       0.95013841      ;
+            daa[15*20+ 2]       =       3.881310531     ;
+            daa[15*20+ 3]       =       0.338372183     ;
+            daa[15*20+ 4]       =       0.336263345     ;
+            daa[15*20+ 5]       =       0.487822499     ;
+            daa[15*20+ 6]       =       0.307140298     ;
+            daa[15*20+ 7]       =       1.585646577     ;
+            daa[15*20+ 8]       =       0.58070425      ;
+            daa[15*20+ 9]       =       0.290381075     ;
+            daa[15*20+10]       =       0.570766693     ;
+            daa[15*20+11]       =       0.283807672     ;
+            daa[15*20+12]       =       0.007026588     ;
+            daa[15*20+13]       =       0.99668567      ;
+            daa[15*20+14]       =       2.087385344     ;
+            daa[16*20+ 0]       =       5.418298175     ;
+            daa[16*20+ 1]       =       0.183076905     ;
+            daa[16*20+ 2]       =       2.140332316     ;
+            daa[16*20+ 3]       =       0.135481233     ;
+            daa[16*20+ 4]       =       0.011975266     ;
+            daa[16*20+ 5]       =       0.602340963     ;
+            daa[16*20+ 6]       =       0.280124895     ;
+            daa[16*20+ 7]       =       0.01880803      ;
+            daa[16*20+ 8]       =       0.368713573     ;
+            daa[16*20+ 9]       =       2.904052286     ;
+            daa[16*20+10]       =       0.044926357     ;
+            daa[16*20+11]       =       1.5269642       ;
+            daa[16*20+12]       =       2.031511321     ;
+            daa[16*20+13]       =       0.000134906     ;
+            daa[16*20+14]       =       0.542251094     ;
+            daa[16*20+15]       =       2.206859934     ;
+            daa[17*20+ 0]       =       1.96E-01        ;
+            daa[17*20+ 1]       =       1.369429408     ;
+            daa[17*20+ 2]       =       5.36E-04        ;
+            daa[17*20+ 3]       =       1.49E-05        ;
+            daa[17*20+ 4]       =       0.09410668      ;
+            daa[17*20+ 5]       =       4.40E-02        ;
+            daa[17*20+ 6]       =       0.155245492     ;
+            daa[17*20+ 7]       =       0.196486447     ;
+            daa[17*20+ 8]       =       2.24E-02        ;
+            daa[17*20+ 9]       =       0.03213215      ;
+            daa[17*20+10]       =       0.431277663     ;
+            daa[17*20+11]       =       4.98E-05        ;
+            daa[17*20+12]       =       0.070460039     ;
+            daa[17*20+13]       =       0.814753094     ;
+            daa[17*20+14]       =       0.000431021     ;
+            daa[17*20+15]       =       0.099835753     ;
+            daa[17*20+16]       =       0.207066206     ;
+            daa[18*20+ 0]       =       0.018289288     ;
+            daa[18*20+ 1]       =       0.099855497     ;
+            daa[18*20+ 2]       =       0.373101927     ;
+            daa[18*20+ 3]       =       0.525398543     ;
+            daa[18*20+ 4]       =       0.601692431     ;
+            daa[18*20+ 5]       =       0.072205935     ;
+            daa[18*20+ 6]       =       0.10409287      ;
+            daa[18*20+ 7]       =       0.074814997     ;
+            daa[18*20+ 8]       =       6.448954446     ;
+            daa[18*20+ 9]       =       0.273934263     ;
+            daa[18*20+10]       =       0.340058468     ;
+            daa[18*20+11]       =       0.012416222     ;
+            daa[18*20+12]       =       0.874272175     ;
+            daa[18*20+13]       =       5.393924245     ;
+            daa[18*20+14]       =       1.82E-04        ;
+            daa[18*20+15]       =       0.39255224      ;
+            daa[18*20+16]       =       0.12489802      ;
+            daa[18*20+17]       =       0.42775543      ;
+            daa[19*20+ 0]       =       3.53200527      ;
+            daa[19*20+ 1]       =       0.103964386     ;
+            daa[19*20+ 2]       =       0.010257517     ;
+            daa[19*20+ 3]       =       0.297123975     ;
+            daa[19*20+ 4]       =       0.054904564     ;
+            daa[19*20+ 5]       =       0.406697814     ;
+            daa[19*20+ 6]       =       0.285047948     ;
+            daa[19*20+ 7]       =       0.337229619     ;
+            daa[19*20+ 8]       =       0.098631355     ;
+            daa[19*20+ 9]       =       14.39405219     ;
+            daa[19*20+10]       =       0.890598579     ;
+            daa[19*20+11]       =       0.07312793      ;
+            daa[19*20+12]       =       4.904842235     ;
+            daa[19*20+13]       =       0.592587985     ;
+            daa[19*20+14]       =       0.058971975     ;
+            daa[19*20+15]       =       0.088256423     ;
+            daa[19*20+16]       =       0.654109108     ;
+            daa[19*20+17]       =       0.256900461     ;
+            daa[19*20+18]       =       0.167581647     ;
+            
+ 
+  
+            f[0]        =       0.0471  ;
+            f[1]        =       0.0509  ;
+            f[2]        =       0.0742  ;
+            f[3]        =       0.0479  ;
+            f[4]        =       0.0250  ;
+            f[5]        =       0.0333  ;
+            f[6]        =       0.0546  ;
+            f[7]        =       0.0764  ;
+            f[8]        =       0.0200  ;
+            f[9]        =       0.0671  ;
+            f[10]       =       0.0715  ;
+            f[11]       =       0.0568  ;
+            f[12]       =       0.0181  ;
+            f[13]       =       0.0305  ;
+            f[14]       =       0.0507  ;
+            f[15]       =       0.0884  ;
+            f[16]       =       0.0743  ;
+            f[17]       =       0.0185  ;
+            f[18]       =       0.0315  ;
+            f[19]       =       0.0632  ;
+          }
+          break;     
+        default: 
+          assert(0);
+        }
+    }
+
+
+  /*
+    
+  TODO review frequency sums for fixed as well as empirical base frequencies !
+  
+  NUMERICAL BUG fix, rounded AA freqs in some models, such that 
+  they actually really sum to 1.0 +/- epsilon 
+  
+  {
+    double acc = 0.0;
+  
+    for(i = 0; i < 20; i++)
+      acc += f[i];
+    
+    printf("%1.80f\n", acc);
+    assert(acc == 1.0);  
+  }
+  */
+ 
+
+
+  /* fill the upper triangle (above the diagonal) with the corresponding values
+     from the lower triangle */
+  for (i=0; i<20; i++)  
+    for (j=0; j<i; j++)               
+      daa[j*20+i] = daa[i*20+j];
+
+  
+  /*
+    for (i=0; i<20; i++)  
+    {
+    for (j=0; j<20; j++)
+    {
+    if(i == j)
+    printf("0.0 ");
+    else
+    printf("%f ", daa[i * 20 + j]);
+    }
+    printf("\n");
+    }
+    
+    for (i=0; i<20; i++) 
+    printf("%f ", f[i]);
+    printf("\n");
+  */
+  
+
+  max = 0;
+  
+  /* copy the triangle above the diagonal from daa (which is a linear block) to
+     the triangle above the diagonal of a square matrix q. Store the maximal
+     value in variable max */
+  for(i = 0; i < 19; i++)
+    for(j = i + 1; j < 20; j++)
+      {
+        q[i][j] = temp = daa[i * 20 + j];
+        if(temp > max) 
+          max = temp;
+      }
+ 
+  scaler = PLL_AA_SCALE / max;
+   
+  /* SCALING HAS BEEN RE-INTRODUCED TO RESOLVE NUMERICAL  PROBLEMS */   
+
+  /* copy and scale values to the initialRates array */
+  r = 0;
+  for(i = 0; i < 19; i++)
+    {      
+      for(j = i + 1; j < 20; j++)
+        {  
+        
+          q[i][j] *= scaler;
+          
+          
+          assert(q[i][j] <= PLL_AA_SCALE_PLUS_EPSILON);
+          
+          initialRates[r++] = q[i][j];
+        }
+    }             
+}
+
+/** @brief Set the frac
+  *
+  * Update \a partitionContribution in each partition by setting it to the fraction of sites in
+  * that partition to the total number of sites. Also set \a tr->fracchange according to the
+  * computes \a fracchange of each partition.
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  * 
+  * @todo 
+      I understand how fracchange is computed for each partition, but I dont know
+      what is it for. Also what is tr->fracchange for?
+*/
+static void updateFracChange(pllInstance *tr, partitionList *pr)
+{   
+  int numberOfModels = pr->numberOfPartitions;
+  if(numberOfModels == 1)
+    {   
+      assert(pr->partitionData[0]->fracchange != -1.0);
+     
+      tr->fracchange = pr->partitionData[0]->fracchange;
+      pr->partitionData[0]->fracchange = -1.0;
+      pr->partitionData[0]->rawFracchange = pr->partitionData[0]->fracchange;
+    }      
+  else
+    {
+      int model;
+      double *modelWeights = (double *)rax_calloc((size_t)numberOfModels, sizeof(double));
+      double wgtsum = 0.0;  
+     
+      assert(numberOfModels > 1);
+
+      tr->fracchange = 0.0;              
+      
+       for(model = 0; model < numberOfModels; model++)
+         {
+           size_t
+             lower = pr->partitionData[model]->lower,
+             upper = pr->partitionData[model]->upper,
+             i;
+           
+           for(i = lower; i < upper; i++)
+             {
+               modelWeights[model] += (double)tr->aliaswgt[i];
+               wgtsum              += (double)tr->aliaswgt[i];
+             }
+         }
+
+       /*for(i = 0; i < tr->originalCrunchedLength; i++)
+        {
+          modelWeights[tr->model[i]]  += (double)tr->aliaswgt[i];
+          wgtsum                      += (double)tr->aliaswgt[i];
+          }*/  
+
+      
+                
+      for(model = 0; model < numberOfModels; model++)
+        {                        
+          pr->partitionData[model]->partitionContribution = modelWeights[model] / wgtsum;
+          tr->fracchange +=  pr->partitionData[model]->partitionContribution * pr->partitionData[model]->fracchange;
+          pr->partitionData[model]->rawFracchange = pr->partitionData[model]->fracchange;
+        }
+    
+      rax_free(modelWeights);
+    }
+
+    tr->rawFracchange = tr->fracchange;
+}
+
+/** @brief Not sure what this function does
+  * 
+  * @todo
+  *   Comment this function
+  */
+static void mytred2(double **a, const int n, double *d, double *e)
+{
+  int     l, k, j, i;
+  double  scale, hh, h, g, f; 
+ 
+  for (i = n; i > 1; i--)
+    {
+      l = i - 1;
+      h = 0.0;
+      scale = 0.0;
+      
+      if (l > 1)
+        {
+          for (k = 1; k <= l; k++)
+            scale += fabs(a[k - 1][i - 1]);
+          if (scale == 0.0)
+            e[i - 1] = a[l - 1][i - 1];
+          else
+            {
+              for (k = 1; k <= l; k++)
+                {
+                  a[k - 1][i - 1] /= scale;
+                  h += a[k - 1][i - 1] * a[k - 1][i - 1];
+                }
+              f = a[l - 1][i - 1];
+              g = ((f > 0) ? -sqrt(h) : sqrt(h)); /* diff */
+              e[i - 1] = scale * g;
+              h -= f * g;
+              a[l - 1][i - 1] = f - g;
+              f = 0.0;
+              for (j = 1; j <= l; j++)
+                {
+                  a[i - 1][j - 1] = a[j - 1][i - 1] / h;
+                  g = 0.0;
+                  for (k = 1; k <= j; k++)
+                    g += a[k - 1][j - 1] * a[k - 1][i - 1];
+                  for (k = j + 1; k <= l; k++)
+                    g += a[j - 1][k - 1] * a[k - 1][i - 1];
+                  e[j - 1] = g / h;
+                  f += e[j - 1] * a[j - 1][i - 1];
+                }
+              hh = f / (h + h);
+              for (j = 1; j <= l; j++)
+                {
+                  f = a[j - 1][i - 1];
+                  g = e[j - 1] - hh * f;
+                  e[j - 1] = g;
+                  for (k = 1; k <= j; k++)
+                    a[k - 1][j - 1] -= (f * e[k - 1] + g * a[k - 1][i - 1]);
+                }
+            }
+        } 
+      else
+        e[i - 1] = a[l - 1][i - 1];
+      d[i - 1] = h;
+    }
+  d[0] = 0.0;
+  e[0] = 0.0;
+  
+  for (i = 1; i <= n; i++)
+    {
+      l = i - 1;
+      if (d[i - 1] != 0.0)
+        {
+          for (j = 1; j <= l; j++)
+            {
+                g = 0.0;
+                for (k = 1; k <= l; k++)
+                  g += a[k - 1][i - 1] * a[j - 1][k - 1];
+                for(k = 1; k <= l; k++)
+                  a[j - 1][k - 1] -= g * a[i - 1][k - 1];
+            }
+        }
+      d[i - 1] = a[i - 1][i - 1];
+      a[i - 1][i - 1] = 1.0;
+      for (j = 1; j <= l; j++)
+        a[i - 1][j - 1] = a[j - 1][i - 1] = 0.0;
+    }
+ 
+ 
+}
+/*#define MYSIGN(a,b) ((b)<0 ? -fabs(a) : fabs(a))*/
+
+/** @brief Not sure what this function does
+  * 
+  * @todo
+  *   Comment this function
+  */
+static int mytqli(double *d, double *e, const int n, double **z)
+{
+  int     m, l, iter, i, k;
+  double  s, r, p, g, f, dd, c, b;
+   
+  for (i = 2; i <= n; i++)
+    e[i - 2] = e[i - 1];
+
+  e[n - 1] = 0.0;
+
+  for (l = 1; l <= n; l++)
+    {
+      iter = 0;
+      do
+        {
+          for (m = l; m <= n - 1; m++)
+            {
+              dd = fabs(d[m - 1]) + fabs(d[m]);
+              if (fabs(e[m - 1]) + dd == dd)
+                break;
+            }
+
+          if (m != l)
+           {
+             assert(iter < 30);
+             
+             g = (d[l] - d[l - 1]) / (2.0 * e[l - 1]);
+             r = sqrt((g * g) + 1.0);
+             g = d[m - 1] - d[l - 1] + e[l - 1] / (g + ((g < 0)?-fabs(r):fabs(r)));/*MYSIGN(r, g));*/
+             s = c = 1.0;
+             p = 0.0;
+
+             for (i = m - 1; i >= l; i--)
+               {
+                 f = s * e[i - 1];
+                 b = c * e[i - 1];
+                 if (fabs(f) >= fabs(g))
+                   {
+                     c = g / f;
+                     r = sqrt((c * c) + 1.0);
+                     e[i] = f * r;
+                     c *= (s = 1.0 / r);
+                   } 
+                 else
+                   {
+                     s = f / g;
+                     r = sqrt((s * s) + 1.0);
+                     e[i] = g * r;
+                     s *= (c = 1.0 / r);
+                   }
+                 g = d[i] - p;
+                 r = (d[i - 1] - g) * s + 2.0 * c * b;
+                 p = s * r;
+                 d[i] = g + p;
+                 g = c * r - b;
+                 for (k = 1; k <= n; k++)
+                   {
+                     f = z[i][k-1];
+                     z[i][k-1] = s * z[i - 1][k - 1] + c * f;
+                     z[i - 1][k - 1] = c * z[i - 1][k - 1] - s * f;
+                   }
+               }
+
+             d[l - 1] = d[l - 1] - p;
+             e[l - 1] = g;
+             e[m - 1] = 0.0;
+           }
+        } 
+      while (m != l);
+    }
+
+    
+ 
+    return (1);
+ }
+
+
+/** @brief Compute the eigenvectors and eigenvalues
+  *
+  * @param _a
+  *   The Q matrix
+  *
+  * @param states
+  *   Number of states
+  *
+  * @param d
+  *  Eigenvalues I think? 
+  * 
+  * @param e
+  *  Not sure why this is passed as a parameter. It is uninitialized, it is first set in mytqli(...) and it is never used in initGeneric()
+  *
+  * @todo
+  *   Remove e from parameter?
+*/
+static void makeEigen(double **_a, const int states, double *d, double *e)
+{
+  mytred2(_a, states, d, e);
+  mytqli(d, e, states, _a);
+}
+
+/** @brief Generic initialization of parameters and decomposition of the Q matrix
+  *
+  * Decompose the Q matrix into eigenvectors and eigenvalues. 
+  *
+  * @param states
+  *  Number of states of the current model
+  *
+  * @param valueVector
+  *  Pointer where the tipVector will be stored
+  *
+  * @param valueVectorLength
+  *  Number of elements (of size \a states) of the tipVector
+  *
+  * @param fracchange
+  *  Variable where the computed fracchange will be stored
+  *
+  * @param ext_EIGN
+  *   Array where the eigenvalues will be stored
+  *
+  * @param EV
+  *   Array where the eigenvectors will be stored
+  *  
+  * @param EI
+  *   Array where the inverse eigenvectors will be stored
+  *
+  * @param frequencies
+  *   The model frequencies
+  *
+  * @param ext_initialRates
+  *   The model substitution rates
+  *
+  * @param tipVector
+  *   Array where the computed tipVector will be stored
+  *
+  * @todo
+  *   Perhaps we could change this also to the way pllOptRatesGeneric and other functions are implemented.
+  *   That is, instead of passing all these parameters, pass the partition index instead and load the
+  *   values within the code. Will make the code more readable. 
+*/
+static void initGeneric(const int states, 
+                        const unsigned int *valueVector, 
+                        int valueVectorLength,
+                        double *fracchange,
+                        double *ext_EIGN,
+                        double *EV,
+                        double *EI,
+                        double *frequencies,
+                        double *ext_initialRates,
+                        double *tipVector
+                      )
+{
+  double 
+    **r, 
+    **a, 
+    **EIGV,
+    *initialRates = ext_initialRates, 
+    *f, 
+    *e, 
+    *d, 
+    *invfreq, 
+    *EIGN,
+    *eptr; 
+  
+  int 
+    i, 
+    j, 
+    k, 
+    m, 
+    l;  
+
+  r    = (double **)rax_malloc((size_t)states * sizeof(double *));
+  EIGV = (double **)rax_malloc((size_t)states * sizeof(double *));  
+  a    = (double **)rax_malloc((size_t)states * sizeof(double *));        
+  
+  for(i = 0; i < states; i++)
+    {
+      a[i]    = (double*)rax_malloc((size_t)states * sizeof(double));
+      EIGV[i] = (double*)rax_malloc((size_t)states * sizeof(double));
+      r[i]    = (double*)rax_malloc((size_t)states * sizeof(double));
+    }
+
+  f       = (double*)rax_malloc((size_t)states * sizeof(double));
+  e       = (double*)rax_malloc((size_t)states * sizeof(double));
+  d       = (double*)rax_malloc((size_t)states * sizeof(double));
+  invfreq = (double*)rax_malloc((size_t)states * sizeof(double));
+  EIGN    = (double*)rax_malloc((size_t)states * sizeof(double));
+  
+  for(l = 0; l < states; l++) 
+    f[l] = frequencies[l];      
+    
+  
+  i = 0;
+  
+  for(j = 0; j < states; j++)    
+    for(k = 0; k < states; k++)
+      r[j][k] = 0.0;
+  
+  for(j = 0; j < states - 1; j++)
+    for (k = j + 1; k < states; k++)              
+      r[j][k] = initialRates[i++];         
+  
+  for (j = 0; j < states; j++) 
+    {
+      r[j][j] = 0.0;
+      for (k = 0; k < j; k++)
+        r[j][k] = r[k][j];
+    }                         
+  
+  
+
+  *fracchange = 0.0;
+  
+  for (j = 0; j < states; j++)
+    for (k = 0; k < states; k++)
+      *fracchange += f[j] * r[j][k] * f[k];
+  
+  m = 0;
+  
+  for(i=0; i< states; i++) 
+    a[i][i] = 0;
+  
+  /*  assert(r[states - 2][states - 1] == 1.0);*/
+  
+  /* compute a matrix from the rates such that each element of the diagonal
+     equals to the negative sum of all other values in the current row */
+  for(i = 0; i < states; i++) 
+    {
+      for(j = i + 1;  j < states; j++) 
+        {
+          double factor =  initialRates[m++];
+          a[i][j] = a[j][i] = factor * sqrt( f[i] * f[j]);
+          a[i][i] -= factor * f[j];
+          a[j][j] -= factor * f[i];
+        }
+    }                           
+
+  makeEigen(a, states, d, e);
+  
+ 
+  
+  for (i = 0; i < states; i++)     
+    for (j = 0; j < states; j++)       
+      a[i][j] *= sqrt(f[j]);
+   
+  
+  
+  for (i = 0; i < states; i++)
+    {     
+      if (d[i] > -1e-8) 
+        {             
+          if (i != 0) 
+            {               
+              double tmp = d[i], sum=0;
+              d[i] = d[0];
+              d[0] = tmp;
+              for (j=0; j < states; j++) 
+                {
+                  tmp = a[i][j];
+                  a[i][j] = a[0][j];
+                  sum += (a[0][j] = tmp);
+                }
+              for (j=0; j < states; j++) 
+                a[0][j] /= sum;
+            }
+          break;
+        }
+    }
+  
+  for (i = 0; i < states; i++) 
+    {
+      EIGN[i] = -d[i];
+      
+      for (j=0; j < states; j++)
+        EIGV[i][j] = a[j][i];
+      invfreq[i] = 1 / EIGV[i][0]; 
+    }                                    
+  
+  ext_EIGN[0] = 0.0;
+
+  for (l = 1; l < states; l++)
+    {
+      ext_EIGN[l] = EIGN[l]; 
+      assert(ext_EIGN[l] > 0.0);
+    }
+  
+  eptr = EV;
+  
+  for (i = 0; i < states; i++)            
+    for (j = 0; j < states; j++)
+      {
+        *eptr++ = EIGV[i][j];    /* EIGV: Eigenvalues */ 
+        
+      }
+  for (i = 0; i < states; i++)
+    for (j = 0; j < states; j++)
+      {
+        if(j == 0)
+          EI[i * states + j] = 1.0;
+        else
+          EI[i * states + j] = EV[i * states + j] * invfreq[i];   /* EV = Eigenvector, EI = Inverse Eigenvector,   $ u_{i,x}^{-1} = \pi_x u_{x,i} */
+      }
+  
+  for (i = 0; i < valueVectorLength; i++)
+    {
+      unsigned int value = valueVector[i];
+      
+      for(j = 0; j < states; j++)
+        tipVector[i * states + j]     = 0;                  
+
+      if(value > 0)
+        {                     
+          for (j = 0; j < states; j++) 
+            {       
+              if ((value >> j) & 1) 
+                {
+                  int l;
+                  for (l = 0; l < states; l++)
+                    tipVector[i * states + l] += EIGV[j][l];
+                }                         
+            }       
+        }     
+    }
+
+  for (i = 0; i < valueVectorLength; i++)
+    {
+       for(j = 0; j < states; j++)
+         if(tipVector[i * states + j] > PLL_MAX_TIP_EV)
+           tipVector[i * states + j] = PLL_MAX_TIP_EV;
+    }
+
+
+  
+
+  for (i = 0; i < states; i++)
+    {
+      rax_free(EIGV[i]);
+      rax_free(a[i]);
+      rax_free(r[i]);
+    }
+
+  rax_free(r);
+  rax_free(a);
+  rax_free(EIGV);
+
+  rax_free(f);
+  rax_free(e);
+  rax_free(d);
+  rax_free(invfreq);
+  rax_free(EIGN);
+}
+
+/** @brief Initialize GTR
+  *
+  * Wrapper function for the decomposition of the substitution rates matrix
+  * into eigenvectors and eigenvalues
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param model
+  *   Partition index
+  */
+void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model)
+{ 
+ double   
+   *ext_EIGN         = pr->partitionData[model]->EIGN,
+   *EV               = pr->partitionData[model]->EV,
+   *EI               = pr->partitionData[model]->EI,
+   *frequencies      = pr->partitionData[model]->frequencies,
+   *empiricalFrequencies = pr->partitionData[model]->empiricalFrequencies,
+   *ext_initialRates = pr->partitionData[model]->substRates,
+   *tipVector        = pr->partitionData[model]->tipVector,
+   *fracchange       = &(pr->partitionData[model]->fracchange);
+ 
+  
+ int states = pr->partitionData[model]->states;
+
+ switch(pr->partitionData[model]->dataType)
+   { 
+   case PLL_GENERIC_32:
+   case PLL_GENERIC_64:
+   case PLL_SECONDARY_DATA_6:
+   case PLL_SECONDARY_DATA_7: 
+   case PLL_SECONDARY_DATA:
+   case PLL_DNA_DATA:
+   case PLL_BINARY_DATA:    
+     initGeneric(states, 
+                 getBitVector(pr->partitionData[model]->dataType),
+                 getUndetermined(pr->partitionData[model]->dataType) + 1,
+                 fracchange,
+                 ext_EIGN, 
+                 EV, 
+                 EI, 
+                 frequencies, 
+                 ext_initialRates,
+                 tipVector
+                 // model
+                );
+     break;   
+   case PLL_AA_DATA:
+     if(pr->partitionData[model]->protModels != PLL_GTR)
+       {
+         double f[20];
+         int l;
+
+         if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+           {
+             int 
+               i;
+             
+             for(i = 0; i < 4; i++)
+               {                 
+                 initProtMat(f, pr->partitionData[model]->protModels, &(pr->partitionData[model]->substRates_LG4[i][0]), i);
+                 
+                 if(!pr->partitionData[model]->optimizeBaseFrequencies)
+                 {
+                   if(!pr->partitionData[model]->protUseEmpiricalFreqs)
+                   {
+                     for(l = 0; l < 20; l++)            
+                       pr->partitionData[model]->frequencies_LG4[i][l] = f[l];
+                   }
+                   else
+                   {
+                     for(l = 0; l < 20; l++)            
+                       pr->partitionData[model]->frequencies_LG4[i][l] = empiricalFrequencies[l];
+                   }
+                 }
+                 else
+                 {
+                   memcpy(pr->partitionData[model]->frequencies_LG4[i], frequencies, 20 * sizeof(double));
+                 }
+               }
+           }
+         else
+           {
+             if(pr->partitionData[model]->protModels == PLL_AUTO)
+               initProtMat(f, pr->partitionData[model]->autoProtModels, ext_initialRates, 0);
+             else         
+               {
+                 initProtMat(f, pr->partitionData[model]->protModels, ext_initialRates, 0);
+               }
+
+             /*if(adef->protEmpiricalFreqs && tr->NumberOfModels == 1)
+               assert(tr->partitionData[model].protUseEmpiricalFreqs);*/
+         
+              if (!pr->partitionData[model]->optimizeBaseFrequencies) {
+                  if(!pr->partitionData[model]->protUseEmpiricalFreqs)
+                  {                 
+                      for(l = 0; l < 20; l++)           
+                         frequencies[l] = f[l];
+                  } else {
+                      for(l = 0; l < 20; l++)           
+                         frequencies[l] = empiricalFrequencies[l];
+                  }
+              }
+           }  
+       }
+               
+     if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+       {
+         int 
+           i;
+
+         double 
+           *fracchanges_LG4[4],
+           acc = 0.0;
+
+         /* TODO frac change !*/
+
+         for(i = 0; i < 4; i++)
+           {
+             fracchanges_LG4[i]  = (double *)rax_malloc(pr->numberOfPartitions * sizeof(double));
+             initGeneric(states, 
+                         bitVectorAA, 
+                         23, 
+                         fracchanges_LG4[i],
+                         pr->partitionData[model]->EIGN_LG4[i], 
+                         pr->partitionData[model]->EV_LG4[i],
+                         pr->partitionData[model]->EI_LG4[i],
+                         pr->partitionData[model]->frequencies_LG4[i],
+                         pr->partitionData[model]->substRates_LG4[i],
+                         pr->partitionData[model]->tipVector_LG4[i]
+             //            model
+                        );   
+           }
+
+         for(i = 0; i < 4; i++)
+           {        
+             acc += fracchanges_LG4[i][model];
+             rax_free(fracchanges_LG4[i]);
+           }
+
+         //tr->fracchanges[model] = acc / 4;
+         //TODO check if valid
+         fracchange[model] = acc / 4;
+       }
+     else
+       initGeneric(states, 
+                   bitVectorAA, 
+                   23, 
+                   fracchange,
+                   ext_EIGN, 
+                   EV, 
+                   EI, 
+                   frequencies, 
+                   ext_initialRates,
+                   tipVector
+       //            model
+                  );
+    break;  
+   default:
+     assert(0);
+   } 
+
+ updateFracChange(tr, pr);
+}
+
+
+double LnGamma (double alpha)
+{
+/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places.  
+   Stirling's formula is used for the central polynomial part of the procedure.
+   Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function.
+   Communications of the Association for Computing Machinery, 9:684
+*/
+  double x, f, z, result;
+
+  x = alpha;
+  f = 0.0;
+  
+  if ( x < 7.0) 
+     {
+       f = 1.0;  
+       z = alpha - 1.0;
+      
+       while ((z = z + 1.0) < 7.0)  
+         {        
+           f *= z;
+         }
+       x = z;   
+     
+       assert(f != 0.0);
+        
+       f=-log(f);
+     }
+   
+   z = 1/(x*x);
+   
+   result = f + (x-0.5)*log(x) - x + .918938533204673 
+          + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z
+               +.083333333333333)/x;  
+
+   return result;
+}
+
+
+
+double IncompleteGamma (double x, double alpha, double ln_gamma_alpha)
+{
+/* returns the incomplete gamma ratio I(x,alpha) where x is the upper 
+           limit of the integration and alpha is the shape parameter.
+   returns (-1) if in error
+   ln_gamma_alpha = ln(Gamma(alpha)), is almost redundant.
+   (1) series expansion     if (alpha>x || x<=1)
+   (2) continued fraction   otherwise
+   RATNEST FORTRAN by
+   Bhattacharjee GP (1970) The incomplete gamma integral.  Applied Statistics,
+   19: 285-287 (AS32)
+*/
+   int i;
+   double p=alpha, g=ln_gamma_alpha;
+   double accurate=1e-8, overflow=1e30;
+   double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6];
+
+
+   if (x==0) return (0);
+   if (x<0 || p<=0) return (-1);
+
+   
+   factor=exp(p*log(x)-x-g);   
+   if (x>1 && x>=p) goto l30;
+   /* (1) series expansion */
+   gin=1;  term=1;  rn=p;
+ l20:
+   rn++;
+   term*=x/rn;   gin+=term;
+
+   if (term > accurate) goto l20;
+   gin*=factor/p;
+   goto l50;
+ l30:  
+   /* (2) continued fraction */
+   a=1-p;   b=a+x+1;  term=0;
+   pn[0]=1;  pn[1]=x;  pn[2]=x+1;  pn[3]=x*b;
+   gin=pn[2]/pn[3];   
+ l32:  
+   a++;  
+   b+=2;  
+   term++;   
+   an=a*term;
+   for (i=0; i<2; i++) 
+     pn[i+4]=b*pn[i+2]-an*pn[i];
+   if (pn[5] == 0) goto l35;
+   rn=pn[4]/pn[5];   
+   dif=fabs(gin-rn);  
+   if (dif>accurate) goto l34;
+   if (dif<=accurate*rn) goto l42;
+ l34:   
+   gin=rn;
+ l35: 
+   for (i=0; i<4; i++) 
+     pn[i]=pn[i+2];
+   if (fabs(pn[4]) < overflow)            
+     goto l32;        
+   
+   for (i=0; i<4; i++) 
+     pn[i]/=overflow;
+
+   
+   goto l32;
+ l42:  
+   gin=1-factor*gin;
+
+ l50: 
+   return (gin);
+}
+
+
+
+
+double PointNormal (double prob)
+{
+/* returns z so that Prob{x<z}=prob where x ~ N(0,1) and (1e-12)<prob<1-(1e-12)
+   returns (-9999) if in error
+   Odeh RE & Evans JO (1974) The percentage points of the normal distribution.
+   Applied Statistics 22: 96-97 (AS70)
+
+   Newer methods:
+     Wichura MJ (1988) Algorithm AS 241: the percentage points of the
+       normal distribution.  37: 477-484.
+     Beasley JD & Springer SG  (1977).  Algorithm AS 111: the percentage 
+       points of the normal distribution.  26: 118-121.
+
+*/
+   double a0=-.322232431088, a1=-1, a2=-.342242088547, a3=-.0204231210245;
+   double a4=-.453642210148e-4, b0=.0993484626060, b1=.588581570495;
+   double b2=.531103462366, b3=.103537752850, b4=.0038560700634;
+   double y, z=0, p=prob, p1;
+
+   p1 = (p<0.5 ? p : 1-p);
+   if (p1<1e-20) return (-9999);
+
+   y = sqrt (log(1/(p1*p1)));   
+   z = y + ((((y*a4+a3)*y+a2)*y+a1)*y+a0) / ((((y*b4+b3)*y+b2)*y+b1)*y+b0);
+   return (p<0.5 ? -z : z);
+}
+
+
+double PointChi2 (double prob, double v)
+{
+/* returns z so that Prob{x<z}=prob where x is Chi2 distributed with df=v
+   returns -1 if in error.   0.000002<prob<0.999998
+   RATNEST FORTRAN by
+       Best DJ & Roberts DE (1975) The percentage points of the 
+       Chi2 distribution.  Applied Statistics 24: 385-388.  (AS91)
+   Converted into C by Ziheng Yang, Oct. 1993.
+*/
+   double e=.5e-6, aa=.6931471805, p=prob, g;
+   double xx, c, ch, a=0,q=0,p1=0,p2=0,t=0,x=0,b=0,s1,s2,s3,s4,s5,s6;
+  
+   if (p<.000002 || p>.999998 || v<=0) return (-1);
+  
+   g = LnGamma(v/2);
+   
+   xx=v/2;   c=xx-1;
+   if (v >= -1.24*log(p)) goto l1;
+
+   ch=pow((p*xx*exp(g+xx*aa)), 1/xx);
+   if (ch-e<0) return (ch);
+   goto l4;
+l1:
+   if (v>.32) goto l3;
+   ch=0.4;   a=log(1-p);
+l2:
+   q=ch;  p1=1+ch*(4.67+ch);  p2=ch*(6.73+ch*(6.66+ch));
+   t=-0.5+(4.67+2*ch)/p1 - (6.73+ch*(13.32+3*ch))/p2;
+   ch-=(1-exp(a+g+.5*ch+c*aa)*p2/p1)/t;
+   if (fabs(q/ch-1)-.01 <= 0) goto l4;
+   else                       goto l2;
+  
+l3:    
+   x=PointNormal (p);
+   p1=0.222222/v;   ch=v*pow((x*sqrt(p1)+1-p1), 3.0);
+   if (ch>2.2*v+6)  ch=-2*(log(1-p)-c*log(.5*ch)+g);
+l4:
+   q=ch;   p1=.5*ch;   
+   if ((t=IncompleteGamma (p1, xx, g))< 0.0) 
+     {
+       printf ("IncompleteGamma \n");      
+       return (-1);
+     }
+  
+   p2=p-t;
+   t=p2*exp(xx*aa+g+p1-c*log(ch));   
+   b=t/ch;  a=0.5*t-b*c;
+
+   s1=(210+a*(140+a*(105+a*(84+a*(70+60*a))))) / 420;
+   s2=(420+a*(735+a*(966+a*(1141+1278*a))))/2520;
+   s3=(210+a*(462+a*(707+932*a)))/2520;
+   s4=(252+a*(672+1182*a)+c*(294+a*(889+1740*a)))/5040;
+   s5=(84+264*a+c*(175+606*a))/2520;
+   s6=(120+c*(346+127*c))/5040;
+   ch+=t*(1+0.5*t*s1-b*c*(s1-b*(s2-b*(s3-b*(s4-b*(s5-b*s6))))));
+   if (fabs(q/ch-1) > e) goto l4;
+
+   return (ch);
+}
+
+/** @brief Compute the gamma rates
+    
+    Compute the gamma rates
+
+    @param alpha
+      Alpha parameter
+
+    @param gammaRates
+      Array where to store the computed gamma rates
+
+    @param K
+      Number of categories
+
+    @param useMedian
+      Boolean flag whether to use a median or not
+
+    @todo
+       Document this more.
+*/
+void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian)
+{
+  int 
+    i;
+
+  double 
+    factor = alpha / alpha * K, 
+    lnga1, 
+    alfa = alpha, 
+    beta = alpha,
+    *gammaProbs = (double *)rax_malloc(K * sizeof(double));
+
+  /* Note that PLL_ALPHA_MIN setting is somewhat critical due to   */
+  /* numerical instability caused by very small rate[0] values */
+  /* induced by low alpha values around 0.01 */
+
+  assert(alfa >= PLL_ALPHA_MIN); 
+
+  if(useMedian)
+    {
+      double  
+        middle = 1.0 / (2.0*K),
+        t = 0.0; 
+      
+      for(i = 0; i < K; i++)     
+        gammaRates[i] = PLL_POINT_GAMMA((double)(i * 2 + 1) * middle, alfa, beta);
+      
+      for (i = 0; i < K; i++) 
+        t += gammaRates[i];
+       for( i = 0; i < K; i++)     
+         gammaRates[i] *= factor / t;
+    }
+  else
+    {
+      lnga1 = LnGamma(alfa + 1);
+
+      for (i = 0; i < K - 1; i++)
+        gammaProbs[i] = PLL_POINT_GAMMA((i + 1.0) / K, alfa, beta);
+
+      for (i = 0; i < K - 1; i++)
+        gammaProbs[i] = IncompleteGamma(gammaProbs[i] * beta, alfa + 1, lnga1);   
+
+      gammaRates[0] = gammaProbs[0] * factor;
+      
+      gammaRates[K - 1] = (1 - gammaProbs[K - 2]) * factor;
+
+      for (i= 1; i < K - 1; i++)  
+        gammaRates[i] = (gammaProbs[i] - gammaProbs[i - 1]) * factor;      
+    }
+  /* assert(gammaRates[0] >= 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875); */
+
+  rax_free(gammaProbs);
+
+  return;  
+}
+
+
+/** @brief Set the substitution rates
+  *
+  * @brief Set \a rates - 1  substitution rates. Set the last rate to 1.
+  *
+  * @param r
+  *  Array of substitution rates
+  *
+  * @param rates
+  *   Number of rates to set
+  */
+static void setRates(double *r, int rates)
+{
+  int i;
+
+  //changed to 1.0 instead of 0.5 for making the 
+  //implementation of an interface function to set other models 
+  //than GTR easier 
+
+  for(i = 0; i < rates - 1; i++)
+    r[i] = 1.0;
+
+  r[rates - 1] = 1.0;
+}
+
+/** @brief Initialize the substitution rates matrix
+  *
+  * Initialize the substitution rates matrices for all partitions
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @todo
+  *   Do we need the secondary structure and binary? Will we only use GTR? If yes,
+  *   we could rename this function to initRateMatrixGTR
+  */
+void initRateMatrix(pllInstance *tr, partitionList *pr)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {   
+      int       
+        i,
+        states = pr->partitionData[model]->states,
+        rates  = (states * states - states) / 2;
+      
+      switch(pr->partitionData[model]->dataType)
+        {
+        case PLL_BINARY_DATA:
+        case PLL_DNA_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+          setRates(pr->partitionData[model]->substRates, rates);
+          break;          
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:      
+          switch(tr->multiStateModel)
+            {
+            case PLL_ORDERED_MULTI_STATE:
+              {
+                int 
+                  j, 
+                  k, 
+                  i = 0;
+                
+                for(j = 0; j < states; j++)
+                  for(k = j + 1; k < states; k++)
+                    pr->partitionData[model]->substRates[i++] = (double)(k - j);
+                assert(i == rates);             
+              }
+              break;
+            case PLL_MK_MULTI_STATE:
+              for(i = 0; i < rates; i++)
+                pr->partitionData[model]->substRates[i] = 1.0;
+              
+              break;
+            case PLL_GTR_MULTI_STATE:
+              setRates(pr->partitionData[model]->substRates, rates);
+              break;
+            default:
+              assert(0);
+            }
+          break;
+        case PLL_AA_DATA:
+          if(pr->partitionData[model]->protModels == PLL_GTR)
+            {
+              //set optimizeSubstRates to true !
+              pr->partitionData[model]->optimizeSubstitutionRates = PLL_TRUE;
+              putWAG(pr->partitionData[model]->substRates);
+            }
+          break;
+        default:
+          assert(0);
+        }           
+      
+      if(pr->partitionData[model]->nonGTR)
+        {
+          assert(pr->partitionData[model]->dataType == PLL_SECONDARY_DATA ||
+                 pr->partitionData[model]->dataType == PLL_SECONDARY_DATA_6 ||
+                 pr->partitionData[model]->dataType == PLL_SECONDARY_DATA_7);
+                  
+          for(i = 0; i < rates; i++)
+            {
+              if(pr->partitionData[model]->symmetryVector[i] == -1)
+                pr->partitionData[model]->substRates[i] = 0.0;
+              else
+                {
+                  if(pr->partitionData[model]->symmetryVector[i] == pr->partitionData[model]->symmetryVector[rates - 1])
+                    pr->partitionData[model]->substRates[i] = 1.0;
+                }
+            }
+        }
+    }  
+}
+
+/** @brief Function for setting secondary structure symmetries
+  *
+  * @todo
+  *   Do we need this function?
+*/
+static void setSymmetry(int *s, int *sDest, const int sCount, int *f, int *fDest, const int fCount)
+{
+  int i;
+
+  for(i = 0; i < sCount; i++)
+    sDest[i] = s[i];
+
+  for(i = 0; i < fCount; i++)
+    fDest[i] = f[i];
+}
+
+/** @brief Wrapper function for setting secondary structure symmetries
+  *
+  * @todo
+  *   Do we need this function?
+*/
+static void setupSecondaryStructureSymmetries(pllInstance *tr, partitionList *partitions)
+{
+  int model;
+  int numberOfModels = partitions->numberOfPartitions;
+
+  for(model = 0; model < numberOfModels; model++)
+    {
+      if(partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA ||
+                  partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA_6 ||
+                  partitions->partitionData[model]->dataType == PLL_SECONDARY_DATA_7)
+        {       
+          switch(tr->secondaryStructureModel)
+            {
+            case PLL_SEC_6_A:
+                partitions->partitionData[model]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_6_B:
+              {
+                int f[6]  = {0, 1, 2, 3, 4, 5};
+                int s[15] = {2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+                  
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_C:
+              {
+                int f[6]  = {0, 2, 2, 1, 0, 1};
+                int s[15] = {2, 0, 1, 2, 2, 2, 2, 0, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+                
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_D:
+              {
+                int f[6]  = {0, 2, 2, 1, 0, 1};
+                int s[15] = {2, -1, 1, 2, 2, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_6_E:
+              {
+                int f[6]  = {0, 1, 2, 3, 4, 5};
+                int s[15] = {2, -1, 1, 2, 2, 2, 2, -1, 1, 1, 2, 2, 2, 2, 1};
+
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 15, f, partitions->partitionData[model]->frequencyGrouping, 6);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_7_A:
+                partitions->partitionData[model]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_7_B:
+              {
+                int f[7]  = {0, 2, 2, 1, 0, 1, 3};
+                int s[21] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_C:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {-1, -1, 0, -1, -1, 4, -1, -1, -1, 3, 5, 1, -1, -1, 6, -1, -1, 7, 2, 8, 9};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_D:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {2, 0, 1, 2, 2, 3, 2, 2, 0, 1, 3, 1, 2, 2, 3, 2, 2, 3, 1, 3, 3};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_E:
+              {
+                int f[7]  = {0, 1, 2, 3, 4, 5, 6};
+                int s[21] = {-1, -1, 0, -1, -1, 1, -1, -1, -1, 0, 1, 0, -1, -1, 1, -1, -1, 1, 0, 1, 1};
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+            case PLL_SEC_7_F:
+              {
+                int f[7]  = {0, 2, 2, 1, 0, 1, 3};
+                int s[21] = {2, 0, 1, 2, 2, 3, 2, 2, 0, 1, 3, 1, 2, 2, 3, 2, 2, 3, 1, 3, 3};            
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 21, f, partitions->partitionData[model]->frequencyGrouping, 7);
+
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+              }
+              break;
+              
+            case PLL_SEC_16:
+                partitions->partitionData[1]->nonGTR = PLL_FALSE;
+              break;
+            case PLL_SEC_16_A:
+              {
+                int f[16]  = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+                int s[120] = {/* AA */  4,  4,  3,  4, -1, -1, -1,  4, -1, -1, -1,  3, -1, -1, -1,
+                              /* AC */  4,  3, -1,  4, -1, -1, -1,  3, -1, -1, -1,  4, -1, -1,
+                              /* AG */  3, -1, -1,  3, -1, -1, -1,  4, -1, -1, -1,  3, -1,
+                              /* AU */ -1, -1,  2,  3, -1,  0, -1,  1,  2, -1,  2,  3,
+                              /* CA */  4,  3,  4,  4, -1, -1, -1,  3, -1, -1, -1,
+                              /* CC */  3,  4, -1,  3, -1, -1, -1,  4, -1, -1,
+                              /* CG */  3, -1,  2,  3,  2,  0, -1,  1, -1,
+                              /* CU */ -1, -1, -1,  3, -1, -1, -1,  4,
+                              /* GA */  3,  4,  3,  3, -1, -1, -1,
+                              /* GC */  3,  1,  2,  3,  2, -1,
+                              /* GG */  3, -1, -1,  3, -1,
+                              /* GU */  2, -1,  2,  3,
+                              /* UA */  3,  1,  3,
+                              /* UC */  3,  4,
+                              /* UG */  3};
+                              
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 120, f, partitions->partitionData[model]->frequencyGrouping, 16);
+                              
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+
+                }
+              break;
+            case PLL_SEC_16_B:
+              {
+                int f[16]  = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+                int s[120] = {/* AA */  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,
+                              /* AC */  0,  0, -1,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1,
+                              /* AG */  0, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1,
+                              /* AU */ -1, -1,  0,  0, -1,  0, -1,  0,  0, -1,  0,  0,
+                              /* CA */  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,
+                              /* CC */  0,  0, -1,  0, -1, -1, -1,  0, -1, -1,
+                              /* CG */  0, -1,  0,  0,  0,  0, -1,  0, -1,
+                              /* CU */ -1, -1, -1,  0, -1, -1, -1,  0,
+                              /* GA */  0,  0,  0,  0, -1, -1, -1,
+                              /* GC */  0,  0,  0,  0,  0, -1,
+                              /* GG */  0, -1, -1,  0, -1,
+                              /* GU */  0, -1,  0,  0,
+                              /* UA */  0,  0,  0,
+                              /* UC */  0,  0,
+                              /* UG */  0};
+                              
+                
+                setSymmetry(s, partitions->partitionData[model]->symmetryVector, 120, f, partitions->partitionData[model]->frequencyGrouping, 16);
+                              
+                partitions->partitionData[model]->nonGTR = PLL_TRUE;
+              }
+              break;
+            case PLL_SEC_16_C:        
+            case PLL_SEC_16_D:
+            case PLL_SEC_16_E:
+            case PLL_SEC_16_F:
+            case PLL_SEC_16_I:
+            case PLL_SEC_16_J:
+            case PLL_SEC_16_K:
+              assert(0);
+            default:
+              assert(0);
+            }
+        }
+
+    }
+
+}
+
+/** @brief Initialize base frequencies in partition data
+  *
+  * Copy the computed empirical frequencies for each partition from the \a empiricalFrequencies
+  * structure to each partition structure.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param empiricalFrequencies
+  *   Array containing the empirical frequencies
+*/
+static void initializeBaseFreqs(partitionList *pr, double **empiricalFrequencies)
+{
+  size_t 
+    model;
+  int
+    l,
+    numFreqs;
+  double f;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    {
+      if(pr->partitionData[model]->optimizeBaseFrequencies)
+       {
+         //set all base frequencies to identical starting values 1.0 / numberOfDataStates
+         numFreqs = pr->partitionData[model]->states;
+         f = 1.0 / ((double)numFreqs);
+
+         for(l = 0; l < numFreqs; l++)
+          {
+            pr->partitionData[model]->frequencies[l]          = f;
+            pr->partitionData[model]->empiricalFrequencies[l] = f;
+          }
+       }
+      else
+       {
+         memcpy(pr->partitionData[model]->frequencies,          empiricalFrequencies[model], sizeof(double) * pr->partitionData[model]->states);
+         memcpy(pr->partitionData[model]->empiricalFrequencies, empiricalFrequencies[model], sizeof(double) * pr->partitionData[model]->states);
+       }
+    }
+}
+
+
+/** @brief Initialize the model parameters
+  * 
+  * Initialize the model parameters. Specifically
+  *   - Base frequencies
+  *   - Rate matrix
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param empiricalFrequencies
+  *   Pointer to the empirical frequencies array
+  *
+  * @param partitions
+  *   Pointer to the partitions structure
+  *
+  * @todo
+  *   What is tr->optimizeRateCategoryInvocations = 1 ?
+  */
+void initModel(pllInstance *tr, double **empiricalFrequencies, partitionList * partitions)
+{  
+  int model, j;
+  double  temp;  
+     
+  tr->optimizeRateCategoryInvocations = 1;      
+  tr->numberOfInvariableColumns = 0;
+  tr->weightOfInvariableColumns = 0;           
+  
+  for (j = 0; j < tr->originalCrunchedLength; j++) 
+    {
+      tr->patrat[j] = temp = 1.0;
+      tr->patratStored[j] = 1.0;
+      tr->rateCategory[j] = 0;           
+    } 
+
+  /* PSR (CAT) model init */
+  for(model = 0; model < partitions->numberOfPartitions; model++)
+    {            
+          partitions->partitionData[model]->numberOfCategories = 1;
+          partitions->partitionData[model]->perSiteRates[0] = 1.0;
+    }
+    
+  updatePerSiteRates(tr, partitions, PLL_FALSE);
+ 
+  setupSecondaryStructureSymmetries(tr, partitions);
+  
+  initRateMatrix(tr, partitions);
+
+  initializeBaseFreqs(partitions, empiricalFrequencies);
+  
+  for(model = 0; model < partitions->numberOfPartitions; model++)
+   {
+     int
+       k;
+
+     partitions->partitionData[model]->alpha = 1.0;
+     if(partitions->partitionData[model]->dataType == PLL_AA_DATA && partitions->partitionData[model]->protModels == PLL_AUTO)
+       partitions->partitionData[model]->autoProtModels = PLL_WAG; /* initialize by WAG per default */
+      
+     pllInitReversibleGTR(tr, partitions, model); /* Decomposition of Q matrix */
+      /* GAMMA model init */
+     pllMakeGammaCats(partitions->partitionData[model]->alpha, partitions->partitionData[model]->gammaRates, 4, tr->useMedian);
+
+     for(k = 0; k < partitions->partitionData[model]->states; k++)
+       partitions->partitionData[model]->freqExponents[k] = 0.0;
+
+     for(k = 0; k < 4; k++)
+     {
+	   partitions->partitionData[model]->lg4x_weights[k] = 0.25;
+	   partitions->partitionData[model]->lg4x_weightExponents[k] = 0.0;
+     }
+
+   }                                   
+  
+  if(partitions->numberOfPartitions > 1)
+    {
+      tr->fracchange = 0;
+      for(model = 0; model < partitions->numberOfPartitions; model++) 
+        tr->fracchange += partitions->partitionData[model]->fracchange;
+      
+      tr->fracchange /= ((double)partitions->numberOfPartitions);
+    }  
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, partitions, PLL_THREAD_COPY_INIT_MODEL);
+#endif
+}
+
+
+
+
diff --git a/pllrepo/src/newick.c b/pllrepo/src/newick.c
new file mode 100644
index 0000000..ceb9653
--- /dev/null
+++ b/pllrepo/src/newick.c
@@ -0,0 +1,583 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newick.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+
+/** @file  newick.c
+
+    @brief Collection of routines for reading and parsing newick trees
+
+    Auxiliary functions for reading and parsing newick tree formats
+*/
+
+
+/** @defgroup newickParseGroup Reading and parsing newick trees
+    
+    This set of functions handles the reading and parsing of newick tree formats
+*/
+
+static int
+parse_newick (pllStack ** stack, int * inp)
+{
+  pllNewickNodeInfo * item = NULL;
+  int item_active = 0;
+  pllLexToken token;
+  int input;
+  pllLexToken prev_token;
+  int nop = 0;          /* number of open parentheses */
+  int depth = 0;
+
+  prev_token.tokenType = PLL_TOKEN_UNKNOWN;
+
+  input = *inp;
+
+  NEXT_TOKEN
+  
+  while (token.tokenType != PLL_TOKEN_EOF && token.tokenType != PLL_TOKEN_UNKNOWN)
+  {
+    switch (token.tokenType)
+     {
+       case PLL_TOKEN_OPAREN:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_OPAREN\n");
+#endif
+        ++nop;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        ++depth;
+        break;
+
+       case PLL_TOKEN_CPAREN:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_CPAREN\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN  &&
+            prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+            prev_token.tokenType != PLL_TOKEN_STRING  &&
+            prev_token.tokenType != PLL_TOKEN_NUMBER  &&
+            prev_token.tokenType != PLL_TOKEN_FLOAT) return (0);
+
+        if (!nop) return (0);
+        --nop;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nec
+        //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "INTERNAL_NODE");
+         }
+
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        item->depth = depth;
+        pllStackPush (stack, item);
+        item_active  = 1;       /* active = 1 */
+        item = NULL;
+        --depth;
+        break;
+
+       case PLL_TOKEN_STRING:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_STRING      %.*s\n", token.len, token.lexeme);
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_OPAREN &&
+            prev_token.tokenType != PLL_TOKEN_CPAREN &&
+            prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+            prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        item->name = my_strndup (token.lexeme, token.len);
+
+        item_active = 1;
+        item->depth = depth;
+        if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
+            prev_token.tokenType == PLL_TOKEN_OPAREN ||
+            prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_FLOAT:
+       case PLL_TOKEN_NUMBER:
+#ifdef PLLDEBUG
+       if (token.tokenType == PLL_TOKEN_FLOAT) printf ("PLL_TOKEN_FLOAT\n"); else printf ("PLL_TOKEN_NUMBER\n");
+#endif
+         if  (prev_token.tokenType != PLL_TOKEN_OPAREN &&
+              prev_token.tokenType != PLL_TOKEN_CPAREN &&
+              prev_token.tokenType != PLL_TOKEN_COLON  &&
+              prev_token.tokenType != PLL_TOKEN_UNKNOWN &&
+              prev_token.tokenType != PLL_TOKEN_COMMA) return (0);
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        if (prev_token.tokenType == PLL_TOKEN_COLON)
+         {
+           item->branch = my_strndup (token.lexeme, token.len);
+         }
+        else
+         {
+           if (prev_token.tokenType == PLL_TOKEN_COMMA  ||
+               prev_token.tokenType == PLL_TOKEN_OPAREN ||
+               prev_token.tokenType == PLL_TOKEN_UNKNOWN) item->leaf = 1;
+           //if (prev_token.tokenType != PLL_TOKEN_UNKNOWN) ++ indent;
+           item->name = my_strndup (token.lexeme, token.len);
+         }
+        item_active = 1;
+        item->depth = depth;
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_COLON:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_COLON\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
+            prev_token.tokenType != PLL_TOKEN_STRING &&
+            prev_token.tokenType != PLL_TOKEN_FLOAT  &&
+            prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        break;
+
+       case PLL_TOKEN_COMMA:
+#ifdef PLLDEBUG
+       printf ("PLL_TOKEN_COMMA\n");
+#endif
+        if (prev_token.tokenType != PLL_TOKEN_CPAREN &&
+             prev_token.tokenType != PLL_TOKEN_STRING &&
+             prev_token.tokenType != PLL_TOKEN_FLOAT && 
+             prev_token.tokenType != PLL_TOKEN_NUMBER) return (0);
+        memcpy (&prev_token, &token, sizeof (pllLexToken));
+        
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo)); // possibly not nece
+        //if (item->name   == NULL) item->name   = strdup ("INTERNAL_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("INTERNAL_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "INTERNAL_NODE");
+         }
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        item->depth = depth;
+        pllStackPush (stack, item);
+        item_active  = 0;
+        item = NULL;
+        break;
+
+       case PLL_TOKEN_SEMICOLON:
+#ifdef PLLDEBUG
+        printf ("PLL_TOKEN_SEMICOLON\n");
+#endif
+        /* push to the stack */
+        if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+        //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
+        if (item->name == NULL) 
+         {
+           item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
+           strcpy (item->name, "ROOT_NODE");
+         }
+        //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+        if (item->branch == NULL) 
+         {
+           item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+           strcpy (item->branch, "0.000000");
+         }
+        pllStackPush (stack, item);
+        item_active  = 0;
+        item = NULL;
+        break;
+       default:
+#ifdef __DEBUGGING_MODE
+         printf ("Unknown token: %d\n", token.tokenType);
+#endif
+       // TODO: Finish this part and add error codes
+        break;
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE);
+  }
+  if (item_active)
+   {
+     if (!item) item = (pllNewickNodeInfo *) rax_calloc (1, sizeof (pllNewickNodeInfo));
+     //if (item->name   == NULL) item->name   = strdup ("ROOT_NODE");
+     if (item->name == NULL) 
+      {
+        item->name = (char *) rax_malloc ((strlen("ROOT_NODE") + 1) * sizeof (char));
+        strcpy (item->name, "ROOT_NODE");
+      }
+     //if (item->branch == NULL) item->branch = strdup ("0.000000"); 
+     if (item->branch == NULL) 
+      {
+        item->branch = (char *) rax_malloc ((strlen("0.000000") + 1) * sizeof (char));
+        strcpy (item->branch, "0.000000");
+      }
+     pllStackPush (stack, item);
+     item_active  = 0;
+   }
+
+  if (nop || token.tokenType == PLL_TOKEN_UNKNOWN) 
+   {
+     return (0);
+   }
+
+  return (1);
+}
+
+#ifdef __DEBUGGING_MODE
+void stack_dump(pllStack ** stack)
+{
+  pllNewickNodeInfo * item;
+  pllStack * head;
+  int i;
+
+  head = *stack;
+  while (head)
+   {
+     item = (pllNewickNodeInfo *) head->item;
+
+     for (i = 0; i < item->depth; ++ i) printf ("\t");
+
+     printf ("%s:%s\n", item->name, item->branch);
+
+     head = head->next;
+   }
+}
+#endif
+
+static void
+assign_ranks (pllStack * stack, int * nodes, int * leaves)
+{
+  pllStack * head;
+  pllNewickNodeInfo * item, * tmp;
+  pllStack * preorder = NULL;
+  int children;
+  int depth;
+
+  *nodes = *leaves = 0;
+
+
+  head = stack;
+  while (head)
+  {
+    assert (head->item);
+    item = (pllNewickNodeInfo *) head->item;
+    
+    if (item->leaf)  ++ (*leaves);
+
+    if (preorder)
+     {
+       tmp = (pllNewickNodeInfo *) preorder->item;
+       children = 0;
+       while (item->depth < tmp->depth)
+        {
+          children = 1;
+          depth = tmp->depth;
+          pllStackPop (&preorder);
+          tmp = preorder->item;
+          while (tmp->depth == depth)
+           {
+             ++ children;
+             pllStackPop (&preorder);
+             tmp = (pllNewickNodeInfo *)preorder->item;
+           }
+          tmp->rank += children;
+        }
+     }
+    
+    ++ (*nodes);
+    head = head->next;
+
+    if (item->leaf)
+     {
+       if (!preorder) return;
+
+       children = 1;
+       tmp = preorder->item;
+       while (tmp->depth == item->depth)
+        {
+          ++ children;
+          pllStackPop (&preorder);
+          assert (preorder);
+          tmp = (pllNewickNodeInfo *)preorder->item;
+        }
+       tmp->rank += children;
+     }
+    else
+     {
+       pllStackPush (&preorder, item);
+     }
+  }
+  
+  while (preorder->item != stack->item)
+  {
+    item = (pllNewickNodeInfo *)pllStackPop (&preorder);
+    tmp  = (pllNewickNodeInfo *) preorder->item;
+    children = 1;
+
+    while (tmp->depth == item->depth)
+     {
+       ++ children;
+       item = (pllNewickNodeInfo *) pllStackPop (&preorder);
+       tmp  = (pllNewickNodeInfo *) preorder->item;
+     }
+    tmp->rank += children;
+    children = 0;
+  }
+ assert (preorder->item == stack->item);
+ 
+ pllStackClear (&preorder);
+}
+
+/** @ingroup newickParseGroup
+    @brief Validate if a newick tree is a valid phylogenetic tree
+
+    A valid tree is one where the root node is binary or ternary
+    and all other internal nodes are binary. In case the root
+    is ternary then the tree must contain at least another internal
+    node and the total number of nodes must be equal to 
+    \f$ 2l - 2\f$, where \f$l\f$ is the number of leaves. If the
+    root is binary, then the total number of nodes must be equal
+    to \f$2l - 1\f$.
+
+    @param tree
+      Newick tree wrapper structure which contains the stack representation of the parsed newick tree
+
+    @return
+      Returns \b 1 in case of success, otherwise \b 0
+*/
+int
+pllValidateNewick (pllNewickTree * t)
+{
+  pllStack * head;
+  pllNewickNodeInfo * item;
+  int correct = 0;
+ 
+  item = t->tree->item;
+  if (item->rank != 2 && item->rank != 3) return (0);
+  head = t->tree->next;
+  while (head)
+  {
+    item = head->item;
+    if (item->rank != 2 && item->rank != 0) 
+     {
+       return (0);
+     }
+    head = head->next;
+  }
+  
+  item = t->tree->item;
+
+  if (item->rank == 2) 
+   {
+     correct = (t->nodes == 2 * t->tips -1);
+     if (correct)
+      {
+        errno = PLL_NEWICK_ROOTED_TREE;
+      }
+     else
+      {
+        errno = PLL_NEWICK_BAD_STRUCTURE;
+      }
+     return (PLL_FALSE);
+   }
+   
+  
+  correct = ((t->nodes == 2 * t->tips - 2) && t->nodes != 4);
+  if (correct) return (PLL_TRUE);
+
+  errno = PLL_NEWICK_BAD_STRUCTURE;
+
+  return (1);
+}
+
+
+/** @ingroup newickParseGroup
+    @brief Convert a binary rooted trree to a binary unrooted tree
+
+    Changes the root of the node to have 3 descendants instead of two, deletes its last immediate descendant internal node
+    and takes the two children (of the deleted internal node) as its children.
+
+    @param
+      Newick tree
+    
+    @return
+      \b PLL_TRUE in case of success, otherwise \b PLL_FALSE and \a errno is set
+*/
+int
+pllNewickUnroot (pllNewickTree * t)
+{
+  pllStack * tmp;
+  pllNewickNodeInfo * item;
+
+  item = t->tree->item;
+  if (item->rank == 2)
+   {
+     item->rank = 3;
+     t->nodes--;
+     item = t->tree->next->item;
+     if (item->rank == 0)
+      {
+        tmp = t->tree->next->next;
+        t->tree->next->next = t->tree->next->next->next;
+      }
+     else
+      {
+        tmp = t->tree->next;
+        t->tree->next = t->tree->next->next;
+      }
+     item = tmp->item;
+     rax_free (item->name);
+     rax_free (tmp->item);
+     rax_free (tmp);
+   }
+
+  return (pllValidateNewick (t));
+}
+
+
+/** @ingroup newickParseGroup
+    @brief Parse a newick tree string
+  
+    Parse a newick string and create a stack structure which represents the tree
+    in a preorder traversal form. Each element of the stack represents one node
+    and consists of its name, branch length, number of children and depth. The
+    stack structure is finally wrapped in a \a pllNewickTree structure which
+    also contains the number of nodes and leaves.
+
+    @param newick
+      String containing the newick tree
+
+    @return
+      Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
+*/
+pllNewickTree *
+pllNewickParseString (const char * newick)
+{
+  int n, input, rc;
+  pllNewickTree * t;
+  int nodes, leaves;
+  
+  t = (pllNewickTree *) rax_calloc (1, sizeof (pllNewickTree));
+
+  n = strlen (newick);
+
+  init_lexan (newick, n);
+  input = get_next_symbol();
+
+  rc = parse_newick (&(t->tree), &input);
+  if (!rc)
+   {
+     /* TODO: properly clean t->tree */
+     rax_free (t);
+     t = NULL;
+   }
+  else
+   {
+     assign_ranks (t->tree, &nodes, &leaves);
+     t->nodes = nodes;
+     t->tips  = leaves;
+   }
+
+  return (t);
+}
+
+/** @ingroup newickParseGroup
+    @brief Deallocate newick parser stack structure
+
+    Deallocates the newick parser stack structure that represents the parsed tree. It
+    also frees all memory allocated by elements of the stack structure.
+
+    @param tree
+      The tree stack structure
+*/
+void pllNewickParseDestroy (pllNewickTree ** t)
+{
+  pllNewickNodeInfo *  item;
+
+  while ((item = (pllNewickNodeInfo *)pllStackPop (&((*t)->tree))))
+   {
+     rax_free (item->name);
+     rax_free (item->branch);
+     rax_free (item);
+   }
+  rax_free (*t);
+  (*t) = NULL;
+}
+
+/** @ingroup newickParseGroup
+    @brief Parse a newick tree file
+  
+    Parse a newick file and create a stack structure which represents the tree
+    in a preorder traversal form. Each element of the stack represents one node
+    and consists of its name, branch length, number of children (rank) and depth. The
+    stack structure is finally wrapped in a \a pllNewickTree structure which
+    also contains the number of nodes and leaves.
+
+    @param filename
+      Filename containing the newick tree
+
+    @return
+      Returns a pointer to the created \a pllNewickTree structure in case of success, otherwise \b NULL
+*/
+pllNewickTree *
+pllNewickParseFile (const char * filename)
+{
+  long n;
+  char * rawdata;
+  pllNewickTree * t;
+
+  rawdata = pllReadFile (filename, &n);
+  if (!rawdata)
+   {
+     fprintf (stderr, "Error while opening/reading file %s\n", filename);
+     return (0);
+   }
+
+  //printf ("%s\n\n", rawdata);
+
+  t = pllNewickParseString (rawdata);
+
+  rax_free (rawdata);
+
+  return (t);
+}
+
diff --git a/pllrepo/src/newick.h b/pllrepo/src/newick.h
new file mode 100644
index 0000000..8810598
--- /dev/null
+++ b/pllrepo/src/newick.h
@@ -0,0 +1,61 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newick.h
+ */
+#ifndef __pll_NEWICK__
+#define __pll_NEWICK__
+#include "stack.h"
+/** @brief Intermediate structure for storing a newick tree 
+    
+    Holds the structure of a parsed newick tree. The number of inner nodes is stored in \a nodes
+*/
+typedef struct
+{
+  int nodes;                    /**< @brief Total number of nodes in the tree == 2*tips - 1 for rooted and 2*tips -2 for unrooted */
+  int tips;                     /**< @brief Number of leaves (tips) in the tree */
+  pllStack * tree;              /**< @brief Parsed tree represented as elements of a stack. Corresponds to placing the postorder traversal of a rooted tree in a pushdown store */
+} pllNewickTree;
+
+
+/** @brief Information describing the parsed newick tree nodes 
+    
+    This structure is placed in the ::pllNewickTree LIFO element pllNewickTree::tree
+    and described each node of the parsed tree.
+
+    @todo Rename this to something more proper
+*/
+typedef struct
+{
+  int depth;                    /**< @brief Distance of node from root */
+  char * name;                  /**< @brief Name of the taxon represented by the node (in case it is a leaf) */
+  char * branch;                /**< @brief Length of branch that leads to its parent */
+  int leaf;                     /**< @brief \b PLL_TRUE if the node is a leaf, otherwise \b PLL_FALSE */
+  int rank;                     /**< @brief Rank of the node, i.e. how many children it has */
+} pllNewickNodeInfo;
+
+
+#endif
diff --git a/pllrepo/src/newviewGenericSpecial.c b/pllrepo/src/newviewGenericSpecial.c
new file mode 100644
index 0000000..e69d7f2
--- /dev/null
+++ b/pllrepo/src/newviewGenericSpecial.c
@@ -0,0 +1,8736 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file newviewGenericSpecial.c
+ *  
+ * @brief Functions that deal (mostly) with conditional likelihood (re)computation
+ */
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#ifdef __MIC_NATIVE
+#include "mic_native.h"
+#endif
+
+
+#ifdef __SSE3
+#include <stdint.h>
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+#include "cycle.h"
+
+static void computeTraversalInfo(nodeptr, traversalInfo *, int *, int, int, pllBoolean, recompVectors *, pllBoolean);
+static void makeP(double z1, double z2, double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, double *right, pllBoolean saveMem, int maxCat, const int states);
+#if (defined(__SSE3) && !defined(__AVX))
+static void newviewGTRGAMMAPROT_LG4(int tipCase,
+                                    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
+                                        double *x1_start, double *x2_start, double *x3_start,
+                                        double *EV, double *tipVector,
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                        const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                        unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+                                        double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
+
+static void newviewGTRGAMMA(int tipCase,
+                            double *x1_start, double *x2_start, double *x3_start,
+                            double *EV, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling
+                            );
+
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+
+static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
+                                double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
+                                            double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                            unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
+                                            double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                            );
+
+static void newviewGTRGAMMAPROT(int tipCase,
+                                double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+static void newviewGTRCATPROT(int tipCase, double *extEV,
+                              int *cptr,
+                              double *x1, double *x2, double *x3, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling);
+
+static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
+                                   int *cptr,
+                                   double *x1, double *x2, double *x3, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+#endif
+#if (defined(__AVX) || defined(__SSE3))
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+                                  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+static void newviewGTRGAMMA_BINARY(int tipCase,
+                                   double *x1_start, double *x2_start, double *x3_start,
+                                   double *EV, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+#endif
+
+/* required to compute the absolute values of double precision numbers with SSE3 */
+
+PLL_ALIGN_BEGIN const union PLL_ALIGN_END
+{
+  uint64_t i[2];
+  __m128d m;
+} absMask = {{0x7fffffffffffffffULL , 0x7fffffffffffffffULL }};
+
+
+
+#endif
+
+static int pllGetTransitionMatrixNormal (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer);
+static int pllGetTransitionMatrixLG4 (partitionList * pr, nodeptr p, int model, double * outBuffer);
+
+extern const char binaryStateNames[2];  /**< @brief Alphabet of binary states */
+extern const char dnaStateNames[4];     /**< @brief DNA alphabet  */
+extern const char protStateNames[20];   /**< @brief Amino-acid alphabet */
+extern const unsigned int mask32[32];   /**< @brief Contains the first 32 powers of 2, i.e. 2^0 upto 2^31 */
+
+static void ascertainmentBiasSequence(unsigned char tip[32], int numStates)
+{ 
+  assert(numStates <= 32 && numStates > 1);
+
+  switch(numStates)
+    {
+    case 2:     
+      tip[0] = 1;
+      tip[1] = 2;
+      break;
+    case 4:
+      tip[0] = 1;
+      tip[1] = 2;
+      tip[2] = 4;
+      tip[3] = 8;
+      break;
+    default:
+      {
+	int 
+	  i;
+	for(i = 0; i < numStates; i++)
+	  {
+	    tip[i] = i;
+	    //printf("%c ", inverseMeaningPROT[i]);
+	  }
+	//printf("\n");
+      }
+      break;
+    }
+}
+
+static void newviewAscCat(int tipCase,
+			  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			  int *ex3, 
+			  const int n, double *left, double *right, 			    
+			  const int numStates)
+{
+  double
+    *le, *ri, *v, *vl, *vr,
+    ump_x1, ump_x2, x1px2;
+  
+  int 
+    i, l, j, scale;
+
+ 
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    le = &left[0];
+	    ri = &right[0];
+
+	    vl = &(tipVector[numStates * tip[i]]);
+	    vr = &(tipVector[numStates * tip[i]]);
+	    v  = &x3[numStates * i];
+
+	    for(l = 0; l < numStates; l++)
+	      v[l] = 0.0;
+
+	    for(l = 0; l < numStates; l++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+
+		for(j = 0; j < numStates; j++)
+		  {
+		    ump_x1 += vl[j] * le[l * numStates + j];
+		    ump_x2 += vr[j] * ri[l * numStates + j];
+		  }
+
+		x1px2 = ump_x1 * ump_x2;
+
+		for(j = 0; j < numStates; j++)
+		  v[j] += x1px2 * extEV[l * numStates + j];
+	      }	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    le = &left[0];
+	    ri = &right[0];
+
+	    vl = &(tipVector[numStates * tip[i]]);
+	    vr = &x2[numStates * i];
+	    v  = &x3[numStates * i];
+
+	    for(l = 0; l < numStates; l++)
+	      v[l] = 0.0;
+
+	    for(l = 0; l < numStates; l++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+
+		for(j = 0; j < numStates; j++)
+		  {
+		    ump_x1 += vl[j] * le[l * numStates + j];
+		    ump_x2 += vr[j] * ri[l * numStates + j];
+		  }
+
+		x1px2 = ump_x1 * ump_x2;
+
+		for(j = 0; j < numStates; j++)
+		  v[j] += x1px2 * extEV[l * numStates + j];
+	      }
+
+	    scale = 1;
+	    for(l = 0; scale && (l < numStates); l++)
+	      scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));	    
+
+	    if(scale)
+	      {
+		for(l = 0; l < numStates; l++)
+		  v[l] *= PLL_TWOTOTHE256;
+			
+		ex3[i]  += 1;	      
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+	{
+	  le = &left[0];
+	  ri = &right[0];
+
+	  vl = &x1[numStates * i];
+	  vr = &x2[numStates * i];
+	  v = &x3[numStates * i];
+
+	  for(l = 0; l < numStates; l++)
+	    v[l] = 0.0;
+
+	  for(l = 0; l < numStates; l++)
+	    {
+	      ump_x1 = 0.0;
+	      ump_x2 = 0.0;
+
+	      for(j = 0; j < numStates; j++)
+		{
+		  ump_x1 += vl[j] * le[l * numStates + j];
+		  ump_x2 += vr[j] * ri[l * numStates + j];
+		}
+
+	      x1px2 =  ump_x1 * ump_x2;
+
+	      for(j = 0; j < numStates; j++)
+		v[j] += x1px2 * extEV[l * numStates + j];
+	    }
+
+	   scale = 1;
+	   for(l = 0; scale && (l < numStates); l++)
+	     scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));
+	  
+	   if(scale)
+	     {
+	       for(l = 0; l < numStates; l++)
+		 v[l] *= PLL_TWOTOTHE256;
+	      
+	       ex3[i]  += 1;	     
+	     }
+	}
+      break;
+    default:
+      assert(0);
+    }
+  
+ 
+
+}
+
+
+static void newviewAscGamma(int tipCase,
+			    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+			    int *ex3, 
+			    const int n, double *left, double *right, 			    
+			    const int numStates)
+{
+  
+  int  
+    i, j, l, k, scale;
+  
+  const int 
+    statesSquare = numStates * numStates,
+    gammaStates = 4 * numStates;
+
+  double 
+    *vl, *vr, al, ar, *v, x1px2;
+
+  unsigned char 
+    tip[32];
+
+  ascertainmentBiasSequence(tip, numStates);
+  
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for(i = 0; i < n; i++)
+	  {
+	    for(k = 0; k < 4; k++)
+	      {
+		vl = &(tipVector[numStates * tip[i]]);
+		vr = &(tipVector[numStates * tip[i]]);
+		v =  &(x3[gammaStates * i + numStates * k]);
+
+		for(l = 0; l < numStates; l++)
+		  v[l] = 0;
+
+		for(l = 0; l < numStates; l++)
+		  {
+		    al = 0.0;
+		    ar = 0.0;
+		    for(j = 0; j < numStates; j++)
+		      {
+			al += vl[j] * left[k * statesSquare + l * numStates + j];
+			ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		      }
+
+		    x1px2 = al * ar;
+		    for(j = 0; j < numStates; j++)
+		      v[j] += x1px2 * extEV[numStates * l + j];
+		  }
+	      }	    
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    for(k = 0; k < 4; k++)
+	      {
+		vl = &(tipVector[numStates * tip[i]]);
+		vr = &(x2[gammaStates * i + numStates * k]);
+		v =  &(x3[gammaStates * i + numStates * k]);
+
+		for(l = 0; l < numStates; l++)
+		  v[l] = 0;
+
+		for(l = 0; l < numStates; l++)
+		  {
+		    al = 0.0;
+		    ar = 0.0;
+		    for(j = 0; j < numStates; j++)
+		      {
+			al += vl[j] * left[k * statesSquare + l * numStates + j];
+			ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		      }
+
+		    x1px2 = al * ar;
+		    for(j = 0; j < numStates; j++)
+		      v[j] += x1px2 * extEV[numStates * l + j];
+		  }
+	      }
+	   
+	    v = &x3[gammaStates * i];
+	    scale = 1;
+	    for(l = 0; scale && (l < gammaStates); l++)
+	      scale = (PLL_ABS(v[l]) < PLL_MINLIKELIHOOD);
+
+	    if(scale)
+	      {		
+		for(l = 0; l < gammaStates; l++)
+		  v[l] *= PLL_TWOTOTHE256;
+		
+		ex3[i]  += 1;	      
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {
+	 for(k = 0; k < 4; k++)
+	   {
+	     vl = &(x1[gammaStates * i + numStates * k]);
+	     vr = &(x2[gammaStates * i + numStates * k]);
+	     v =  &(x3[gammaStates * i + numStates * k]);
+
+	     for(l = 0; l < numStates; l++)
+	       v[l] = 0;
+
+	     for(l = 0; l < numStates; l++)
+	       {
+		 al = 0.0;
+		 ar = 0.0;
+		 for(j = 0; j < numStates; j++)
+		   {
+		     al += vl[j] * left[k * statesSquare + l * numStates + j];
+		     ar += vr[j] * right[k * statesSquare + l * numStates + j];
+		   }
+
+		 x1px2 = al * ar;
+		 for(j = 0; j < numStates; j++)
+		   v[j] += x1px2 * extEV[numStates * l + j];
+	       }
+	   }
+	 
+	 v = &(x3[gammaStates * i]);
+	 scale = 1;
+	 for(l = 0; scale && (l < gammaStates); l++)
+	   scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD));
+
+	 if(scale)
+	   {	    
+	     for(l = 0; l < gammaStates; l++)
+	       v[l] *= PLL_TWOTOTHE256;
+	     
+	     ex3[i]  += 1;	    
+	   }
+       }
+      break;
+    default:
+      assert(0);
+    }  
+}
+
+
+/* generic function for computing the P matrices, for computing the conditional likelihood at a node p, given child nodes q and r 
+   we compute P(z1) and P(z2) here */
+
+/** @brief Computes two P matrices for two edges.
+
+    Generic function for computing the P matrices of two nodes based on their edges. This is used to 
+    (later) compute the the conditional likelihood at a node p which has two descendants \a q and \r, 
+    which in turn have the edges \a z1 and \a z2 that connect them with \a p. Given those edges, we
+    compute two P matrices \a P(z1) and \a P(z2) which are stored in the arrays \a left and \a right.
+ 
+    The following value is computed here: 
+    \f[
+     EI\cdot exp( EIGN \cdot z)
+     \f]
+     to fill up the P matrix.
+     
+    @param z1    Branch length leading to left descendant node (let's call it \a q)
+    @param z2    Branch length leading to right descendant node (let's call it \a r)
+    @param rptr  Array of values for rate categories
+    @param EI    Inverse eigenvectors of Q-matrix
+    @param EIGN  Eigenvalues of Q-matrix
+    @param numberOfCategories How many rate heterogeneity categories we have, depending on GAMMA and CAT
+    @param left  Where to store the left P matrix (for node \a q)
+    @param right Where to store the right P matrix (for node \a r)
+    @param saveMem If set to \b PLL_TRUE, memory saving technique is enabled
+    @param maxCat Maximum number of rate categories
+    @param states Number of states for the particular data (4 for DNA or 20 for AA)
+*/
+static void 
+makeP(double z1, double z2, double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, double *right, pllBoolean saveMem, int maxCat, const int states)
+{
+  int  i, j, k, statesSquare = states * states;
+
+  /* assign some space for pre-computing and later re-using functions */
+
+  double 
+    *lz1 = (double*)rax_malloc(sizeof(double) * states),
+    *lz2 = (double*)rax_malloc(sizeof(double) * states),
+    *d1 = (double*)rax_malloc(sizeof(double) * states),
+    *d2 = (double*)rax_malloc(sizeof(double) * states);
+
+  /* multiply branch lengths with eigenvalues */
+
+  for(i = 1; i < states; i++)
+  {
+    lz1[i] = EIGN[i] * z1;
+    lz2[i] = EIGN[i] * z2;
+  }
+
+
+  /* loop over the number of rate categories, this will be 4 for the GAMMA model and 
+     variable for the CAT model */
+
+  for(i = 0; i < numberOfCategories; i++)
+  {
+    /* exponentiate the rate multiplied by the branch */
+
+    for(j = 1; j < states; j++)
+    {
+      d1[j] = exp(rptr[i] * lz1[j]);
+      d2[j] = exp(rptr[i] * lz2[j]);
+
+    }
+
+    /* now fill the P matrices for the two branch length values */
+
+    for(j = 0; j < states; j++)
+    {
+      /* left and right are pre-allocated arrays */
+
+      left[statesSquare * i  + states * j] = 1.0;
+      right[statesSquare * i + states * j] = 1.0;         
+
+      for(k = 1; k < states; k++)
+      {
+        left[statesSquare * i + states * j + k]  = d1[k] * EI[states * j + k];
+        right[statesSquare * i + states * j + k] = d2[k] * EI[states * j + k];
+      }
+    }
+  }
+
+
+  /* if memory saving is enabled and we are using CAT we need to do one additional P matrix 
+     calculation for a rate of 1.0 to compute the entries of a column/tree site comprising only gaps */
+
+
+  if(saveMem)
+  {
+    i = maxCat;
+
+    for(j = 1; j < states; j++)
+    {
+      d1[j] = exp (lz1[j]);
+      d2[j] = exp (lz2[j]);
+    }
+
+    for(j = 0; j < states; j++)
+    {
+      left[statesSquare * i  + states * j] = 1.0;
+      right[statesSquare * i + states * j] = 1.0;
+
+      for(k = 1; k < states; k++)
+      {
+        left[statesSquare * i + states * j + k]  = d1[k] * EI[states * j + k];
+        right[statesSquare * i + states * j + k] = d2[k] * EI[states * j + k];
+      }
+    }
+  }
+
+  /* free the temporary buffers */
+
+  rax_free(lz1);
+  rax_free(lz2);
+  rax_free(d1);
+  rax_free(d2);
+}
+
+
+/** Compute the transition probability matrix for a given branch
+
+    Computes the transition probability matrix for the branch \a p->z and partition \a model given the
+    PLL instance \a tr and list of partitions \a pr. The result is stored in \a outBuffer which must
+    be of sufficient size, i.e states * states * (numberOfRateCategories + 1) * sizeof(double);
+
+    @param tr  PLL instance
+    @param pr  List of partitions
+    @param model  Partition index for which to take the branch length
+    @param p  Adjacent node to the edge we want to compute the trans. prob. matrix
+    @param outBuffer Output buffer where to store the transition probability matrix
+
+*/
+int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer)
+{
+  if (tr->rateHetModel == PLL_CAT)
+   {
+     if (rate >= pr->partitionData[model]->numberOfCategories) return (PLL_FALSE);
+   }
+  else
+   {
+     if (rate >= 4) return (PLL_FALSE);
+   }
+
+  if (pr->partitionData[model]->dataType == PLL_AA_DATA &&
+		  (pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+    return (pllGetTransitionMatrixLG4 (pr, p, model, outBuffer));
+    
+    
+  return (pllGetTransitionMatrixNormal (tr, pr, p, model, rate, outBuffer));
+}
+
+
+/* TODO: Fix this function according to pllGetTransitionMatrixNormal */
+static int pllGetTransitionMatrixLG4 (partitionList * pr, nodeptr p, int model, double * outBuffer)
+{
+  int
+    i, j, k,
+    states = pr->partitionData[model]->states,
+    numberOfCategories = 4;
+  double
+    d[64],
+    *  rptr = pr->partitionData[model]->gammaRates,
+    ** EI   = pr->partitionData[model]->EI_LG4,
+    ** EIGN = pr->partitionData[model]->EIGN_LG4;
+
+  assert (states == 20);
+
+  for (i = 0; i < numberOfCategories; ++i)
+   {
+     for (j = 1; j < states; ++j)
+      {
+        d[j] = exp(rptr[i] * EIGN[i][j] * p->z[model]);
+      }
+     for (j = 0; j < states; ++ j)
+      {
+        outBuffer[states * states * i + states * j] = 1.0;
+        for (k = 1; k < states; ++k) 
+         {
+           outBuffer[states * states * i + states * j + k] = d[k] * EI[i][states * j + k];
+         }
+      }
+   }
+  return (PLL_TRUE);
+}
+
+static int pllGetTransitionMatrixNormal (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer)
+{
+  int 
+    i, j, k,
+    /* numberOfCategories, */
+    states = pr->partitionData[model]->states;
+  double
+    * d = (double *)rax_malloc(sizeof(double) * states),
+    * rptr,
+    * EI   = pr->partitionData[model]->EI,
+    * EIGN = pr->partitionData[model]->EIGN,
+    * EV = pr->partitionData[model]->EV;
+  
+  double lz = (p->z[model] > PLL_ZMIN) ? log(p->z[model]) : log(PLL_ZMIN);                        
+
+  if (tr->rateHetModel == PLL_CAT)
+   {
+     rptr               = pr->partitionData[model]->perSiteRates;
+     /* numberOfCategories = pr->partitionData[model]->numberOfCategories; */
+   }
+  else
+   {
+     rptr               = pr->partitionData[model]->gammaRates;
+     /* numberOfCategories = 4; */
+   }
+
+  for (i = 0; i < states * states; ++ i) outBuffer[i] = 0;
+
+  d[0] = 1.0;
+  for (j = 1; j < states; ++ j)
+   {
+     d[j] = exp(rptr[rate] * EIGN[j] * lz);
+   }
+
+  for (i = 0; i < states; ++ i)
+   {
+     for (j = 0; j < states; ++ j)
+      {
+        for (k = 0; k < states; ++ k)
+         {
+           outBuffer[states * i + j] += (d[k] * EI[states * i + k] * EV[states * j + k]);
+         }
+      }
+   }
+
+  assert (!tr->saveMemory);
+  // TODO: Fix the following snippet
+  //if (tr->saveMemory)
+  // {
+  //   i = tr->maxCategories;
+  //   
+  //   for (j = 1; j < states; ++j)
+  //    {
+  //      d[j] = EXP(EIGN[j] * p->z[model]);
+  //    }
+
+  //   for (j = 0; j < states; ++j)
+  //    {
+  //      outBuffer[states * states * i + states * j] = 1.0;
+  //      for (k = 1; k < states; ++k)
+  //       {
+  //         outBuffer[states * states * i + states * j + k] = d[k] * EI[states * j + k];
+  //       }
+  //    }
+  // }
+
+  rax_free(d);
+
+  return (PLL_TRUE);
+}
+
+
+/** @brief Compute two P matrices for two edges for the LG4 model
+    
+    Computing the P matrices of two nodes based on their edges for the LG4 model. This is used to 
+    (later) compute the the conditional likelihood at a node p which has two descendants \a q and \r, 
+    which in turn have the edges \a z1 and \a z2 that connect them with \a p. Given those edges, we
+    compute two P matrices \a P(z1) and \a P(z2) which are stored in the arrays \a left and \a right.
+
+    @param z1
+      Branch length leading to left descendant node (let's call it \a q)
+     
+    @param z2
+      Branch length leading to right descendant node (let's call it \a r)
+
+    @param rptr
+      Array of values for rate categories
+
+    @param EI
+      Inverse eigenvectors of 4 Q-matrices
+     
+    @param EIGN
+      Eigenvalues of 4 Q-matrix
+
+    @param numberOfCategories
+      How many rate heterogeneity categories we have, depending on GAMMA and CAT
+     
+    @param left
+      Where to store the left P matrix (for node \a q)
+     
+    @param right
+      Where to store the right P matrix (for node \a r)
+
+    @param numStates
+      Number of states for the particular data (4 for DNA or 20 for AA)
+
+    @todo
+      Present the maths here as in ::makeP
+
+*/
+static void makeP_FlexLG4(double z1, double z2, double *rptr, double *EI[4],  double *EIGN[4], int numberOfCategories, double *left, double *right, const int numStates)
+{
+  int 
+    i,
+    j,
+    k;
+  
+  const int
+    statesSquare = numStates * numStates;
+
+  double    
+    d1[64],  
+    d2[64];
+
+  assert(numStates <= 64);
+       
+  for(i = 0; i < numberOfCategories; i++)
+    {
+      for(j = 1; j < numStates; j++)
+        {
+          d1[j] = exp (rptr[i] * EIGN[i][j] * z1);
+          d2[j] = exp (rptr[i] * EIGN[i][j] * z2);
+        }
+
+      for(j = 0; j < numStates; j++)
+        {
+          left[statesSquare * i  + numStates * j] = 1.0;
+          right[statesSquare * i + numStates * j] = 1.0;
+
+          for(k = 1; k < numStates; k++)
+            {
+              left[statesSquare * i + numStates * j + k]  = d1[k] * EI[i][numStates * j + k];
+              right[statesSquare * i + numStates * j + k] = d2[k] * EI[i][numStates * j + k];
+            }
+        }
+    }  
+}
+
+#if (!defined(__AVX) && !defined(__SSE3))
+
+/** @brief Computation of conditional likelihood arrays for CAT
+ 
+    This is a generic, slow but readable function implementation for computing the 
+     conditional likelihood arrays at p, given child nodes q and r using the CAT
+     mode of rate heterogeneity. Depending whether \a q, resp. \r, are tips or internal
+     nodes (indicated by \a tipCase) the conditional likelihoods are computed based on
+     \a x1 if \a q is an inner node or \a tipX1 if it is a tip, resp. \a x2 if \a r
+     is an inner node or \a tipX2 if it is a tip. Output array \a ex3 stores the
+     number of times the likelihood of each site for each internal node has been scaled.
+     The conditional likelihood vectors for any possible base-pair (which is useful when
+     \a q or \a r are tips) has been already precomputed from the eigenvalues of the Q
+     matrix in the array \a tipVector. In case the conditional likelihood for a particular
+     site is very small in terms of a floating point number, then it is multiplied by a
+     very large number (scaling), and then number of times it has been scaled (per node) is
+     stored in the array \a ex3, if \a fastScaling is set to \b PLL_FALSE. Otherwise, the
+     total number of scalings for all sites and all nodes is stored in a single variable
+     \a scalerIncrement.
+
+    @param tipCase
+      Can be either \b PLL_TIP_TIP, or \b PLL_TIP_INNER or \b PLL_INNER_INNER, and describes the
+      descendants of the node for which we currently compute the condition likelihood
+      vector, i.e. whether they are both tips (leaves), or one is tip and the other
+      an inner node, or both are inner nodes.
+
+    @param extEV
+      Eigenvectors of Q matrix
+      
+    @param cptr
+      Array where the rate for each site in the compressed partition alignment is stored
+
+    @param x1
+      Conditional likelihood vectors of the first child node, in case it is an internal node
+
+    @param x2
+      Conditional likelihood vectors of the second child node, in case it is an internal node
+
+    @param x3
+      Pointer to where the computed conditional likelihood vector of node \a p will be stored
+
+    @param tipVector
+      Vector contining sums of left eigenvectors for likelihood computation at tips.
+
+    @param ex3
+      Pointer to an array of whose elements correspond to the number of times the likelihood of
+      a particular site of a particular internal nodeis scaled. Those elements are incremented
+      at every scaling operation and only if \a fastScaling flag is set to \b PLL_FALSE. This 
+      array will be used later when evaluating the likelihood of the whole tree.
+
+    @param tipX1
+      Pointer to the alignment data (sequence) of first child node, in case it is a tip
+
+    @param tipX2
+      Pointer to the alignment data (sequence) of second child node, in case it is a tip
+
+    @param n
+      Number of sites for which we are doing the evaluation. For the single-thread version this is the number of sites in the
+      current partition, for multi-threads this is the number of sites assigned to the running thread from the current partition.
+
+    @param left
+      Pointer to the P matrix of the left child
+
+    @param right
+      Pointer to the P matrix of the right child
+
+    @param wgt
+      Array of weights for each site
+
+    @param scalerIncrement
+      Where to store the number of scalings carried out in case \a fastScaling is set to \b PLL_TRUE.
+
+    @param fastScaling
+      If set to \b PLL_TRUE, only the total number of scalings for all sites of the partition will be
+      stored in \a scalerIncrement, otherwise per-site scalings are stored in the array \a ex3. 
+
+    @param states
+      Number of states for the particular data (4 for DNA or 20 for AA)
+ */
+static void newviewCAT_FLEX(int tipCase, double *extEV,
+                            int *cptr,
+                            double *x1, double *x2, double *x3, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling, const int states)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    ump_x1, 
+    ump_x2, 
+    x1px2;
+
+  int 
+    i, 
+    l, 
+    j, 
+    scale, 
+    addScale = 0;
+
+  const int 
+    statesSquare = states * states;
+
+
+  /* here we switch over the different cases for efficiency, but also because 
+     each case accesses different data types.
+
+     We consider three cases: either q and r are both tips, q or r are tips, and q and r are inner 
+     nodes.
+     */
+
+
+  switch(tipCase)
+  {
+
+    /* both child nodes of p weher we want to update the conditional likelihood are tips */
+    case PLL_TIP_TIP:     
+      /* loop over sites */
+      for (i = 0; i < n; i++)
+      {
+        /* set a pointer to the P-Matrices for the rate category of this site */
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* pointers to the likelihood entries of the tips q (vl) and r (vr) 
+           We will do reading accesses to these values only.
+           */
+        vl = &(tipVector[states * tipX1[i]]);
+        vr = &(tipVector[states * tipX2[i]]);
+
+        /* address of the conditional likelihood array entres at site i. This is 
+           a writing access to v */
+        v  = &x3[states * i];
+
+        /* initialize v */
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        /* loop over states to compute the cond likelihoods at p (v) */
+
+        for(l = 0; l < states; l++)
+        {             
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          /* le and ri are the P-matrices */
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 = ump_x1 * ump_x2;
+
+          /* multiply with matrix of eigenvectors extEV */
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];
+        }          
+      }    
+      break;
+    case PLL_TIP_INNER:      
+
+      /* same as above, only that now vl is a tip and vr is the conditional probability vector 
+         at an inner node. Note that, if we have the case that either q or r is a tip, the 
+         nodes will be flipped to ensure that tipX1 always points to the sequence at the tip.
+         */
+
+      for (i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* access tip vector lookup table */
+        vl = &(tipVector[states * tipX1[i]]);
+
+        /* access conditional likelihoo arrays */
+        /* again, vl and vr are reading accesses, while v is a writing access */
+        vr = &x2[states * i];
+        v  = &x3[states * i];
+
+        /* same as in the loop above */
+
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        for(l = 0; l < states; l++)
+        {
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 = ump_x1 * ump_x2;
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];
+        }
+
+        /* now let's check for numerical scaling. 
+           The maths in RAxML are a bit non-standard to avoid/economize on arithmetic operations 
+           at the virtual root and for branch length optimization and hence values stored 
+           in the conditional likelihood vectors can become negative.
+           Below we check if all absolute values stored at position i of v are smaller 
+           than a pre-defined value in pll.h. If they are all smaller we can then safely 
+           multiply them by a large, constant number PLL_TWOTOTHE256 (without numerical overflow) 
+           that is also speced in pll.h */
+
+        scale = 1;
+        for(l = 0; scale && (l < states); l++)
+          scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));         
+
+        if(scale)
+        {
+          for(l = 0; l < states; l++)
+            v[l] *= PLL_TWOTOTHE256;
+
+          /* if we have scaled the entries to prevent underflow, we need to keep track of how many scaling 
+             multiplications we did per node such as to undo them at the virtual root, e.g., in 
+             evaluateGeneric() 
+             Note here, that, if we scaled the site we need to increment the scaling counter by the wieght, i.e., 
+             the number of sites this potentially compressed pattern represents ! */ 
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];   
+          
+        }
+      }   
+      break;
+    case PLL_INNER_INNER:
+
+      /* same as above, only that the two child nodes q and r are now inner nodes */
+
+      for(i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * statesSquare];
+        ri = &right[cptr[i] * statesSquare];
+
+        /* index conditional likelihood vectors of inner nodes */
+
+        vl = &x1[states * i];
+        vr = &x2[states * i];
+        v = &x3[states * i];
+
+        for(l = 0; l < states; l++)
+          v[l] = 0.0;
+
+        for(l = 0; l < states; l++)
+        {
+          ump_x1 = 0.0;
+          ump_x2 = 0.0;
+
+          for(j = 0; j < states; j++)
+          {
+            ump_x1 += vl[j] * le[l * states + j];
+            ump_x2 += vr[j] * ri[l * states + j];
+          }
+
+          x1px2 =  ump_x1 * ump_x2;
+
+          for(j = 0; j < states; j++)
+            v[j] += x1px2 * extEV[l * states + j];            
+        }
+
+        scale = 1;
+        for(l = 0; scale && (l < states); l++)
+          scale = ((v[l] < PLL_MINLIKELIHOOD) && (v[l] > PLL_MINUSMINLIKELIHOOD));
+
+        if(scale)
+        {
+          for(l = 0; l < states; l++)
+            v[l] *= PLL_TWOTOTHE256;
+          
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];    
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  /* increment the scaling counter by the additional scalings done at node p */
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @brief Computation of conditional likelihood arrays for \b GAMMA
+ 
+    This is a generic, slow but readable function implementation for computing the 
+     conditional likelihood arrays at \a p, given child nodes \a q and \a r using the \b GAMMA
+     model of rate heterogeneity. Depending whether \a q, resp. \r, are tips or internal
+     nodes (indicated by \a tipCase) the conditional likelihoods are computed based on
+     \a x1 if \a q is an inner node or \a tipX1 if it is a tip, resp. \a x2 if \a r
+     is an inner node or \a tipX2 if it is a tip. Output array \a ex3 stores the
+     number of times the likelihood of each site for each internal node has been scaled.
+     The conditional likelihood vectors for any possible base-pair (which is useful when
+     \a q or \a r are tips) has been already precomputed from the eigenvalues of the Q
+     matrix in the array \a tipVector. In case the conditional likelihood for a particular
+     site is very small in terms of a floating point number, then it is multiplied by a
+     very large number (scaling), and then number of times it has been scaled (per node) is
+     stored in the array \a ex3, if \a fastScaling is set to \b PLL_FALSE. Otherwise, the
+     total number of scalings for all sites and all nodes is stored in a single variable
+     \a scalerIncrement.
+
+    @param tipCase
+      Can be either \b PLL_TIP_TIP, or \b PLL_TIP_INNER or \b PLL_INNER_INNER, and describes the
+      descendants of the node for which we currently compute the condition likelihood
+      vector, i.e. whether they are both tips (leaves), or one is tip and the other
+      an inner node, or both are inner nodes.
+
+    @param x1
+      Conditional likelihood vectors of the first child node, in case it is an internal node
+
+    @param x2
+      Conditional likelihood vectors of the second child node, in case it is an internal node
+
+    @param x3
+      Pointer to where the computed conditional likelihood vector of node \a p will be stored
+
+    @param extEV
+      Eigenvectors of Q matrix
+
+    @param tipVector
+      Vector contining sums of left eigenvectors for likelihood computation at tips.
+
+    @param ex3
+      Pointer to an array of whose elements correspond to the number of times the likelihood of
+      a particular site of a particular internal nodeis scaled. Those elements are incremented
+      at every scaling operation and only if \a fastScaling flag is set to \b PLL_FALSE. This 
+      array will be used later when evaluating the likelihood of the whole tree.
+
+    @param tipX1
+      Pointer to the alignment data (sequence) of first child node, in case it is a tip
+
+    @param tipX2
+      Pointer to the alignment data (sequence) of second child node, in case it is a tip
+
+    @param n
+      Number of sites to be processed
+
+    @param left
+      Pointer to the P matrix of the left child
+
+    @param right
+      Pointer to the P matrix of the right child
+
+    @param wgt
+      Array of weights for each site
+
+    @param scalerIncrement
+      Where to store the number of scalings carried out in case \a fastScaling is set to \b PLL_TRUE.
+
+    @param fastScaling
+      If set to \b PLL_TRUE, only the total number of scalings for all sites of the partition will be
+      stored in \a scalerIncrement, otherwise per-site scalings are stored in the array \a ex3. 
+
+    @param states
+      Number of states for the particular data (4 for DNA or 20 for AA)
+
+    @param maxStateValue
+      Number of all possible base-pairs including degenerate characters, i.e. 16 for  DNA and 23 for AA
+ */
+static void newviewGAMMA_FLEX(int tipCase,
+                              double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling, const int states, const int maxStateValue)
+{
+  double  
+    *uX1, 
+    *uX2, 
+    *v, 
+    x1px2, 
+    *vl, 
+    *vr, 
+    al, 
+    ar;
+
+  int  
+    i, 
+    j, 
+    l, 
+    k, 
+    scale, 
+    addScale = 0;
+
+  const int     
+    statesSquare = states * states,
+                 span = states * 4,
+                 /* this is required for doing some pre-computations that help to save 
+                    numerical operations. What we are actually computing here are additional lookup tables 
+                    for each possible state a certain data-type can assume.
+                    for DNA with ambuguity coding this is 15, for proteins this is 22 or 23, since there 
+                    also exist one or two amibguity codes for protein data.
+                    Essentially this is very similar to the tip vectors which we also use as lookup tables */
+                 precomputeLength = maxStateValue * span;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        /* allocate pre-compute memory space */
+
+        double 
+          *umpX1 = (double*)rax_malloc(sizeof(double) * precomputeLength),
+          *umpX2 = (double*)rax_malloc(sizeof(double) * precomputeLength);
+
+        /* multiply all possible tip state vectors with the respective P-matrices 
+        */
+
+        for(i = 0; i < maxStateValue; i++)
+        {
+          v = &(tipVector[states * i]);
+
+          for(k = 0; k < span; k++)
+          {
+
+            umpX1[span * i + k] = 0.0;
+            umpX2[span * i + k] = 0.0;
+
+            for(l = 0; l < states; l++)
+            {
+              umpX1[span * i + k] +=  v[l] *  left[k * states + l];
+              umpX2[span * i + k] +=  v[l] * right[k * states + l];
+            }
+
+          }
+        }
+
+        for(i = 0; i < n; i++)
+        {
+          /* access the precomputed arrays (pre-computed multiplication of conditional with the tip state) 
+          */
+
+          uX1 = &umpX1[span * tipX1[i]];
+          uX2 = &umpX2[span * tipX2[i]];
+
+          /* loop over discrete GAMMA rates */
+
+          for(j = 0; j < 4; j++)
+          {
+            /* the rest is the same as for CAT */
+            v = &x3[i * span + j * states];
+
+            for(k = 0; k < states; k++)
+              v[k] = 0.0;
+
+            for(k = 0; k < states; k++)
+            {              
+              x1px2 = uX1[j * states + k] * uX2[j * states + k];
+
+              for(l = 0; l < states; l++)                                                       
+                v[l] += x1px2 * extEV[states * k + l];               
+            }
+
+          }        
+        }
+
+        /* free precomputed vectors */
+
+        rax_free(umpX1);
+        rax_free(umpX2);
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        /* we do analogous pre-computations as above, with the only difference that we now do them 
+           only for one tip vector */
+
+        double 
+          *umpX1 = (double*)rax_malloc(sizeof(double) * precomputeLength),
+          *ump_x2 = (double*)rax_malloc(sizeof(double) * states);
+
+        /* precompute P and left tip vector product */
+
+        for(i = 0; i < maxStateValue; i++)
+        {
+          v = &(tipVector[states * i]);
+
+          for(k = 0; k < span; k++)
+          {
+
+            umpX1[span * i + k] = 0.0;
+
+            for(l = 0; l < states; l++)
+              umpX1[span * i + k] +=  v[l] * left[k * states + l];
+
+
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
+
+          uX1 = &umpX1[span * tipX1[i]];
+
+          /* loop over discrete GAMMA rates */
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2[span * i + k * states]);
+
+            for(l = 0; l < states; l++)
+            {
+              ump_x2[l] = 0.0;
+
+              for(j = 0; j < states; j++)
+                ump_x2[l] += v[j] * right[k * statesSquare + l * states + j];
+            }
+
+            v = &(x3[span * i + states * k]);
+
+            for(l = 0; l < states; l++)
+              v[l] = 0;
+
+            for(l = 0; l < states; l++)
+            {
+              x1px2 = uX1[k * states + l]  * ump_x2[l];
+              for(j = 0; j < states; j++)
+                v[j] += x1px2 * extEV[l * states  + j];
+            }
+          }
+
+          /* also do numerical scaling as above. Note that here we need to scale 
+             4 * 4 values for DNA or 4 * 20 values for protein data.
+             If they are ALL smaller than our threshold, we scale. Note that,
+             this can cause numerical problems with GAMMA, if the values generated 
+             by the four discrete GAMMA rates are too different.
+
+             For details, see: 
+
+             F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees"
+
+*/
+
+
+          v = &x3[span * i];
+          scale = 1;
+          for(l = 0; scale && (l < span); l++)
+            scale = (PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD);
+
+
+          if (scale)
+          {
+            for(l = 0; l < span; l++)
+              v[l] *= PLL_TWOTOTHE256;
+            
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                   
+          }
+        }
+
+        rax_free(umpX1);
+        rax_free(ump_x2);
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      /* same as above, without pre-computations */
+
+      for (i = 0; i < n; i++)
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1[span * i + states * k]);
+          vr = &(x2[span * i + states * k]);
+          v =  &(x3[span * i + states * k]);
+
+
+          for(l = 0; l < states; l++)
+            v[l] = 0;
+
+
+          for(l = 0; l < states; l++)
+          {              
+
+            al = 0.0;
+            ar = 0.0;
+
+            for(j = 0; j < states; j++)
+            {
+              al += vl[j] * left[k * statesSquare + l * states + j];
+              ar += vr[j] * right[k * statesSquare + l * states + j];
+            }
+
+            x1px2 = al * ar;
+
+            for(j = 0; j < states; j++)
+              v[j] += x1px2 * extEV[states * l + j];
+
+          }
+        }
+
+        v = &(x3[span * i]);
+        scale = 1;
+        for(l = 0; scale && (l < span); l++)
+          scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD));
+
+        if(scale)
+        {  
+          for(l = 0; l < span; l++)
+            v[l] *= PLL_TWOTOTHE256;
+          
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];           
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications 
+     carried out for computing the likelihood array at node p */
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/* Candidate for deletion */
+/*
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  double
+    ump_x1, ump_x2, x1px2[4];
+  int i, j, k, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[4 * tipX1[i]]);
+            x2 = &(tipVector[4 * tipX2[i]]);
+            x3 = &x3_start[4 * i];
+
+            le =  &left[cptr[i] * 16];
+            ri =  &right[cptr[i] * 16];
+
+            for(j = 0; j < 4; j++)
+              {
+                ump_x1 = 0.0;
+                ump_x2 = 0.0;
+                for(k = 0; k < 4; k++)
+                  {
+                    ump_x1 += x1[k] * le[j * 4 + k];
+                    ump_x2 += x2[k] * ri[j * 4 + k];
+                  }
+                x1px2[j] = ump_x1 * ump_x2;
+              }
+
+            for(j = 0; j < 4; j++)
+              x3[j] = 0.0;
+
+            for(j = 0; j < 4; j++)
+              for(k = 0; k < 4; k++)
+                x3[k] += x1px2[j] * EV[j * 4 + k];          
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[4 * tipX1[i]]);
+            x2 = &x2_start[4 * i];
+            x3 = &x3_start[4 * i];
+
+            le =  &left[cptr[i] * 16];
+            ri =  &right[cptr[i] * 16];
+
+            for(j = 0; j < 4; j++)
+              {
+                ump_x1 = 0.0;
+                ump_x2 = 0.0;
+                for(k = 0; k < 4; k++)
+                  {
+                    ump_x1 += x1[k] * le[j * 4 + k];
+                    ump_x2 += x2[k] * ri[j * 4 + k];
+                  }
+                x1px2[j] = ump_x1 * ump_x2;
+              }
+
+            for(j = 0; j < 4; j++)
+              x3[j] = 0.0;
+
+            for(j = 0; j < 4; j++)
+              for(k = 0; k < 4; k++)
+                x3[k] +=  x1px2[j] *  EV[4 * j + k];       
+
+            scale = 1;
+            for(j = 0; j < 4 && scale; j++)
+              scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);               
+                    
+            if(scale)
+              {             
+                for(j = 0; j < 4; j++)
+                  x3[j] *= PLL_TWOTOTHE256;
+                
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;         
+              }      
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[4 * i];
+          x2 = &x2_start[4 * i];
+          x3 = &x3_start[4 * i];
+
+          le = &left[cptr[i] * 16];
+          ri = &right[cptr[i] * 16];
+
+          for(j = 0; j < 4; j++)
+            {
+              ump_x1 = 0.0;
+              ump_x2 = 0.0;
+              for(k = 0; k < 4; k++)
+                {
+                  ump_x1 += x1[k] * le[j * 4 + k];
+                  ump_x2 += x2[k] * ri[j * 4 + k];
+                }
+              x1px2[j] = ump_x1 * ump_x2;
+            }
+
+          for(j = 0; j < 4; j++)
+            x3[j] = 0.0;
+
+          for(j = 0; j < 4; j++)
+            for(k = 0; k < 4; k++)
+              x3[k] +=  x1px2[j] *  EV[4 * j + k];
+        
+          scale = 1;
+          for(j = 0; j < 4 && scale; j++)
+            scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+          if(scale)
+            {               
+              for(j = 0; j < 4; j++)
+                x3[j] *= PLL_TWOTOTHE256;
+              
+              if(useFastScaling)
+                addScale += wgt[i];
+              else
+                ex3[i]  += 1;           
+            }     
+        }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+*/
+#if 0
+static void newviewGTRGAMMA_BINARY(int tipCase,
+                                   double *x1_start, double *x2_start, double *x3_start,
+                                   double *EV, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+                                   )
+{
+  double
+    *x1, *x2, *x3;
+  double
+    ump_x1,
+    ump_x2,
+    x1px2[4];
+  int i, j, k, l, scale, addScale = 0;
+
+
+  /* C-OPT figure out if we are at an inner node who has two tips/leaves
+     as descendants TIP_TIP, a tip and another inner node as descendant
+     TIP_INNER, or two inner nodes as descendants INNER_INNER */
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &(tipVector[2 * tipX2[i]]);
+            x3 = &x3_start[i * 8];
+
+            for(j = 0; j < 8; j++)
+              x3[j] = 0.0;
+
+            for (j = 0; j < 4; j++)
+              {
+                for (k = 0; k < 2; k++)
+                  {
+                    ump_x1 = 0.0;
+                    ump_x2 = 0.0;
+
+                    for (l=0; l < 2; l++)
+                      {
+                        ump_x1 += x1[l] * left[ j*4 + k*2 + l];
+                        ump_x2 += x2[l] * right[j*4 + k*2 + l];
+                      }
+
+                    x1px2[k] = ump_x1 * ump_x2;
+                  }
+
+                for(k = 0; k < 2; k++)
+                  for (l = 0; l < 2; l++)
+                    x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+              }    
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+         for (i = 0; i < n; i++)
+           {
+             x1 = &(tipVector[2 * tipX1[i]]);
+             x2 = &x2_start[i * 8];
+             x3 = &x3_start[i * 8];
+
+             for(j = 0; j < 8; j++)
+               x3[j] = 0.0;
+
+             for (j = 0; j < 4; j++)
+               {
+                 for (k = 0; k < 2; k++)
+                   {
+                     ump_x1 = 0.0;
+                     ump_x2 = 0.0;
+
+                     for (l=0; l < 2; l++)
+                       {
+                         ump_x1 += x1[l] * left[ j*4 + k*2 + l];
+                         ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
+                       }
+
+                     x1px2[k] = ump_x1 * ump_x2;
+                   }
+
+                 for(k = 0; k < 2; k++)
+                   for (l = 0; l < 2; l++)
+                     x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+               }            
+
+             scale = 1;
+             for(l = 0; scale && (l < 8); l++)
+               scale = (PLL_ABS(x3[l]) <  PLL_MINLIKELIHOOD);
+
+             if(scale)
+               {
+                 for (l=0; l < 8; l++)
+                   x3[l] *= PLL_TWOTOTHE256;
+                 
+                 if(useFastScaling)
+                   addScale += wgt[i];
+                 else
+                   ex3[i]  += 1;               
+               }
+
+           }
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      /* C-OPT here we don't do any pre-computations
+         This should be the most compute intensive loop of the three
+         cases here. If we have one or two tips as descendants
+         we can take a couple of shortcuts */
+
+
+     for (i = 0; i < n; i++)
+       {
+         x1 = &x1_start[i * 8];
+         x2 = &x2_start[i * 8];
+         x3 = &x3_start[i * 8];
+
+         for(j = 0; j < 8; j++)
+           x3[j] = 0.0;
+
+         for (j = 0; j < 4; j++)
+           {
+             for (k = 0; k < 2; k++)
+               {
+                 ump_x1 = 0.0;
+                 ump_x2 = 0.0;
+
+                 for (l=0; l < 2; l++)
+                   {
+                     ump_x1 += x1[j*2 + l] * left[ j*4 + k*2 + l];
+                     ump_x2 += x2[j*2 + l] * right[j*4 + k*2 + l];
+                   }
+
+                 x1px2[k] = ump_x1 * ump_x2;
+               }
+
+             for(k = 0; k < 2; k++)
+               for (l = 0; l < 2; l++)
+                 x3[j * 2 + l] +=  x1px2[k] * EV[2 * k + l];
+
+           }
+         
+         scale = 1;
+         for(l = 0; scale && (l < 8); l++)
+           scale = (PLL_ABS(x3[l]) <  PLL_MINLIKELIHOOD);
+
+
+         if(scale)
+           {
+             for (l=0; l<8; l++)
+               x3[l] *= PLL_TWOTOTHE256;
+
+             if(useFastScaling)
+               addScale += wgt[i];
+             else
+               ex3[i]  += 1;      
+           }
+       }
+     break;
+
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+				  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+				  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  double
+    ump_x1, ump_x2, x1px2[2];
+  int i, j, k, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    x1 = &(tipVector[2 * tipX1[i]]);
+	    x2 = &(tipVector[2 * tipX2[i]]);
+	    x3 = &x3_start[2 * i];	    
+
+	    le =  &left[cptr[i] * 4];
+	    ri =  &right[cptr[i] * 4];
+
+	    for(j = 0; j < 2; j++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+		for(k = 0; k < 2; k++)
+		  {
+		    ump_x1 += x1[k] * le[j * 2 + k];
+		    ump_x2 += x2[k] * ri[j * 2 + k];
+		  }
+		x1px2[j] = ump_x1 * ump_x2;
+	      }
+
+	    for(j = 0; j < 2; j++)
+	      x3[j] = 0.0;
+
+	    for(j = 0; j < 2; j++)
+	      for(k = 0; k < 2; k++)
+		x3[k] += x1px2[j] * EV[j * 2 + k];	   
+	  }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+	for (i = 0; i < n; i++)
+	  {
+	    x1 = &(tipVector[2 * tipX1[i]]);
+	    x2 = &x2_start[2 * i];
+	    x3 = &x3_start[2 * i];
+	    
+	    le =  &left[cptr[i] * 4];
+	    ri =  &right[cptr[i] * 4];
+
+	    for(j = 0; j < 2; j++)
+	      {
+		ump_x1 = 0.0;
+		ump_x2 = 0.0;
+		for(k = 0; k < 2; k++)
+		  {
+		    ump_x1 += x1[k] * le[j * 2 + k];
+		    ump_x2 += x2[k] * ri[j * 2 + k];
+		  }
+		x1px2[j] = ump_x1 * ump_x2;
+	      }
+
+	    for(j = 0; j < 2; j++)
+	      x3[j] = 0.0;
+
+	    for(j = 0; j < 2; j++)
+	      for(k = 0; k < 2; k++)
+		x3[k] +=  x1px2[j] *  EV[2 * j + k];	   
+
+	    scale = 1;
+	    for(j = 0; j < 2 && scale; j++)
+	      scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+	    if(scale)
+	      {
+		for(j = 0; j < 2; j++)
+		  x3[j] *= PLL_TWOTOTHE256;
+
+		if(useFastScaling)
+		  addScale += wgt[i];
+		else
+		  ex3[i]  += 1;	       
+	      }
+	  }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+	{
+	  x1 = &x1_start[2 * i];
+	  x2 = &x2_start[2 * i];
+	  x3 = &x3_start[2 * i];
+
+	  le = &left[cptr[i] * 4];
+	  ri = &right[cptr[i] * 4];
+
+	  for(j = 0; j < 2; j++)
+	    {
+	      ump_x1 = 0.0;
+	      ump_x2 = 0.0;
+	      for(k = 0; k < 2; k++)
+		{
+		  ump_x1 += x1[k] * le[j * 2 + k];
+		  ump_x2 += x2[k] * ri[j * 2 + k];
+		}
+	      x1px2[j] = ump_x1 * ump_x2;
+	    }
+
+	  for(j = 0; j < 2; j++)
+	    x3[j] = 0.0;
+
+	  for(j = 0; j < 2; j++)
+	    for(k = 0; k < 2; k++)
+	      x3[k] +=  x1px2[j] *  EV[2 * j + k];	  
+
+	  scale = 1;
+	  for(j = 0; j < 2 && scale; j++)
+	    scale = (x3[j] < PLL_MINLIKELIHOOD && x3[j] > PLL_MINUSMINLIKELIHOOD);
+
+	  if(scale)
+	    {
+	      for(j = 0; j < 2; j++)
+		x3[j] *= PLL_TWOTOTHE256;
+
+	      if(useFastScaling)
+		addScale += wgt[i];
+	      else
+		ex3[i]  += 1;	   
+	    }
+	}
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+#endif    /* end if 0 */
+#endif
+
+#if (defined(__AVX) || defined(__SSE3))
+static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
+                                  double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
+                                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                  int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1, *x2, *x3;
+  int i, l, scale, addScale = 0;
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        for(i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &(tipVector[2 * tipX2[i]]);
+            x3 = &x3_start[2 * i];         
+
+            le =  &left[cptr[i] * 4];
+            ri =  &right[cptr[i] * 4];
+
+            _mm_store_pd(x3, _mm_setzero_pd());     
+                     
+            for(l = 0; l < 2; l++)
+              {                                                                                                                          
+                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+                
+                al = _mm_mul_pd(al, ar);
+                
+                __m128d vv  = _mm_load_pd(x3);
+                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                
+                _mm_store_pd(x3, vv);                                                     
+              }            
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+          {
+            x1 = &(tipVector[2 * tipX1[i]]);
+            x2 = &x2_start[2 * i];
+            x3 = &x3_start[2 * i];
+            
+            le =  &left[cptr[i] * 4];
+            ri =  &right[cptr[i] * 4];
+
+            _mm_store_pd(x3, _mm_setzero_pd());     
+                     
+            for(l = 0; l < 2; l++)
+              {                                                                                                                          
+                __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+                __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+                
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+                
+                al = _mm_mul_pd(al, ar);
+                
+                __m128d vv  = _mm_load_pd(x3);
+                __m128d EVV = _mm_load_pd(&EV[2 * l]);
+                
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+                
+                _mm_store_pd(x3, vv);                                                     
+              }  
+            
+            __m128d minlikelihood_sse = _mm_set1_pd(PLL_MINLIKELIHOOD);
+         
+            scale = 1;
+            
+            __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;                         
+            
+            if(scale)
+              {
+                __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+                
+                __m128d ex3v = _mm_load_pd(x3);           
+                _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                                 
+                
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;   
+              }                    
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+        {
+          x1 = &x1_start[2 * i];
+          x2 = &x2_start[2 * i];
+          x3 = &x3_start[2 * i];
+
+          le = &left[cptr[i] * 4];
+          ri = &right[cptr[i] * 4];
+
+          _mm_store_pd(x3, _mm_setzero_pd());       
+          
+          for(l = 0; l < 2; l++)
+            {                                                                                                                            
+              __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&le[l * 2]));
+              __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&ri[l * 2]));
+              
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+              
+              al = _mm_mul_pd(al, ar);
+              
+              __m128d vv  = _mm_load_pd(x3);
+              __m128d EVV = _mm_load_pd(&EV[2 * l]);
+              
+              vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+              
+              _mm_store_pd(x3, vv);                                                       
+            }                             
+
+          __m128d minlikelihood_sse = _mm_set1_pd(PLL_MINLIKELIHOOD);
+         
+          scale = 1;
+                  
+          __m128d v1 = _mm_and_pd(_mm_load_pd(x3), absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;                   
+         
+          if(scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+                    
+              __m128d ex3v = _mm_load_pd(x3);             
+              _mm_store_pd(x3, _mm_mul_pd(ex3v,twoto));                                           
+             
+              if(useFastScaling)
+                addScale += wgt[i];
+              else
+                ex3[i]  += 1;     
+           }             
+        }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+static void newviewGTRGAMMA_BINARY(int tipCase,
+				   double *x1_start, double *x2_start, double *x3_start,
+				   double *EV, double *tipVector,
+				   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+				   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling
+				   )
+{
+  double
+    *x1, *x2, *x3;
+ 
+  int i, k, l, scale, addScale = 0; 
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      for (i = 0; i < n; i++)
+       {
+	 x1  = &(tipVector[2 * tipX1[i]]);
+	 x2  = &(tipVector[2 * tipX2[i]]);
+	 
+	 for(k = 0; k < 4; k++)
+	   {	     	     	    
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+       }
+      break;
+    case PLL_TIP_INNER:
+      for (i = 0; i < n; i++)
+       {
+	 x1  = &(tipVector[2 * tipX1[i]]);
+	 
+	 for(k = 0; k < 4; k++)
+	   {	     	     
+	     x2 = &(x2_start[8 * i + 2 * k]);
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+	
+	 x3 = &(x3_start[8 * i]);
+	 __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+	 
+	 scale = 1;
+	 for(l = 0; scale && (l < 8); l += 2)
+	   {
+	     __m128d vv = _mm_load_pd(&x3[l]);
+	     __m128d v1 = _mm_and_pd(vv, absMask.m);
+	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(_mm_movemask_pd( v1 ) != 3)
+	       scale = 0;
+	   }	    	         
+	 
+	 if(scale)
+	   {
+	     __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	     
+	     for(l = 0; l < 8; l+=2)
+	       {
+		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
+		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+	       }		   		  
+	     
+	     if(useFastScaling)
+	       addScale += wgt[i];
+	     else
+	       ex3[i]  += 1;	  
+	   }	 
+       }      
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {	 
+	 for(k = 0; k < 4; k++)
+	   {	     
+	     x1 = &(x1_start[8 * i + 2 * k]);
+	     x2 = &(x2_start[8 * i + 2 * k]);
+	     x3 = &(x3_start[8 * i + 2 * k]);	     
+	    	         
+	     _mm_store_pd(x3, _mm_setzero_pd());	    
+	    	     
+	     for(l = 0; l < 2; l++)
+	       {		 		 						   		  		 		 
+		 __m128d al = _mm_mul_pd(_mm_load_pd(x1), _mm_load_pd(&left[k * 4 + l * 2]));
+		 __m128d ar = _mm_mul_pd(_mm_load_pd(x2), _mm_load_pd(&right[k * 4 + l * 2]));
+		 		       
+		 al = _mm_hadd_pd(al, al);
+		 ar = _mm_hadd_pd(ar, ar);
+		   
+		 al = _mm_mul_pd(al, ar);
+		   
+		 __m128d vv  = _mm_load_pd(x3);
+		 __m128d EVV = _mm_load_pd(&EV[2 * l]);
+		 
+		 vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+		 
+		 _mm_store_pd(x3, vv);		     	  		   		  
+	       }	     	    
+	   }
+	
+	 x3 = &(x3_start[8 * i]);
+	 __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+	 
+	 scale = 1;
+	 for(l = 0; scale && (l < 8); l += 2)
+	   {
+	     __m128d vv = _mm_load_pd(&x3[l]);
+	     __m128d v1 = _mm_and_pd(vv, absMask.m);
+	     v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+	     if(_mm_movemask_pd( v1 ) != 3)
+	       scale = 0;
+	   }	    	         
+	 
+	 if(scale)
+	   {
+	     __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+	     
+	     for(l = 0; l < 8; l+=2)
+	       {
+		 __m128d ex3v = _mm_load_pd(&x3[l]);		  
+		 _mm_store_pd(&x3[l], _mm_mul_pd(ex3v,twoto));	
+	       }		   		  
+	     
+	     if(useFastScaling)
+	       addScale += wgt[i];
+	     else
+	       ex3[i]  += 1;	  
+	   }	 
+       }
+      break;
+
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+
+
+#endif
+
+
+
+
+/* The function below computes partial traversals only down to the point/node in the tree where the 
+   conditional likelihhod vector summarizing a subtree is already oriented in the correct direction */
+
+
+/** @brief Compute a partial or full traversal descriptor for a subtree of the topology
+
+   Unless the \a partialTraversal is set to \b PLL_TRUE, compute a partial traversal descriptor down 
+   to the point/node in the tree where the conditional likelihood vector representing a subtree is
+   already oriented in the correct direction. The elements of the traversal descriptor are stored in
+   \a ti and a \a counter keeps track of the number of elements.
+
+   @param p
+     Root of the  subtree for which we want to compute the traversal descriptor. The two descendents are \a p->next->back and \a p->next->next->back
+
+   @param ti
+i    Traversal descriptor element structure
+
+   @param counter
+     Number of elements in the traversal descriptor. Updated when an element is added
+
+   @param maxTips
+     Number of tips in the tree structure
+
+   @param numBranches
+     Number of branches
+   
+   @param partialTraversal
+     If \b PLL_TRUE, a partial traversal descriptor is computed, otherwise a full
+
+   @param rvec
+     Parameter concerning ancestral state recomputation. Please document
+
+   @param useRecom
+     If \b PLL_TRUE, then ancestral state recomputation is enabled.
+   
+   @todo Fill in the ancestral recomputation parameter information 
+ */
+static void computeTraversalInfo(nodeptr p, traversalInfo *ti, int *counter, int maxTips, int numBranches, pllBoolean partialTraversal, recompVectors *rvec, pllBoolean useRecom)
+{
+  /* if it's a tip we don't do anything */
+
+  if(isTip(p->number, maxTips))
+    return;
+
+  {
+    int 
+      i;
+
+    /* recom default values */
+    int slot = -1,
+        unpin1 = -1, 
+        unpin2 = -1;
+    /* get the left and right descendants */
+
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;   
+
+    /* if the left and right children are tips there is not that much to do */
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))
+    {
+      /* fix the orientation of p->x */
+
+      if (! p->x)
+        getxnode(p);    
+      
+      assert(p->x);
+
+      /* add the current node triplet p,q,r to the traversal descriptor */
+      ti[*counter].tipCase = PLL_TIP_TIP;
+      ti[*counter].pNumber = p->number;
+      ti[*counter].qNumber = q->number;
+      ti[*counter].rNumber = r->number;
+
+
+      /* copy branches to traversal descriptor */
+      for(i = 0; i < numBranches; i++)
+      {     
+        ti[*counter].qz[i] = q->z[i];
+        ti[*counter].rz[i] = r->z[i];
+      }
+
+      /* recom - add the slot to the traversal descriptor */
+      if(useRecom)
+      {
+        getxVector(rvec, p->number, &slot, maxTips);
+        ti[*counter].slot_p = slot;
+        ti[*counter].slot_q = -1;
+        ti[*counter].slot_r = -1;
+      }
+
+      /* increment length counter */
+
+      *counter = *counter + 1;
+    }
+    else
+    {
+      /* if either r or q are tips, flip them to make sure that the tip data is stored 
+         for q */
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {     
+        if(isTip(r->number, maxTips))
+        {
+          nodeptr 
+            tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+
+        /* if the orientation of the liklihood vector at r is not correct we need to re-compute it 
+           and descend into its subtree to figure out if there are more vrctors in there to re-compute and 
+           re-orient */
+
+        if(needsRecomp(useRecom, rvec, r, maxTips) || !partialTraversal) 
+          computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+        else
+          {
+            if(useRecom)
+              /* the node is available,  now make sure it will not be unpinned until it is read */
+              protectNode(rvec, r->number, maxTips);
+          }
+        /* Now that r is oriented, we can safely set the orientation of p */
+        if(! p->x)
+          getxnode(p);   
+
+        /* make sure that everything is consistent now */
+
+        assert(p->x && r->x);
+
+        /* store data for p, q, r in the traversal descriptor */
+
+        ti[*counter].tipCase = PLL_TIP_INNER;
+        ti[*counter].pNumber = p->number;
+        ti[*counter].qNumber = q->number;
+        ti[*counter].rNumber = r->number;
+
+        for(i = 0; i < numBranches; i++)
+        {       
+          ti[*counter].qz[i] = q->z[i];
+          ti[*counter].rz[i] = r->z[i];
+        }
+
+        if(useRecom)
+        {
+          getxVector(rvec, r->number, &slot, maxTips);
+          ti[*counter].slot_r = slot;
+
+          getxVector(rvec, p->number, &slot, maxTips);
+          ti[*counter].slot_p = slot;
+
+          ti[*counter].slot_q = -1;
+
+          unpin2 = r->number; /* when PLL_TIP_INNER finishes, the INNER input vector r can be unpinned*/
+        }
+
+        *counter = *counter + 1;
+      }
+      else
+      {
+        /* same as above, only now q and r are inner nodes. Hence if they are not 
+           oriented correctly they will need to be recomputed and we need to descend into the 
+           respective subtrees to check if everything is consistent in there, potentially expanding 
+           the traversal descriptor */
+        if(( useRecom && (!partialTraversal) ) || 
+            ( useRecom && needsRecomp(useRecom, rvec, q, maxTips) && needsRecomp(useRecom, rvec, r, maxTips) ))
+        {
+          /* PLL_INNER_INNER and recomputation implies that the order we descend q and r matters, 
+           * if we are in a partial traversal, this is only relevant if both require recomputation
+           * see TODOFER add ref. */
+
+          int q_stlen = rvec->stlen[q->number - maxTips - 1],
+              r_stlen = rvec->stlen[q->number - maxTips - 1];
+          assert(q_stlen >= 2 && q_stlen <= maxTips - 1);
+          assert(r_stlen >= 2 && r_stlen <= maxTips - 1);
+
+          if(q_stlen > r_stlen)
+          {
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          }
+          else
+          {
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          }
+        }
+        else
+        {
+          /* Now the order does not matter */
+          /* If we are in a recomputation and partial, only either q or r will be descended */
+
+          if(!partialTraversal || needsRecomp(useRecom, rvec, q, maxTips))
+            computeTraversalInfo(q, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          else
+          {
+            if(useRecom)
+              /* the node is available,  now make sure it will not be unpinned until it is read */
+              protectNode(rvec, q->number, maxTips);
+          }
+
+          if(!partialTraversal || needsRecomp(useRecom, rvec, r, maxTips))
+            computeTraversalInfo(r, ti, counter, maxTips, numBranches, partialTraversal, rvec, useRecom);
+          else
+          {
+            if(useRecom)
+              protectNode(rvec, r->number, maxTips);
+          }
+        }
+
+
+        if(! p->x)
+          getxnode(p);
+
+        /* check that the vector orientations are consistent now */
+
+        assert(p->x && r->x && q->x);
+
+        ti[*counter].tipCase = PLL_INNER_INNER;
+        ti[*counter].pNumber = p->number;
+        ti[*counter].qNumber = q->number;
+        ti[*counter].rNumber = r->number;
+
+        if(useRecom)
+        {
+          /* We check that the strategy cannot re-use slots */
+          getxVector(rvec, q->number, &slot, maxTips);
+          ti[*counter].slot_q = slot;
+
+          getxVector(rvec, r->number, &slot, maxTips);
+          ti[*counter].slot_r = slot;
+          assert(slot != ti[*counter].slot_q);
+
+          getxVector(rvec, p->number, &slot, maxTips);
+          ti[*counter].slot_p = slot;
+          assert(slot != ti[*counter].slot_q);
+          assert(slot != ti[*counter].slot_r);
+
+          /* And at these point both input INNER can be marked as unpinned */
+          unpin2 = r->number;
+          unpin1 = q->number;
+        }
+
+        for(i = 0; i < numBranches; i++)
+        {       
+          ti[*counter].qz[i] = q->z[i];
+          ti[*counter].rz[i] = r->z[i];
+        }
+
+        *counter = *counter + 1;
+      }
+    }
+    if(useRecom)
+    {
+      /* Mark the nodes as unpinnable(will be unpinned while executing the replacement strategy only if required)*/
+      unpinNode(rvec, unpin1, maxTips);
+      unpinNode(rvec, unpin2, maxTips);
+    }
+  }
+}
+
+/* below are the optimized unrolled, and vectorized versions of the above generi cfunctions 
+   for computing the conditional likelihood at p given child nodes q and r. The actual implementation is located at the end/bottom of this 
+   file.
+   */
+/* now this is the function that just iterates over the length of the traversal descriptor and 
+   just computes the conditional likelihhod arrays in the order given by the descriptor.
+   So in a sense, this function has no clue that there is any tree-like structure 
+   in the traversal descriptor, it just operates on an array of structs of given length */ 
+
+
+/** @brief Compute the conditional likelihood for each entry (node) of the traversal descriptor
+
+    Computes the conditional likelihood vectors for each entry (node) in the already computed
+    traversal descriptor, starting from the \a startIndex entry.
+     
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param startIndex
+      From which node to start computing the conditional likelihood vectors in the traversal
+      descriptor
+     
+    @note This function just iterates over the length of the traversal descriptor and 
+      computes the conditional likelihhod arrays in the order given by the descriptor.
+      So in a sense, this function has no clue that there is any tree-like structure 
+      in the traversal descriptor, it just operates on an array of structs of given length.
+ */
+void pllNewviewIterative (pllInstance *tr, partitionList *pr, int startIndex)
+{
+  traversalInfo 
+    *ti   = tr->td[0].ti;
+
+  int 
+    i, 
+    model;
+
+  int 
+    p_slot = -1, 
+    q_slot = -1, 
+    r_slot = -1;
+
+#ifdef _DEBUG_RECOMPUTATION
+  /* recom */
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+#else
+  countTraversal(tr);
+#endif
+  /* E recom */
+#endif
+
+  /* loop over traversal descriptor length. Note that on average we only re-compute the conditionals on 3 -4 
+     nodes in RAxML */
+
+  for(i = startIndex; i < tr->td[0].count; i++)
+  {
+
+    traversalInfo 
+      *tInfo = &ti[i];
+    
+    /* Note that the slots refer to different things if recomputation is applied */
+    if(tr->useRecom)
+      {
+        /* a slot has been assigned while computing the traversal descriptor  */
+        p_slot = tInfo->slot_p;
+        q_slot = tInfo->slot_q;
+        r_slot = tInfo->slot_r;
+      }
+    else
+      {
+        /* a fixed slot is always given for each inner node, we only need an offset to get the right index */
+        p_slot = tInfo->pNumber - tr->mxtips - 1;
+        q_slot = tInfo->qNumber - tr->mxtips - 1;
+        r_slot = tInfo->rNumber - tr->mxtips - 1;
+      }
+
+    /* now loop over all partitions for nodes p, q, and r of the current traversal vector entry */
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      /* number of sites in this partition */
+      size_t            
+        width  = (size_t)pr->partitionData[model]->width;
+
+      /* this conditional statement is exactly identical to what we do in pllEvaluateIterative */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+      {       
+        double
+          *x1_start = (double*)NULL,
+          *x2_start = (double*)NULL,
+          *x3_start = pr->partitionData[model]->xVector[p_slot],
+          *left     = (double*)NULL,
+          *right    = (double*)NULL,            
+#if (defined(__SSE3) || defined(__AVX))
+          *x1_gapColumn = (double*)NULL,
+          *x2_gapColumn = (double*)NULL,
+          *x3_gapColumn = (double*)NULL,
+#endif
+          *rateCategories = (double*)NULL,
+          *x1_ascColumn = NULL,
+          *x2_ascColumn = NULL,
+          *x3_ascColumn = NULL;
+
+        int
+          categories,
+          scalerIncrement = 0,
+
+          /* integer wieght vector with pattern compression weights */
+
+          *wgt = pr->partitionData[model]->wgt;
+
+        /* pointers for per-site scaling array at node p */
+        
+        int      
+          *ex3     = NULL,
+          *ex3_asc = NULL;
+
+        /* select fastScaling or per-site scaling of conidtional likelihood entries */
+
+        pllBoolean
+          fastScaling = tr->fastScaling;
+
+#if (defined(__SSE3) || defined(__AVX))
+        unsigned int
+          *x1_gap = (unsigned int*)NULL,
+          *x2_gap = (unsigned int*)NULL,
+          *x3_gap = (unsigned int*)NULL;
+#endif
+
+        unsigned char
+          *tipX1 = (unsigned char *)NULL,
+          *tipX2 = (unsigned char *)NULL;
+
+        double 
+          qz, 
+          rz;        
+
+        size_t
+#if (defined(__SSE3) || defined(__AVX))
+          gapOffset = 0,
+#endif
+          rateHet = discreteRateCategories(tr->rateHetModel),
+          ascWidth = (size_t)pr->partitionData[model]->states,
+
+          /* get the number of states in the data stored in partition model */
+          
+          states = (size_t)pr->partitionData[model]->states,
+          
+          /* get the length of the current likelihood array stored at node p. This is 
+             important mainly for the SEV-based memory saving option described in here:
+             
+             F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees".
+             
+             So pr->partitionData[model]->xSpaceVector[i] provides the length of the allocated conditional array of partition model
+             and node i 
+          */
+          
+          availableLength = pr->partitionData[model]->xSpaceVector[p_slot],
+          requiredLength = 0;        
+        
+        /* figure out what kind of rate heterogeneity approach we are using */
+
+        if(tr->rateHetModel == PLL_CAT)
+          {              
+            rateCategories = pr->partitionData[model]->perSiteRates;
+            categories = pr->partitionData[model]->numberOfCategories;
+          }
+        else
+          {                              
+            rateCategories = pr->partitionData[model]->gammaRates;
+            categories = 4;
+          }
+
+        /* memory saving stuff, not important right now, but if you are interested ask Fernando */
+
+#if (defined(__SSE3) || defined(__AVX))
+        if(tr->saveMemory)
+          {
+            size_t
+              j,
+              setBits = 0;                
+            
+            gapOffset = states * (size_t)getUndetermined(pr->partitionData[model]->dataType);
+            
+            x1_gap = &(pr->partitionData[model]->gapVector[tInfo->qNumber * pr->partitionData[model]->gapVectorLength]);
+            x2_gap = &(pr->partitionData[model]->gapVector[tInfo->rNumber * pr->partitionData[model]->gapVectorLength]);
+            x3_gap = &(pr->partitionData[model]->gapVector[tInfo->pNumber * pr->partitionData[model]->gapVectorLength]);
+            
+            for(j = 0; j < (size_t)pr->partitionData[model]->gapVectorLength; j++)
+              {              
+                x3_gap[j] = x1_gap[j] & x2_gap[j];
+                setBits += (size_t)(bitcount_32_bit(x3_gap[j])); 
+              }
+            
+            requiredLength = (width - setBits)  * rateHet * states * sizeof(double);            
+          }
+        else
+#endif
+          {
+            /* if we are not trying to save memory the space required to store an inner likelihood array 
+               is the number of sites in the partition times the number of states of the data type in the partition 
+               times the number of discrete GAMMA rates (1 for CAT essentially) times 8 bytes */
+            requiredLength  =  virtual_width( width ) * rateHet * states * sizeof(double);
+            
+            //                   printf( "req: %d %d %d %d\n", requiredLength, width, virtual_width(width), model );
+          }
+        
+        /* Initially, even when not using memory saving no space is allocated for inner likelihood arrats hence 
+           availableLength will be zero at the very first time we traverse the tree.
+           Hence we need to allocate something here */
+
+        if(requiredLength != availableLength)
+          {               
+            /* if there is a vector of incorrect length assigned here i.e., x3 != NULL we must free 
+               it first */
+            if(x3_start)
+              rax_free(x3_start);
+            
+            /* allocate memory: note that here we use a byte-boundary aligned malloc, because we need the vectors
+               to be aligned at 16 BYTE (SSE3) or 32 BYTE (AVX) boundaries! */
+            
+            rax_posix_memalign ((void **)&x3_start, PLL_BYTE_ALIGNMENT, requiredLength);              
+            
+            /* update the data structures for consistent bookkeeping */
+            pr->partitionData[model]->xVector[p_slot]      = x3_start;
+            pr->partitionData[model]->xSpaceVector[p_slot] = requiredLength;
+          }
+        
+
+        /* 
+           if we are not using fast scaling, we need to assign memory for storing 
+           integer vectors at each inner node that are as long as the sites of the 
+           partition. IMPORTANT: while this looks as if this might be a memory saving trick 
+           it is not. The ex3 vectors will be allocated once during the very first tree 
+           traversal and then never again because they will always have the required length!
+        */
+
+        if(!fastScaling)
+          {
+            size_t
+              availableExpLength = pr->partitionData[model]->expSpaceVector[p_slot],
+              requiredExpLength  = width * sizeof(int);
+            
+            ex3 = pr->partitionData[model]->expVector[p_slot];
+            
+            if(requiredExpLength != availableExpLength)
+              {
+                if(ex3)
+                  rax_free(ex3);
+                
+                rax_posix_memalign ((void **)&ex3, PLL_BYTE_ALIGNMENT, requiredExpLength);               
+                
+                pr->partitionData[model]->expVector[p_slot] = ex3;
+                
+                pr->partitionData[model]->expSpaceVector[p_slot] = requiredExpLength;
+              }
+          }
+
+        /* now just set the pointers for data accesses in the newview() implementations above to the corresponding values 
+           according to the tip case */
+        
+        switch(tInfo->tipCase)
+          {
+          case PLL_TIP_TIP:           
+            tipX1    = pr->partitionData[model]->yVector[tInfo->qNumber];
+            tipX2    = pr->partitionData[model]->yVector[tInfo->rNumber];
+
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              {
+                x1_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x2_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x3_gapColumn   = &(pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet]);
+              }
+#endif            
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+            if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+            if(pr->partitionData[model]->ascBias)
+#endif
+             {
+              size_t
+                k;
+              
+              x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+              ex3_asc      = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+              for(k = 0; k < ascWidth; k++)
+                ex3_asc[k] = 0;               
+             }
+            /* if we do per-site log likelihood scaling, and both child nodes are tips,
+               just initialize the vector with zeros, i.e., no scaling events */
+
+            if(!fastScaling)
+              {
+                size_t
+                  k;                                 
+
+                for(k = 0; k < width; k++)
+                  ex3[k] = 0;
+              }
+            break;
+          case PLL_TIP_INNER:                
+            tipX1    =  pr->partitionData[model]->yVector[tInfo->qNumber];
+            x2_start = pr->partitionData[model]->xVector[r_slot];
+            assert(r_slot != p_slot);
+            
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              { 
+                x1_gapColumn   = &(pr->partitionData[model]->tipVector[gapOffset]);
+                x2_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
+                x3_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
+              }
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+            if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+              if(pr->partitionData[model]->ascBias)
+#endif      
+              {   
+                size_t
+                  k;
+
+                int 
+                  *ex2_asc;
+                
+                x2_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->rNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                
+                ex2_asc = &pr->partitionData[model]->ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
+                ex3_asc = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+                for(k = 0; k < ascWidth; k++)
+                  ex3_asc[k] = ex2_asc[k];
+              }
+            
+            /* if one child node is not a tip, just copy the values from there, coudl also be done with memcpy of course 
+               the elements of ex3[] will then potentially be further incremented in the actual newview() if scaling events 
+               take place */
+
+            if(!fastScaling)
+              {
+                size_t 
+                  k;
+                int
+                  *ex2 = pr->partitionData[model]->expVector[r_slot];                
+                      
+                for(k = 0; k < width; k++)
+                  ex3[k] = ex2[k];
+              }
+            break;
+          case PLL_INNER_INNER:                              
+            x1_start       = pr->partitionData[model]->xVector[q_slot];
+            x2_start       = pr->partitionData[model]->xVector[r_slot];
+            assert(r_slot != p_slot);
+            assert(q_slot != p_slot);
+            assert(q_slot != r_slot);
+            
+#if (defined(__SSE3) || defined(__AVX))
+            if(tr->saveMemory)
+              {
+                x1_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->qNumber - tr->mxtips - 1) * states * rateHet];
+                x2_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->rNumber - tr->mxtips - 1) * states * rateHet];
+                x3_gapColumn   = &pr->partitionData[model]->gapColumn[(tInfo->pNumber - tr->mxtips - 1) * states * rateHet];
+              }
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+              if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+              if(pr->partitionData[model]->ascBias)
+#endif          
+               {                
+                 size_t
+                   k;
+
+                 int 
+                   *ex1_asc,
+                   *ex2_asc;
+                 
+                 x1_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->qNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x2_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->rNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 x3_ascColumn = &pr->partitionData[model]->ascVector[(tInfo->pNumber - tr->mxtips - 1) * pr->partitionData[model]->ascOffset];
+                 
+                 ex1_asc = &pr->partitionData[model]->ascExpVector[(tInfo->qNumber - tr->mxtips - 1) * ascWidth];
+                 ex2_asc = &pr->partitionData[model]->ascExpVector[(tInfo->rNumber - tr->mxtips - 1) * ascWidth];
+                 ex3_asc = &pr->partitionData[model]->ascExpVector[(tInfo->pNumber - tr->mxtips - 1) * ascWidth];
+
+                 for(k = 0; k < ascWidth; k++)
+                   ex3_asc[k] = ex1_asc[k] + ex2_asc[k];
+               }
+            /* both child nodes are inner nodes, thus the initial value of the scaling vector 
+               ex3 is the sum of the scaling values of the left and right child node */
+
+            if(!fastScaling)
+              {
+                size_t
+                  k;
+                      
+                int            
+                  *ex1      = pr->partitionData[model]->expVector[q_slot],
+                  *ex2      = pr->partitionData[model]->expVector[r_slot];                    
+                      
+                  for(k = 0; k < width; k++)
+                    ex3[k] = ex1[k] + ex2[k];
+              }
+            break;
+          default:
+            assert(0);
+          }
+
+        /* set the pointers to the left and right P matrices to the pre-allocated memory space for storing them */
+
+        left  = pr->partitionData[model]->left;
+        right = pr->partitionData[model]->right;
+
+        /* if we use per-partition branch length optimization 
+           get the branch length of partition model and take the log otherwise 
+           use the joint branch length among all partitions that is always stored 
+           at index [0] */
+
+        if(pr->perGeneBranchLengths)
+        {
+          qz = tInfo->qz[model];                                    
+          rz = tInfo->rz[model];                  
+        }
+        else
+        {
+          qz = tInfo->qz[0];
+          rz = tInfo->rz[0];
+        }
+
+        qz = (qz > PLL_ZMIN) ? log(qz) : log(PLL_ZMIN);                        
+        rz = (rz > PLL_ZMIN) ? log(rz) : log(PLL_ZMIN);                       
+
+        /* compute the left and right P matrices */
+
+        if(pr->partitionData[model]->dataType == PLL_AA_DATA &&
+        		(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X))
+                makeP_FlexLG4(qz, rz, pr->partitionData[model]->gammaRates,
+                              pr->partitionData[model]->EI_LG4,
+                              pr->partitionData[model]->EIGN_LG4,
+                              4, left, right, 20);
+        else
+        makeP(qz, rz, rateCategories,   pr->partitionData[model]->EI,
+              pr->partitionData[model]->EIGN, categories,
+              left, right, tr->saveMemory, tr->maxCategories, states);
+
+
+#if (!defined(__SSE3) && !defined(__AVX) && !defined(__MIC_NATIVE))
+        assert(!tr->saveMemory);
+
+        /* figure out if we need to compute the CAT or GAMMA model of rate heterogeneity */
+
+        if(tr->rateHetModel == PLL_CAT)
+         {
+
+           newviewCAT_FLEX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                           x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                           ex3, tipX1, tipX2,
+                           width, left, right, wgt, &scalerIncrement, fastScaling, states);
+         }
+        else 
+         {
+            newviewGAMMA_FLEX(tInfo->tipCase,
+                 x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                 0, tipX1, tipX2,
+                 width, left, right, wgt, &scalerIncrement, fastScaling, states, getUndetermined(pr->partitionData[model]->dataType) + 1);
+         }
+#else
+        /* dedicated highly optimized functions. Analogously to the functions in evaluateGeneric() 
+           we also siwtch over the state number */
+
+        switch(states)
+        {               
+        case 2:
+          assert (!tr->saveMemory);
+          if (tr->rateHetModel == PLL_CAT)
+           {
+             newviewGTRCAT_BINARY(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+           }
+          else
+           {
+             newviewGTRGAMMA_BINARY(tInfo->tipCase,
+                                    x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                    ex3, tipX1, tipX2,
+                                    width, left, right, wgt, &scalerIncrement, fastScaling);                  
+           }
+          break;
+
+        case 4: /* DNA */
+#ifdef __MIC_NATIVE
+
+              /* CAT & memory saving are not supported on MIC */
+
+              assert(!tr->saveMemory);
+              assert(tr->rateHetModel == PLL_GAMMA);
+
+              newviewGTRGAMMA_MIC(tInfo->tipCase,
+                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                ex3, tipX1, tipX2,
+                                width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+          if(tr->rateHetModel == PLL_CAT)
+            {                                
+              
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                              x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                              ex3, tipX1, tipX2,
+                                              width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                              x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#else
+                newviewGTRCAT_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                   x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                   ex3, tipX1, tipX2,
+                                   width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                   x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRCAT_AVX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRCAT(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                            x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                            ex3, tipX1, tipX2,
+                            width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif
+            }
+          else
+            {
+              
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                x1_gap, x2_gap, x3_gap, 
+                                                x1_gapColumn, x2_gapColumn, x3_gapColumn);
+
+#else
+              newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase,
+                                          x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                          ex3, tipX1, tipX2,
+                                          width, left, right, wgt, &scalerIncrement, fastScaling,
+                                          x1_gap, x2_gap, x3_gap, 
+                                          x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRGAMMA_AVX(tInfo->tipCase,
+                                    x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                    ex3, tipX1, tipX2,
+                                    width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRGAMMA(tInfo->tipCase,
+                              x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                              ex3,tipX1, tipX2,
+                              width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif
+            }
+#endif
+
+            break;                  
+          case 20: /* proteins */
+
+#ifdef __MIC_NATIVE
+
+                        /* CAT & memory saving are not supported on MIC */
+
+                        assert(!tr->saveMemory);
+                        assert(tr->rateHetModel == PLL_GAMMA);
+
+                        if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                        {
+                                  newviewGTRGAMMAPROT_LG4_MIC(tInfo->tipCase,
+                            x1_start, x2_start, x3_start, pr->partitionData[model]->EV_LG4, pr->partitionData[model]->tipVector_LG4,
+                            tipX1, tipX2,
+                            width, left, right, wgt, &scalerIncrement);
+                        }
+                        else
+                        {
+                                  newviewGTRGAMMAPROT_MIC(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling);
+                        }
+#else
+
+            if(tr->rateHetModel == PLL_CAT)
+            {
+
+
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                                  x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                                  ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling, 
+                                                  x1_gap, x2_gap, x3_gap,
+                                                  x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#else
+              newviewGTRCATPROT_SAVE(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                     x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                     ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling, x1_gap, x2_gap, x3_gap,
+                                     x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
+#endif
+              else
+#ifdef __AVX
+                newviewGTRCATPROT_AVX(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                      x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                      ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRCATPROT(tInfo->tipCase,  pr->partitionData[model]->EV, pr->partitionData[model]->rateCategory,
+                                x1_start, x2_start, x3_start, pr->partitionData[model]->tipVector,
+                                ex3, tipX1, tipX2, width, left, right, wgt, &scalerIncrement, fastScaling);                     
+#endif
+            }
+            else
+            {
+
+              
+
+              if(tr->saveMemory)
+#ifdef __AVX
+                newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase,
+                                                    x1_start, x2_start, x3_start,
+                                                    pr->partitionData[model]->EV,
+                                                    pr->partitionData[model]->tipVector,
+                                                    ex3, tipX1, tipX2,
+                                                    width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                    x1_gap, x2_gap, x3_gap,
+                                                    x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#else
+                newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase,
+                                                x1_start, x2_start, x3_start,
+                                                pr->partitionData[model]->EV,
+                                                pr->partitionData[model]->tipVector,
+                                                ex3, tipX1, tipX2,
+                                                width, left, right, wgt, &scalerIncrement, fastScaling,
+                                                x1_gap, x2_gap, x3_gap,
+                                                x1_gapColumn, x2_gapColumn, x3_gapColumn);
+#endif
+            
+             else
+                        {
+                          if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+                            {
+#ifdef __AVX 
+                              newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase,
+                                                          x1_start, x2_start, x3_start,
+                                                          pr->partitionData[model]->EV_LG4,
+                                                          pr->partitionData[model]->tipVector_LG4,
+                                                          (int*)NULL, tipX1, tipX2,
+                                                          width, left, right, wgt, &scalerIncrement, PLL_TRUE);
+#else
+                              newviewGTRGAMMAPROT_LG4(tInfo->tipCase,
+                                                      x1_start, x2_start, x3_start,
+                                                      pr->partitionData[model]->EV_LG4,
+                                                      pr->partitionData[model]->tipVector_LG4,
+                                                      (int*)NULL, tipX1, tipX2,
+                                                      width, left, right, 
+                                                      wgt, &scalerIncrement, PLL_TRUE);
+#endif                      
+                            }
+              else
+#ifdef __AVX
+                newviewGTRGAMMAPROT_AVX(tInfo->tipCase,
+                                        x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                        ex3, tipX1, tipX2,
+                                        width, left, right, wgt, &scalerIncrement, fastScaling);
+#else
+              newviewGTRGAMMAPROT(tInfo->tipCase,
+                                  x1_start, x2_start, x3_start, pr->partitionData[model]->EV, pr->partitionData[model]->tipVector,
+                                  ex3, tipX1, tipX2,
+                                  width, left, right, wgt, &scalerIncrement, fastScaling);
+#endif                 
+            }   
+        }
+#endif
+            
+            break;      
+          default:
+            assert(0);
+        }
+#endif
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+       if(pr->partitionData[model]->ascBias && tr->threadID == 0)
+#else
+       if(pr->partitionData[model]->ascBias)
+#endif         
+         {
+           switch(tr->rateHetModel)
+             {
+             case PLL_CAT:
+               {
+                 double 
+                   rates = 1.0;
+                 
+                 //need to re-calculate transition probabilities assuming a rate of 1.0 
+                 makeP(qz, rz, 
+                       &rates,  
+                       pr->partitionData[model]->EI,
+                       pr->partitionData[model]->EIGN,
+                       1, 
+                       left, right, 
+                       tr->saveMemory,
+                       tr->maxCategories,
+                       states);
+                 
+                 newviewAscCat(tInfo->tipCase,
+                               x1_ascColumn, x2_ascColumn, x3_ascColumn,
+                               pr->partitionData[model]->EV,
+                               pr->partitionData[model]->tipVector,
+                               ex3_asc,
+                               states, left, right, states);
+               }
+               break;
+             case PLL_GAMMA:
+               newviewAscGamma(tInfo->tipCase,
+                               x1_ascColumn, x2_ascColumn, x3_ascColumn,
+                               pr->partitionData[model]->EV,
+                               pr->partitionData[model]->tipVector,
+                               ex3_asc,
+                               states, left, right, states);                        
+               break;
+             default:
+               assert(0);
+             }
+         }
+
+
+        /* important step, here we essentiallt recursively compute the number of scaling multiplications 
+           at node p: it's the sum of the number of scaling multiplications already conducted 
+           for computing nodes q and r plus the scaling multiplications done at node p */
+
+        if(fastScaling)
+          {
+            pr->partitionData[model]->globalScaler[tInfo->pNumber] =
+              pr->partitionData[model]->globalScaler[tInfo->qNumber] +
+              pr->partitionData[model]->globalScaler[tInfo->rNumber] +
+              (unsigned int)scalerIncrement;
+            
+            /* check that we are not getting an integer overflow ! */
+
+            assert(pr->partitionData[model]->globalScaler[tInfo->pNumber] < INT_MAX);
+          }
+        
+        /* show the output vector */
+      } 
+    }
+  }
+}
+
+/** @brief Compute the traversal descriptor of the subtree rooted at \a p.
+    
+    Computes the traversal descriptor of the subtree with root \a p. By traversal
+    descriptory we essentially mean a preorder traversal of the unrooted topology
+    by rooting it at a node \a p.
+    If \a partialTraversal is set to \b PLL_TRUE then subtrees which are oriented
+    correctly (i.e. if root node \a r of a subtree has \a r->x == 1) are not
+    included in the traversal descriptor.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Node assumed to be the root
+
+    @param partialTraversal
+      If set to \b PLL_TRUE, then a partial traversal descriptor is computed.
+
+    @param numBranches
+      Number of branches (either per-partition branch or joint branch estimate)
+*/
+void computeTraversal(pllInstance *tr, nodeptr p, pllBoolean partialTraversal, int numBranches)
+{
+  /* Only if we apply recomputations we need the additional step of updating the subtree lengths */
+  if(tr->useRecom)
+  {
+    int traversal_counter = 0;
+    if(partialTraversal)
+      computeTraversalInfoStlen(p, tr->mxtips, tr->rvec, &traversal_counter);
+    else
+      computeFullTraversalInfoStlen(p, tr->mxtips, tr->rvec);
+  }
+  computeTraversalInfo(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, numBranches, partialTraversal, tr->rvec, tr->useRecom);
+}
+
+
+/** @brief Computes the conditional likelihood vectors of all nodes in the subtree rooted at \a p
+  
+    Compute the conditional likelihood vectors of all nodes in the subtree rooted at node \a p. The
+    conditional likelihood vector at node \a p is recomputed regardless of whether the orientation (i.e. \a p->x) 
+    is correct or not, and, recursuvely, the likelihoods at each node in the subtree as needed and if necessary.
+    In case \a masked is set to \b PLL_TRUE, the computation will not take place at partitions for which the 
+    conditional likelihood has converged (for example as a reult of previous branch length optimization).
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Root of the subtree for which we want to recompute the conditional likelihood vectors
+
+    @param masked
+      If set to \b PLL_TRUE, then likelihood vectors of partitions that are converged are
+      not recomputed.
+ */
+void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked)
+{  
+  /* if it's a tip there is nothing to do */
+
+  if(isTip(p->number, tr->mxtips))
+    return;
+
+  /* the first entry of the traversal descriptor is always reserved for evaluate or branch length optimization calls,
+     hence we start filling the array at the second entry with index one. This is not very nice and should be fixed 
+     at some point */
+
+  tr->td[0].count = 0;
+
+  /* compute the traversal descriptor, which will include nodes-that-need-update descending the subtree  p */
+  computeTraversal(tr, p, PLL_TRUE, pr->perGeneBranchLengths?pr->numberOfPartitions : 1);
+
+  /* the traversal descriptor has been recomputed -> not sure if it really always changes, something to 
+     optimize in the future */
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+
+  /* We do a masked newview, i.e., do not execute newvies for each partition, when for example 
+     doing a branch length optimization on the entire tree when branches are estimated on a per partition basis.
+
+     you may imagine that for partition 5 the branch length optimization has already converged whereas 
+     for partition 6 we still need to go over the tree again.
+
+     This is explained in more detail in:
+
+     A. Stamatakis, M. Ott: "Load Balance in the Phylogenetic Likelihood Kernel". Proceedings of ICPP 2009
+
+     The external pllBoolean array tr->partitionConverged[] contains exactly that information and is copied
+     to executeModel and subsequently to the executeMask of the traversal descriptor 
+
+*/
+
+
+  if(masked)
+  {
+    int model;
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      if(tr->partitionConverged[model])
+        pr->partitionData[model]->executeModel = PLL_FALSE;
+      else
+        pr->partitionData[model]->executeModel = PLL_TRUE;
+    }
+  }
+
+  /* if there is something to re-compute */
+
+  if(tr->td[0].count > 0)
+  {
+    /* store execute mask in traversal descriptor */
+
+    storeExecuteMaskInTraversalDescriptor(tr, pr);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+    /* do the parallel for join for pthreads
+       not that we do not need a reduction operation here, but just a barrier to make 
+       sure that all threads are done with their partition */
+
+    pllMasterBarrier(tr, pr, PLL_THREAD_NEWVIEW);
+#else
+    /* in the sequential case we now simply call pllNewviewIterative() */
+
+    pllNewviewIterative(tr, pr, 0);
+#endif
+
+  }
+
+  /* clean up */
+
+  if(masked)
+  {
+    int model;
+
+    for(model = 0; model < pr->numberOfPartitions; model++)
+      pr->partitionData[model]->executeModel = PLL_TRUE;
+  }
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+}
+
+/* function to compute the marginal ancestral probability vector at a node p for CAT/PSR model */
+
+/** @brief Compute the marginal ancestral probability vector for CAT/PSR model
+    
+    Computes the marginal ancestral probability vector for CAT/PSR model, given the conditional likelihood
+    vector \a x3 of some node, and a zero branch length P matrix \a diagptable.
+
+    @param x3
+      Conditional likelihood of the node for which we are computing the ancestral vector
+
+    @param ancestralBuffer
+      Buffer where to store the marginal ancestral probability vector
+
+    @param diagptable
+      A zero branch length P matrix
+
+    @param n
+      Number of sites in the partition to process (in the case of MPI/PTHREADS, the number of sites in the partition assigned to the current thread/process)
+
+    @param numStates
+      Number of states
+
+    @param cptr
+      Array where the rate for each site in the compressed partition alignment is stored
+      
+ */
+static void ancestralCat(double *x3, double *ancestralBuffer, double *diagptable, const int n, const int numStates, int *cptr)
+{ 
+  double 
+    *term = (double*)rax_malloc(sizeof(double) * numStates);
+
+  int 
+    i;
+
+  const int
+    statesSquare = numStates * numStates;
+  
+  for(i = 0; i < n; i++)
+    {
+      double 
+        sum = 0.0,
+        *v = &x3[numStates * i],
+        *ancestral = &ancestralBuffer[numStates * i],
+        *d = &diagptable[cptr[i] * statesSquare];            
+
+      int 
+        l,
+        j;
+
+      for(l = 0; l < numStates; l++)
+        {
+          double 
+            ump_x1 = 0.0;
+      
+          for(j = 0; j < numStates; j++)        
+            ump_x1 += v[j] * d[l * numStates + j];
+
+          sum += ump_x1;
+          term[l] = ump_x1;      
+        }
+                
+      for(l = 0; l < numStates; l++)          
+        ancestral[l] = term[l] / sum;   
+    }
+   
+  rax_free(term);
+}
+
+
+/* compute marginal ancestral states for GAMMA models,
+   for the euqation to obtain marginal ancestral states 
+   see Ziheng Yang's book */
+
+/** @brief Compute the marginal ancestral probability vector for GAMMA model
+    
+    Computes the marginal ancestral probability vector for the GAMMA model, given the conditional likelihood
+    vector \a x3 of some node, and a zero branch length P matrix \a diagptable.
+
+    @param x3
+      Conditional likelihood of the node for which we are computing the ancestral vector
+
+    @param ancestralBuffer
+      Buffer where to store the marginal ancestral probability vector
+
+    @param diagptable
+      A zero branch length P matrix
+
+    @param n
+      Number of sites in the partition to process (in the case of MPI/PTHREADS, the number of sites in the partition assigned to the current thread/process)
+
+    @param numStates
+      Number of states
+
+    @param gammaStates
+      Number of GAMMA categories times number of states
+      
+ */
+static void ancestralGamma(double *x3, double *ancestralBuffer, double *diagptable, const int n, const int numStates, const int gammaStates)
+{
+  int 
+    i;
+
+  const int
+    statesSquare = numStates * numStates;
+
+  double    
+    *term = (double*)rax_malloc(sizeof(double) * numStates);                  
+  
+  for(i = 0; i < n; i++)
+    {
+      double 
+        sum = 0.0,
+        *_v = &x3[gammaStates * i],
+        *ancestral = &ancestralBuffer[numStates * i];  
+      
+      int
+        k,
+        j,
+        l;
+      
+      for(l = 0; l < numStates; l++)
+        term[l] = 0.0;
+
+      for(k = 0; k < 4; k++)
+        {
+          double 
+            *v =  &(_v[numStates * k]);
+
+          for(l = 0; l < numStates; l++)
+            {
+              double
+                al = 0.0;
+              
+              for(j = 0; j < numStates; j++)        
+                al += v[j] * diagptable[k * statesSquare + l * numStates + j];
+          
+              term[l] += al;
+              sum += al;
+            }
+        }
+  
+      for(l = 0; l < numStates; l++)        
+        ancestral[l] = term[l] / sum;       
+    }
+   
+  rax_free(term);
+}
+
+/* compute dedicated zero branch length P matrix */
+/** @brief Compute a dedicated zero branch length P matrix
+   
+    Computes a P matrix by assuming a branch length of zero. This is used
+    for the marginal ancestral probabilities recomputation.
+
+    @param rptr
+      Array of values for rate categories
+
+    @param EI
+      Inverse eigenvector of Q matrix
+
+    @param EIGN
+      Eigenvalues of Q matrix
+
+    @param numberOfCategories
+      Number of rate categories
+
+    @param left
+      Where to store the resulting P matrix
+
+    @param numStates
+      Number of states
+ */
+static void calc_diagp_Ancestral(double *rptr, double *EI,  double *EIGN, int numberOfCategories, double *left, const int numStates)
+{
+  int 
+    i,
+    j,
+    k;
+  
+  const int   
+    statesSquare = numStates * numStates;
+
+  double 
+    z1 = 0.0,
+    lz1[64],
+    d1[64];
+
+  assert(numStates <= 64);
+     
+  for(i = 0; i < numStates; i++)    
+    lz1[i] = EIGN[i] * z1;
+     
+
+  for(i = 0; i < numberOfCategories; i++)
+    {
+      d1[0] = 1.0;
+
+      for(j = 1; j < numStates; j++)    
+        d1[j] = exp(rptr[i] * lz1[j]);
+         
+      for(j = 0; j < numStates; j++)
+        {
+          left[statesSquare * i  + numStates * j] = 1.0;         
+
+          for(k = 1; k < numStates; k++)            
+            left[statesSquare * i + numStates * j + k]  = d1[k] * EI[numStates * j + k];             
+        }
+    }  
+}
+
+/** @brief A very simple iterative function, we only access the conditional likelihood vector at node \a p
+ *
+ *
+ */
+void newviewAncestralIterative(pllInstance *tr, partitionList *pr)
+{
+  traversalInfo 
+    *ti    = tr->td[0].ti,
+    *tInfo = &ti[0];
+
+  int    
+    model,
+    p_slot = -1;
+
+  /* make sure that the traversal descriptor has length 1 */
+
+  assert(tr->td[0].count == 1);
+  assert(!tr->saveMemory);
+
+  /* get the index to the conditional likelihood vector depending on whether recomputation is used or not */
+
+  if(tr->useRecom)    
+    p_slot = tInfo->slot_p;         
+  else    
+    p_slot = tInfo->pNumber - tr->mxtips - 1;         
+
+  /* now loop over all partitions for nodes p of the current traversal vector entry */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      /* number of sites in this partition */
+      size_t            
+        width  = (size_t)pr->partitionData[model]->width;
+
+      /* this conditional statement is exactly identical to what we do in pllEvaluateIterative */
+
+      if(tr->td[0].executeModel[model] && width > 0)
+        {             
+          double         
+            *x3_start = pr->partitionData[model]->xVector[p_slot],
+//          *left     = (double*)NULL,
+//          *right    = (double*)NULL,                 
+            *rateCategories = (double*)NULL,
+            *diagptable = (double*)NULL;
+
+          int
+            categories;
+        
+          size_t                  
+            states = (size_t)pr->partitionData[model]->states,
+            availableLength = pr->partitionData[model]->xSpaceVector[p_slot],
+            requiredLength = 0,
+            rateHet = discreteRateCategories(tr->rateHetModel);   
+
+        /* figure out what kind of rate heterogeneity approach we are using */
+
+          if(tr->rateHetModel == PLL_CAT)
+            {            
+              rateCategories = pr->partitionData[model]->perSiteRates;
+              categories     = pr->partitionData[model]->numberOfCategories;
+            }
+          else
+            {                            
+              rateCategories = pr->partitionData[model]->gammaRates;
+              categories     = 4;
+            }
+          
+          /* allocate some space for a special P matrix with a branch length of 0 into which we mingle 
+             the eignevalues. This will allow us to obtain real probabilites from the internal RAxML 
+             representation */
+
+          rax_posix_memalign ((void **)&diagptable, PLL_BYTE_ALIGNMENT, categories * states * states * sizeof(double));
+          
+          requiredLength  =  virtual_width( width ) * rateHet * states * sizeof(double);
+          
+          /* make sure that this vector had already been allocated. This must be PLL_TRUE since we first invoked a standard newview() on this */
+
+          assert(requiredLength == availableLength);                                     
+
+          /* now compute the special P matrix */
+
+          calc_diagp_Ancestral(rateCategories, pr->partitionData[model]->EI,  pr->partitionData[model]->EIGN, categories, diagptable, states);
+          
+          /* switch over the rate heterogeneity model 
+             and call generic functions that compute the marginal ancestral states and 
+             store them in pr->partitionData[model]->ancestralBuffer
+          */
+
+          if(tr->rateHetModel == PLL_CAT)       
+            ancestralCat(x3_start, pr->partitionData[model]->ancestralBuffer, diagptable, width, states, pr->partitionData[model]->rateCategory);
+          else
+            ancestralGamma(x3_start, pr->partitionData[model]->ancestralBuffer, diagptable, width, states, categories * states);
+          
+          rax_free(diagptable);                   
+        }       
+    }
+}
+
+/** @brief Computes the Conditional Likelihood Vector (CLV) for each rate of some internal node.
+
+    Computes the conditional likelihood vectors of node \a p for each rate, given the partition
+    index \a partition. The result is placed in the array \a outProbs, which must be pre-allocated
+    by the caller, and must be of size \a sites * categories * states * sizeof(double). The structure of
+    the resulting array is the following:
+    For each site we have \a categories * states cells of size \a double. Those cells are divided per rate
+    category, i.e. first \a states cells are the probabilities for the states of rate 1 (ordered alphabetically
+    by base name), next \a states cells for rate 2 and so on.
+
+    @param tr   PLL instance
+    @param pr     List of partitions
+    @param p Node for which we want to compute the CLV
+    @param partition   Index of the partition for which to compute the CLV
+    @param outProbs    Pre-allocated array where the result will be stored
+
+    @returns Returns \b PLL_TRUE on success, \b PLL_FALSE on failure
+
+    @todo       Fix to work with CAT
+*/
+int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs)
+{
+  size_t i, j, k, l;
+
+  if (tr->rateHetModel != PLL_GAMMA) return (PLL_FALSE);
+
+  int p_slot;
+  size_t states = (size_t)pr->partitionData[partition]->states;
+
+  double
+    *term = (double*)rax_malloc(sizeof(double) * states);
+
+  if(tr->useRecom)
+    p_slot = p->number;
+  else
+    p_slot = p->number - tr->mxtips - 1;
+
+  size_t width = (size_t) pr->partitionData[partition]->width;
+  double * diagptable = NULL;
+  double * rateCategories = pr->partitionData[partition]->gammaRates;
+  double * x3 = pr->partitionData[partition]->xVector[p_slot];
+  size_t categories = 4;
+
+  rax_posix_memalign ((void **)&diagptable, PLL_BYTE_ALIGNMENT, categories * states * states * sizeof (double));
+
+  calc_diagp_Ancestral(rateCategories, pr->partitionData[partition]->EI,  pr->partitionData[partition]->EIGN, categories, diagptable, states);
+
+  for (i = 0; i < width; ++ i)
+   {
+     double
+       *_v  = &x3[categories * states * i],
+       *clv = &outProbs[categories * states * i];
+
+     for (k = 0; k < categories; ++ k)
+      {
+        double
+         sum = 0.0,
+         *v = &(_v[states * k]);
+
+        for (l = 0; l < states; ++ l)
+         {
+           double al = 0.0;
+
+           for (j = 0; j < states; ++ j)
+             al += v[j] * diagptable[k * states * states + l * states + j];
+
+           term[l] = al;
+           sum += al;
+         }
+        for (l = 0; l < states; ++ l)
+           clv[k * categories + l] = term[l] / sum;
+      }
+   }
+
+  rax_free(term);
+  rax_free(diagptable);
+
+  return (PLL_TRUE);
+}
+
+/* this is very similar to pllUpdatePartials, except that it also computes the marginal ancestral probabilities 
+   at node p. To simplify the code I am re-using newview() here to first get the likelihood vector p->x at p
+   and then I deploy newviewAncestralIterative(tr); that should always only have a traversal descriptor of lenth 1,
+   to do some mathematical transformations that are required to obtain the marginal ancestral probabilities from 
+   the conditional likelihood array at p.
+
+   Note that the marginal ancestral probability vector summarizes the subtree rooted at p! */
+
+/** @brief Computes the conditional likelihood vectors of all nodes in the subtree rooted at \a p
+    and the marginal ancestral probabilities at node \a p
+
+    Compute the conditional likelihood vectors of all nodes in the subtree rooted at node \a p. The
+    conditional likelihood vector at node \a p is recomputed regardless of whether the orientation (i.e. \a p->x)
+    is correct or not, and, recursively, the likelihoods at each node in the subtree as needed and if necessary.
+    In addition, the marginal ancestral probability vector for node \a p is also computed.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node for which we want to compute the ancestral vector
+
+    @note
+      This function is not implemented with the saveMemory technique. 
+*/
+void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  /* error check, we don't need to compute anything for tips */
+  
+  if(isTip(p->number, tr->mxtips))
+    {
+      printf("You are trying to compute the ancestral states on a tip node of the tree\n");
+      assert(0);
+    }
+
+  /* doesn't work yet in conjunction with SEVs, can be implemented though at some point 
+     if urgently required */
+
+  if(tr->saveMemory)
+    {
+      printf("ancestral state implementation will not work with memory saving (SEVs) enabled!\n");
+      printf("returning without computing anything ... \n");
+      return;
+    }
+
+  /* first call pllUpdatePartials() with mask set to PLL_FALSE such that the likelihood vector is there ! */
+
+  pllUpdatePartials(tr, pr, p, PLL_FALSE);
+
+  /* now let's compute the ancestral states using this vector ! */
+  
+  /* to make things easy and reduce code size, let's re-compute a standard traversal descriptor for node p,
+     hence we need to set the count to 0 */
+
+  tr->td[0].count = 0;
+
+  computeTraversalInfo(p, &(tr->td[0].ti[0]), &(tr->td[0].count), tr->mxtips, pr->perGeneBranchLengths?pr->numberOfPartitions : 1, PLL_TRUE, tr->rvec, tr->useRecom);
+
+  tr->td[0].traversalHasChanged = PLL_TRUE;
+
+  /* here we actually assert, that the traversal descriptor only contains one node triplet p, p->next->back, p->next->next->back
+     this must be PLL_TRUE because we have alread invoked the standard pllUpdatePartials() on p.
+  */ 
+
+  assert(tr->td[0].count == 1);  
+  
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* use the pthreads barrier to invoke newviewAncestralIterative() on a per-thread basis */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_NEWVIEW_ANCESTRAL);
+#else
+  /* now call the dedicated function that does the mathematical transformation of the 
+     conditional likelihood vector at p to obtain the marginal ancestral states */
+
+  newviewAncestralIterative(tr, pr);
+#endif
+
+  tr->td[0].traversalHasChanged = PLL_FALSE;
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* invoke another parallel region to gather the marginal ancestral probabilities 
+     from the threads/MPI processes */
+
+  pllMasterBarrier (tr, pr, PLL_THREAD_GATHER_ANCESTRAL);
+#endif
+
+  
+}
+
+/* returns the character representation of an enumerated DNA or AA state */
+
+/** @brief Get the character representation of an enumerated DNA or AA state
+    
+    Returns the character representation of the enumarates DNA or AA state,
+    from the constant arrays \a dnaStateNames (for DNA) or \a protStateNames (for proteins).
+
+    @param dataType
+      Type of data, i.e. \b PLL_DNA_DATA or \b PLL_AA_DATA
+
+    @param state
+      The number which we want to decode to a letter
+
+    @return
+      Returns the decoded character
+ */
+static char getStateCharacter(int dataType, int state)
+{
+  char 
+    result;
+
+  switch(dataType)
+    {    
+    case PLL_BINARY_DATA:
+       result = binaryStateNames[state];
+       break;
+    case PLL_DNA_DATA:
+       result = dnaStateNames[state];
+      break;
+    case PLL_AA_DATA:
+      result =  protStateNames[state];
+      break;    
+    default:
+      assert(0);
+    }
+
+  return  result;
+}
+
+/** @brief Prints the ancestral state information for a node \a p to the terminal 
+ 
+    Prints the ancestral state information for a node \a p to the terminal. 
+    The ancestral state sequence, resp. marginal ancestral state probabilities, is printed
+    depending on whether \a \a printStates, resp. \a printProbs, is set to \b PLL_TRUE.
+
+    @param p
+      The node for which to print the ancestral state sequence
+
+    @param printStates
+      If set to \b PLL_TRUE then the ancestral state sequence is printed
+
+    @param printProbs
+      If set to \b PLL_TRUE then the marginal ancestral state probabilities are printed
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+ 
+    @note  Here one can see how to store the ancestral probabilities in a dedicated data structure
+ */
+void printAncestralState(nodeptr p, pllBoolean printStates, pllBoolean printProbs, pllInstance *tr, partitionList *pr)
+{
+#ifdef _USE_PTHREADS
+  size_t 
+    accumulatedOffset = 0;
+#endif
+
+  int
+    j,
+    k,
+    model,
+    globalIndex = 0;
+  
+  /* allocate an array of structs for storing ancestral prob vector info/data */
+
+  ancestralState 
+    *a = (ancestralState *)rax_malloc(sizeof(ancestralState) * tr->originalCrunchedLength);   
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      int            
+        i,
+        width = pr->partitionData[model]->upper - pr->partitionData[model]->lower,
+        states = pr->partitionData[model]->states;
+      
+      /* set pointer to ancestral probability vector */
+
+#ifdef _USE_PTHREADS
+      double
+        *ancestral = &tr->ancestralVector[accumulatedOffset];
+#else
+      double 
+        *ancestral = pr->partitionData[model]->ancestralBuffer;
+#endif        
+      
+      /* loop over the sites of the partition */
+
+      for(i = 0; i < width; i++, globalIndex++)
+        {
+          double
+            equal = 1.0 / (double)states,
+            max = -1.0;
+            
+          pllBoolean
+            approximatelyEqual = PLL_TRUE;
+
+          int
+            max_l = -1,
+            l;
+          
+          char 
+            c;
+
+          /* stiore number of states for this site */
+
+          a[globalIndex].states = states;
+
+          /* alloc space for storing marginal ancestral probabilities */
+
+          a[globalIndex].probs = (double *)rax_malloc(sizeof(double) * states);
+          
+          /* loop over states to store probabilities and find the maximum */
+
+          for(l = 0; l < states; l++)
+            {
+              double 
+                value = ancestral[states * i + l];
+
+              if(value > max)
+                {
+                  max = value;
+                  max_l = l;
+                }
+              
+              /* this is used for discretizing the ancestral state sequence, if all marginal ancestral 
+                 probabilities are approximately equal we output a ? */
+
+              approximatelyEqual = approximatelyEqual && (PLL_ABS(equal - value) < 0.000001);
+              
+              a[globalIndex].probs[l] = value;                
+            }
+
+          
+          /* figure out the discrete ancestral nucleotide */
+
+          if(approximatelyEqual)
+            c = '?';      
+          else
+            c = getStateCharacter(pr->partitionData[model]->dataType, max_l);
+          
+          a[globalIndex].c = c;   
+        }
+
+#ifdef _USE_PTHREADS
+      accumulatedOffset += width * states;
+#endif            
+    }
+
+  /* print marginal ancestral probs to terminal */
+
+  if(printProbs)
+    {
+      printf("%d\n", p->number);
+      
+      for(k = 0; k < tr->originalCrunchedLength; k++)
+        {
+          for(j = 0; j < a[k].states; j++)
+            printf("%f ", a[k].probs[j]);
+          printf("\n");      
+        }
+      
+      printf("\n");
+    }
+ 
+  /* print discrete state ancestrakl sequence to terminal */
+
+  if(printStates)
+    {
+      printf("%d ", p->number);
+
+      for(k = 0; k < tr->originalCrunchedLength; k++)          
+        printf("%c", a[k].c);   
+  
+      printf("\n");
+    }
+  
+  /* free the ancestral state data structure */
+          
+  for(j = 0; j < tr->originalCrunchedLength; j++)
+    rax_free(a[j].probs);  
+
+  rax_free(a);
+}
+
+void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence)
+{
+#ifdef _USE_PTHREADS
+  size_t 
+    accumulatedOffset = 0;
+#endif
+
+  int
+    j,
+    k,
+    model,
+    globalIndex = 0;
+     
+  pllUpdatePartialsAncestral(tr, pr, p);
+  
+  /* allocate an array of structs for storing ancestral prob vector info/data */
+
+  ancestralState 
+    *a = (ancestralState *)rax_malloc(sizeof(ancestralState) * tr->originalCrunchedLength);   
+
+  /* loop over partitions */
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      int            
+        i,
+        width = pr->partitionData[model]->upper - pr->partitionData[model]->lower,
+        states = pr->partitionData[model]->states;
+      
+      /* set pointer to ancestral probability vector */
+
+#ifdef _USE_PTHREADS
+      double
+        *ancestral = &tr->ancestralVector[accumulatedOffset];
+#else
+      double 
+        *ancestral = pr->partitionData[model]->ancestralBuffer;
+#endif        
+      
+      /* loop over the sites of the partition */
+
+      for(i = 0; i < width; i++, globalIndex++)
+        {
+          double
+            equal = 1.0 / (double)states,
+            max = -1.0;
+            
+          pllBoolean
+            approximatelyEqual = PLL_TRUE;
+
+          int
+            max_l = -1,
+            l;
+          
+          char 
+            c;
+
+          /* stiore number of states for this site */
+
+          a[globalIndex].states = states;
+
+          /* alloc space for storing marginal ancestral probabilities */
+
+          a[globalIndex].probs = (double *)rax_malloc(sizeof(double) * states);
+          
+          /* loop over states to store probabilities and find the maximum */
+
+          for(l = 0; l < states; l++)
+            {
+              double 
+                value = ancestral[states * i + l];
+
+              if(value > max)
+                {
+                  max = value;
+                  max_l = l;
+                }
+              
+              /* this is used for discretizing the ancestral state sequence, if all marginal ancestral 
+                 probabilities are approximately equal we output a ? */
+
+              approximatelyEqual = approximatelyEqual && (PLL_ABS(equal - value) < 0.000001);
+              
+              a[globalIndex].probs[l] = value;                
+            }
+
+          
+          /* figure out the discrete ancestral nucleotide */
+
+          if(approximatelyEqual)
+            c = '?';      
+          else
+            c = getStateCharacter(pr->partitionData[model]->dataType, max_l);
+          
+          a[globalIndex].c = c;   
+        }
+
+#ifdef _USE_PTHREADS
+      accumulatedOffset += width * states;
+#endif            
+    }
+
+  /* print marginal ancestral probs to terminal */
+
+  for(k = 0; k < tr->originalCrunchedLength; k++)
+    {
+      for(j = 0; j < a[k].states; j++)
+        outProbs[k * a[k].states + j] = a[k].probs[j];
+    }
+ 
+  /* print discrete state ancestrakl sequence to terminal */
+
+  for(k = 0; k < tr->originalCrunchedLength; k++)          
+      outSequence[k] = a[k].c;
+  outSequence[tr->originalCrunchedLength] = 0;
+  
+  /* free the ancestral state data structure */
+          
+  for(j = 0; j < tr->originalCrunchedLength; j++)
+    rax_free(a[j].probs);  
+
+  rax_free(a);
+}
+/* optimized function implementations */
+
+
+/**
+ *  @defgroup group1 Optimized functions
+ *  This is the optimized functions group
+ */
+
+#if (!defined(__AVX) && defined(__SSE3))
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA with memory saving (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
+                                        double *x1_start, double *x2_start, double *x3_start,
+                                        double *EV, double *tipVector,
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                        const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                        unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
+                                        double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn)
+{
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    addScale = 0, 
+    scaleGap = 0;
+
+  double
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start,       
+    max;
+  PLL_ALIGN_BEGIN double
+    maxima[2] PLL_ALIGN_END,
+    EV_t[16] PLL_ALIGN_END;
+
+  __m128d 
+    values[8],
+    EVV[8];  
+
+  for(k = 0; k < 4; k++)
+    for (l=0; l < 4; l++)
+      EV_t[4 * l + k] = EV[4 * k + l];
+
+  for(k = 0; k < 8; k++)
+    EVV[k] = _mm_load_pd(&EV_t[k * 2]);      
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double *uX1, *uX2;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END, umpX2[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {           
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {                            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+            }
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {
+              __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+
+            }
+        }                 
+
+        uX1 = &umpX1[240];
+        uX2 = &umpX2[240];                          
+
+        for (j = 0; j < 4; j++)
+        {                                                                                  
+          __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+          __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+          __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+          __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+          __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+          __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );                                                 
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+          _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
+          _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );    
+        }  
+
+
+        x3 = x3_start;
+
+        for (i = 0; i < n; i++)
+        {           
+          if(!(x3_gap[i / 32] & mask32[i % 32]))             
+          {
+            uX1 = &umpX1[16 * tipX1[i]];
+            uX2 = &umpX2[16 * tipX2[i]];                                        
+
+            for (j = 0; j < 4; j++)
+            {                                                                              
+              __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+              __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+              __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+              __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+
+              //
+              // multiply left * right
+              //
+
+              __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+              __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+
+
+              //
+              // multiply with EV matrix (!?)
+              //
+
+              __m128d EV_t_l0_k0 = EVV[0];
+              __m128d EV_t_l0_k2 = EVV[1];
+              __m128d EV_t_l1_k0 = EVV[2];
+              __m128d EV_t_l1_k2 = EVV[3];
+              __m128d EV_t_l2_k0 = EVV[4];
+              __m128d EV_t_l2_k2 = EVV[5];
+              __m128d EV_t_l3_k0 = EVV[6]; 
+              __m128d EV_t_l3_k2 = EVV[7];
+
+              EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+              EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+              EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+              EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+              EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+              EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+              EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+              EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+              EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+              EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+              _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+              _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+            }
+
+            x3 += 16;
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      { 
+        double 
+          *uX1;
+        PLL_ALIGN_BEGIN double
+          umpX1[256] PLL_ALIGN_END;
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);                
+            }
+        }
+
+        {
+          __m128d maxv =_mm_setzero_pd();
+
+          scaleGap = 0;
+
+          x2 = x2_gapColumn;                     
+          x3 = x3_gapColumn;
+
+          uX1 = &umpX1[240];         
+
+          for (j = 0; j < 4; j++)
+          {                                
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+            __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+            __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+            __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+            __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+            values[j * 2]     = EV_t_l0_k0;
+            values[j * 2 + 1] = EV_t_l2_k0;                                
+
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                                    
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            scaleGap = 1;
+
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));                        
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }
+        }                       
+
+        x3 = x3_start;
+
+        for (i = 0; i < n; i++)
+        {
+          if((x3_gap[i / 32] & mask32[i % 32]))
+          {            
+            if(scaleGap)
+            {   
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                  
+            }
+          }
+          else
+          {                              
+            __m128d maxv =_mm_setzero_pd();              
+
+            if(x2_gap[i / 32] & mask32[i % 32])
+              x2 = x2_gapColumn;
+            else
+            {
+              x2 = x2_ptr;
+              x2_ptr += 16;
+            }
+
+            uX1 = &umpX1[16 * tipX1[i]];             
+
+
+            for (j = 0; j < 4; j++)
+            {                              
+              double *x2_p = &x2[j*4];
+              double *right_k0_p = &right[j*16];
+              double *right_k1_p = &right[j*16 + 1*4];
+              double *right_k2_p = &right[j*16 + 2*4];
+              double *right_k3_p = &right[j*16 + 3*4];
+              __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+              __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+              __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+              __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+              __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+              __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+              __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+              __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+              __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+              __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+
+              right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+              right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+              right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+              right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+              right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+              right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+              right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+
+              right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+              right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+              right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+              right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+              right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+              right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+              right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+              {
+                //
+                // load left side from tip vector
+                //
+
+                __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+                __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+                //
+                // multiply left * right
+                //
+
+                __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+                __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+
+                //
+                // multiply with EV matrix (!?)
+                //                                
+
+                __m128d EV_t_l0_k0 = EVV[0];
+                __m128d EV_t_l0_k2 = EVV[1];
+                __m128d EV_t_l1_k0 = EVV[2];
+                __m128d EV_t_l1_k2 = EVV[3];
+                __m128d EV_t_l2_k0 = EVV[4];
+                __m128d EV_t_l2_k2 = EVV[5];
+                __m128d EV_t_l3_k0 = EVV[6]; 
+                __m128d EV_t_l3_k2 = EVV[7];
+
+
+                EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+                EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+                EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+                EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+                EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+                EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+                EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+                EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+                EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+                EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+                EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+                EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+                EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+                EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+                values[j * 2]     = EV_t_l0_k0;
+                values[j * 2 + 1] = EV_t_l2_k0;                            
+
+                maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+                maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                
+              }            
+            }
+
+
+            _mm_store_pd(maxima, maxv);
+
+            max = PLL_MAX(maxima[0], maxima[1]);
+
+            if(max < PLL_MINLIKELIHOOD)
+            {
+              __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+              _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));     
+              _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+              _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+              _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+              _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));     
+              _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+              _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+              _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));      
+
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+
+            }
+            else
+            {
+              _mm_store_pd(&x3[0], values[0]);     
+              _mm_store_pd(&x3[2], values[1]);
+              _mm_store_pd(&x3[4], values[2]);
+              _mm_store_pd(&x3[6], values[3]);
+              _mm_store_pd(&x3[8], values[4]);     
+              _mm_store_pd(&x3[10], values[5]);
+              _mm_store_pd(&x3[12], values[6]);
+              _mm_store_pd(&x3[14], values[7]);
+            }            
+
+            x3 += 16;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:         
+      {
+        __m128d maxv =_mm_setzero_pd();
+
+        scaleGap = 0;
+
+        x1 = x1_gapColumn;                  
+        x2 = x2_gapColumn;          
+        x3 = x3_gapColumn;
+
+        for (j = 0; j < 4; j++)
+        {
+
+          double *x1_p = &x1[j*4];
+          double *left_k0_p = &left[j*16];
+          double *left_k1_p = &left[j*16 + 1*4];
+          double *left_k2_p = &left[j*16 + 2*4];
+          double *left_k3_p = &left[j*16 + 3*4];
+
+          __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+          __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+          __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+          __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+          __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+          __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+          __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+          __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+          __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+          double *x2_p = &x2[j*4];
+          double *right_k0_p = &right[j*16];
+          double *right_k1_p = &right[j*16 + 1*4];
+          double *right_k2_p = &right[j*16 + 2*4];
+          double *right_k3_p = &right[j*16 + 3*4];
+          __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+          __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+          __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+          __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+          __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+          __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+          __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+          __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+          __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);                                    
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );                                          
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+          values[j * 2] = EV_t_l0_k0;
+          values[j * 2 + 1] = EV_t_l2_k0;                           
+
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+        }
+
+        _mm_store_pd(maxima, maxv);
+
+        max = PLL_MAX(maxima[0], maxima[1]);
+
+        if(max < PLL_MINLIKELIHOOD)
+        {
+          __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+          scaleGap = 1;
+
+          _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));         
+          _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+          _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+          _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+          _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));         
+          _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+          _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+          _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));                      
+        }
+        else
+        {
+          _mm_store_pd(&x3[0], values[0]);         
+          _mm_store_pd(&x3[2], values[1]);
+          _mm_store_pd(&x3[4], values[2]);
+          _mm_store_pd(&x3[6], values[3]);
+          _mm_store_pd(&x3[8], values[4]);         
+          _mm_store_pd(&x3[10], values[5]);
+          _mm_store_pd(&x3[12], values[6]);
+          _mm_store_pd(&x3[14], values[7]);
+        }
+      }
+
+
+      x3 = x3_start;
+
+      for (i = 0; i < n; i++)
+      { 
+        if(x3_gap[i / 32] & mask32[i % 32])
+        {            
+          if(scaleGap)
+          {     
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                              
+          }
+        }
+        else
+        {
+          __m128d maxv =_mm_setzero_pd();                   
+
+          if(x1_gap[i / 32] & mask32[i % 32])
+            x1 = x1_gapColumn;
+          else
+          {
+            x1 = x1_ptr;
+            x1_ptr += 16;
+          }
+
+          if(x2_gap[i / 32] & mask32[i % 32])
+            x2 = x2_gapColumn;
+          else
+          {
+            x2 = x2_ptr;
+            x2_ptr += 16;
+          }
+
+
+          for (j = 0; j < 4; j++)
+          {
+
+            double *x1_p = &x1[j*4];
+            double *left_k0_p = &left[j*16];
+            double *left_k1_p = &left[j*16 + 1*4];
+            double *left_k2_p = &left[j*16 + 2*4];
+            double *left_k3_p = &left[j*16 + 3*4];
+
+            __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+            __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+            __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+            __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+            __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+            __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+            __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+            __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+            __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+            __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+            left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+            left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+            left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+            left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+            left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+            left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+            left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+            left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+            left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+            left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+            left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+            left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+            left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+            left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+            //
+            // multiply/add right side
+            //
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);     
+
+            //
+            // multiply left * right
+            //
+
+            __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+            __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+
+            //
+            // multiply with EV matrix (!?)
+            //       
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+            values[j * 2] = EV_t_l0_k0;
+            values[j * 2 + 1] = EV_t_l2_k0;                         
+
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+            maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));        
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];
+
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }      
+
+
+
+          x3 += 16;
+
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMA(int tipCase,
+                            double *x1_start, double *x2_start, double *x3_start,
+                            double *EV, double *tipVector,
+                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                            const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling
+                            )
+{
+  int 
+    i, 
+    j, 
+    k, 
+    l,
+    addScale = 0;
+
+  //int scaling = 0;
+
+  double
+    *x1,
+    *x2,
+    *x3,
+    max;
+  PLL_ALIGN_BEGIN double
+    maxima[2] PLL_ALIGN_END,
+    EV_t[16] PLL_ALIGN_END;
+
+  __m128d 
+    values[8],
+    EVV[8];  
+
+  for(k = 0; k < 4; k++)
+    for (l=0; l < 4; l++)
+      EV_t[4 * l + k] = EV[4 * k + l];
+
+  for(k = 0; k < 8; k++)
+    EVV[k] = _mm_load_pd(&EV_t[k * 2]);
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double *uX1, *uX2;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END, umpX2[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+
+            for (k = 0; k < 4; k++) {
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
+            }
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {
+              __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
+
+            }
+        }       
+
+        for (i = 0; i < n; i++)
+        {
+          x3 = &x3_start[i * 16];
+
+
+          uX1 = &umpX1[16 * tipX1[i]];
+          uX2 = &umpX2[16 * tipX2[i]];                      
+
+          for (j = 0; j < 4; j++)
+          {                                                                                
+            __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+            __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+            __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
+            __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
+
+
+            //
+            // multiply left * right
+            //
+
+            __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
+            __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
+
+
+            //
+            // multiply with EV matrix (!?)
+            //
+
+            __m128d EV_t_l0_k0 = EVV[0];
+            __m128d EV_t_l0_k2 = EVV[1];
+            __m128d EV_t_l1_k0 = EVV[2];
+            __m128d EV_t_l1_k2 = EVV[3];
+            __m128d EV_t_l2_k0 = EVV[4];
+            __m128d EV_t_l2_k2 = EVV[5];
+            __m128d EV_t_l3_k0 = EVV[6]; 
+            __m128d EV_t_l3_k2 = EVV[7];
+
+            EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+            EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+            EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+            EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+            EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+            EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+            EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+            EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+            EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+            _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
+            _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      { 
+        double *uX1;
+        PLL_ALIGN_BEGIN double umpX1[256] PLL_ALIGN_END;
+
+
+        for (i = 1; i < 16; i++)
+        {
+          __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
+          __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));       
+
+          for (j = 0; j < 4; j++)
+            for (k = 0; k < 4; k++)
+            {            
+              __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
+              __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
+
+              __m128d acc = _mm_setzero_pd();
+
+              acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
+              acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
+
+              acc = _mm_hadd_pd(acc, acc);
+              _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);                
+            }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          __m128d maxv =_mm_setzero_pd();
+
+          x2 = &x2_start[i * 16];
+          x3 = &x3_start[i * 16];
+
+          uX1 = &umpX1[16 * tipX1[i]];       
+
+          for (j = 0; j < 4; j++)
+          {
+
+            //
+            // multiply/add right side
+            //
+            double *x2_p = &x2[j*4];
+            double *right_k0_p = &right[j*16];
+            double *right_k1_p = &right[j*16 + 1*4];
+            double *right_k2_p = &right[j*16 + 2*4];
+            double *right_k3_p = &right[j*16 + 3*4];
+            __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+            __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+            __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+            __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+            __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+            __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+            __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+            __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+            __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+            __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+
+
+            right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+            right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+            right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+            right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+            right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+            right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+
+            right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+            right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+            right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+            right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+            right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+            right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
+
+            {
+              //
+              // load left side from tip vector
+              //
+
+              __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
+              __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
+
+
+              //
+              // multiply left * right
+              //
+
+              __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
+              __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
+
+
+              //
+              // multiply with EV matrix (!?)
+              //                                  
+
+              __m128d EV_t_l0_k0 = EVV[0];
+              __m128d EV_t_l0_k2 = EVV[1];
+              __m128d EV_t_l1_k0 = EVV[2];
+              __m128d EV_t_l1_k2 = EVV[3];
+              __m128d EV_t_l2_k0 = EVV[4];
+              __m128d EV_t_l2_k2 = EVV[5];
+              __m128d EV_t_l3_k0 = EVV[6]; 
+              __m128d EV_t_l3_k2 = EVV[7];
+
+
+              EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+              EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+              EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+              EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+              EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+              EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+              EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+              EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+              EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+              EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+              EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+              EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+              values[j * 2]     = EV_t_l0_k0;
+              values[j * 2 + 1] = EV_t_l2_k0;                              
+
+              maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+              maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));                  
+            }
+          }
+
+
+          _mm_store_pd(maxima, maxv);
+
+          max = PLL_MAX(maxima[0], maxima[1]);
+
+          if(max < PLL_MINLIKELIHOOD)
+          {
+            __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+            _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));       
+            _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+            _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+            _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+            _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));       
+            _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+            _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+            _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));        
+
+             if(!fastScaling)
+               ex3[i] += 1;
+             else
+               addScale += wgt[i];
+
+          }
+          else
+          {
+            _mm_store_pd(&x3[0], values[0]);       
+            _mm_store_pd(&x3[2], values[1]);
+            _mm_store_pd(&x3[4], values[2]);
+            _mm_store_pd(&x3[6], values[3]);
+            _mm_store_pd(&x3[8], values[4]);       
+            _mm_store_pd(&x3[10], values[5]);
+            _mm_store_pd(&x3[12], values[6]);
+            _mm_store_pd(&x3[14], values[7]);
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+
+      for (i = 0; i < n; i++)
+      {
+        __m128d maxv =_mm_setzero_pd();
+
+
+        x1 = &x1_start[i * 16];
+        x2 = &x2_start[i * 16];
+        x3 = &x3_start[i * 16];
+
+        for (j = 0; j < 4; j++)
+        {
+
+          double *x1_p = &x1[j*4];
+          double *left_k0_p = &left[j*16];
+          double *left_k1_p = &left[j*16 + 1*4];
+          double *left_k2_p = &left[j*16 + 2*4];
+          double *left_k3_p = &left[j*16 + 3*4];
+
+          __m128d x1_0 = _mm_load_pd( &x1_p[0] );
+          __m128d x1_2 = _mm_load_pd( &x1_p[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
+          __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
+          __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
+          __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
+          __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
+          __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
+          __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
+          __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+
+          //
+          // multiply/add right side
+          //
+          double *x2_p = &x2[j*4];
+          double *right_k0_p = &right[j*16];
+          double *right_k1_p = &right[j*16 + 1*4];
+          double *right_k2_p = &right[j*16 + 2*4];
+          double *right_k3_p = &right[j*16 + 3*4];
+          __m128d x2_0 = _mm_load_pd( &x2_p[0] );
+          __m128d x2_2 = _mm_load_pd( &x2_p[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
+          __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
+          __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
+          __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
+          __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
+          __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
+          __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
+          __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          //
+          // multiply left * right
+          //
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+
+          //
+          // multiply with EV matrix (!?)
+          //         
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6]; 
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
+
+
+          values[j * 2] = EV_t_l0_k0;
+          values[j * 2 + 1] = EV_t_l2_k0;                           
+
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
+          maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
+        }
+
+
+        _mm_store_pd(maxima, maxv);
+
+        max = PLL_MAX(maxima[0], maxima[1]);
+
+        if(max < PLL_MINLIKELIHOOD)
+        {
+          __m128d sv = _mm_set1_pd(PLL_TWOTOTHE256);
+
+          _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));         
+          _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
+          _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
+          _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
+          _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));         
+          _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
+          _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
+          _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));          
+
+           if(!fastScaling)
+             ex3[i] += 1;
+           else
+             addScale += wgt[i];        
+        }
+        else
+        {
+          _mm_store_pd(&x3[0], values[0]);         
+          _mm_store_pd(&x3[2], values[1]);
+          _mm_store_pd(&x3[4], values[2]);
+          _mm_store_pd(&x3[6], values[3]);
+          _mm_store_pd(&x3[8], values[4]);         
+          _mm_store_pd(&x3[10], values[5]);
+          _mm_store_pd(&x3[12], values[6]);
+          _mm_store_pd(&x3[14], values[7]);
+        }        
+      }
+
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
+                           double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2, 
+    *x3;
+  PLL_ALIGN_BEGIN double
+    EV_t[16] PLL_ALIGN_END;
+
+  int 
+    i, 
+    j, 
+    scale, 
+    addScale = 0;
+
+  __m128d
+    minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD ),
+                      sc = _mm_set1_pd(PLL_TWOTOTHE256),
+                      EVV[8];  
+
+  for(i = 0; i < 4; i++)
+    for (j=0; j < 4; j++)
+      EV_t[4 * j + i] = EV[4 * i + j];
+
+  for(i = 0; i < 8; i++)
+    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+      {  
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &(tipVector[4 * tipX2[i]]);
+
+        x3 = &x3_start[i * 4];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );           
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );      
+
+        _mm_store_pd(x3, EV_t_l0_k0);
+        _mm_store_pd(&x3[2], EV_t_l2_k0);                                   
+      }
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+      {
+        x1 = &(tipVector[4 * tipX1[i]]);
+        x2 = &x2_start[4 * i];
+        x3 = &x3_start[4 * i];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                       
+
+        scale = 1;
+
+        __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+        else
+        {
+          v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+        }
+
+        if(scale)
+        {                     
+          _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+          _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                   
+
+           if(!fastScaling)
+             ex3[i] += 1;
+           else
+             addScale += wgt[i];          
+        }       
+        else
+        {
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);
+        }
+
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        x1 = &x1_start[4 * i];
+        x2 = &x2_start[4 * i];
+        x3 = &x3_start[4 * i];
+
+        le =  &left[cptr[i] * 16];
+        ri =  &right[cptr[i] * 16];
+
+        __m128d x1_0 = _mm_load_pd( &x1[0] );
+        __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+        __m128d left_k0_0 = _mm_load_pd( &le[0] );
+        __m128d left_k0_2 = _mm_load_pd( &le[2] );
+        __m128d left_k1_0 = _mm_load_pd( &le[4] );
+        __m128d left_k1_2 = _mm_load_pd( &le[6] );
+        __m128d left_k2_0 = _mm_load_pd( &le[8] );
+        __m128d left_k2_2 = _mm_load_pd( &le[10] );
+        __m128d left_k3_0 = _mm_load_pd( &le[12] );
+        __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+        left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+        left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+        left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+        left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+        left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+        left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+        left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+        left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+        left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+        left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+        left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+        left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+        __m128d x2_0 = _mm_load_pd( &x2[0] );
+        __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+        __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+        __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+        __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+        __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+        __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+        __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+        __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+        __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+        right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+        right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+        right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+        right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+        right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+        right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+        right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+        right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+        right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+        right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+        right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+        right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);         
+
+        __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+        __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+        __m128d EV_t_l0_k0 = EVV[0];
+        __m128d EV_t_l0_k2 = EVV[1];
+        __m128d EV_t_l1_k0 = EVV[2];
+        __m128d EV_t_l1_k2 = EVV[3];
+        __m128d EV_t_l2_k0 = EVV[4];
+        __m128d EV_t_l2_k2 = EVV[5];
+        __m128d EV_t_l3_k0 = EVV[6];
+        __m128d EV_t_l3_k2 = EVV[7];
+
+
+        EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+        EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+        EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+        EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+        EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+        EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+        EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+        EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+        EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+        EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+        EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+        EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                              
+
+        scale = 1;
+
+        __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+        else
+        {
+          v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+        }
+
+        if(scale)
+        {                     
+          _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+          _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                   
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];   
+        }       
+        else
+        {
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);
+        }
+
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+#endif
+
+/** @brief Check whether the position \a pos in bitvector \a x is a gap
+    
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is set (i.e. it is a gap) 
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//__inline
+//#endif
+pllBoolean isGap(unsigned int *x, int pos)
+{
+  return (x[pos / 32] & mask32[pos % 32]);
+}
+
+/** @brief Check whether the position \a pos in bitvector \a x is \b NOT a gap
+    
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is \b NOT set (i.e. it is \b NOT a gap) 
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//__inline
+//#endif
+pllBoolean noGap(unsigned int *x, int pos)
+{
+  return (!(x[pos / 32] & mask32[pos % 32]));
+}
+
+#if (!defined(__AVX) && defined(__SSE3))
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT with memory saving (Optimized SSE3 version for DNA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
+                                double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le,
+    *ri,
+    *x1,
+    *x2,
+    *x3,
+    *x1_ptr = x1_start,
+    *x2_ptr = x2_start, 
+    *x3_ptr = x3_start;
+  PLL_ALIGN_BEGIN double
+    EV_t[16] PLL_ALIGN_END;
+
+  int 
+    i, 
+    j, 
+    scale, 
+    scaleGap = 0,
+    addScale = 0;
+
+  __m128d
+    minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD ),
+                      sc = _mm_set1_pd(PLL_TWOTOTHE256),
+                      EVV[8];  
+
+  for(i = 0; i < 4; i++)
+    for (j=0; j < 4; j++)
+      EV_t[4 * j + i] = EV[4 * i + j];
+
+  for(i = 0; i < 8; i++)
+    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
+
+  {
+    x1 = x1_gapColumn;        
+    x2 = x2_gapColumn;
+    x3 = x3_gapColumn;
+
+    le =  &left[maxCats * 16];           
+    ri =  &right[maxCats * 16];                                                  
+
+    __m128d x1_0 = _mm_load_pd( &x1[0] );
+    __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+    __m128d left_k0_0 = _mm_load_pd( &le[0] );
+    __m128d left_k0_2 = _mm_load_pd( &le[2] );
+    __m128d left_k1_0 = _mm_load_pd( &le[4] );
+    __m128d left_k1_2 = _mm_load_pd( &le[6] );
+    __m128d left_k2_0 = _mm_load_pd( &le[8] );
+    __m128d left_k2_2 = _mm_load_pd( &le[10] );
+    __m128d left_k3_0 = _mm_load_pd( &le[12] );
+    __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+    __m128d x2_0 = _mm_load_pd( &x2[0] );
+    __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+    __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+    __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+    __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+    __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+    __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+    __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+    __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+    __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);     
+
+    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+    __m128d EV_t_l0_k0 = EVV[0];
+    __m128d EV_t_l0_k2 = EVV[1];
+    __m128d EV_t_l1_k0 = EVV[2];
+    __m128d EV_t_l1_k2 = EVV[3];
+    __m128d EV_t_l2_k0 = EVV[4];
+    __m128d EV_t_l2_k2 = EVV[5];
+    __m128d EV_t_l3_k0 = EVV[6];
+    __m128d EV_t_l3_k2 = EVV[7];
+
+    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                   
+
+    if(tipCase != PLL_TIP_TIP)
+    {    
+      scale = 1;
+
+      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+      if(_mm_movemask_pd( v1 ) != 3)
+        scale = 0;
+      else
+      {
+        v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+      }
+
+      if(scale)
+      {               
+        _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+        _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                     
+
+        scaleGap = PLL_TRUE;       
+      } 
+      else
+      {
+        _mm_store_pd(x3, EV_t_l0_k0);
+        _mm_store_pd(&x3[2], EV_t_l2_k0);
+      }
+    }
+    else
+    {
+      _mm_store_pd(x3, EV_t_l0_k0);
+      _mm_store_pd(&x3[2], EV_t_l2_k0);
+    }
+  }
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:      
+      for (i = 0; i < n; i++)
+      {
+        if(noGap(x3_gap, i))
+        {
+          x1 = &(tipVector[4 * tipX1[i]]);
+          x2 = &(tipVector[4 * tipX2[i]]);
+
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+            le =  &left[maxCats * 16];
+          else            
+            le =  &left[cptr[i] * 16];    
+
+          if(isGap(x2_gap, i))
+            ri =  &right[maxCats * 16];
+          else            
+            ri =  &right[cptr[i] * 16];
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );                 
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );    
+
+          _mm_store_pd(x3, EV_t_l0_k0);
+          _mm_store_pd(&x3[2], EV_t_l2_k0);                                 
+
+          x3_ptr += 4;
+        }
+      }
+      break;
+    case PLL_TIP_INNER:      
+      for (i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {             
+          x1 = &(tipVector[4 * tipX1[i]]);
+
+          x2 = x2_ptr;
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+            le =  &left[maxCats * 16];
+          else
+            le =  &left[cptr[i] * 16];
+
+          if(isGap(x2_gap, i))
+          {              
+            ri =  &right[maxCats * 16];
+            x2 = x2_gapColumn;
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 16];
+            x2 = x2_ptr;
+            x2_ptr += 4;
+          }                               
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                     
+
+          scale = 1;
+
+          __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+          else
+          {
+            v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }
+
+          if(scale)
+          {                   
+            _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+            _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                 
+            
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }     
+          else
+          {
+            _mm_store_pd(x3, EV_t_l0_k0);
+            _mm_store_pd(&x3[2], EV_t_l2_k0);
+          }
+
+          x3_ptr += 4;
+        }
+
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {            
+          x3 = x3_ptr;
+
+          if(isGap(x1_gap, i))
+          {
+            x1 = x1_gapColumn;
+            le =  &left[maxCats * 16];
+          }
+          else
+          {
+            le =  &left[cptr[i] * 16];
+            x1 = x1_ptr;
+            x1_ptr += 4;
+          }
+
+          if(isGap(x2_gap, i))  
+          {
+            x2 = x2_gapColumn;
+            ri =  &right[maxCats * 16];     
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 16];
+            x2 = x2_ptr;
+            x2_ptr += 4;
+          }                               
+
+          __m128d x1_0 = _mm_load_pd( &x1[0] );
+          __m128d x1_2 = _mm_load_pd( &x1[2] );
+
+          __m128d left_k0_0 = _mm_load_pd( &le[0] );
+          __m128d left_k0_2 = _mm_load_pd( &le[2] );
+          __m128d left_k1_0 = _mm_load_pd( &le[4] );
+          __m128d left_k1_2 = _mm_load_pd( &le[6] );
+          __m128d left_k2_0 = _mm_load_pd( &le[8] );
+          __m128d left_k2_2 = _mm_load_pd( &le[10] );
+          __m128d left_k3_0 = _mm_load_pd( &le[12] );
+          __m128d left_k3_2 = _mm_load_pd( &le[14] );
+
+          left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
+          left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
+
+          left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
+          left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
+
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
+          left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
+          left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
+
+          left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
+          left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
+
+          left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
+          left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
+
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
+          left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
+          left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
+
+          __m128d x2_0 = _mm_load_pd( &x2[0] );
+          __m128d x2_2 = _mm_load_pd( &x2[2] );
+
+          __m128d right_k0_0 = _mm_load_pd( &ri[0] );
+          __m128d right_k0_2 = _mm_load_pd( &ri[2] );
+          __m128d right_k1_0 = _mm_load_pd( &ri[4] );
+          __m128d right_k1_2 = _mm_load_pd( &ri[6] );
+          __m128d right_k2_0 = _mm_load_pd( &ri[8] );
+          __m128d right_k2_2 = _mm_load_pd( &ri[10] );
+          __m128d right_k3_0 = _mm_load_pd( &ri[12] );
+          __m128d right_k3_2 = _mm_load_pd( &ri[14] );
+
+          right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
+          right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
+
+          right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
+          right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
+
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
+          right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
+          right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
+
+          right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
+          right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
+
+          right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
+          right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
+
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
+          right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
+          right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);       
+
+          __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
+          __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
+
+          __m128d EV_t_l0_k0 = EVV[0];
+          __m128d EV_t_l0_k2 = EVV[1];
+          __m128d EV_t_l1_k0 = EVV[2];
+          __m128d EV_t_l1_k2 = EVV[3];
+          __m128d EV_t_l2_k0 = EVV[4];
+          __m128d EV_t_l2_k2 = EVV[5];
+          __m128d EV_t_l3_k0 = EVV[6];
+          __m128d EV_t_l3_k2 = EVV[7];
+
+
+          EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
+          EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
+
+          EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
+          EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
+
+          EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
+          EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
+
+          EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
+          EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
+
+          EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
+          EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
+          EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
+
+          EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );                                            
+
+          scale = 1;
+
+          __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
+          v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+          if(_mm_movemask_pd( v1 ) != 3)
+            scale = 0;
+          else
+          {
+            v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }
+
+          if(scale)
+          {                   
+            _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
+            _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));                 
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }     
+          else
+          {
+            _mm_store_pd(x3, EV_t_l0_k0);
+            _mm_store_pd(&x3[2], EV_t_l2_k0);
+          }
+
+          x3_ptr += 4;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA with memory saving (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity. The memory saving technique is incorporated.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
+                                            double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                            int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                            int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                            unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
+                                            double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                            )
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0,   
+       gapScaling = 0;
+  double 
+    *vl, *vr, *x1v, *x2v,
+    *x1_ptr = x1,
+    *x2_ptr = x2,
+    *x3_ptr = x3;
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+            double *rr =  &right[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+            __m128d umpX2v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+              umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+            umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+            _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+          }
+        }
+
+        {
+          uX1 = &umpX1[1760];
+          uX2 = &umpX2[1760];
+
+          for(j = 0; j < 4; j++)
+          {
+            v = &x3_gapColumn[j * 20];
+
+            __m128d zero =  _mm_setzero_pd();
+            for(k = 0; k < 20; k+=2)                                
+              _mm_store_pd(&v[k], zero);
+
+            for(k = 0; k < 20; k++)
+            { 
+              double *eev = &extEV[k * 20];
+              x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d ee = _mm_load_pd(&eev[l]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[l], vv);
+              }
+            }
+          }        
+        }       
+
+        for(i = 0; i < n; i++)
+        {
+          if(!(x3_gap[i / 32] & mask32[i % 32]))
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+            uX2 = &umpX2[80 * tipX2[i]];
+
+            for(j = 0; j < 4; j++)
+            {
+              v = &x3_ptr[j * 20];
+
+
+              __m128d zero =  _mm_setzero_pd();
+              for(k = 0; k < 20; k+=2)                              
+                _mm_store_pd(&v[k], zero);
+
+              for(k = 0; k < 20; k++)
+              { 
+                double *eev = &extEV[k * 20];
+                x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                for(l = 0; l < 20; l+=2)
+                {
+                  __m128d vv = _mm_load_pd(&v[l]);
+                  __m128d ee = _mm_load_pd(&eev[l]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                  _mm_store_pd(&v[l], vv);
+                }
+              }
+            }      
+            x3_ptr += 80;
+          }
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                               
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);          
+
+          }
+        }
+
+        {
+          uX1 = &umpX1[1760];
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2_gapColumn[k * 20]);
+
+            for(l = 0; l < 20; l++)
+            {              
+              double *r =  &right[k * 400 + l * 20];
+              __m128d ump_x2v = _mm_setzero_pd();           
+
+              for(j = 0; j < 20; j+= 2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d rr = _mm_load_pd(&r[j]);
+                ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+              }
+
+              ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+              _mm_storel_pd(&ump_x2[l], ump_x2v);                                    
+            }
+
+            v = &(x3_gapColumn[20 * k]);
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {
+              double *eev = &extEV[l * 20];
+              x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d ee = _mm_load_pd(&eev[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[j], vv);
+              }                             
+            }                   
+
+          }
+
+          { 
+            v = x3_gapColumn;
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+            gapScaling = 1;
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                                                          
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {           
+          if((x3_gap[i / 32] & mask32[i % 32]))
+          {            
+            if(gapScaling)
+            {   
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                  
+            }
+          }
+          else
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+
+            if(x2_gap[i / 32] & mask32[i % 32])
+              x2v = x2_gapColumn;
+            else
+            {
+              x2v = x2_ptr;
+              x2_ptr += 80;
+            }
+
+            for(k = 0; k < 4; k++)
+            {
+              v = &(x2v[k * 20]);
+
+              for(l = 0; l < 20; l++)
+              {            
+                double *r =  &right[k * 400 + l * 20];
+                __m128d ump_x2v = _mm_setzero_pd();         
+
+                for(j = 0; j < 20; j+= 2)
+                {
+                  __m128d vv = _mm_load_pd(&v[j]);
+                  __m128d rr = _mm_load_pd(&r[j]);
+                  ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                }
+
+                ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+                _mm_storel_pd(&ump_x2[l], ump_x2v);                                  
+              }
+
+              v = &x3_ptr[20 * k];
+
+              __m128d zero =  _mm_setzero_pd();
+              for(l = 0; l < 20; l+=2)                              
+                _mm_store_pd(&v[l], zero);
+
+              for(l = 0; l < 20; l++)
+              {
+                double *eev = &extEV[l * 20];
+                x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d vv = _mm_load_pd(&v[j]);
+                  __m128d ee = _mm_load_pd(&eev[j]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                  _mm_store_pd(&v[j], vv);
+                }                                   
+              }                 
+
+            }
+
+
+            { 
+              v = x3_ptr;
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+              scale = 1;
+              for(l = 0; scale && (l < 80); l += 2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d v1 = _mm_and_pd(vv, absMask.m);
+                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(_mm_movemask_pd( v1 ) != 3)
+                  scale = 0;
+              }           
+            }
+
+
+            if (scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+              for(l = 0; l < 80; l+=2)
+              {
+                __m128d ex3v = _mm_load_pd(&v[l]);                
+                _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));    
+              }                           
+              
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];                   
+            }
+
+            x3_ptr += 80;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1_gapColumn[20 * k]);
+          vr = &(x2_gapColumn[20 * k]);
+          v =  &(x3_gapColumn[20 * k]);
+
+          __m128d zero =  _mm_setzero_pd();
+          for(l = 0; l < 20; l+=2)                                  
+            _mm_store_pd(&v[l], zero);
+
+          for(l = 0; l < 20; l++)
+          {              
+            {
+              __m128d al = _mm_setzero_pd();
+              __m128d ar = _mm_setzero_pd();
+
+              double *ll   = &left[k * 400 + l * 20];
+              double *rr   = &right[k * 400 + l * 20];
+              double *EVEV = &extEV[20 * l];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d lv  = _mm_load_pd(&ll[j]);
+                __m128d rv  = _mm_load_pd(&rr[j]);
+                __m128d vll = _mm_load_pd(&vl[j]);
+                __m128d vrr = _mm_load_pd(&vr[j]);
+
+                al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+              }                  
+
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+
+              al = _mm_mul_pd(al, ar);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv  = _mm_load_pd(&v[j]);
+                __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                _mm_store_pd(&v[j], vv);
+              }                                           
+            }            
+
+          }
+        }
+
+
+        { 
+          v = x3_gapColumn;
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 80); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+        if (scale)
+        {
+          gapScaling = 1;
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 80; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+        }
+      }
+
+      for (i = 0; i < n; i++)
+      {
+        if(x3_gap[i / 32] & mask32[i % 32])
+        {            
+          if(gapScaling)
+          {     
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                              
+          }
+        }
+        else
+        {
+          if(x1_gap[i / 32] & mask32[i % 32])
+            x1v = x1_gapColumn;
+          else
+          {
+            x1v = x1_ptr;
+            x1_ptr += 80;
+          }
+
+          if(x2_gap[i / 32] & mask32[i % 32])
+            x2v = x2_gapColumn;
+          else
+          {
+            x2v = x2_ptr;
+            x2_ptr += 80;
+          }
+
+          for(k = 0; k < 4; k++)
+          {
+            vl = &(x1v[20 * k]);
+            vr = &(x2v[20 * k]);
+            v =  &x3_ptr[20 * k];
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {            
+              {
+                __m128d al = _mm_setzero_pd();
+                __m128d ar = _mm_setzero_pd();
+
+                double *ll   = &left[k * 400 + l * 20];
+                double *rr   = &right[k * 400 + l * 20];
+                double *EVEV = &extEV[20 * l];
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d lv  = _mm_load_pd(&ll[j]);
+                  __m128d rv  = _mm_load_pd(&rr[j]);
+                  __m128d vll = _mm_load_pd(&vl[j]);
+                  __m128d vrr = _mm_load_pd(&vr[j]);
+
+                  al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                  ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                }                
+
+                al = _mm_hadd_pd(al, al);
+                ar = _mm_hadd_pd(ar, ar);
+
+                al = _mm_mul_pd(al, ar);
+
+                for(j = 0; j < 20; j+=2)
+                {
+                  __m128d vv  = _mm_load_pd(&v[j]);
+                  __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                  vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                  _mm_store_pd(&v[j], vv);
+                }                                                 
+              }          
+
+            }
+          }
+
+
+
+          { 
+            v = x3_ptr;
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];                         
+          }
+          x3_ptr += 80;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;  
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR GAMMA (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT(int tipCase,
+                                double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                                int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0;
+  double *vl, *vr;
+
+
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+            double *rr =  &right[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+            __m128d umpX2v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+              umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+            umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+            _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+
+          }
+        }
+
+        for(i = 0; i < n; i++)
+        {
+          uX1 = &umpX1[80 * tipX1[i]];
+          uX2 = &umpX2[80 * tipX2[i]];
+
+          for(j = 0; j < 4; j++)
+          {
+            v = &x3[i * 80 + j * 20];
+
+
+            __m128d zero =  _mm_setzero_pd();
+            for(k = 0; k < 20; k+=2)                                
+              _mm_store_pd(&v[k], zero);
+
+            for(k = 0; k < 20; k++)
+            { 
+              double *eev = &extEV[k * 20];
+              x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d ee = _mm_load_pd(&eev[l]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[l], vv);
+              }
+            }
+
+
+          }        
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+        {
+          v = &(tipVector[20 * i]);
+
+          for(k = 0; k < 80; k++)
+          {
+            double *ll =  &left[k * 20];
+
+            __m128d umpX1v = _mm_setzero_pd();
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                 
+            }
+
+            umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                               
+            _mm_storel_pd(&umpX1[80 * i + k], umpX1v);          
+
+
+          }
+        }
+
+        for (i = 0; i < n; i++)
+        {
+          uX1 = &umpX1[80 * tipX1[i]];
+
+          for(k = 0; k < 4; k++)
+          {
+            v = &(x2[80 * i + k * 20]);
+
+            for(l = 0; l < 20; l++)
+            {              
+              double *r =  &right[k * 400 + l * 20];
+              __m128d ump_x2v = _mm_setzero_pd();           
+
+              for(j = 0; j < 20; j+= 2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d rr = _mm_load_pd(&r[j]);
+                ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+              }
+
+              ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+
+              _mm_storel_pd(&ump_x2[l], ump_x2v);                                    
+            }
+
+            v = &(x3[80 * i + 20 * k]);
+
+            __m128d zero =  _mm_setzero_pd();
+            for(l = 0; l < 20; l+=2)                                
+              _mm_store_pd(&v[l], zero);
+
+            for(l = 0; l < 20; l++)
+            {
+              double *eev = &extEV[l * 20];
+              x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+              __m128d x1px2v = _mm_set1_pd(x1px2);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                __m128d ee = _mm_load_pd(&eev[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+
+                _mm_store_pd(&v[j], vv);
+              }                             
+            }                   
+
+          }
+
+
+          { 
+            v = &(x3[80 * i]);
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 80); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if (scale)
+          {
+
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 80; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];
+
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+      {
+        for(k = 0; k < 4; k++)
+        {
+          vl = &(x1[80 * i + 20 * k]);
+          vr = &(x2[80 * i + 20 * k]);
+          v =  &(x3[80 * i + 20 * k]);
+
+
+          __m128d zero =  _mm_setzero_pd();
+          for(l = 0; l < 20; l+=2)                                  
+            _mm_store_pd(&v[l], zero);
+
+
+          for(l = 0; l < 20; l++)
+          {              
+
+            {
+              __m128d al = _mm_setzero_pd();
+              __m128d ar = _mm_setzero_pd();
+
+              double *ll   = &left[k * 400 + l * 20];
+              double *rr   = &right[k * 400 + l * 20];
+              double *EVEV = &extEV[20 * l];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d lv  = _mm_load_pd(&ll[j]);
+                __m128d rv  = _mm_load_pd(&rr[j]);
+                __m128d vll = _mm_load_pd(&vl[j]);
+                __m128d vrr = _mm_load_pd(&vr[j]);
+
+                al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+              }                  
+
+              al = _mm_hadd_pd(al, al);
+              ar = _mm_hadd_pd(ar, ar);
+
+              al = _mm_mul_pd(al, ar);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv  = _mm_load_pd(&v[j]);
+                __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                _mm_store_pd(&v[j], vv);
+              }                                           
+            }            
+
+          }
+        }
+
+
+
+        { 
+          v = &(x3[80 * i]);
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 80); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+
+        if (scale)
+        {
+
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 80; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCATPROT(int tipCase, double *extEV,
+                              int *cptr,
+                              double *x1, double *x2, double *x3, double *tipVector,
+                              int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                              int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
+{
+  double
+    *le, *ri, *v, *vl, *vr;
+
+  int i, l, j, scale, addScale = 0;
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+        {
+          le = &left[cptr[i] * 400];
+          ri = &right[cptr[i] * 400];
+
+          vl = &(tipVector[20 * tipX1[i]]);
+          vr = &(tipVector[20 * tipX2[i]]);
+          v  = &x3[20 * i];
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+
+          for(l = 0; l < 20; l++)
+          {
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();      
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }        
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+        {
+          le = &left[cptr[i] * 400];
+          ri = &right[cptr[i] * 400];
+
+          vl = &(tipVector[20 * tipX1[i]]);
+          vr = &x2[20 * i];
+          v  = &x3[20 * i];
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+
+
+          for(l = 0; l < 20; l++)
+          {
+
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();     
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }
+
+          {         
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 20); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+
+          if(scale)
+          {
+
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));                  
+            }
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];         
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      {
+        le = &left[cptr[i] * 400];
+        ri = &right[cptr[i] * 400];
+
+        vl = &x1[20 * i];
+        vr = &x2[20 * i];
+        v = &x3[20 * i];
+
+
+        for(l = 0; l < 20; l+=2)
+          _mm_store_pd(&v[l], _mm_setzero_pd());                        
+
+
+        for(l = 0; l < 20; l++)
+        {
+
+          __m128d x1v = _mm_setzero_pd();
+          __m128d x2v = _mm_setzero_pd();
+          double 
+            *ev = &extEV[l * 20],
+            *lv = &le[l * 20],
+            *rv = &ri[l * 20];
+
+
+          for(j = 0; j < 20; j+=2)
+          {
+            x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                    
+            x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+          }
+
+          x1v = _mm_hadd_pd(x1v, x1v);
+          x2v = _mm_hadd_pd(x2v, x2v);
+
+          x1v = _mm_mul_pd(x1v, x2v);
+
+          for(j = 0; j < 20; j+=2)
+          {
+            __m128d vv = _mm_load_pd(&v[j]);
+            vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+            _mm_store_pd(&v[j], vv);
+          }                 
+
+        }
+
+        {           
+          __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+          scale = 1;
+          for(l = 0; scale && (l < 20); l += 2)
+          {
+            __m128d vv = _mm_load_pd(&v[l]);
+            __m128d v1 = _mm_and_pd(vv, absMask.m);
+            v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+            if(_mm_movemask_pd( v1 ) != 3)
+              scale = 0;
+          }               
+        }
+
+
+        if(scale)
+        {
+
+          __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+          for(l = 0; l < 20; l+=2)
+          {
+            __m128d ex3v = _mm_load_pd(&v[l]);            
+            _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));        
+          }                               
+
+
+          if(!fastScaling)
+            ex3[i] += 1;
+          else
+            addScale += wgt[i];    
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for GTR CAT with memory saving (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewCAT_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b CAT
+    model of rate heterogeneity.
+
+    @note
+    For more details and function argument description check the function ::newviewCAT_FLEX
+*/
+static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
+                                   int *cptr,
+                                   double *x1, double *x2, double *x3, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
+{
+  double
+    *le, 
+    *ri, 
+    *v, 
+    *vl, 
+    *vr,
+    *x1_ptr = x1,
+    *x2_ptr = x2, 
+    *x3_ptr = x3;
+
+  int 
+    i, 
+    l, 
+    j, 
+    scale, 
+    scaleGap = 0,
+    addScale = 0;
+
+  {
+    vl = x1_gapColumn;        
+    vr = x2_gapColumn;
+    v = x3_gapColumn;
+
+    le = &left[maxCats * 400];
+    ri = &right[maxCats * 400];   
+
+    for(l = 0; l < 20; l+=2)
+      _mm_store_pd(&v[l], _mm_setzero_pd());                    
+
+    for(l = 0; l < 20; l++)
+    {
+      __m128d x1v = _mm_setzero_pd();
+      __m128d x2v = _mm_setzero_pd();
+      double 
+        *ev = &extEV[l * 20],
+        *lv = &le[l * 20],
+        *rv = &ri[l * 20];
+
+
+      for(j = 0; j < 20; j+=2)
+      {
+        x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+        x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+      }
+
+      x1v = _mm_hadd_pd(x1v, x1v);
+      x2v = _mm_hadd_pd(x2v, x2v);
+
+      x1v = _mm_mul_pd(x1v, x2v);
+
+      for(j = 0; j < 20; j+=2)
+      {
+        __m128d vv = _mm_load_pd(&v[j]);
+        vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+        _mm_store_pd(&v[j], vv);
+      }                 
+    }
+
+    if(tipCase != PLL_TIP_TIP)
+    {       
+      __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+      scale = 1;
+      for(l = 0; scale && (l < 20); l += 2)
+      {
+        __m128d vv = _mm_load_pd(&v[l]);
+        __m128d v1 = _mm_and_pd(vv, absMask.m);
+        v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+        if(_mm_movemask_pd( v1 ) != 3)
+          scale = 0;
+      }                 
+
+      if(scale)
+      {
+        __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+        for(l = 0; l < 20; l+=2)
+        {
+          __m128d ex3v = _mm_load_pd(&v[l]);              
+          _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));  
+        }                                 
+
+        scaleGap = PLL_TRUE;       
+      }
+    }
+  }
+
+  switch(tipCase)
+  {
+    case PLL_TIP_TIP:
+      {
+        for (i = 0; i < n; i++)
+        {
+          if(noGap(x3_gap, i))
+          {             
+            vl = &(tipVector[20 * tipX1[i]]);
+            vr = &(tipVector[20 * tipX2[i]]);
+            v  = x3_ptr;
+
+            if(isGap(x1_gap, i))
+              le =  &left[maxCats * 400];
+            else                  
+              le =  &left[cptr[i] * 400];         
+
+            if(isGap(x2_gap, i))
+              ri =  &right[maxCats * 400];
+            else                  
+              ri =  &right[cptr[i] * 400];
+
+            for(l = 0; l < 20; l+=2)
+              _mm_store_pd(&v[l], _mm_setzero_pd());                    
+
+            for(l = 0; l < 20; l++)
+            {
+              __m128d x1v = _mm_setzero_pd();
+              __m128d x2v = _mm_setzero_pd();    
+              double 
+                *ev = &extEV[l * 20],
+                *lv = &le[l * 20],
+                *rv = &ri[l * 20];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+              }
+
+              x1v = _mm_hadd_pd(x1v, x1v);
+              x2v = _mm_hadd_pd(x2v, x2v);
+
+              x1v = _mm_mul_pd(x1v, x2v);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+                _mm_store_pd(&v[j], vv);
+              }            
+            }
+
+            x3_ptr += 20;
+
+          }   
+        }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        for (i = 0; i < n; i++)
+        {
+          if(isGap(x3_gap, i))
+          {
+            if(scaleGap)
+              {
+                if(!fastScaling)
+                  ex3[i] += 1;
+                else
+                  addScale += wgt[i];
+              }
+          }
+          else
+          {      
+            vl = &(tipVector[20 * tipX1[i]]);
+
+            vr = x2_ptr;
+            v = x3_ptr;
+
+            if(isGap(x1_gap, i))
+              le =  &left[maxCats * 400];
+            else
+              le =  &left[cptr[i] * 400];
+
+            if(isGap(x2_gap, i))
+            {            
+              ri =  &right[maxCats * 400];
+              vr = x2_gapColumn;
+            }
+            else
+            {
+              ri =  &right[cptr[i] * 400];
+              vr = x2_ptr;
+              x2_ptr += 20;
+            }                                             
+
+            for(l = 0; l < 20; l+=2)
+              _mm_store_pd(&v[l], _mm_setzero_pd());                               
+
+            for(l = 0; l < 20; l++)
+            {
+              __m128d x1v = _mm_setzero_pd();
+              __m128d x2v = _mm_setzero_pd();   
+              double 
+                *ev = &extEV[l * 20],
+                *lv = &le[l * 20],
+                *rv = &ri[l * 20];
+
+              for(j = 0; j < 20; j+=2)
+              {
+                x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                
+                x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+              }
+
+              x1v = _mm_hadd_pd(x1v, x1v);
+              x2v = _mm_hadd_pd(x2v, x2v);
+
+              x1v = _mm_mul_pd(x1v, x2v);
+
+              for(j = 0; j < 20; j+=2)
+              {
+                __m128d vv = _mm_load_pd(&v[j]);
+                vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+                _mm_store_pd(&v[j], vv);
+              }             
+            }
+
+            {       
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+              scale = 1;
+              for(l = 0; scale && (l < 20); l += 2)
+              {
+                __m128d vv = _mm_load_pd(&v[l]);
+                __m128d v1 = _mm_and_pd(vv, absMask.m);
+                v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                if(_mm_movemask_pd( v1 ) != 3)
+                  scale = 0;
+              }           
+            }
+
+
+            if(scale)
+            {
+              __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+              for(l = 0; l < 20; l+=2)
+              {
+                __m128d ex3v = _mm_load_pd(&v[l]);
+                _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));                
+              }
+              
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];       
+            }
+            x3_ptr += 20;
+          }
+        }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for(i = 0; i < n; i++)
+      { 
+        if(isGap(x3_gap, i))
+        {
+          if(scaleGap)
+            {
+              if(!fastScaling)
+                ex3[i] += 1;
+              else
+                addScale += wgt[i];
+            }
+        }
+        else
+        {                    
+          v = x3_ptr;
+
+          if(isGap(x1_gap, i))
+          {
+            vl = x1_gapColumn;
+            le =  &left[maxCats * 400];
+          }
+          else
+          {
+            le =  &left[cptr[i] * 400];
+            vl = x1_ptr;
+            x1_ptr += 20;
+          }
+
+          if(isGap(x2_gap, i))  
+          {
+            vr = x2_gapColumn;
+            ri =  &right[maxCats * 400];            
+          }
+          else
+          {
+            ri =  &right[cptr[i] * 400];
+            vr = x2_ptr;
+            x2_ptr += 20;
+          }                               
+
+          for(l = 0; l < 20; l+=2)
+            _mm_store_pd(&v[l], _mm_setzero_pd());                      
+
+          for(l = 0; l < 20; l++)
+          {
+            __m128d x1v = _mm_setzero_pd();
+            __m128d x2v = _mm_setzero_pd();
+            double 
+              *ev = &extEV[l * 20],
+              *lv = &le[l * 20],
+              *rv = &ri[l * 20];
+
+            for(j = 0; j < 20; j+=2)
+            {
+              x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));                  
+              x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
+            }
+
+            x1v = _mm_hadd_pd(x1v, x1v);
+            x2v = _mm_hadd_pd(x2v, x2v);
+
+            x1v = _mm_mul_pd(x1v, x2v);
+
+            for(j = 0; j < 20; j+=2)
+            {
+              __m128d vv = _mm_load_pd(&v[j]);
+              vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
+              _mm_store_pd(&v[j], vv);
+            }               
+
+          }
+
+          {         
+            __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+
+            scale = 1;
+            for(l = 0; scale && (l < 20); l += 2)
+            {
+              __m128d vv = _mm_load_pd(&v[l]);
+              __m128d v1 = _mm_and_pd(vv, absMask.m);
+              v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+              if(_mm_movemask_pd( v1 ) != 3)
+                scale = 0;
+            }             
+          }
+
+          if(scale)
+          {
+            __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+
+            for(l = 0; l < 20; l+=2)
+            {
+              __m128d ex3v = _mm_load_pd(&v[l]);                  
+              _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));      
+            }                             
+
+            if(!fastScaling)
+              ex3[i] += 1;
+            else
+              addScale += wgt[i];          
+          }
+          x3_ptr += 20;
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  if(fastScaling)
+    *scalerIncrement = addScale;
+}
+
+
+/** @ingroup group1
+ *  @brief Computation of conditional likelihood arrray for the GTR GAMMA and for the LG4 model (Optimized SSE3 version for AA data)
+
+    This is the SSE3 optimized version of ::newviewGAMMA_FLEX for computing the conditional
+    likelihood arrays at some node \a p, given child nodes \a q and \a r using the \b GAMMA
+    model of rate heterogeneity and the LG4 model of evolution. Note that the original unoptimized
+    function does not incorporate the LG4 model.
+
+    @note
+    For more details and function argument description check the function ::newviewGAMMA_FLEX
+*/
+static void newviewGTRGAMMAPROT_LG4(int tipCase,
+                                    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling)
+{
+  double  *uX1, *uX2, *v;
+  double x1px2;
+  int  i, j, l, k, scale, addScale = 0;
+  double *vl, *vr;
+#ifndef __SSE3
+  double al, ar;
+#endif
+
+
+
+  switch(tipCase)
+    {
+    case PLL_TIP_TIP:
+      {
+        double umpX1[1840], umpX2[1840];
+
+        for(i = 0; i < 23; i++)
+          {
+           
+
+            for(k = 0; k < 80; k++)
+              {
+                
+                v = &(tipVector[k / 20][20 * i]);
+#ifdef __SSE3
+                double *ll =  &left[k * 20];
+                double *rr =  &right[k * 20];
+                
+                __m128d umpX1v = _mm_setzero_pd();
+                __m128d umpX2v = _mm_setzero_pd();
+
+                for(l = 0; l < 20; l+=2)
+                  {
+                    __m128d vv = _mm_load_pd(&v[l]);
+                    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
+                    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));                                   
+                  }
+                
+                umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
+                umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
+                
+                _mm_storel_pd(&umpX1[80 * i + k], umpX1v);
+                _mm_storel_pd(&umpX2[80 * i + k], umpX2v);
+#else
+                umpX1[80 * i + k] = 0.0;
+                umpX2[80 * i + k] = 0.0;
+
+                for(l = 0; l < 20; l++)
+                  {
+                    umpX1[80 * i + k] +=  v[l] *  left[k * 20 + l];
+                    umpX2[80 * i + k] +=  v[l] * right[k * 20 + l];
+                  }
+#endif
+              }
+          }
+
+        for(i = 0; i < n; i++)
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+            uX2 = &umpX2[80 * tipX2[i]];
+
+            for(j = 0; j < 4; j++)
+              {
+                v = &x3[i * 80 + j * 20];
+
+#ifdef __SSE3
+                __m128d zero =  _mm_setzero_pd();
+                for(k = 0; k < 20; k+=2)                                    
+                  _mm_store_pd(&v[k], zero);
+
+                for(k = 0; k < 20; k++)
+                  { 
+                    double *eev = &extEV[j][k * 20];
+                    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                    __m128d x1px2v = _mm_set1_pd(x1px2);
+
+                    for(l = 0; l < 20; l+=2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[l]);
+                        __m128d ee = _mm_load_pd(&eev[l]);
+
+                        vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                        
+                        _mm_store_pd(&v[l], vv);
+                      }
+                  }
+
+#else
+
+                for(k = 0; k < 20; k++)
+                  v[k] = 0.0;
+
+                for(k = 0; k < 20; k++)
+                  {                
+                    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
+                   
+                    for(l = 0; l < 20; l++)                                                     
+                      v[l] += x1px2 * extEV[j][20 * k + l];                  
+                  }
+#endif
+              }    
+          }
+      }
+      break;
+    case PLL_TIP_INNER:
+      {
+        double umpX1[1840], ump_x2[20];
+
+
+        for(i = 0; i < 23; i++)
+          {
+           
+
+            for(k = 0; k < 80; k++)
+              { 
+                v = &(tipVector[k / 20][20 * i]);
+#ifdef __SSE3
+                double *ll =  &left[k * 20];
+                                
+                __m128d umpX1v = _mm_setzero_pd();
+                
+                for(l = 0; l < 20; l+=2)
+                  {
+                    __m128d vv = _mm_load_pd(&v[l]);
+                    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));                                                   
+                  }
+                
+                umpX1v = _mm_hadd_pd(umpX1v, umpX1v);                           
+                _mm_storel_pd(&umpX1[80 * i + k], umpX1v);              
+#else       
+                umpX1[80 * i + k] = 0.0;
+
+                for(l = 0; l < 20; l++)
+                  umpX1[80 * i + k] +=  v[l] * left[k * 20 + l];
+#endif
+
+              }
+          }
+
+        for (i = 0; i < n; i++)
+          {
+            uX1 = &umpX1[80 * tipX1[i]];
+
+            for(k = 0; k < 4; k++)
+              {
+                v = &(x2[80 * i + k * 20]);
+#ifdef __SSE3              
+                for(l = 0; l < 20; l++)
+                  {                
+                    double *r =  &right[k * 400 + l * 20];
+                    __m128d ump_x2v = _mm_setzero_pd();     
+                    
+                    for(j = 0; j < 20; j+= 2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[j]);
+                        __m128d rr = _mm_load_pd(&r[j]);
+                        ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
+                      }
+                     
+                    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
+                    
+                    _mm_storel_pd(&ump_x2[l], ump_x2v);                              
+                  }
+
+                v = &(x3[80 * i + 20 * k]);
+
+                __m128d zero =  _mm_setzero_pd();
+                for(l = 0; l < 20; l+=2)                                    
+                  _mm_store_pd(&v[l], zero);
+                  
+                for(l = 0; l < 20; l++)
+                  {
+                    double *eev = &extEV[k][l * 20];
+                    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                    __m128d x1px2v = _mm_set1_pd(x1px2);
+                  
+                    for(j = 0; j < 20; j+=2)
+                      {
+                        __m128d vv = _mm_load_pd(&v[j]);
+                        __m128d ee = _mm_load_pd(&eev[j]);
+                        
+                        vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
+                        
+                        _mm_store_pd(&v[j], vv);
+                      }                             
+                  }                     
+#else
+                for(l = 0; l < 20; l++)
+                  {
+                    ump_x2[l] = 0.0;
+
+                    for(j = 0; j < 20; j++)
+                      ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
+                  }
+
+                v = &(x3[80 * i + 20 * k]);
+
+                for(l = 0; l < 20; l++)
+                  v[l] = 0;
+
+                for(l = 0; l < 20; l++)
+                  {
+                    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
+                    for(j = 0; j < 20; j++)
+                      v[j] += x1px2 * extEV[k][l * 20  + j];
+                  }
+#endif
+              }
+           
+#ifdef __SSE3
+            { 
+              v = &(x3[80 * i]);
+              __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+              
+              scale = 1;
+              for(l = 0; scale && (l < 80); l += 2)
+                {
+                  __m128d vv = _mm_load_pd(&v[l]);
+                  __m128d v1 = _mm_and_pd(vv, absMask.m);
+                  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+                  if(_mm_movemask_pd( v1 ) != 3)
+                    scale = 0;
+                }                 
+            }
+#else
+            v = &x3[80 * i];
+            scale = 1;
+            for(l = 0; scale && (l < 80); l++)
+              scale = (PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD );
+#endif
+
+            if (scale)
+              {
+#ifdef __SSE3
+               __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+               
+               for(l = 0; l < 80; l+=2)
+                 {
+                   __m128d ex3v = _mm_load_pd(&v[l]);             
+                   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); 
+                 }                                
+#else
+                for(l = 0; l < 80; l++)
+                  v[l] *= PLL_TWOTOTHE256;
+#endif
+
+                if(useFastScaling)
+                  addScale += wgt[i];
+                else
+                  ex3[i]  += 1;        
+              }
+          }
+      }
+      break;
+    case PLL_INNER_INNER:
+      for (i = 0; i < n; i++)
+       {
+         for(k = 0; k < 4; k++)
+           {
+             vl = &(x1[80 * i + 20 * k]);
+             vr = &(x2[80 * i + 20 * k]);
+             v =  &(x3[80 * i + 20 * k]);
+
+#ifdef __SSE3
+             __m128d zero =  _mm_setzero_pd();
+             for(l = 0; l < 20; l+=2)                               
+               _mm_store_pd(&v[l], zero);
+#else
+             for(l = 0; l < 20; l++)
+               v[l] = 0;
+#endif
+
+             for(l = 0; l < 20; l++)
+               {                 
+#ifdef __SSE3
+                 {
+                   __m128d al = _mm_setzero_pd();
+                   __m128d ar = _mm_setzero_pd();
+
+                   double *ll   = &left[k * 400 + l * 20];
+                   double *rr   = &right[k * 400 + l * 20];
+                   double *EVEV = &extEV[k][20 * l];
+                   
+                   for(j = 0; j < 20; j+=2)
+                     {
+                       __m128d lv  = _mm_load_pd(&ll[j]);
+                       __m128d rv  = _mm_load_pd(&rr[j]);
+                       __m128d vll = _mm_load_pd(&vl[j]);
+                       __m128d vrr = _mm_load_pd(&vr[j]);
+                       
+                       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
+                       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
+                     }                   
+                       
+                   al = _mm_hadd_pd(al, al);
+                   ar = _mm_hadd_pd(ar, ar);
+                   
+                   al = _mm_mul_pd(al, ar);
+
+                   for(j = 0; j < 20; j+=2)
+                     {
+                       __m128d vv  = _mm_load_pd(&v[j]);
+                       __m128d EVV = _mm_load_pd(&EVEV[j]);
+
+                       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
+
+                       _mm_store_pd(&v[j], vv);
+                     }                                            
+                 }               
+#else
+                 al = 0.0;
+                 ar = 0.0;
+
+                 for(j = 0; j < 20; j++)
+                   {
+                     al += vl[j] * left[k * 400 + l * 20 + j];
+                     ar += vr[j] * right[k * 400 + l * 20 + j];
+                   }
+
+                 x1px2 = al * ar;
+
+                 for(j = 0; j < 20; j++)
+                   v[j] += x1px2 * extEV[k][20 * l + j];
+#endif
+               }
+           }
+         
+
+#ifdef __SSE3
+         { 
+           v = &(x3[80 * i]);
+           __m128d minlikelihood_sse = _mm_set1_pd( PLL_MINLIKELIHOOD );
+           
+           scale = 1;
+           for(l = 0; scale && (l < 80); l += 2)
+             {
+               __m128d vv = _mm_load_pd(&v[l]);
+               __m128d v1 = _mm_and_pd(vv, absMask.m);
+               v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
+               if(_mm_movemask_pd( v1 ) != 3)
+                 scale = 0;
+             }            
+         }
+#else
+         v = &(x3[80 * i]);
+         scale = 1;
+         for(l = 0; scale && (l < 80); l++)
+           scale = ((PLL_ABS(v[l]) <  PLL_MINLIKELIHOOD ));
+#endif
+
+         if (scale)
+           {
+#ifdef __SSE3
+               __m128d twoto = _mm_set_pd(PLL_TWOTOTHE256, PLL_TWOTOTHE256);
+               
+               for(l = 0; l < 80; l+=2)
+                 {
+                   __m128d ex3v = _mm_load_pd(&v[l]);             
+                   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto)); 
+                 }                                
+#else        
+             for(l = 0; l < 80; l++)
+               v[l] *= PLL_TWOTOTHE256;
+#endif
+
+             if(useFastScaling)
+               addScale += wgt[i];
+             else
+               ex3[i]  += 1;      
+           }
+       }
+      break;
+    default:
+      assert(0);
+    }
+
+  if(useFastScaling)
+    *scalerIncrement = addScale;
+
+}
+#endif
+
+
diff --git a/pllrepo/src/optimizeModel.c b/pllrepo/src/optimizeModel.c
new file mode 100644
index 0000000..dde1b95
--- /dev/null
+++ b/pllrepo/src/optimizeModel.c
@@ -0,0 +1,3145 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file optimizeModel.c
+ *
+ * @brief Model optimization routines
+ */ 
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static const double MNBRAK_GOLD =    1.618034;          /**< Golden ratio */
+static const double MNBRAK_TINY =      1.e-20;
+static const double MNBRAK_GLIMIT =     100.0;
+static const double BRENT_ZEPS  =       1.e-5;
+static const double BRENT_CGOLD =   0.3819660;
+
+extern int optimizeRatesInvocations;
+extern int optimizeAlphaInvocations;
+extern int optimizeInvarInvocations;
+extern char ratesFileName[1024];
+extern char lengthFileName[1024];
+extern char lengthFileNameModel[1024];
+extern char *protModels[PLL_NUM_PROT_MODELS];
+
+static void optParamGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int rateNumber, double lim_inf, double lim_sup, int whichParameterType);
+// FLAG for easier debugging of model parameter optimization routines 
+
+//#define _DEBUG_MOD_OPT
+
+
+/*********************FUNCTIONS FOR EXACT MODEL OPTIMIZATION UNDER GTRGAMMA ***************************************/
+
+
+/* the following function is used to set rates in the Q matrix 
+   the data structure called symmetryVector is used to 
+   define the symmetries between rates as they are specified 
+   in some of the secondary structure substitution models that 
+   generally don't use GTR matrices but more restricted forms thereof */
+
+/** @brief Set a specific rate in the substitition matrix
+  *
+  * This function is used to set the \a position-th substitution rate of
+  * partition \a index to \a rate.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param model
+  *   Index of partition
+  *
+  * @param rate
+  *   The new value to which to set the specific substition rate
+  *
+  * @param posititon
+  *   The number of the substition rate
+  */
+static void setRateModel(partitionList *pr, int model, double rate, int position)
+{
+  int
+    states   = pr->partitionData[model]->states,
+    numRates = (states * states - states) / 2;
+
+  if(pr->partitionData[model]->dataType == PLL_DNA_DATA)
+    assert(position >= 0 && position < (numRates - 1));
+  else
+    assert(position >= 0 && position < numRates);
+
+  assert(pr->partitionData[model]->dataType != PLL_BINARY_DATA);
+
+  assert(rate >= PLL_RATE_MIN && rate <= PLL_RATE_MAX);
+
+  if(pr->partitionData[model]->nonGTR)
+    {    
+      int 
+        i, 
+        index    = pr->partitionData[model]->symmetryVector[position],
+        lastRate = pr->partitionData[model]->symmetryVector[numRates - 1];
+           
+      for(i = 0; i < numRates; i++)
+        {       
+          if(pr->partitionData[model]->symmetryVector[i] == index)
+            {
+              if(index == lastRate)
+                pr->partitionData[model]->substRates[i] = 1.0;
+              else
+                pr->partitionData[model]->substRates[i] = rate;      
+            }
+          
+          //printf("%f ", tr->partitionData[model].substRates[i]);
+        }
+      //printf("\n");
+    }
+  else
+    pr->partitionData[model]->substRates[position] = rate;
+}
+
+//LIBRARY: the only thing that we will need to do here is to 
+//replace linkList by a string and also add some error correction 
+//code
+
+/* 
+   the following three functions are used to link/unlink parameters 
+   between partitions. This should work in a generic way, however 
+   this is so far mainly used for linking unlinking GTR matrix parameter 
+   estimates across different protein data partitions.
+   Generally this mechanism can also be used for linking/inlinking alpha paremeters 
+   between partitions and the like.
+   However, all alpha parameter estimates for all partitions and GTR estimates for 
+   DNA partitions are unlinked by default. This is actually hard-coded 
+   in here. 
+*/
+
+/* initializwe a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+
+
+
+
+
+/* dedicated helper function to initialize the linkage list, that is, essentiaylly compute 
+   the integer vector int *linkList used above for linking GTR models.
+   
+   Once again, this is hard-coded in RAxML, because users can not influence the linking.
+
+*/
+   
+
+/* free linkage list data structure */
+
+#define ALPHA_F    0
+#define RATE_F     1
+#define FREQ_F     2
+#define LXRATE_F   3
+#define LXWEIGHT_F 4
+
+static void updateWeights(partitionList *pr, int model, int rate, double value)
+{
+    int j;
+    double w = 0.0;
+    assert(rate >= 0 && rate < 4);
+    pr->partitionData[model]->lg4x_weightExponents[rate] = value;
+    for (j = 0; j < 4; j++)
+        w += exp(pr->partitionData[model]->lg4x_weightExponents[j]);
+    for (j = 0; j < 4; j++)
+        pr->partitionData[model]->lg4x_weights[j] = exp(
+                pr->partitionData[model]->lg4x_weightExponents[j]) / w;
+}
+
+static void optimizeWeights(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll,
+        int numberOfModels)
+{
+    int i;
+    double initialLH = 0.0, finalLH = 0.0;
+    pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+    initialLH = tr->likelihood;
+    for (i = 0; i < 4; i++)
+        optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, i, -1000000.0,
+                200.0, LXWEIGHT_F);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+    pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+#endif
+
+    pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    finalLH = tr->likelihood;
+    if (finalLH < initialLH)
+        printf("Final: %f initial: %f\n", finalLH, initialLH);
+    assert(finalLH >= initialLH);
+}
+
+/** @brief Wrapper function for changing a specific model parameter to the specified value
+  *
+  * Change the \a rateNumber-th model parameter of the type specified by \a whichParameterType to
+  * the value \a value.
+  * This routine is usually called by model optimization routines to restore the original
+  * model parameter vlaue when optimization leads to worse likelihood than the original, or
+  * when optimizing routines and testing the new parameter.
+  * In case of changing a frequency or substitution rate the Q matrix is also decomposed (into
+  * eigenvalues and eigenvectors)
+  *
+  * @param index
+  *   Index of partition
+  *
+  * @param rateNumber
+  *   The index of the model parameter
+  *
+  * @param value
+  *   The value to which the parameter must be changed
+  *
+  * @param whichParameterType
+  *   Type of model parameter. Can be \b RATE_F, \b ALPHA_F or \b FREQ_F, that is substitution rates,
+  *   alpha rates, or base frequencies rates
+  */   
+static void changeModelParameters(int index, int rateNumber, double value, int whichParameterType, pllInstance *tr, partitionList * pr)
+{
+  switch(whichParameterType)
+    {
+    case RATE_F:
+      setRateModel(pr, index, value, rateNumber);  
+      pllInitReversibleGTR(tr, pr, index);          
+      break;
+    case ALPHA_F:
+      pr->partitionData[index]->alpha = value;
+      pllMakeGammaCats(pr->partitionData[index]->alpha, pr->partitionData[index]->gammaRates, 4, tr->useMedian);
+      break;
+    case FREQ_F:
+      {
+        int 
+          states = pr->partitionData[index]->states,
+          j;
+
+        double 
+          w = 0.0;
+
+        pr->partitionData[index]->freqExponents[rateNumber] = value;
+
+        for(j = 0; j < states; j++)
+          w += exp(pr->partitionData[index]->freqExponents[j]);
+
+        for(j = 0; j < states; j++)              
+          pr->partitionData[index]->frequencies[j] = exp(pr->partitionData[index]->freqExponents[j]) / w;
+        
+        pllInitReversibleGTR(tr, pr, index);
+      }
+      break;
+    case LXRATE_F:
+        pr->partitionData[index]->gammaRates[rateNumber] = value;
+        break;
+    case LXWEIGHT_F:
+        updateWeights(pr, index, rateNumber, value);
+        break;
+    default:
+      assert(0);
+    }
+}
+
+/* function that evaluates the change to a parameter */
+/** @brief Evaluate the change of a parameter
+ *
+ *  Evaluate the likelihood for each entry \a i in the linkage list when changing the
+ *  \a rateNumber-th parameter of type \a whichFunction (\b ALPHA_F, \b RATE_F 
+ *  or \b FREQ_F) to \a value[i]. The resulting likelihood for each entry list \a i in the
+ *  linkage list is then stored in \a result[i]
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param rateNumber
+ *    Index of the parameter to optimize 
+ *
+ *  @param value
+ *
+ *  @param result
+ *    An array where the total likelihood of each entry list \a i in the linkage list \a ll  is stored when evaluating the new \a i-th parameter of array \a value
+ *
+ *  @param converged
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param modelEpsilon
+ *    Epsilon threshold
+ */
+static void evaluateChange(pllInstance *tr, partitionList *pr, int rateNumber, double *value, double *result, pllBoolean* converged, int whichFunction, int numberOfModels, linkageList *ll, double modelEpsilon)
+{ 
+  int 
+    i, 
+    k, 
+    pos;
+
+  pllBoolean
+    atLeastOnePartition = PLL_FALSE;
+
+  for(i = 0, pos = 0; i < ll->entries; i++)
+    {
+      if(ll->ld[i].valid)
+        {
+          if(converged[pos])
+            {
+              for(k = 0; k < ll->ld[i].partitions; k++)
+                pr->partitionData[ll->ld[i].partitionList[k]]->executeModel = PLL_FALSE;
+            }
+          else
+            {
+              atLeastOnePartition = PLL_TRUE;
+              for(k = 0; k < ll->ld[i].partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];
+
+
+                  changeModelParameters(index, rateNumber, value[pos], whichFunction, tr, pr);
+                }
+            }
+          pos++;
+        }
+      else
+        {
+          for(k = 0; k < ll->ld[i].partitions; k++)
+            pr->partitionData[ll->ld[i].partitionList[k]]->executeModel = PLL_FALSE;
+        }      
+    }
+
+  assert(pos == numberOfModels);
+
+    //some error checks for individual model parameters
+    switch (whichFunction)
+    {
+    case RATE_F:
+        assert(rateNumber != -1);
+        break;
+    case ALPHA_F:
+        break;
+    case LXRATE_F:
+        assert(rateNumber != -1);
+        break;
+    case LXWEIGHT_F:
+        assert(rateNumber != -1);
+        break;
+    case FREQ_F:
+        break;
+    default:
+        assert(0);
+    }
+
+    switch (whichFunction)
+    {
+    case RATE_F:
+    case ALPHA_F:
+    case LXRATE_F:
+    case FREQ_F:
+        pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+        break;
+    case LXWEIGHT_F:
+        pllEvaluateLikelihood(tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+        break;
+    default:
+        assert(0);
+    }
+    //nested optimization for LX4 model, now optimize the weights!
+    if (whichFunction == LXRATE_F && atLeastOnePartition)
+    {
+        pllBoolean *buffer = (pllBoolean*) malloc(
+                pr->numberOfPartitions* sizeof(pllBoolean));
+
+        for (i = 0; i < pr->numberOfPartitions; i++) {
+            buffer[i] = pr->partitionData[i]->executeModel;
+            pr->partitionData[i]->executeModel = PLL_FALSE;
+        }
+
+        for (i = 0, pos = 0; i < ll->entries; i++)
+        {
+            int index = ll->ld[i].partitionList[0];
+            if (ll->ld[i].valid)
+                pr->partitionData[index]->executeModel = PLL_TRUE;
+        }
+        optimizeWeights(tr, pr, modelEpsilon, ll, numberOfModels);
+
+        for (i = 0; i < pr->numberOfPartitions; i++) {
+            pr->partitionData[i]->executeModel = buffer[i];
+        }
+
+        free(buffer);
+    }
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+   switch (whichFunction)
+    {
+      case RATE_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_RATE);
+        break;
+      case ALPHA_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_ALPHA);
+        break;
+      case FREQ_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_RATE);
+        break;
+      case LXRATE_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_LG4X_RATE);
+        break;
+      case LXWEIGHT_F:
+        pllMasterBarrier(tr, pr, PLL_THREAD_OPT_LG4X_RATE);
+        break;
+      default:
+        break;
+    }
+#else
+   //commented out evaluate below in the course of the LG4X integration
+   //pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+#endif     
+
+
+  for(i = 0, pos = 0; i < ll->entries; i++)     
+    {
+      if(ll->ld[i].valid)
+        {
+          result[pos] = 0.0;
+          
+          for(k = 0; k < ll->ld[i].partitions; k++)
+            {
+              int 
+                index = ll->ld[i].partitionList[k];
+
+              assert(pr->partitionData[index]->partitionLH <= 0.0);
+              result[pos] -= pr->partitionData[index]->partitionLH;
+              
+            }
+          pos++;
+        }
+      for(k = 0; k < ll->ld[i].partitions; k++)
+        {
+          int index = ll->ld[i].partitionList[k];
+          pr->partitionData[index]->executeModel = PLL_TRUE;
+        }         
+    }
+  
+  assert(pos == numberOfModels);   
+}
+
+/* generic implementation of Brent's algorithm for one-dimensional parameter optimization */
+
+/** @brief Brent's algorithm
+ *
+ *  Generic implementation of Brent's algorithm for one-dimensional parameter optimization
+ *
+ *  @param ax
+ *
+ *  @param bx
+ *
+ *  @param cx
+ *
+ *  @param fb
+ *
+ *  @param tol
+ *
+ *  @param xmin
+ *
+ *  @param result
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param rateNumber
+ *     Index of the parameter to optimize 
+ *   
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param lim_inf
+ *    Lower bound for the rate assignment
+ *
+ *  @param lim_sup
+ *    Upper bound for the rate assignment
+ *
+ *  @todo
+ *     Fill the rest of the entries. Also, why not preallocate all memory instead of allocating
+ *     at every call? We can save a lot of time which is lost due to function calls, finding free
+ *     memory blocks by allocation strategy, and also prevent mem fragmentation.
+ */
+static void brentGeneric(double *ax, double *bx, double *cx, double *fb, double tol, double *xmin, double *result, int numberOfModels, 
+                         int whichFunction, int rateNumber, pllInstance *tr, partitionList *pr, linkageList *ll, double lim_inf, double lim_sup)
+{
+  int iter, i;
+  double 
+    *a     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *b     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *d     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *etemp = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fu    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fv    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fw    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fx    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *p     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *q     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *r     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *tol1  = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *tol2  = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *u     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *v     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *w     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *x     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *xm    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *e     = (double *)rax_malloc(sizeof(double) * numberOfModels);
+  pllBoolean *converged = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * numberOfModels);
+  pllBoolean allConverged;
+  
+  for(i = 0; i < numberOfModels; i++)    
+    converged[i] = PLL_FALSE;
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      e[i] = 0.0;
+      d[i] = 0.0;
+    }
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      a[i]=((ax[i] < cx[i]) ? ax[i] : cx[i]);
+      b[i]=((ax[i] > cx[i]) ? ax[i] : cx[i]);
+      x[i] = w[i] = v[i] = bx[i];
+      fw[i] = fv[i] = fx[i] = fb[i];
+    }
+
+  for(i = 0; i < numberOfModels; i++)
+    {      
+      assert(a[i] >= lim_inf && a[i] <= lim_sup);
+      assert(b[i] >= lim_inf && b[i] <= lim_sup);
+      assert(x[i] >= lim_inf && x[i] <= lim_sup);
+      assert(v[i] >= lim_inf && v[i] <= lim_sup);
+      assert(w[i] >= lim_inf && w[i] <= lim_sup);
+    }
+  
+  
+
+  for(iter = 1; iter <= PLL_ITMAX; iter++)
+    {
+      allConverged = PLL_TRUE;
+
+      for(i = 0; i < numberOfModels && allConverged; i++)
+        allConverged = allConverged && converged[i];
+
+      if(allConverged)
+        {
+          rax_free(converged);
+          rax_free(a);
+          rax_free(b);
+          rax_free(d);
+          rax_free(etemp);
+          rax_free(fu);
+          rax_free(fv);
+          rax_free(fw);
+          rax_free(fx);
+          rax_free(p);
+          rax_free(q);
+          rax_free(r);
+          rax_free(tol1);
+          rax_free(tol2);
+          rax_free(u);
+          rax_free(v);
+          rax_free(w);
+          rax_free(x);
+          rax_free(xm);
+          rax_free(e);
+          return;
+        }     
+
+      for(i = 0; i < numberOfModels; i++)
+        {
+          if(!converged[i])
+            {                 
+              assert(a[i] >= lim_inf && a[i] <= lim_sup);
+              assert(b[i] >= lim_inf && b[i] <= lim_sup);
+              assert(x[i] >= lim_inf && x[i] <= lim_sup);
+              assert(v[i] >= lim_inf && v[i] <= lim_sup);
+              assert(w[i] >= lim_inf && w[i] <= lim_sup);
+  
+              xm[i] = 0.5 * (a[i] + b[i]);
+              tol2[i] = 2.0 * (tol1[i] = tol * fabs(x[i]) + BRENT_ZEPS);
+          
+              if(fabs(x[i] - xm[i]) <= (tol2[i] - 0.5 * (b[i] - a[i])))
+                {                
+                  result[i] =  -fx[i];
+                  xmin[i]   = x[i];
+                  converged[i] = PLL_TRUE;                
+                }
+              else
+                {
+                  if(fabs(e[i]) > tol1[i])
+                    {                
+                      r[i] = (x[i] - w[i]) * (fx[i] - fv[i]);
+                      q[i] = (x[i] - v[i]) * (fx[i] - fw[i]);
+                      p[i] = (x[i] - v[i]) * q[i] - (x[i] - w[i]) * r[i];
+                      q[i] = 2.0 * (q[i] - r[i]);
+                      if(q[i] > 0.0)
+                        p[i] = -p[i];
+                      q[i] = fabs(q[i]);
+                      etemp[i] = e[i];
+                      e[i] = d[i];
+                      if((fabs(p[i]) >= fabs(0.5 * q[i] * etemp[i])) || (p[i] <= q[i] * (a[i]-x[i])) || (p[i] >= q[i] * (b[i] - x[i])))
+                        d[i] = BRENT_CGOLD * (e[i] = (x[i] >= xm[i] ? a[i] - x[i] : b[i] - x[i]));
+                      else
+                        {
+                          d[i] = p[i] / q[i];
+                          u[i] = x[i] + d[i];
+                          if( u[i] - a[i] < tol2[i] || b[i] - u[i] < tol2[i])
+                            d[i] = PLL_SIGN(tol1[i], xm[i] - x[i]);
+                        }
+                    }
+                  else
+                    {                
+                      d[i] = BRENT_CGOLD * (e[i] = (x[i] >= xm[i] ? a[i] - x[i]: b[i] - x[i]));
+                    }
+                  u[i] = ((fabs(d[i]) >= tol1[i]) ? (x[i] + d[i]) : (x[i] + PLL_SIGN(tol1[i], d[i])));
+                }
+
+              if(!converged[i])
+                assert(u[i] >= lim_inf && u[i] <= lim_sup);
+            }
+        }
+                 
+      evaluateChange(tr, pr, rateNumber, u, fu, converged, whichFunction, numberOfModels, ll, tol);
+
+      for(i = 0; i < numberOfModels; i++)
+        {
+          if(!converged[i])
+            {
+              if(fu[i] <= fx[i])
+                {
+                  if(u[i] >= x[i])
+                    a[i] = x[i];
+                  else
+                    b[i] = x[i];
+                  
+                  PLL_SHFT(v[i],w[i],x[i],u[i]);
+                  PLL_SHFT(fv[i],fw[i],fx[i],fu[i]);
+                }
+              else
+                {
+                  if(u[i] < x[i])
+                    a[i] = u[i];
+                  else
+                    b[i] = u[i];
+                  
+                  if(fu[i] <= fw[i] || w[i] == x[i])
+                    {
+                      v[i] = w[i];
+                      w[i] = u[i];
+                      fv[i] = fw[i];
+                      fw[i] = fu[i];
+                    }
+                  else
+                    {
+                      if(fu[i] <= fv[i] || v[i] == x[i] || v[i] == w[i])
+                        {
+                          v[i] = u[i];
+                          fv[i] = fu[i];
+                        }
+                    }       
+                }
+              
+              assert(a[i] >= lim_inf && a[i] <= lim_sup);
+              assert(b[i] >= lim_inf && b[i] <= lim_sup);
+              assert(x[i] >= lim_inf && x[i] <= lim_sup);
+              assert(v[i] >= lim_inf && v[i] <= lim_sup);
+              assert(w[i] >= lim_inf && w[i] <= lim_sup);
+              assert(u[i] >= lim_inf && u[i] <= lim_sup);
+            }
+        }
+    }
+
+  rax_free(converged);
+  rax_free(a);
+  rax_free(b);
+  rax_free(d);
+  rax_free(etemp);
+  rax_free(fu);
+  rax_free(fv);
+  rax_free(fw);
+  rax_free(fx);
+  rax_free(p);
+  rax_free(q);
+  rax_free(r);
+  rax_free(tol1);
+  rax_free(tol2);
+  rax_free(u);
+  rax_free(v);
+  rax_free(w);
+  rax_free(x);
+  rax_free(xm);
+  rax_free(e);
+
+  printf("\n. Too many iterations in BRENT !");
+  assert(0);
+}
+
+/* generic bracketing function required for Brent's algorithm. For details please see the corresponding chapter in the book Numerical Recipees in C */
+
+/** @brief Bracketing function
+ *
+ *  Generic bracketing function required for Brent's algorithm.
+ *  
+ *  @param param
+ *
+ *  @param ax
+ *
+ *  @param bx
+ *
+ *  @param cx
+ *
+ *  @param fa
+ *
+ *  @param fb
+ *
+ *  @param fc
+ *
+ *  @param lim_inf
+ *    Lower bound for the rate assignment
+ *
+ *  @param lim_sup
+ *    Upper bound for the rate assignment
+ *
+ *  @param numberOfModels
+ *    Number of partitions for which we are optimizing 
+ *
+ *  @param rateNumber
+ *     Index of the parameter to optimize 
+ *
+ *  @param whichFunction
+ *    Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ *  @param modelEpsilon
+ *
+ *  @return
+ *    Fill this
+ *
+ *  @todo
+ *    Fill remaining details
+ */
+static int brakGeneric(double *param, double *ax, double *bx, double *cx, double *fa, double *fb, 
+                       double *fc, double lim_inf, double lim_sup, 
+                       int numberOfModels, int rateNumber, int whichFunction, pllInstance *tr, partitionList *pr,
+                       linkageList *ll, double modelEpsilon)
+{
+  double 
+    *ulim = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *u    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *r    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *q    = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *fu   = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *dum  = (double *)rax_malloc(sizeof(double) * numberOfModels), 
+    *temp = (double *)rax_malloc(sizeof(double) * numberOfModels);
+  
+  int 
+    i,
+    *state    = (int *)rax_malloc(sizeof(int) * numberOfModels),
+    *endState = (int *)rax_malloc(sizeof(int) * numberOfModels);
+
+  pllBoolean *converged = (pllBoolean *)rax_malloc(sizeof(pllBoolean) * numberOfModels);
+  pllBoolean allConverged;
+
+  for(i = 0; i < numberOfModels; i++)
+    converged[i] = PLL_FALSE;
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      state[i] = 0;
+      endState[i] = 0;
+
+      u[i] = 0.0;
+
+      param[i] = ax[i];
+
+      if(param[i] > lim_sup)    
+        param[i] = ax[i] = lim_sup;
+      
+      if(param[i] < lim_inf) 
+        param[i] = ax[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+   
+  
+  evaluateChange(tr, pr, rateNumber, param, fa, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      param[i] = bx[i];
+      if(param[i] > lim_sup) 
+        param[i] = bx[i] = lim_sup;
+      if(param[i] < lim_inf) 
+        param[i] = bx[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+  
+  evaluateChange(tr, pr, rateNumber, param, fb, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+  for(i = 0; i < numberOfModels; i++)  
+    {
+      if (fb[i] > fa[i]) 
+        {         
+          PLL_SHFT(dum[i],ax[i],bx[i],dum[i]);
+          PLL_SHFT(dum[i],fa[i],fb[i],dum[i]);
+        }
+      
+      cx[i] = bx[i] + MNBRAK_GOLD * (bx[i] - ax[i]);
+      
+      param[i] = cx[i];
+      
+      if(param[i] > lim_sup) 
+        param[i] = cx[i] = lim_sup;
+      if(param[i] < lim_inf) 
+        param[i] = cx[i] = lim_inf;
+
+      assert(param[i] >= lim_inf && param[i] <= lim_sup);
+    }
+  
+ 
+  evaluateChange(tr, pr, rateNumber, param, fc, converged, whichFunction, numberOfModels,  ll, modelEpsilon);
+
+   while(1) 
+     {       
+       allConverged = PLL_TRUE;
+
+       for(i = 0; i < numberOfModels && allConverged; i++)
+         allConverged = allConverged && converged[i];
+
+       if(allConverged)
+         {
+           for(i = 0; i < numberOfModels; i++)
+             {         
+               if(ax[i] > lim_sup) 
+                 ax[i] = lim_sup;
+               if(ax[i] < lim_inf) 
+                 ax[i] = lim_inf;
+
+               if(bx[i] > lim_sup) 
+                 bx[i] = lim_sup;
+               if(bx[i] < lim_inf) 
+                 bx[i] = lim_inf;
+               
+               if(cx[i] > lim_sup) 
+                 cx[i] = lim_sup;
+               if(cx[i] < lim_inf) 
+                 cx[i] = lim_inf;
+             }
+
+           rax_free(converged);
+           rax_free(ulim);
+           rax_free(u);
+           rax_free(r);
+           rax_free(q);
+           rax_free(fu);
+           rax_free(dum); 
+           rax_free(temp);
+           rax_free(state);   
+           rax_free(endState);
+           return 0;
+           
+         }
+
+       for(i = 0; i < numberOfModels; i++)
+         {
+           if(!converged[i])
+             {
+               switch(state[i])
+                 {
+                 case 0:
+                   endState[i] = 0;
+                   if(!(fb[i] > fc[i]))                  
+                     converged[i] = PLL_TRUE;                                
+                   else
+                     {
+                   
+                       if(ax[i] > lim_sup) 
+                         ax[i] = lim_sup;
+                       if(ax[i] < lim_inf) 
+                         ax[i] = lim_inf;
+                       if(bx[i] > lim_sup) 
+                         bx[i] = lim_sup;
+                       if(bx[i] < lim_inf) 
+                         bx[i] = lim_inf;
+                       if(cx[i] > lim_sup) 
+                         cx[i] = lim_sup;
+                       if(cx[i] < lim_inf) 
+                         cx[i] = lim_inf;
+                       
+                       r[i]=(bx[i]-ax[i])*(fb[i]-fc[i]);
+                       q[i]=(bx[i]-cx[i])*(fb[i]-fa[i]);
+                       u[i]=(bx[i])-((bx[i]-cx[i])*q[i]-(bx[i]-ax[i])*r[i])/
+                         (2.0 * PLL_SIGN(PLL_MAX(fabs(q[i]-r[i]),MNBRAK_TINY),q[i]-r[i]));
+                       
+                       ulim[i]=(bx[i])+MNBRAK_GLIMIT*(cx[i]-bx[i]);
+                       
+                       if(u[i] > lim_sup) 
+                         u[i] = lim_sup;
+                       if(u[i] < lim_inf) 
+                         u[i] = lim_inf;
+                       if(ulim[i] > lim_sup) 
+                         ulim[i] = lim_sup;
+                       if(ulim[i] < lim_inf) 
+                         ulim[i] = lim_inf;
+                       
+                       if ((bx[i]-u[i])*(u[i]-cx[i]) > 0.0)
+                         {
+                           param[i] = u[i];
+                           if(param[i] > lim_sup)                            
+                             param[i] = u[i] = lim_sup;
+                           if(param[i] < lim_inf)
+                             param[i] = u[i] = lim_inf;
+                           endState[i] = 1;
+                         }
+                       else 
+                         {
+                           if ((cx[i]-u[i])*(u[i]-ulim[i]) > 0.0) 
+                             {
+                               param[i] = u[i];
+                               if(param[i] > lim_sup) 
+                                 param[i] = u[i] = lim_sup;
+                               if(param[i] < lim_inf) 
+                                 param[i] = u[i] = lim_inf;
+                               endState[i] = 2;
+                             }                         
+                           else
+                             {
+                               if ((u[i]-ulim[i])*(ulim[i]-cx[i]) >= 0.0) 
+                                 {
+                                   u[i] = ulim[i];
+                                   param[i] = u[i];     
+                                   if(param[i] > lim_sup) 
+                                     param[i] = u[i] = ulim[i] = lim_sup;
+                                   if(param[i] < lim_inf) 
+                                     param[i] = u[i] = ulim[i] = lim_inf;
+                                   endState[i] = 0;
+                                 }                              
+                               else 
+                                 {                
+                                   u[i]=(cx[i])+MNBRAK_GOLD*(cx[i]-bx[i]);
+                                   param[i] = u[i];
+                                   endState[i] = 0;
+                                   if(param[i] > lim_sup) 
+                                     param[i] = u[i] = lim_sup;
+                                   if(param[i] < lim_inf) 
+                                     param[i] = u[i] = lim_inf;
+                                 }
+                             }    
+                         }
+                     }
+                   break;
+                 case 1:
+                   endState[i] = 0;
+                   break;
+                 case 2:
+                   endState[i] = 3;
+                   break;
+                 default:
+                   assert(0);
+                 }
+               assert(param[i] >= lim_inf && param[i] <= lim_sup);
+             }
+         }
+             
+       evaluateChange(tr, pr, rateNumber, param, temp, converged, whichFunction, numberOfModels, ll, modelEpsilon);
+
+       for(i = 0; i < numberOfModels; i++)
+         {
+           if(!converged[i])
+             {         
+               switch(endState[i])
+                 {
+                 case 0:
+                   fu[i] = temp[i];
+                   PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                   PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                   state[i] = 0;
+                   break;
+                 case 1:
+                   fu[i] = temp[i];
+                   if (fu[i] < fc[i]) 
+                     {
+                       ax[i]=(bx[i]);
+                       bx[i]=u[i];
+                       fa[i]=(fb[i]);
+                       fb[i]=fu[i]; 
+                       converged[i] = PLL_TRUE;               
+                     } 
+                   else 
+                     {
+                       if (fu[i] > fb[i]) 
+                         {
+                           assert(u[i] >= lim_inf && u[i] <= lim_sup);
+                           cx[i]=u[i];
+                           fc[i]=fu[i];
+                           converged[i] = PLL_TRUE;                       
+                         }
+                       else
+                         {                 
+                           u[i]=(cx[i])+MNBRAK_GOLD*(cx[i]-bx[i]);
+                           param[i] = u[i];
+                           if(param[i] > lim_sup) {param[i] = u[i] = lim_sup;}
+                           if(param[i] < lim_inf) {param[i] = u[i] = lim_inf;}    
+                           state[i] = 1;                 
+                         }                
+                     }
+                   break;
+                 case 2: 
+                   fu[i] = temp[i];
+                   if (fu[i] < fc[i]) 
+                     {               
+                       PLL_SHFT(bx[i],cx[i],u[i], cx[i]+MNBRAK_GOLD*(cx[i]-bx[i]));
+                       state[i] = 2;
+                     }     
+                   else
+                     {
+                       state[i] = 0;
+                       PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                       PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                     }
+                   break;          
+                 case 3:                  
+                   PLL_SHFT(fb[i],fc[i],fu[i], temp[i]);
+                   PLL_SHFT(ax[i],bx[i],cx[i],u[i]);
+                   PLL_SHFT(fa[i],fb[i],fc[i],fu[i]);
+                   state[i] = 0;
+                   break;
+                 default:
+                   assert(0);
+                 }
+             }
+         }
+    }
+   
+
+   assert(0);
+   rax_free(converged);
+   rax_free(ulim);
+   rax_free(u);
+   rax_free(r);
+   rax_free(q);
+   rax_free(fu);
+   rax_free(dum); 
+   rax_free(temp);
+   rax_free(state);   
+   rax_free(endState);
+
+  
+
+   return(0);
+}
+
+/*******************************************************************************************************/
+/******** LG4X ***************************************************************************************/
+
+void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon,
+        linkageList *ll, int numberOfModels)
+{
+    int i;
+    double lg4xScaler, *lg4xScalers = (double *) calloc(pr->numberOfPartitions,
+            sizeof(double)), wgtsum = 0.0;
+    for (i = 0; i < 4; i++)
+        optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, i, PLL_LG4X_RATE_MIN,
+                PLL_LG4X_RATE_MAX, LXRATE_F);
+    for (i = 0; i < pr->numberOfPartitions; i++)
+        lg4xScalers[i] = 1.0;
+    for (i = 0; i < ll->entries; i++)
+    {
+        if (ll->ld[i].valid)
+        {
+            int j, index = ll->ld[i].partitionList[0];
+            double averageRate = 0.0;
+            assert(ll->ld[i].partitions == 1);
+            for (j = 0; j < 4; j++)
+                averageRate += pr->partitionData[index]->gammaRates[j];
+            averageRate /= 4.0;
+            lg4xScalers[index] = averageRate;
+        }
+    }
+    if (pr->numberOfPartitions > 1)
+    {
+        for (i = 0; i < pr->numberOfPartitions; i++)
+            pr->partitionData[i]->fracchange = pr->partitionData[i]->rawFracchange * (1.0 / lg4xScalers[i]);
+    }
+    for (i = 0; i < pr->numberOfPartitions; i++)
+        wgtsum += (double) pr->partitionData[i]->partitionWeight;
+    lg4xScaler = 0.0;
+    for (i = 0; i < pr->numberOfPartitions; i++)
+    {
+        double fraction = (double) pr->partitionData[i]->partitionWeight / wgtsum;
+        lg4xScaler += (fraction * lg4xScalers[i]);
+    }
+    tr->fracchange = tr->rawFracchange * (1.0 / lg4xScaler);
+    free(lg4xScalers);
+}
+
+/**********************************************************************************************************/
+/* ALPHA PARAM ********************************************************************************************/
+
+
+//this function is required for implementing the LG4X model later-on 
+
+/** @brief Optimize alpha rates
+  *
+  * Generic routine for alpha rates optimization
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Don't know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @todo
+  *   Implement the LG4X model
+  */
+void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    non_LG4X_Partitions = 0,
+    LG4X_Partitions  = 0;
+
+  /* assumes homogeneous super-partitions, that either contain DNA or AA partitions !*/
+  /* does not check whether AA are all linked */
+
+  /* first do non-LG4X partitions */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:                          
+        case PLL_BINARY_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:
+            if (pr->partitionData[ll->ld[i].partitionList[0]]->optimizeAlphaParameter)
+            {
+                ll->ld[i].valid = PLL_TRUE;
+                non_LG4X_Partitions++;
+            }
+            else
+                ll->ld[i].valid = PLL_FALSE;
+            break;
+        case PLL_AA_DATA:
+            if (pr->partitionData[ll->ld[i].partitionList[0]]->optimizeAlphaParameter)
+            {
+                if (pr->partitionData[ll->ld[i].partitionList[0]]->protModels == PLL_LG4X)
+                {
+                    LG4X_Partitions++;
+                    ll->ld[i].valid = PLL_FALSE;
+                }
+                else
+                {
+                    ll->ld[i].valid = PLL_TRUE;
+                    non_LG4X_Partitions++;
+                }
+            }
+            else
+                ll->ld[i].valid = PLL_FALSE;
+            break;
+        default:
+            assert(0);
+        }      
+    }   
+
+ 
+
+  if(non_LG4X_Partitions > 0)    
+    optParamGeneric(tr, pr, modelEpsilon, ll, non_LG4X_Partitions, -1, PLL_ALPHA_MIN, PLL_ALPHA_MAX, ALPHA_F);
+  
+  /* then LG4x partitions */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:                          
+        case PLL_BINARY_DATA:
+        case PLL_SECONDARY_DATA:
+        case PLL_SECONDARY_DATA_6:
+        case PLL_SECONDARY_DATA_7:
+        case PLL_GENERIC_32:
+        case PLL_GENERIC_64:
+          ll->ld[i].valid = PLL_FALSE;    
+          break;
+        case PLL_AA_DATA:     
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->protModels == PLL_LG4X)
+            ll->ld[i].valid = PLL_TRUE;
+          else
+            ll->ld[i].valid = PLL_FALSE;                    
+          break;
+        default:
+          assert(0);
+        }      
+    }   
+  
+  if(LG4X_Partitions > 0)
+    pllOptLG4X(tr, pr, modelEpsilon, ll, LG4X_Partitions);
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+/** @brief Optimize model parameters
+  *
+  * Function for optimizing the \a rateNumber-th model parameter of type \a whichParameterTYpe,
+  * i.e. alpha rate, substitution rate, or base frequency rate, in all partitions with the \a
+  * valid flag set to \b PLL_TRUE.
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *   
+  * @param modelEpsilon
+  *    A parameter passed for Brent / Brak
+  *
+  * @param ll
+  *   Linkage list
+  * 
+  * @param numberOfModels
+  *   Number of partitions for which we are optimizing 
+  *
+  * @param rateNumber
+  *  Index of the parameter to optimize 
+  *
+  * @param lim_inf
+  *  Lower bound for the rate assignment
+  *
+  * @param lim_sup
+  *  Upper bound for the rate assignment
+  *
+  * @param whichParameterType
+  *  Type of the model parameter. Possible values are \b ALPHA_F, \b RATE_F and \b FREQ_F
+  *
+  * @todo
+  *    Describe the modelEpsilon parameter in detail
+  */
+static void optParamGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int rateNumber, double lim_inf, double lim_sup, int whichParameterType)
+{
+  int
+    l,
+    k, 
+    j, 
+    pos;
+
+  double
+    *startRates     = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startWeights   = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startExponents = (double *)rax_malloc(sizeof(double) * numberOfModels * 4),
+    *startValues = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *startLH     = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *endLH       = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_a          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_b          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_c          = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fa         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fb         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_fc         = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_param      = (double *)rax_malloc(sizeof(double) * numberOfModels),
+    *_x          = (double *)rax_malloc(sizeof(double) * numberOfModels);
+   
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    if (whichParameterType == LXWEIGHT_F)
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+    else
+    {
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+        if (whichParameterType == LXRATE_F)
+        {
+            int j;
+            for (j = 0; j < pr->numberOfPartitions; j++)
+                pr->partitionData[j]->lg4x_weightLikelihood = pr->partitionData[j]->partitionLH;
+        }
+    }
+  
+#ifdef  _DEBUG_MOD_OPT
+  double
+    initialLH = tr->likelihood;
+#endif
+
+  /* 
+     at this point here every worker has the traversal data it needs for the 
+     search 
+  */
+
+  /* store in startValues the values of the old parameters */
+  for(l = 0, pos = 0; ll && l < ll->entries; l++)
+    {
+      if(ll->ld[l].valid)
+        {
+          endLH[pos] = PLL_UNLIKELY;
+          startLH[pos] = 0.0;
+
+          for(j = 0; j < ll->ld[l].partitions; j++)
+            {
+              int 
+                index = ll->ld[l].partitionList[j];
+              
+              startLH[pos] += pr->partitionData[index]->partitionLH;
+              
+              switch(whichParameterType)
+                {
+                case ALPHA_F:
+                  startValues[pos] = pr->partitionData[index]->alpha;
+                  break;
+                case RATE_F:
+                  startValues[pos] = pr->partitionData[index]->substRates[rateNumber];      
+                  break;
+                case FREQ_F:
+                  startValues[pos] = pr->partitionData[index]->freqExponents[rateNumber];
+                  break;
+                case LXRATE_F:
+                    assert(rateNumber >= 0 && rateNumber < 4);
+                    startValues[pos] =
+                            pr->partitionData[index]->gammaRates[rateNumber];
+                    memcpy(&startRates[pos * 4],
+                            pr->partitionData[index]->gammaRates,
+                            4 * sizeof(double));
+                    memcpy(&startExponents[pos * 4],
+                            pr->partitionData[index]->lg4x_weightExponents,
+                            4 * sizeof(double));
+                    memcpy(&startWeights[pos * 4],
+                            pr->partitionData[index]->lg4x_weights,
+                            4 * sizeof(double));
+                    break;
+                case LXWEIGHT_F:
+                    assert(rateNumber >= 0 && rateNumber < 4);
+                    startValues[pos] =
+                            pr->partitionData[index]->lg4x_weightExponents[rateNumber];
+                    break;
+                default:
+                  assert(0);
+                }
+            }
+          pos++;
+        }
+    }  
+
+  assert(pos == numberOfModels);
+   
+  for(k = 0, pos = 0; ll && k < ll->entries; k++)
+    {
+      if(ll->ld[k].valid)
+        {
+          _a[pos] = startValues[pos] + 0.1;
+          _b[pos] = startValues[pos] - 0.1;
+
+          if(_a[pos] < lim_inf) 
+            _a[pos] = lim_inf;
+          
+          if(_a[pos] > lim_sup) 
+            _a[pos] = lim_sup;
+              
+          if(_b[pos] < lim_inf) 
+            _b[pos] = lim_inf;
+          
+          if(_b[pos] > lim_sup) 
+            _b[pos] = lim_sup;    
+
+          pos++;
+        }
+    }                                
+
+  assert(pos == numberOfModels);
+
+  brakGeneric(_param, _a, _b, _c, _fa, _fb, _fc, lim_inf, lim_sup, numberOfModels, rateNumber, whichParameterType, tr, pr, ll, modelEpsilon);
+      
+  for(k = 0; k < numberOfModels; k++)
+    {
+      assert(_a[k] >= lim_inf && _a[k] <= lim_sup);
+      assert(_b[k] >= lim_inf && _b[k] <= lim_sup);       
+      assert(_c[k] >= lim_inf && _c[k] <= lim_sup);         
+    }      
+
+  brentGeneric(_a, _b, _c, _fb, modelEpsilon, _x, endLH, numberOfModels, whichParameterType, rateNumber, tr,  pr, ll, lim_inf, lim_sup);
+        
+  for(k = 0, pos = 0; ll && k < ll->entries; k++)
+    {
+      if(ll->ld[k].valid)
+        { 
+          if(startLH[pos] > endLH[pos])
+            {
+              //if the initial likelihood was better than the likelihodo after optimization, we set the values back 
+              //to their original values 
+
+              for(j = 0; j < ll->ld[k].partitions; j++)
+                {
+                  int 
+                    index = ll->ld[k].partitionList[j];
+                  
+                  if (whichParameterType == LXRATE_F)
+                    {
+                        memcpy(pr->partitionData[index]->lg4x_weights,
+                                &startWeights[pos * 4], sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->gammaRates,
+                                &startRates[pos * 4], sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->lg4x_weightExponents,
+                                &startExponents[pos * 4], 4 * sizeof(double));
+                    }
+
+                    changeModelParameters(index, rateNumber, startValues[pos], whichParameterType, tr, pr); 
+                }
+            }
+          else
+            {
+              //otherwise we set the value to the optimized value 
+              //this used to be a bug in standard RAxML, before I fixed it 
+              //I was not using _x[pos] as value that needs to be set 
+
+              for(j = 0; j < ll->ld[k].partitions; j++)
+                {
+                  int 
+                    index = ll->ld[k].partitionList[j];
+                  
+                  changeModelParameters(index, rateNumber, _x[pos], whichParameterType, tr, pr);
+
+                  if (whichParameterType == LXWEIGHT_F)
+                    {
+                        if (endLH[pos]
+                                > pr->partitionData[index]->lg4x_weightLikelihood)
+                        {
+                            memcpy(pr->partitionData[index]->lg4x_weightsBuffer,
+                                    pr->partitionData[index]->lg4x_weights,
+                                    sizeof(double) * 4);
+                            memcpy(
+                                    pr->partitionData[index]->lg4x_weightExponentsBuffer,
+                                    pr->partitionData[index]->lg4x_weightExponents,
+                                    sizeof(double) * 4);
+                            pr->partitionData[index]->lg4x_weightLikelihood =
+                                    endLH[pos];
+                        }
+                    }
+                    if (whichParameterType == LXRATE_F)
+                    {
+                        memcpy(pr->partitionData[index]->lg4x_weights,
+                                pr->partitionData[index]->lg4x_weightsBuffer,
+                                sizeof(double) * 4);
+                        memcpy(pr->partitionData[index]->lg4x_weightExponents,
+                                pr->partitionData[index]->lg4x_weightExponentsBuffer,
+                                sizeof(double) * 4);
+                    }
+                }
+            }
+          pos++;
+        }
+    }
+
+  #if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      if (whichParameterType == LXRATE_F || whichParameterType == LXWEIGHT_F) {
+        pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+      } else {
+        pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+      }
+
+//    switch(whichParameterType)
+//      {
+//      case FREQ_F:
+//      case RATE_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+//        break;
+//      case ALPHA_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_ALPHA);
+//        break;
+//      case LXRATE_F:
+//      case LXWEIGHT_F:
+//          pllMasterBarrier(tr, pr, PLL_THREAD_COPY_LG4X_RATES);
+//        break;
+//      default:
+//        assert(0);
+//      }
+
+  #endif    
+
+    
+  assert(pos == numberOfModels);
+
+  rax_free(startLH);
+  rax_free(endLH);
+  rax_free(_a);
+  rax_free(_b);
+  rax_free(_c);
+  rax_free(_fa);
+  rax_free(_fb);
+  rax_free(_fc);
+  rax_free(_param);
+  rax_free(_x);
+  rax_free(startValues);
+  rax_free(startRates);
+  rax_free(startWeights);
+  rax_free(startExponents);
+
+#ifdef _DEBUG_MOD_OPT
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  if(tr->likelihood < initialLH)
+    printf("%f %f\n", tr->likelihood, initialLH);
+  assert(tr->likelihood >= initialLH);
+#endif
+}
+
+//******************** rate optimization functions ***************************************************/
+
+/** @brief Wrapper function for optimizing base frequency rates
+  *
+  * Wrapper function for optimizing base frequency rates of \a numberOfModels partitions. 
+  * The function iteratively calls the function \a optParamGeneric for optimizing each of the \a states
+  * parameters
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Dont know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @param numberOfModels
+  *   Number of partitions that we are optimizing
+  *
+  * @param states
+  *   Number of states
+  */
+static void optFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int states)
+{ 
+  int 
+    rateNumber;
+
+  double
+    freqMin = -1000000.0,
+    freqMax = 200.0;
+  
+  for(rateNumber = 0; rateNumber < states; rateNumber++)
+    optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, rateNumber, freqMin, freqMax, FREQ_F);   
+}
+
+/** @brief Optimize base frequencies 
+ *  
+ *  Wrapper function for optimizing base frequencies
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param modelEpsilon
+ *    
+ *
+ *  @param ll
+ *    Linkage list
+ *
+ */
+void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    states,
+    dnaPartitions = 0,
+    aaPartitions  = 0,
+    binPartitions = 0;
+
+  /* first do DNA */
+
+  /* Set the valid flag in linkage list to PLL_TRUE for all DNA partitions */
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_DNA_DATA:  
+          states = pr->partitionData[ll->ld[i].partitionList[0]]->states; 
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+            {
+              ll->ld[i].valid = PLL_TRUE;
+              dnaPartitions++;              
+            }
+          else
+             ll->ld[i].valid = PLL_FALSE;
+          break;       
+        case PLL_BINARY_DATA:
+        case PLL_AA_DATA:
+          ll->ld[i].valid = PLL_FALSE;
+          break;
+        default:
+          assert(0);
+        }      
+    }   
+
+  /* Optimize the frequency rates of all DNA partitions */
+  if(dnaPartitions > 0)
+    optFreqs(tr, pr, modelEpsilon, ll, dnaPartitions, states);
+  
+  /* then AA */
+
+  /* find all partitions that have frequency optimization enabled */ 
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+        case PLL_AA_DATA:
+          states = pr->partitionData[ll->ld[i].partitionList[0]]->states;             
+          if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+            {
+              ll->ld[i].valid = PLL_TRUE;
+              aaPartitions++;           
+            }
+          else
+            ll->ld[i].valid = PLL_FALSE; 
+          break;
+        case PLL_DNA_DATA:      
+        case PLL_BINARY_DATA:
+          ll->ld[i].valid = PLL_FALSE;
+          break;
+        default:
+          assert(0);
+        }        
+    }
+
+  if(aaPartitions > 0)      
+    optFreqs(tr, pr, modelEpsilon, ll, aaPartitions, states);
+
+  /* then binary */
+  for(i = 0; i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+	{
+	case PLL_BINARY_DATA:	  
+	  states = pr->partitionData[ll->ld[i].partitionList[0]]->states; 	      
+	  if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeBaseFrequencies)
+	    {
+	      ll->ld[i].valid = PLL_TRUE;
+	      binPartitions++;		
+	    }
+	  else
+	    ll->ld[i].valid = PLL_FALSE; 
+	  break;
+	case PLL_DNA_DATA:	  
+	case PLL_AA_DATA:      
+	case PLL_SECONDARY_DATA:
+	case PLL_SECONDARY_DATA_6:
+	case PLL_SECONDARY_DATA_7:
+	case PLL_GENERIC_32:
+	case PLL_GENERIC_64:	    
+	  ll->ld[i].valid = PLL_FALSE;
+	  break;
+	default:
+	  assert(0);
+	}	 
+    }
+
+  if(binPartitions > 0)      
+    optFreqs(tr, pr, modelEpsilon, ll, binPartitions, states);
+
+  /* done */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+
+
+/* new version for optimizing rates, an external loop that iterates over the rates */
+/** @brief Wrapper function for optimizing substitution rates
+  *
+  * Wrapper function for optimizing substitution rates of \a numberOfModels partitions. 
+  * The function determines the  number of free parameters and iteratively calls the 
+  * function \a optParamGeneric for optimizing each parameter
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Dont know yet
+  *
+  * @param ll
+  *   Linkage list
+  *
+  * @param numberOfModels
+  *   Number of partitions that we are optimizing
+  *
+  * @param states
+  *   Number of states
+  */
+static void optRates(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels, int states)
+{
+  int
+    rateNumber,
+    numberOfRates = ((states * states - states) / 2) - 1;
+
+  for(rateNumber = 0; rateNumber < numberOfRates; rateNumber++)
+    optParamGeneric(tr, pr, modelEpsilon, ll, numberOfModels, rateNumber, PLL_RATE_MIN, PLL_RATE_MAX, RATE_F);
+}
+
+
+/* figure out if all AA models have been assigned a joint GTR matrix */
+
+/** @brief Check whether all protein partitions have been assigned a joint GTR matrix
+  *
+  * Check whether there exists at least one protein partition and whether all
+  * protein partitions have been assigned a joint GTR matrix.
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @return
+  *   Return \b PLL_TRUE in case there exists at least one protein partition and all of
+  *   protein partitions are assigned a joint GTR matrix. Otherwise return \b PLL_FALSE
+  */
+static pllBoolean AAisGTR(partitionList *pr)
+{
+  int i, count = 0;
+
+  for(i = 0; i < pr->numberOfPartitions; i++)
+    {
+      if(pr->partitionData[i]->dataType == PLL_AA_DATA)
+        {
+          count++;
+          if(pr->partitionData[i]->protModels != PLL_GTR)
+            return PLL_FALSE;
+        }
+    }
+
+  if(count == 0)
+    return PLL_FALSE;
+
+  return PLL_TRUE;
+}
+
+
+/* generic substitiution matrix (Q matrix) optimization */
+
+/** @brief Optimize substitution rates
+  *
+  * Generic routine for substitution matrix (Q matrix) optimization
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  * @param modelEpsilon
+  *   Don't know yet
+  *
+  * @param ll
+  *   Linkage list
+  */
+void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll)
+{
+  int 
+    i,
+    dnaPartitions = 0,
+    aaPartitions  = 0,
+    states = -1;
+
+  /* assumes homogeneous super-partitions, that either contain DNA or AA partitions !*/
+  /* does not check whether AA are all linked */
+
+  /* 
+     first optimize all rates in DNA data partition matrices. That's where we use the valid field in the 
+     linkage list data structure. 
+   */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+        {
+          case PLL_DNA_DATA:  
+            states = pr->partitionData[ll->ld[i].partitionList[0]]->states;
+	    if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeSubstitutionRates)
+	      {
+		ll->ld[i].valid = PLL_TRUE;
+		++ dnaPartitions;  
+	      }
+	    else	      
+	      ll->ld[i].valid = PLL_FALSE;	      
+            break;
+          case PLL_BINARY_DATA:
+          case PLL_AA_DATA:
+          case PLL_SECONDARY_DATA:
+          case PLL_SECONDARY_DATA_6:
+          case PLL_SECONDARY_DATA_7:
+          case PLL_GENERIC_32:
+          case PLL_GENERIC_64:
+            ll->ld[i].valid = PLL_FALSE;
+            break;
+          default:
+            assert(0);
+        }      
+    }   
+
+  /* if we have dna partitions in our dataset, let's optimize all 5 rates in their substitution matrices */
+
+  if(dnaPartitions > 0)
+    optRates(tr, pr, modelEpsilon, ll, dnaPartitions, states);
+  
+  /* AA partitions evolving under a GTR model do not need to be linked any more, this responsibility now remains 
+     with the library user !
+   */
+  
+  for(i = 0; ll && i < ll->entries; i++)
+    {
+      switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType)
+	{
+	case PLL_AA_DATA:
+	  states = pr->partitionData[ll->ld[i].partitionList[0]]->states;
+	  if(pr->partitionData[ll->ld[i].partitionList[0]]->optimizeSubstitutionRates)
+	    {
+	      ll->ld[i].valid = PLL_TRUE;
+	      aaPartitions++;
+	    }
+	  else
+	    ll->ld[i].valid = PLL_FALSE;
+	  break;
+	case PLL_DNA_DATA:          
+	case PLL_BINARY_DATA:
+	case PLL_SECONDARY_DATA:        
+	case PLL_SECONDARY_DATA_6:
+	case PLL_SECONDARY_DATA_7:
+	  ll->ld[i].valid = PLL_FALSE;
+	  break;
+	default:
+	  assert(0);
+	}    
+    }
+  
+  if(aaPartitions > 0)
+    optRates(tr, pr, modelEpsilon, ll, aaPartitions, states); 
+
+  /* done with all partitions, so we can set all entries in the linkage list to valid again :-) */
+
+  for(i = 0; ll && i < ll->entries; i++)
+    ll->ld[i].valid = PLL_TRUE;
+}
+
+
+
+
+
+/*********************FUNCTIONS FOR PSR/CAT model of rate heterogeneity ***************************************/
+
+
+
+
+
+
+static int catCompare(const void *p1, const void *p2)
+{
+ rateCategorize *rc1 = (rateCategorize *)p1;
+ rateCategorize *rc2 = (rateCategorize *)p2;
+
+  double i = rc1->accumulatedSiteLikelihood;
+  double j = rc2->accumulatedSiteLikelihood;
+  
+  if (i > j)
+    return (1);
+  if (i < j)
+    return (-1);
+  return (0);
+}
+
+
+static void categorizePartition(pllInstance *tr, partitionList *pr, rateCategorize *rc, int model, int lower, int upper)
+{
+  int
+    zeroCounter,
+    i, 
+    k;
+  
+  double 
+    diff, 
+    min;
+
+  for (i = lower, zeroCounter = 0; i < upper; i++, zeroCounter++) 
+      {
+        double
+          temp = tr->patrat[i];
+
+        int
+          found = 0;
+        
+        for(k = 0; k < pr->partitionData[model]->numberOfCategories; k++)
+          {
+            if(temp == rc[k].rate || (fabs(temp - rc[k].rate) < 0.001))
+              {
+                found = 1;
+                tr->rateCategory[i] = k; 
+                break;
+              }
+          }
+        
+        if(!found)
+          {
+            min = fabs(temp - rc[0].rate);
+            tr->rateCategory[i] = 0;
+
+            for(k = 1; k < pr->partitionData[model]->numberOfCategories; k++)
+              {
+                diff = fabs(temp - rc[k].rate);
+
+                if(diff < min)
+                  {
+                    min = diff;
+                    tr->rateCategory[i] = k;
+                  }
+              }
+          }
+      }
+
+  for(k = 0; k < pr->partitionData[model]->numberOfCategories; k++)
+    pr->partitionData[model]->perSiteRates[k] = rc[k].rate;
+}
+
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+
+void optRateCatPthreads(pllInstance *tr, partitionList *pr, double lower_spacing, double upper_spacing, double *lhs, int n, int tid)
+{
+  int 
+    model, 
+    i;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {      
+      int 
+        localIndex = 0;
+
+      pllBoolean 
+        execute = ((tr->manyPartitions && isThisMyPartition(pr, tid, model)) || (!tr->manyPartitions));
+
+      if(execute)
+        for(i = pr->partitionData[model]->lower;  i < pr->partitionData[model]->upper; i++)
+          {
+            if(tr->manyPartitions || (i % n == tid))
+              {
+              
+                double initialRate, initialLikelihood, 
+                  leftLH, rightLH, leftRate, rightRate, v;
+                const double epsilon = 0.00001;
+                int k;        
+                
+                tr->patrat[i] = tr->patratStored[i];     
+                initialRate = tr->patrat[i];
+                
+                initialLikelihood = evaluatePartialGeneric(tr, pr, localIndex, initialRate, model); /* i is real i ??? */
+                
+                
+                leftLH = rightLH = initialLikelihood;
+                leftRate = rightRate = initialRate;
+                
+                k = 1;
+                
+                while((initialRate - k * lower_spacing > 0.0001) && 
+                      ((v = evaluatePartialGeneric(tr, pr, localIndex, initialRate - k * lower_spacing, model))
+                       > leftLH) && 
+                      (fabs(leftLH - v) > epsilon))  
+                  {       
+#ifndef WIN32
+                    if(isnan(v))
+                      assert(0);
+#endif
+                    
+                    leftLH = v;
+                    leftRate = initialRate - k * lower_spacing;
+                    k++;          
+                  }      
+                
+                k = 1;
+                
+                while(((v = evaluatePartialGeneric(tr, pr, localIndex, initialRate + k * upper_spacing, model)) > rightLH) &&
+                      (fabs(rightLH - v) > epsilon))            
+                  {
+#ifndef WIN32
+                    if(isnan(v))
+                      assert(0);
+#endif     
+                    rightLH = v;
+                    rightRate = initialRate + k * upper_spacing;         
+                    k++;
+                  }           
+                
+                if(rightLH > initialLikelihood || leftLH > initialLikelihood)
+                  {
+                    if(rightLH > leftLH)            
+                      {      
+                        tr->patrat[i] = rightRate;
+                        lhs[i] = rightLH;
+                      }
+                    else
+                      {       
+                        tr->patrat[i] = leftRate;
+                        lhs[i] = leftLH;
+                      }
+                  }
+                else
+                  lhs[i] = initialLikelihood;
+                
+                tr->patratStored[i] = tr->patrat[i];
+                localIndex++;
+              }
+          }
+      assert(localIndex == pr->partitionData[model]->width);
+    }
+}
+
+
+
+#else
+
+/** @brief Optimize rates for CAT model
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param model
+ *    Partition index
+ *
+ *  @param lower_specing
+ *
+ *  @param upper_spacing
+ *
+ *  @param lhs
+ */
+static void optRateCatModel(pllInstance *tr, partitionList *pr, int model, double lower_spacing, double upper_spacing, double *lhs)
+{
+  int lower = pr->partitionData[model]->lower;
+  int upper = pr->partitionData[model]->upper;
+  int i;
+  for(i = lower; i < upper; i++)
+    {
+      double initialRate, initialLikelihood, 
+        leftLH, rightLH, leftRate, rightRate, v;
+      const double epsilon = 0.00001;
+      int k;
+      
+      tr->patrat[i] = tr->patratStored[i];     
+      initialRate = tr->patrat[i];
+      
+      initialLikelihood = evaluatePartialGeneric(tr, pr, i, initialRate, model);
+      
+      
+      leftLH = rightLH = initialLikelihood;
+      leftRate = rightRate = initialRate;
+      
+      k = 1;
+      
+      while((initialRate - k * lower_spacing > 0.0001) && 
+            ((v = evaluatePartialGeneric(tr, pr, i, initialRate - k * lower_spacing, model))
+             > leftLH) && 
+            (fabs(leftLH - v) > epsilon))  
+        {         
+#ifndef WIN32
+          if(isnan(v))
+            assert(0);
+#endif
+          
+          leftLH = v;
+          leftRate = initialRate - k * lower_spacing;
+          k++;    
+        }      
+      
+      k = 1;
+      
+      while(((v = evaluatePartialGeneric(tr, pr, i, initialRate + k * upper_spacing, model)) > rightLH) &&
+            (fabs(rightLH - v) > epsilon))      
+        {
+#ifndef WIN32
+          if(isnan(v))
+            assert(0);
+#endif     
+          rightLH = v;
+          rightRate = initialRate + k * upper_spacing;   
+          k++;
+        }           
+  
+      if(rightLH > initialLikelihood || leftLH > initialLikelihood)
+        {
+          if(rightLH > leftLH)      
+            {        
+              tr->patrat[i] = rightRate;
+              lhs[i] = rightLH;
+            }
+          else
+            {         
+              tr->patrat[i] = leftRate;
+              lhs[i] = leftLH;
+            }
+        }
+      else
+        lhs[i] = initialLikelihood;
+      
+      tr->patratStored[i] = tr->patrat[i];
+    }
+
+}
+
+
+#endif
+
+
+
+/* 
+   set scaleRates to PLL_FALSE everywhere such that 
+   per-site rates are not scaled to obtain an overall mean rate 
+   of 1.0
+*/
+
+void updatePerSiteRates(pllInstance *tr, partitionList *pr, pllBoolean scaleRates)
+{
+  int 
+    i,
+    model;
+
+  if(pr->perGeneBranchLengths && pr->numberOfPartitions > 1)
+    {            
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          int          
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+          
+          if(scaleRates)
+            {
+              double 
+                scaler = 0.0,       
+                accRat = 0.0; 
+
+              int 
+                accWgt     = 0;
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }          
+          
+              accRat /= ((double)accWgt);
+          
+              scaler = 1.0 / ((double)accRat);
+                  
+              for(i = 0; i < pr->partitionData[model]->numberOfCategories; i++)
+                pr->partitionData[model]->perSiteRates[i] *= scaler;
+
+              accRat = 0.0;      
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);        
+                  
+                  accRat += (w * rate);
+                }                
+
+              accRat /= ((double)accWgt);         
+
+              assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+            }
+          else
+            {
+              double               
+                accRat = 0.0; 
+
+              int 
+                accWgt     = 0;
+              
+              for(i = lower; i < upper; i++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }          
+          
+              accRat /= ((double)accWgt);
+              
+              assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+            }
+
+          
+#if NOT (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+          {
+            int 
+              localCount = 0;
+            
+            for(i = lower, localCount = 0; i < upper; i++, localCount++)
+              {               
+                pr->partitionData[model]->rateCategory[localCount] = tr->rateCategory[i];
+              }
+          }
+#endif
+        }
+    }
+  else
+    {
+      int
+        accWgt = 0;
+
+      double 
+        scaler = 0.0,       
+        accRat = 0.0; 
+
+      if(scaleRates)
+        {
+          for(model = 0, accRat = 0.0, accWgt = 0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }
+            }
+          
+          accRat /= ((double)accWgt);
+          
+          scaler = 1.0 / ((double)accRat);
+          
+          for(model = 0; model < pr->numberOfPartitions; model++)
+            {
+              for(i = 0; i < pr->partitionData[model]->numberOfCategories; i++)
+                pr->partitionData[model]->perSiteRates[i] *= scaler;
+            }
+
+          for(model = 0, accRat = 0.0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);        
+                  
+                  accRat += (w * rate);
+                }
+            }           
+
+          accRat /= ((double)accWgt);     
+
+          assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+        }
+      else
+        {
+          for(model = 0, accRat = 0.0, accWgt = 0; model < pr->numberOfPartitions; model++)
+            {
+              int 
+                localCount = 0,
+                lower = pr->partitionData[model]->lower,
+                upper = pr->partitionData[model]->upper;
+              
+              for(i = lower, localCount = 0; i < upper; i++, localCount++)
+                {
+                  int 
+                    w = tr->aliaswgt[i];
+                  
+                  double
+                    rate = pr->partitionData[model]->perSiteRates[tr->rateCategory[i]];
+                  
+                  assert(0 <= tr->rateCategory[i] && tr->rateCategory[i] < tr->maxCategories);
+                  
+                  accWgt += w;
+                  
+                  accRat += (w * rate);
+                }
+            }
+          
+          accRat /=  (double)accWgt;
+
+          assert(PLL_ABS(1.0 - accRat) < 1.0E-5);
+        }
+         
+         /*
+       for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          int 
+            localCount = 0,
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+
+        }  */       
+#if NOT (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {                        
+          int 
+            localCount,
+            lower = pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper;
+          
+          for(i = lower, localCount = 0; i < upper; i++, localCount++)
+              pr->partitionData[model]->rateCategory[localCount] = tr->rateCategory[i];
+        }
+#endif
+    }
+  
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATE_CATS);
+#endif               
+}
+
+/** @brief Optimize rate categories for CAT model
+ *
+ *  Optimize rate categories for CAT model
+ *
+ *  @param tr
+ *    PLL instance
+ *
+ *  @param pr
+ *    List of partitions
+ *
+ *  @param _maxCategories
+ *    Number of categories
+ */
+static void optimizeRateCategories(pllInstance *tr, partitionList *pr, int _maxCategories)
+{
+  assert(_maxCategories > 0);
+
+  if(_maxCategories > 1)
+    {
+      double  
+        temp,  
+        lower_spacing, 
+        upper_spacing,
+        initialLH = tr->likelihood,     
+        *ratStored = (double *)rax_malloc(sizeof(double) * tr->originalCrunchedLength),
+        /**lhs =       (double *)malloc(sizeof(double) * tr->originalCrunchedLength),*/
+        **oldCategorizedRates = (double **)rax_malloc(sizeof(double *) * pr->numberOfPartitions);
+
+      int  
+        i,
+        k,
+        maxCategories = _maxCategories,
+        *oldCategory =  (int *)rax_malloc(sizeof(int) * tr->originalCrunchedLength),
+        model,
+        *oldNumbers = (int *)rax_malloc(sizeof(int) * pr->numberOfPartitions);
+  
+      assert(isTip(tr->start->number, tr->mxtips));         
+      
+      pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+      if(tr->optimizeRateCategoryInvocations == 1)
+        {
+          lower_spacing = 0.5 / ((double)(tr->optimizeRateCategoryInvocations));
+          upper_spacing = 1.0 / ((double)(tr->optimizeRateCategoryInvocations));
+        }
+      else
+        {
+          lower_spacing = 0.05 / ((double)(tr->optimizeRateCategoryInvocations));
+          upper_spacing = 0.1 / ((double)(tr->optimizeRateCategoryInvocations));
+        }
+      
+      if(lower_spacing < 0.001)
+        lower_spacing = 0.001;
+      
+      if(upper_spacing < 0.001)
+        upper_spacing = 0.001;
+      
+      tr->optimizeRateCategoryInvocations = tr->optimizeRateCategoryInvocations + 1;
+
+      memcpy(oldCategory, tr->rateCategory, sizeof(int) * tr->originalCrunchedLength);       
+      memcpy(ratStored,   tr->patratStored, sizeof(double) * tr->originalCrunchedLength);
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          oldNumbers[model]          = pr->partitionData[model]->numberOfCategories;
+
+          oldCategorizedRates[model] = (double *)rax_malloc(sizeof(double) * tr->maxCategories);
+          
+          memcpy(oldCategorizedRates[model], pr->partitionData[model]->perSiteRates, tr->maxCategories * sizeof(double));
+        }      
+      
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+      /*tr->lhs = lhs;*/
+      tr->lower_spacing = lower_spacing;
+      tr->upper_spacing = upper_spacing;
+      pllMasterBarrier(tr, pr, PLL_THREAD_RATE_CATS);
+#else      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        optRateCatModel(tr, pr, model, lower_spacing, upper_spacing, tr->lhs);
+#endif     
+
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {     
+          int 
+            where = 1,
+            found = 0,
+            width = pr->partitionData[model]->upper -  pr->partitionData[model]->lower,
+            upper = pr->partitionData[model]->upper,
+            lower = pr->partitionData[model]->lower;
+            
+          rateCategorize 
+            *rc = (rateCategorize *)rax_malloc(sizeof(rateCategorize) * width);          
+        
+          for (i = 0; i < width; i++)
+            {
+              rc[i].accumulatedSiteLikelihood = 0.0;
+              rc[i].rate = 0.0;
+            }  
+        
+          rc[0].accumulatedSiteLikelihood = tr->lhs[lower];
+          rc[0].rate = tr->patrat[lower];
+        
+          tr->rateCategory[lower] = 0;
+        
+          for (i = lower + 1; i < upper; i++) 
+            {
+              temp = tr->patrat[i];
+              found = 0;
+            
+              for(k = 0; k < where; k++)
+                {
+                  if(temp == rc[k].rate || (fabs(temp - rc[k].rate) < 0.001))
+                    {
+                      found = 1;                                                
+                      rc[k].accumulatedSiteLikelihood += tr->lhs[i];    
+                      break;
+                    }
+                }
+            
+              if(!found)
+                {           
+                  rc[where].rate = temp;            
+                  rc[where].accumulatedSiteLikelihood += tr->lhs[i];        
+                  where++;
+                }
+            }
+        
+          qsort(rc, where, sizeof(rateCategorize), catCompare);
+        
+          if(where < maxCategories)
+            {
+              pr->partitionData[model]->numberOfCategories = where;
+              categorizePartition(tr, pr, rc, model, lower, upper);
+            }
+          else
+            {
+              pr->partitionData[model]->numberOfCategories = maxCategories;
+              categorizePartition(tr, pr, rc, model, lower, upper);
+            }
+        
+          rax_free(rc);
+        }
+                
+      updatePerSiteRates(tr, pr, PLL_TRUE);
+
+      pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      
+      if(tr->likelihood < initialLH)
+        {                         
+          for(model = 0; model < pr->numberOfPartitions; model++)
+            {
+              pr->partitionData[model]->numberOfCategories = oldNumbers[model];
+              memcpy(pr->partitionData[model]->perSiteRates, oldCategorizedRates[model], tr->maxCategories * sizeof(double));
+            }         
+          
+          memcpy(tr->patratStored, ratStored, sizeof(double) * tr->originalCrunchedLength);
+          memcpy(tr->rateCategory, oldCategory, sizeof(int) * tr->originalCrunchedLength);           
+          
+          updatePerSiteRates(tr, pr, PLL_FALSE);
+          
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+          /* printf("REVERT: %1.40f %1.40f\n", initialLH, tr->likelihood); */
+
+          assert(initialLH == tr->likelihood);
+        }
+          
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        rax_free(oldCategorizedRates[model]);
+                   
+      rax_free(oldCategorizedRates);
+      rax_free(oldCategory);
+      rax_free(ratStored);       
+      /*     rax_free(lhs); */
+      rax_free(oldNumbers);
+    }
+}
+  
+
+/************************* end of functions for CAT model of rate heterogeneity */
+
+
+
+
+/*****************************************************************************************************/
+
+/* reset all branche lengths in tree to default values */
+
+/** @brief Reset all branch lengths to default values
+  
+    Reset all branch lengths in the tree instance to default values (\b PLL_DEFAULTZ)
+
+    @param tr
+      PLL instance
+  */
+void resetBranches(pllInstance *tr)
+{
+  nodeptr  p, q;
+  int  nodes, i;
+  
+  nodes = tr->mxtips  +  3 * (tr->mxtips - 2);
+  p = tr->nodep[1];
+  while (nodes-- > 0) 
+    {   
+      for(i = 0; i < PLL_NUM_BRANCHES; i++)
+        p->z[i] = PLL_DEFAULTZ;
+        
+      q = p->next;
+      while(q != p)
+        {       
+          for(i = 0; i < PLL_NUM_BRANCHES; i++)
+            q->z[i] = PLL_DEFAULTZ;         
+          q = q->next;
+        }
+      p++;
+    }
+}
+
+/**
+ * @brief Adjust frequencies in case some base frequency is close to zero.
+ */
+static void smoothFrequencies(double *frequencies, int numberOfFrequencies) {
+	int countScale = 0, l, loopCounter = 0;
+
+	for (l = 0; l < numberOfFrequencies; l++)
+		if (frequencies[l] < PLL_FREQ_MIN)
+			countScale++;
+
+	if (countScale > 0) {
+		while (countScale > 0) {
+			double correction = 0.0, factor = 1.0;
+
+			for (l = 0; l < numberOfFrequencies; l++) {
+				if (frequencies[l] == 0.0)
+					correction += PLL_FREQ_MIN;
+				else if (frequencies[l] < PLL_FREQ_MIN) {
+					correction += (PLL_FREQ_MIN - frequencies[l]);
+					factor -= (PLL_FREQ_MIN - frequencies[l]);
+				}
+			}
+
+			countScale = 0;
+
+			for (l = 0; l < numberOfFrequencies; l++) {
+				if (frequencies[l] >= PLL_FREQ_MIN)
+					frequencies[l] = frequencies[l] - (frequencies[l] * correction * factor);
+				else
+					frequencies[l] = PLL_FREQ_MIN;
+
+				if (frequencies[l] < PLL_FREQ_MIN)
+					countScale++;
+			}
+			assert(loopCounter < 100);
+			loopCounter++;
+		}
+	}
+}
+
+/**
+ * @brief Evaluate all possible protein models
+ */
+static void optimizeProteinModels(pllInstance *tr, partitionList * pr, int *bestIndex, double *bestScores, pllBoolean empiricalFreqs)
+{
+	int modelIndex, partitionIndex,
+	    numProteinModels = PLL_AUTO;
+
+	for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+		bestIndex[partitionIndex] = -1;
+		bestScores[partitionIndex] = PLL_UNLIKELY;
+	}
+
+	if (empiricalFreqs) {
+		double ** freqs = pllBaseFrequenciesInstance(tr, pr);
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			smoothFrequencies(freqs[partitionIndex], PLL_NUM_AA_STATES);
+			memcpy(pr->partitionData[partitionIndex]->empiricalFrequencies, freqs[partitionIndex], PLL_NUM_AA_STATES*sizeof(double));
+		}
+		free(freqs);
+	}
+
+	for (modelIndex = 0; modelIndex < numProteinModels; modelIndex++) {
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+
+				pr->partitionData[partitionIndex]->autoProtModels = modelIndex;
+				pr->partitionData[partitionIndex]->protUseEmpiricalFreqs =
+						empiricalFreqs;
+
+				assert(!pr->partitionData[partitionIndex]->optimizeBaseFrequencies);
+
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+		pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+
+		/* optimize branch lengths */
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 16);
+
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				if (pr->partitionData[partitionIndex]->partitionLH > bestScores[partitionIndex]) {
+					/* improved best score */
+					bestScores[partitionIndex] = pr->partitionData[partitionIndex]->partitionLH;
+					bestIndex[partitionIndex] = modelIndex;
+				}
+			}
+		}
+	}
+}
+
+/* 
+   automatically compute the best protein substitution model for the dataset at hand.
+ */
+
+/** @brief Compute the best protein substitution model
+  *
+  * Automatically compute the best protein substitution model for the dataset
+  * at hand
+  *
+  * @param tr
+  *   The PLL instance
+  *
+  * @param pr
+  *   List of partitions
+  *
+  */
+static void autoProtein(pllInstance *tr, partitionList *pr)
+{
+	int countAutos = 0, partitionIndex;
+
+	/* count the number of partitions with model set to PLL_AUTO */
+	for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++)
+		if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO)
+			countAutos++;
+
+	/* if there are partitions with model set to PLL_AUTO compute the best model */
+	if (countAutos > 0) {
+		int *bestIndex = (int*) rax_malloc(
+				sizeof(int) * pr->numberOfPartitions),
+		    *bestIndexEmpFreqs = (int*) rax_malloc(
+				sizeof(int) * pr->numberOfPartitions),
+		    *oldIndex =
+				(int*) rax_malloc(sizeof(int) * pr->numberOfPartitions);
+
+		pllBoolean *oldFreqs = (pllBoolean*) malloc(
+				sizeof(pllBoolean) * pr->numberOfPartitions);
+
+		double startLH,
+		      *bestScores = (double*) rax_malloc(
+				sizeof(double) * pr->numberOfPartitions),
+			  *bestScoresEmpFreqs = (double*) rax_malloc(
+				sizeof(double) * pr->numberOfPartitions);
+
+		topolRELL_LIST *rl = (topolRELL_LIST *) rax_malloc(
+				sizeof(topolRELL_LIST));
+
+		initTL(rl, tr, 1);
+		saveTL(rl, tr, 0);
+
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+		/* store the initial likelihood of the tree with the currently assigned protein models */
+		startLH = tr->likelihood;
+
+		/* save the currently assigned protein model for each PLL_AUTO partition */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			oldIndex[partitionIndex] = pr->partitionData[partitionIndex]->autoProtModels;
+			oldFreqs[partitionIndex] = pr->partitionData[partitionIndex]->protUseEmpiricalFreqs;
+			bestIndex[partitionIndex] = -1;
+			bestScores[partitionIndex] = PLL_UNLIKELY;
+		}
+
+		/* evaluate all models with fixed base frequencies */
+		optimizeProteinModels(tr, pr, bestIndex, bestScores, PLL_FALSE);
+		/* evaluate all models with fixed empirical frequencies */
+		optimizeProteinModels(tr, pr, bestIndexEmpFreqs, bestScoresEmpFreqs, PLL_TRUE);
+
+		/* model selection */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				int bestIndexFixed = bestIndex[partitionIndex],
+				    bestIndexEmp = bestIndexEmpFreqs[partitionIndex];
+
+				double bestLhFixed = bestScores[partitionIndex],
+					   bestLhEmp = bestScoresEmpFreqs[partitionIndex],
+					   samples = 0.0,
+					   freeParamsFixed = 0.0,
+					   freeParamsEmp = 0.0;
+
+				samples = pr->partitionData[partitionIndex]->partitionWeight;
+				assert(samples > 0.0 && samples >= pr->partitionData[partitionIndex]->width);
+
+				assert(tr->ntips == tr->mxtips);
+				freeParamsFixed = freeParamsEmp = (2 * tr->ntips - 3);
+				freeParamsEmp += 19.0;
+
+				switch (tr->rateHetModel) {
+				case PLL_CAT:
+					freeParamsFixed +=
+							(double) pr->partitionData[partitionIndex]->numberOfCategories;
+					freeParamsEmp +=
+							(double) pr->partitionData[partitionIndex]->numberOfCategories;
+					break;
+				case PLL_GAMMA:
+					freeParamsFixed += 1.0;
+					freeParamsEmp += 1.0;
+					break;
+				default:
+					assert(0);
+				}
+
+				switch (tr->autoProteinSelectionType) {
+				case PLL_AUTO_ML:
+					if (bestLhFixed > bestLhEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+					break;
+				case PLL_AUTO_BIC: {
+					//BIC: -2 * lnL + k * ln(n)
+					double bicFixed = -2.0 * bestLhFixed
+							+ freeParamsFixed * log(samples),
+						   bicEmp = -2.0
+							* bestLhEmp + freeParamsEmp * log(samples);
+
+					if (bicFixed < bicEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				case PLL_AUTO_AIC: {
+					//AIC: 2 * (k - lnL)
+					double aicFixed = 2.0 * (freeParamsFixed - bestLhFixed),
+							aicEmp = 2.0 * (freeParamsEmp - bestLhEmp);
+
+					if (aicFixed < aicEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				case PLL_AUTO_AICC: {
+					//AICc: AIC + (2 * k * (k + 1))/(n - k - 1)
+					double aiccFixed, aiccEmp;
+
+					/*
+					 * Even though samples and freeParamsFixed are fp variables, they are actually integers.
+					 * That's why we are comparing with a 0.5 threshold.
+					 */
+
+					if (fabs(samples - freeParamsFixed - 1.0) < 0.5)
+						aiccFixed = 0.0;
+					else
+						aiccFixed = (2.0 * (freeParamsFixed - bestLhFixed))
+								+ ((2.0 * freeParamsFixed
+										* (freeParamsFixed + 1.0))
+										/ (samples - freeParamsFixed - 1.0));
+
+					if (fabs(samples - freeParamsEmp - 1.0) < 0.5)
+						aiccEmp = 0.0;
+					else
+						aiccEmp = (2.0 * (freeParamsEmp - bestLhEmp))
+								+ ((2.0 * freeParamsEmp * (freeParamsEmp + 1.0))
+										/ (samples - freeParamsEmp - 1.0));
+
+					if (aiccFixed < aiccEmp) {
+						pr->partitionData[partitionIndex]->autoProtModels =
+								bestIndexFixed;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 0;
+					} else {
+						pr->partitionData[partitionIndex]->autoProtModels = bestIndexEmp;
+						pr->partitionData[partitionIndex]->protUseEmpiricalFreqs = 1;
+					}
+				}
+					break;
+				default:
+					assert(0);
+				}
+
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 64);
+
+		/* set the protein model of PLL_AUTO partitions to the best computed and reset model parameters */
+		for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+			if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+				pr->partitionData[partitionIndex]->autoProtModels = bestIndex[partitionIndex];
+				pllInitReversibleGTR(tr, pr, partitionIndex);
+			}
+		}
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+		pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+
+		/* compute again the likelihood of the tree */
+		resetBranches(tr);
+		pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		pllOptimizeBranchLengths(tr, pr, 64);
+
+		/* check if the likelihood of the tree with the new protein models assigned to PLL_AUTO partitions is better than the with the old protein models */
+		if (tr->likelihood < startLH) {
+			for (partitionIndex = 0; partitionIndex < pr->numberOfPartitions; partitionIndex++) {
+				if (pr->partitionData[partitionIndex]->protModels == PLL_AUTO) {
+					pr->partitionData[partitionIndex]->autoProtModels = oldIndex[partitionIndex];
+					pllInitReversibleGTR(tr, pr, partitionIndex);
+				}
+			}
+
+			//this barrier needs to be called in the library
+			//#ifdef _USE_PTHREADS
+			//pllMasterBarrier(tr, pr, PLL_THREAD_COPY_RATES);
+			//#endif
+
+			/* Restore the topology. rl holds the topology before the optimization. However,
+			 since the topology doesn't change - only the branch lengths do - maybe we
+			 could write a new routine that will store only the branch lengths and restore them */
+			restoreTL(rl, tr, 0,
+					pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+			pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+		}
+
+		assert(tr->likelihood >= startLH);
+
+		freeTL(rl);
+		rax_free(rl);
+
+		rax_free(oldIndex);
+		rax_free(bestIndex);
+		rax_free(bestIndexEmpFreqs);
+		rax_free(bestScores);
+		rax_free(bestScoresEmpFreqs);
+	}
+}
+
+
+/* iterative procedure for optimizing all model parameters */
+
+/* @brief Optimize all model parameters
+ *
+ * Iterative procedure for optimizing all model parameters
+ *
+ * @param tr
+ *   PLL instance
+ *
+ * @param pr
+ *   List of partitions
+ *
+ * @param likelihoodEpsilon
+ *   Optimize model parameters until we get a difference of \a likelihoodEpsilon
+ *
+ * @todo
+ *   Describe likelihoodEpsilon. Understand the TODO marked blocks.
+ */
+void modOpt(pllInstance *tr, partitionList *pr, double likelihoodEpsilon)
+{ 
+  int catOpt = 0; 
+  double 
+    inputLikelihood,
+    currentLikelihood,
+    modelEpsilon = 0.0001;
+
+  /* linkage lists for alpha, p-invar has actually been ommitted in this version of the code 
+     and the GTR subst matrices */
+
+  linkageList
+    *alphaList = pr->alphaList,
+    *rateList  = pr->rateList,
+    *freqList  = pr->freqList;
+
+  modelEpsilon = 0.0001;
+
+  // test code for library
+  if (0)
+   {
+     
+      //assuming that we have three partitions for testing here 
+
+      //alphaList = initLinkageListString("0,1,2", pr);
+      //rateList  = initLinkageListString("0,1,1", pr);
+    
+      //init_Q_MatrixSymmetries("0,1,2,3,4,5", pr, 0);
+      //init_Q_MatrixSymmetries("0,1,2,3,4,4", pr, 1);
+      //init_Q_MatrixSymmetries("0,1,1,2,3,4", pr, 2);
+      
+      //function that checks that partitions that have linked Q matrices as in our example above
+      //will not have different configurations of the Q matrix as set by the init_Q_MatrixSymmetries() function
+      //e.g., on would have HKY and one would have GTR, while the user claimes that they are linked
+      //in our example, the Q matrices of partitions 1 and 2 are linked 
+      //but we set different matrix symmetries via 
+      // init_Q_MatrixSymmetries("0,1,2,3,4,4", tr, 1);
+      // and
+      // init_Q_MatrixSymmetries("0,1,1,2,3,4", tr, 2);
+      //
+      //the function just let's assertions fail for the time being .....
+
+      //checkMatrixSymnmetriesAndLinkage(pr, rateList);
+
+  /* alpha parameters and p-invar parameters are unlinked.
+     this is the point where I actually hard-coded this in RAxML */
+
+  /* call the dedicated function for linking the GTR matrix across all AA data partitions 
+     If we have only DNA data all GTR matrix estimates will be unlinked.
+     */
+   }
+  else
+   {
+     //alphaList = initLinkageList(unlinked, pr);
+     //freqList  = initLinkageList(unlinked, pr);
+     //rateList  = initLinkageListGTR(pr);
+   }
+
+  tr->start = tr->nodep[1];
+
+  /* This check is here to make sure that the likelihood 
+     computed prior to entering modOpt() is consistent 
+     with the likelihood when entering modOpt().
+     This allows us to ensure that we didn't forget to update anything prior 
+     to entereing this function.
+   */
+  inputLikelihood = tr->likelihood;
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  assert (inputLikelihood == tr->likelihood);
+
+  do
+  {           
+    //printBothOpen("cur LH: %f\n", tr->likelihood);
+    currentLikelihood = tr->likelihood;     
+
+#ifdef _DEBUG_MOD_OPT
+      printf ("start: %f\n", currentLikelihood);
+#endif
+
+    pllOptRatesGeneric(tr, pr, modelEpsilon, rateList);
+
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+#ifdef _DEBUG_MOD_OPT
+    printf ("after rates %f\n", tr->likelihood);
+#endif
+
+    autoProtein(tr, pr);
+
+    pllOptimizeBranchLengths(tr, pr, 2); // 0.0625 * 32 = 2.0
+
+#ifdef _DEBUG_MOD_OPT
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    printf("after br-len 1 %f\n", tr->likelihood); 
+#endif
+
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+    pllOptBaseFreqs(tr, pr, modelEpsilon, freqList);
+    
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+    
+    pllOptimizeBranchLengths(tr, pr, 2); // 0.0625 * 32 = 2.0
+
+#ifdef _DEBUG_MOD_OPT
+    pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE); 
+    printf("after pllOptBaseFreqs 1 %f\n", tr->likelihood);
+#endif 
+
+    switch(tr->rateHetModel)
+    {
+      case PLL_GAMMA:      
+        pllOptAlphasGeneric (tr, pr, modelEpsilon, alphaList);
+        pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+#ifdef _DEBUG_MOD_OPT
+          printf("after alphas %f\n", tr->likelihood); 
+#endif
+
+        pllOptimizeBranchLengths(tr, pr, 3); // 0.1 * 32 = 3.2
+
+#ifdef _DEBUG_MOD_OPT
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+          printf("after br-len 2 %f\n", tr->likelihood); 
+#endif
+        break;
+      case PLL_CAT:
+        if(catOpt < 3)
+        {                            
+          pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+          optimizeRateCategories(tr, pr, tr->categories);
+#ifdef _DEBUG_MOD_OPT
+            pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);  
+            printf("after cat-opt %f\n", tr->likelihood); 
+#endif
+          catOpt++;
+        }
+        break;    
+      default:
+        assert(0);
+    }                   
+
+    if(tr->likelihood < currentLikelihood)
+     {
+      printf("%.20f %.20f\n", tr->likelihood, currentLikelihood);
+      printf("Difference: %.20f\n",tr->likelihood - currentLikelihood);
+    }
+    assert (tr->likelihood - currentLikelihood > 0.000000000000001);
+    //assert(tr->likelihood > currentLikelihood);
+
+  }
+  while(fabs(currentLikelihood - tr->likelihood) > likelihoodEpsilon);  
+  /* TODO: Why do we check the computed likelihood with the currentLikelihood which is the likelihood before THIS optimization loop? Why dont we
+     rather check it with the initial likelihood (the one before calling modOpt)? Isn't it possible to have a deadlock? */
+
+  
+}
+
diff --git a/pllrepo/src/parsePartition.c b/pllrepo/src/parsePartition.c
new file mode 100644
index 0000000..1ae92af
--- /dev/null
+++ b/pllrepo/src/parsePartition.c
@@ -0,0 +1,388 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file parsePartition.c
+ * @brief Collection of routines for parsing and processing a partition (model) file
+ *
+ * @defgroup parsePartitionFileGroup Reading and parsing partition (model) files
+ * This set of functions handles the reading and parsing of partition files, i.e.
+ * files that contain alignment partition definitions and corresponding models.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <ctype.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const char *protModels[PLL_NUM_PROT_MODELS];
+
+static void destroy_model_names(pllHashTable * hashTable)
+{
+  pllHashDestroy (&hashTable, rax_free);
+}
+
+static pllHashTable * init_model_names (void)
+{
+  int i;
+  int * item;
+
+  pllHashTable * hashTable;
+  hashTable = pllHashInit (PLL_NUM_PROT_MODELS);
+
+  for (i = 0; i < PLL_NUM_PROT_MODELS; ++ i)
+   {
+     item  = (int *) rax_malloc (sizeof (int));
+     *item = i;
+     pllHashAdd (hashTable, pllHashString(protModels[i], hashTable->size), protModels[i], (void *) item);
+   }
+  return hashTable;
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Destroy queue structure that contains parsed information from a partition file
+
+    Destroys the structure, and therefore frees allocated memory, that holds parsed information
+    from a partition (model) file
+
+    @param partitions
+      Queue structure with parsed info
+*/
+void pllQueuePartitionsDestroy (pllQueue ** partitions)
+{
+  pllPartitionInfo * pi;
+  pllPartitionRegion * region;
+
+  while (pllQueueRemove (*partitions, (void **)&pi))
+   {
+     while (pllQueueRemove (pi->regionList, (void **) &region))
+      {
+        rax_free (region);
+      }
+     rax_free (pi->regionList);
+     rax_free (pi->partitionName);
+     rax_free (pi->partitionModel);
+     rax_free (pi);
+   }
+  rax_free (*partitions);
+}
+
+static pllQueue * parse_partition (int * inp, pllHashTable * proteinModelsHash)
+{
+  int input, i;
+  pllLexToken token;
+  int lines = 0;
+  pllQueue * partitions;
+  pllPartitionInfo * pi;
+  pllPartitionRegion * region;
+  int * protIndexPtr;
+  char * modelptr;
+
+  input  = *inp;
+
+  NEXT_TOKEN
+
+  pllQueueInit (&partitions);
+  while (token.tokenType != PLL_TOKEN_EOF)
+  {
+    ++ lines;
+    pi = (pllPartitionInfo *) rax_calloc (1, sizeof (pllPartitionInfo));
+    pllQueueInit (&(pi->regionList));
+    pllQueueAppend (partitions, (void *)pi);
+    CONSUME (PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+
+
+    /* read partition type */
+    if (token.tokenType != PLL_TOKEN_STRING) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    pi->partitionModel = my_strndup (token.lexeme, token.len);
+    for (i = 0; i < token.len; ++i) pi->partitionModel[i] = toupper(pi->partitionModel[i]);
+
+    // check partition model
+    pi->protModels              = -1;
+    pi->protUseEmpiricalFreqs   = PLL_FALSE;
+    pi->ascBias                 = PLL_FALSE;
+    pi->optimizeBaseFrequencies = PLL_FALSE;
+
+    /* check if the model contains Asc bias */
+    if (!strncmp(pi->partitionModel, "ASC_", 4))
+      {
+        pi->ascBias = PLL_TRUE;
+        modelptr    = pi->partitionModel + 4;
+      }
+     else
+        modelptr    = pi->partitionModel;
+
+    /* check first for BINARY */
+    if (!strcmp(modelptr, "BIN") || !strcmp(modelptr, "BINX"))
+     {
+       pi->dataType = PLL_BINARY_DATA;
+
+       if (!strcmp(modelptr, "BINX"))
+         pi->optimizeBaseFrequencies = PLL_TRUE;
+     }  /* now for DNA */
+    else if (!strcmp(modelptr, "DNA") || !strcmp(modelptr, "DNAX"))
+     {
+       pi->dataType   = PLL_DNA_DATA;
+
+       if (!strcmp(modelptr, "DNAX")) 
+         pi->optimizeBaseFrequencies = PLL_TRUE; 
+     }
+    else
+     {                  /* and  protein data */
+       pi->dataType  = PLL_AA_DATA;
+
+       if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+        {
+          pi->protModels              = *protIndexPtr;
+          pi->protUseEmpiricalFreqs   = PLL_FALSE;
+          pi->optimizeBaseFrequencies = PLL_FALSE;
+        }
+       else
+        {
+          if (modelptr[token.len - 1] == 'X')
+           {
+             modelptr[token.len - 1] = '\0';
+             if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+              {
+                pi->protModels              = *protIndexPtr;
+                pi->optimizeBaseFrequencies = PLL_TRUE;
+              }
+             modelptr[token.len - 1] = 'X';
+           }
+          else if (modelptr[token.len - 1] == 'F')
+           {
+             modelptr[token.len - 1] = '\0';
+             if (pllHashSearch (proteinModelsHash, modelptr, (void **) &protIndexPtr))
+              {
+                pi->protModels              = *protIndexPtr;
+                pi->protUseEmpiricalFreqs   = PLL_TRUE;
+              }
+             modelptr[token.len - 1] = 'F';
+           }
+          else
+           {
+             pllQueuePartitionsDestroy (&partitions);
+             return (0);
+           }
+        }
+     }
+
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    if (token.tokenType != PLL_TOKEN_COMMA) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read partition name */
+    if (token.tokenType != PLL_TOKEN_STRING) 
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    pi->partitionName = my_strndup (token.lexeme, token.len);
+
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read equal sign */
+    if (token.tokenType != PLL_TOKEN_EQUAL)
+     {
+       pllQueuePartitionsDestroy (&partitions);
+       return (0);
+     }
+    NEXT_TOKEN
+    CONSUME(PLL_TOKEN_WHITESPACE)
+
+    /* read rhs */
+    while (1)
+    {
+      region = (pllPartitionRegion *) rax_malloc (sizeof (pllPartitionRegion));
+      if (token.tokenType != PLL_TOKEN_NUMBER) 
+       {
+         pllQueuePartitionsDestroy (&partitions);
+         return (0);
+       }
+      region->start  = region->end = atoi (token.lexeme);  
+      region->stride = 1;
+      NEXT_TOKEN
+      CONSUME(PLL_TOKEN_WHITESPACE)
+      
+      if  (token.tokenType == PLL_TOKEN_DASH)
+       {
+         NEXT_TOKEN
+         CONSUME(PLL_TOKEN_WHITESPACE)
+         if (token.tokenType != PLL_TOKEN_NUMBER) 
+          {
+            pllQueuePartitionsDestroy (&partitions);
+            return (0);
+          }
+         region->end = atoi (token.lexeme);
+         if (region->end < region->start)
+          {
+            pllQueuePartitionsDestroy (&partitions);
+            return (0);
+          }
+         NEXT_TOKEN
+         CONSUME(PLL_TOKEN_WHITESPACE)
+         if (token.tokenType == PLL_TOKEN_SLASH)
+          {
+            NEXT_TOKEN
+            CONSUME(PLL_TOKEN_WHITESPACE)
+            if (token.tokenType != PLL_TOKEN_NUMBER) 
+             {
+               pllQueuePartitionsDestroy (&partitions);
+               return (0);
+             }
+            region->stride = atoi (token.lexeme);
+            NEXT_TOKEN
+          }
+         CONSUME(PLL_TOKEN_WHITESPACE)
+       }
+       pllQueueAppend (pi->regionList, (void *)region);
+      
+      if (token.tokenType != PLL_TOKEN_COMMA) break;
+      NEXT_TOKEN
+      CONSUME(PLL_TOKEN_WHITESPACE)
+    }
+   CONSUME(PLL_TOKEN_WHITESPACE | PLL_TOKEN_NEWLINE)
+  }
+ 
+ return (partitions);
+} 
+
+/** @ingroup parsePartitionFileGroup
+    @brief Dump a parsed partition file in the console
+
+    Prints the parsed contents of a partition file to the console
+
+    @param partitions Queue structure containing parsed information
+*/
+void pllPartitionDump (pllQueue * partitions)
+{
+   struct pllQueueItem * elm;
+   struct pllQueueItem * regionList;
+   pllPartitionInfo * pi;
+   pllPartitionRegion * region;
+
+   elm = partitions->head;
+
+   while (elm)
+    {
+      pi  = (pllPartitionInfo *) elm->item;
+      printf ("%s, %s = ", pi->partitionModel, pi->partitionName);
+      regionList = pi->regionList->head;
+      while (regionList)
+       {
+         region = (pllPartitionRegion *) regionList->item;
+         printf ("%d", region->start);
+         if (region->start != region->end)
+          {
+            printf ("-%d", region->end);
+            if (region->stride != 1) printf ("/%d", region->stride);
+          }
+         regionList = regionList->next;
+         if (regionList) printf (", ");
+       }
+      printf ("\n");
+
+      elm = elm->next;
+    }
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Parse a partition (model) file
+
+    Parses the partition file \a filename and stores the information in a queue
+    structure ::pllQueue
+
+    @param filename Name of the partition file
+    @return Queue structure with parsed information
+*/
+pllQueue * pllPartitionParse (const char * filename)
+{
+  long n;
+  char * rawdata;
+  int input;
+  pllQueue * partitions;
+
+  rawdata = pllReadFile (filename, &n);
+  if (!rawdata)
+   {
+     fprintf (stderr, "Error while opening/reading file %s\n", filename);
+     return (0);
+   }
+
+  n = strlen (rawdata);
+
+  init_lexan (rawdata, n);
+  input = get_next_symbol();
+
+  pllHashTable * model_names = init_model_names();
+  partitions  = parse_partition (&input, model_names);
+  destroy_model_names(model_names);
+  
+  rax_free (rawdata);
+  return (partitions);
+}
+
+/** @ingroup parsePartitionFileGroup
+    @brief Parse a partition (model) file
+
+    Parses the partition information stored in string \a p and stores the
+    information in a queue structure ::pllQueue
+
+    @param p Partition information string
+    @return  Queue structure with parsed information
+*/
+pllQueue * pllPartitionParseString (const char * p)
+{
+  long n;
+  int input;
+  pllQueue * partitions;
+
+  n = strlen(p);
+  init_lexan (p, n);
+  input = get_next_symbol();
+
+  pllHashTable * model_names;
+  model_names = init_model_names();
+  partitions = parse_partition (&input, model_names);
+  destroy_model_names(model_names);
+  
+  return (partitions);
+}
diff --git a/pllrepo/src/parsePartition.h b/pllrepo/src/parsePartition.h
new file mode 100644
index 0000000..47799d9
--- /dev/null
+++ b/pllrepo/src/parsePartition.h
@@ -0,0 +1,51 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file part.h
+ */
+#ifndef __pll_PART__
+#define __pll_PART__
+#include "queue.h"
+
+typedef struct
+{
+  int start;
+  int end;
+  int stride;
+} pllPartitionRegion;
+
+typedef struct 
+{
+  char * partitionName;
+  char * partitionModel;
+  int protModels;
+  int protUseEmpiricalFreqs;
+  int dataType;
+  int ascBias;
+  int optimizeBaseFrequencies;
+  pllQueue * regionList;
+} pllPartitionInfo;
+#endif
diff --git a/pllrepo/src/parsimony.c b/pllrepo/src/parsimony.c
new file mode 100644
index 0000000..1fae471
--- /dev/null
+++ b/pllrepo/src/parsimony.c
@@ -0,0 +1,865 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file parsimony.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+
+#if defined(__MIC_NATIVE)
+
+#include <immintrin.h>
+
+#define INTS_PER_VECTOR 16
+#define LONG_INTS_PER_VECTOR 8
+#define INT_TYPE __m512i
+#define CAST double*
+#define SET_ALL_BITS_ONE _mm512_set1_epi32(0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm512_setzero_epi32()
+#define VECTOR_LOAD _mm512_load_epi32
+#define VECTOR_STORE  _mm512_store_epi32
+#define VECTOR_BIT_AND _mm512_and_epi32
+#define VECTOR_BIT_OR  _mm512_or_epi32
+#define VECTOR_AND_NOT _mm512_andnot_epi32
+
+#elif defined(__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define ULINT_SIZE 64
+#define INTS_PER_VECTOR 8
+#define LONG_INTS_PER_VECTOR 4
+#define INT_TYPE __m256d
+#define CAST double*
+#define SET_ALL_BITS_ONE (__m256d)_mm256_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO (__m256d)_mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm256_load_pd
+#define VECTOR_BIT_AND _mm256_and_pd
+#define VECTOR_BIT_OR  _mm256_or_pd
+#define VECTOR_STORE  _mm256_store_pd
+#define VECTOR_AND_NOT _mm256_andnot_pd
+
+#elif (defined(__SSE3))
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+  
+#define INTS_PER_VECTOR 4
+#ifdef __i386__
+#define ULINT_SIZE 32
+#define LONG_INTS_PER_VECTOR 4
+#else
+#define ULINT_SIZE 64
+#define LONG_INTS_PER_VECTOR 2
+#endif
+#define INT_TYPE __m128i
+#define CAST __m128i*
+#define SET_ALL_BITS_ONE _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
+#define SET_ALL_BITS_ZERO _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000)
+#define VECTOR_LOAD _mm_load_si128
+#define VECTOR_BIT_AND _mm_and_si128
+#define VECTOR_BIT_OR  _mm_or_si128
+#define VECTOR_STORE  _mm_store_si128
+#define VECTOR_AND_NOT _mm_andnot_si128
+
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern const unsigned int mask32[32]; 
+
+static __inline unsigned int vectorPopcount(INT_TYPE v)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+
+  int    
+    i,
+    sum = 0;
+
+  VECTOR_STORE((CAST)counts, v);
+
+  for(i = 0; i < LONG_INTS_PER_VECTOR; i++)
+     sum += __builtin_popcountl(counts[i]);
+
+  return ((unsigned int)sum);
+}
+
+static __inline void storePerSiteScores (partitionList * pr, int model, INT_TYPE v, unsigned int offset)
+{
+  unsigned long
+    counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT)));
+  parsimonyNumber * buf;
+
+  int    
+    i,
+    j;
+  
+  VECTOR_STORE((CAST)counts, v);
+
+  for (i = 0; i < LONG_INTS_PER_VECTOR; ++i)
+   {
+     buf = &(pr->partitionData[model]->perSiteParsScores[offset * PLL_PCF + i * ULINT_SIZE]);
+     for (j = 0; j < ULINT_SIZE; ++ j)
+        buf[j] += ((counts[i] >> j) & 1);
+   }
+  
+}
+
+static void getxnodeLocal (nodeptr p)
+{
+  nodeptr  s;
+
+  if((s = p->next)->xPars || (s = s->next)->xPars)
+    {
+      p->xPars = s->xPars;
+      s->xPars = 0;
+    }
+
+  assert(p->next->xPars || p->next->next->xPars || p->xPars);
+
+}
+
+static void computeTraversalInfoParsimony(nodeptr p, int *ti, int *counter, int maxTips, pllBoolean full)
+{        
+  nodeptr 
+    q = p->next->back,
+    r = p->next->next->back;
+  
+  if(! p->xPars)
+    getxnodeLocal(p);  
+  
+  if(full)
+    {
+       if(q->number > maxTips) 
+         computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  else
+    {
+      if(q->number > maxTips && !q->xPars) 
+        computeTraversalInfoParsimony(q, ti, counter, maxTips, full);
+      
+      if(r->number > maxTips && !r->xPars) 
+        computeTraversalInfoParsimony(r, ti, counter, maxTips, full);
+    }
+  
+  
+  ti[*counter]     = p->number;
+  ti[*counter + 1] = q->number;
+  ti[*counter + 2] = r->number;
+  *counter = *counter + 4;
+}
+
+/* check whether site contains at least 2 different letters, i.e.
+   whether it will generate a score */
+static pllBoolean isInformative(pllInstance *tr, int dataType, int site)
+{
+  int
+    informativeCounter = 0,
+    check[256],   
+    j,   
+    undetermined = getUndetermined(dataType);
+
+  const unsigned int
+    *bitVector = getBitVector(dataType);
+
+  unsigned char
+    nucleotide;
+  
+        
+  for(j = 0; j < 256; j++)
+    check[j] = 0;
+  
+  for(j = 1; j <= tr->mxtips; j++)
+    {      
+      nucleotide = tr->yVector[j][site];            
+      check[nucleotide] = 1;
+      assert(bitVector[nucleotide] > 0);                   
+    }
+  
+  for(j = 0; j < undetermined; j++)
+    {
+      if(check[j] > 0)
+        informativeCounter++;    
+    } 
+          
+  if(informativeCounter > 1)
+    return PLL_TRUE;    
+
+  return PLL_FALSE;          
+}
+
+static void compressDNA(pllInstance *tr, partitionList *pr, int *informative, int perSiteScores)
+{
+  size_t
+    totalNodes,
+    i,
+    model;
+   
+  totalNodes = 2 * (size_t)tr->mxtips;
+
+ 
+
+  for(model = 0; model < (size_t) pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = (size_t)pr->partitionData[model]->states,
+        compressedEntries,
+        compressedEntriesPadded,
+        entries = 0, 
+        lower = pr->partitionData[model]->lower,
+        upper = pr->partitionData[model]->upper;
+
+      parsimonyNumber 
+        **compressedTips = (parsimonyNumber **)rax_malloc(states * sizeof(parsimonyNumber*)),
+        *compressedValues = (parsimonyNumber *)rax_malloc(states * sizeof(parsimonyNumber));
+      
+      for(i = lower; i < upper; i++)    
+        if(informative[i])
+          entries += (size_t)tr->aliaswgt[i];     
+  
+      compressedEntries = entries / PLL_PCF;
+
+      if(entries % PLL_PCF != 0)
+        compressedEntries++;
+
+#if (defined(__SSE3) || defined(__AVX))
+      if(compressedEntries % INTS_PER_VECTOR != 0)
+        compressedEntriesPadded = compressedEntries + (INTS_PER_VECTOR - (compressedEntries % INTS_PER_VECTOR));
+      else
+        compressedEntriesPadded = compressedEntries;
+#else
+      compressedEntriesPadded = compressedEntries;
+#endif     
+
+      
+      rax_posix_memalign ((void **) &(pr->partitionData[model]->parsVect), PLL_BYTE_ALIGNMENT, (size_t)compressedEntriesPadded * states * totalNodes * sizeof(parsimonyNumber));
+      if (perSiteScores)
+       {
+         rax_posix_memalign ((void **) &(pr->partitionData[model]->perSiteParsScores), PLL_BYTE_ALIGNMENT, (size_t)pr->partitionData[model]->width* sizeof (parsimonyNumber));
+         for (i = 0; i < (size_t)pr->partitionData[model]->width; ++i) pr->partitionData[model]->perSiteParsScores[i] = 0;
+       }
+
+     
+      for(i = 0; i < compressedEntriesPadded * states * totalNodes; i++)      
+        pr->partitionData[model]->parsVect[i] = 0;
+
+      for(i = 0; i < (size_t)tr->mxtips; i++)
+        {
+          size_t
+            w = 0,
+            compressedIndex = 0,
+            compressedCounter = 0,
+            index = 0;
+
+          for(k = 0; k < states; k++)
+            {
+              compressedTips[k] = &(pr->partitionData[model]->parsVect[(compressedEntriesPadded * states * (i + 1)) + (compressedEntriesPadded * k)]);
+              compressedValues[k] = 0;
+            }                
+              
+          for(index = lower; index < (size_t)upper; index++)
+            {
+              if(informative[index])
+                {
+                  const unsigned int 
+                    *bitValue = getBitVector(pr->partitionData[model]->dataType);
+
+                  parsimonyNumber 
+                    value = bitValue[tr->yVector[i + 1][index]];          
+              
+                  for(w = 0; w < (size_t)tr->aliaswgt[index]; w++)
+                    {      
+                      for(k = 0; k < states; k++)
+                        {
+                          if(value & mask32[k])
+                            compressedValues[k] |= mask32[compressedCounter];
+                        }
+                     
+                      compressedCounter++;
+                  
+                      if(compressedCounter == PLL_PCF)
+                        {
+                          for(k = 0; k < states; k++)
+                            {
+                              compressedTips[k][compressedIndex] = compressedValues[k];
+                              compressedValues[k] = 0;
+                            }                    
+                          
+                          compressedCounter = 0;
+                          compressedIndex++;
+                        }
+                    }
+                }
+            }
+          
+          for(;compressedIndex < compressedEntriesPadded; compressedIndex++)
+            {   
+              for(;compressedCounter < PLL_PCF; compressedCounter++)              
+                for(k = 0; k < states; k++)
+                  compressedValues[k] |= mask32[compressedCounter];               
+          
+              for(k = 0; k < states; k++)
+                {
+                  compressedTips[k][compressedIndex] = compressedValues[k];
+                  compressedValues[k] = 0;
+                }                     
+              
+              compressedCounter = 0;
+            }           
+        }
+  
+      pr->partitionData[model]->parsimonyLength = compressedEntriesPadded;
+
+      rax_free(compressedTips);
+      rax_free(compressedValues);
+    }
+  
+  rax_posix_memalign ((void **) &(tr->parsimonyScore), PLL_BYTE_ALIGNMENT, sizeof(unsigned int) * totalNodes);  
+          
+  for(i = 0; i < totalNodes; i++) 
+    tr->parsimonyScore[i] = 0;
+}
+
+static void determineUninformativeSites(pllInstance *tr, partitionList *pr, int *informative)
+{
+  int 
+    model,
+    number = 0,
+    i;
+
+  /* 
+     Not all characters are useful in constructing a parsimony tree. 
+     Invariant characters, those that have the same state in all taxa, 
+     are obviously useless and are ignored by the method. Characters in 
+     which a state occurs in only one taxon are also ignored. 
+     All these characters are called parsimony uninformative.
+
+     Alternative definition: informative columns contain at least two types
+     of nucleotides, and each nucleotide must appear at least twice in each 
+     column. Kind of a pain if we intend to check for this when using, e.g.,
+     amibiguous DNA encoding.
+  */
+
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      for(i = pr->partitionData[model]->lower; i < pr->partitionData[model]->upper; i++)
+        {
+           if(isInformative(tr, pr->partitionData[model]->dataType, i))
+             informative[i] = 1;
+           else
+             {
+               informative[i] = 0;
+               number++;
+             }  
+        }      
+    }
+
+  /* printf("Uninformative Patterns: %d\n", number); */
+}
+
+void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{
+  int 
+    i,
+    *informative = (int *)rax_malloc(sizeof(int) * (size_t)tr->originalCrunchedLength);
+
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+     rax_free (pr->partitionData[i]->parsVect);
+
+  rax_free (tr->parsimonyScore);
+ 
+  determineUninformativeSites(tr, pr, informative);
+
+  compressDNA(tr, pr, informative, perSiteScores);
+
+  for(i = tr->mxtips + 1; i <= tr->mxtips + tr->mxtips - 1; i++)
+    {
+      nodeptr 
+        p = tr->nodep[i];
+
+      p->xPars             = 1;
+      p->next->xPars       = 0;
+      p->next->next->xPars = 0;
+    }
+
+  tr->ti = (int*)rax_malloc(sizeof(int) * 4 * (size_t)tr->mxtips);  
+
+  rax_free(informative); 
+}
+
+static void newviewParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{    
+  INT_TYPE
+    allOne = SET_ALL_BITS_ONE;
+
+  int 
+    model,
+    *ti = tr->ti,
+    count = ti[0],
+    index; 
+
+  for(index = 4; index < count; index += 4)
+    {      
+      unsigned int
+        totalScore = 0;
+
+      size_t
+        pNumber = (size_t)ti[index],
+        qNumber = (size_t)ti[index + 1],
+        rNumber = (size_t)ti[index + 2];
+      
+      for(model = 0; model < pr->numberOfPartitions; model++)
+        {
+          size_t
+            k,
+            states = pr->partitionData[model]->states,
+            width = pr->partitionData[model]->parsimonyLength;
+            
+          unsigned int  
+            i;      
+                 
+          switch(states)
+            {
+            case 2:       
+              {
+                parsimonyNumber
+                  *left[2],
+                  *right[2],
+                  *this[2];
+
+                for(k = 0; k < 2; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C,
+                      v_A, v_C;          
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);                                                                
+                    
+                    v_N = VECTOR_BIT_OR(l_A, l_C);
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);            
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            case 4:
+              {
+                parsimonyNumber
+                  *left[4],
+                  *right[4],
+                  *this[4];
+
+                for(k = 0; k < 4; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+                  }
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    INT_TYPE
+                      s_r, s_l, v_N,
+                      l_A, l_C, l_G, l_T,
+                      v_A, v_C, v_G, v_T;                
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[0][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[0][i]));
+                    l_A = VECTOR_BIT_AND(s_l, s_r);
+                    v_A = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[1][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[1][i]));
+                    l_C = VECTOR_BIT_AND(s_l, s_r);
+                    v_C = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[2][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[2][i]));
+                    l_G = VECTOR_BIT_AND(s_l, s_r);
+                    v_G = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    s_l = VECTOR_LOAD((CAST)(&left[3][i]));
+                    s_r = VECTOR_LOAD((CAST)(&right[3][i]));
+                    l_T = VECTOR_BIT_AND(s_l, s_r);
+                    v_T = VECTOR_BIT_OR(s_l, s_r);
+                    
+                    v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));                                
+                    
+                    VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A)));
+                    VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C)));
+                    VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G)));
+                    VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T)));                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);  
+                    
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            case 20:
+              {
+                parsimonyNumber
+                  *left[20],
+                  *right[20],
+                  *this[20];
+
+                for(k = 0; k < 20; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[20], 
+                      v_A[20];           
+                    
+                    for(j = 0; j < 20; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < 20; j++)                 
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }
+              }
+              break;
+            default:
+              {
+                parsimonyNumber
+                  *left[32], 
+                  *right[32],
+                  *this[32];
+
+                assert(states <= 32);
+                
+                for(k = 0; k < states; k++)
+                  {
+                    left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                    right[k] = &(pr->partitionData[model]->parsVect[(width * states * rNumber) + width * k]);
+                    this[k]  = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+                  }
+
+                for(i = 0; i < width; i += INTS_PER_VECTOR)
+                  {               
+                    size_t j;
+                    
+                    INT_TYPE
+                      s_r, s_l, 
+                      v_N = SET_ALL_BITS_ZERO,
+                      l_A[32], 
+                      v_A[32];           
+                    
+                    for(j = 0; j < states; j++)
+                      {
+                        s_l = VECTOR_LOAD((CAST)(&left[j][i]));
+                        s_r = VECTOR_LOAD((CAST)(&right[j][i]));
+                        l_A[j] = VECTOR_BIT_AND(s_l, s_r);
+                        v_A[j] = VECTOR_BIT_OR(s_l, s_r);
+                        
+                        v_N = VECTOR_BIT_OR(v_N, l_A[j]);
+                      }
+                    
+                    for(j = 0; j < states; j++)             
+                      VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j])));                                                                    
+                    
+                    v_N = VECTOR_AND_NOT(v_N, allOne);
+                    
+                    totalScore += vectorPopcount(v_N);
+
+                    if (perSiteScores)
+                       storePerSiteScores (pr, model, v_N, i);
+                  }                             
+              }
+            }            
+        }
+      tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber];      
+    }
+}
+
+static unsigned int evaluateParsimonyIterativeFast(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores)
+{
+  INT_TYPE 
+    allOne = SET_ALL_BITS_ONE;
+
+  size_t 
+    pNumber = (size_t)tr->ti[1],
+    qNumber = (size_t)tr->ti[2];
+
+  int
+    model;
+
+  unsigned int 
+    bestScore = tr->bestParsimony,    
+    sum;
+
+  if(tr->ti[0] > 4)
+    newviewParsimonyIterativeFast(tr, pr, perSiteScores);
+
+  sum = tr->parsimonyScore[pNumber] + tr->parsimonyScore[qNumber];
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    {
+      size_t
+        k,
+        states = pr->partitionData[model]->states,
+        width  = pr->partitionData[model]->parsimonyLength,
+        i;
+
+       switch(states)
+         {
+         case 2:
+           {
+             parsimonyNumber
+               *left[2],
+               *right[2];
+             
+             for(k = 0; k < 2; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 2 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 2 * pNumber) + width * k]);
+               }     
+             
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                               
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),            
+                   v_N = VECTOR_BIT_OR(l_A, l_C);
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+               }
+           }
+           break;
+         case 4:
+           {
+             parsimonyNumber
+               *left[4],
+               *right[4];
+      
+             for(k = 0; k < 4; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * 4 * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * 4 * pNumber) + width * k]);
+               }        
+
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                                                
+                 INT_TYPE      
+                   l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[0][i])), VECTOR_LOAD((CAST)(&right[0][i]))),
+                   l_C = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[1][i])), VECTOR_LOAD((CAST)(&right[1][i]))),
+                   l_G = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[2][i])), VECTOR_LOAD((CAST)(&right[2][i]))),
+                   l_T = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[3][i])), VECTOR_LOAD((CAST)(&right[3][i]))),
+                   v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T));     
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+               }                 
+           }
+           break;
+         case 20:
+           {
+             parsimonyNumber
+               *left[20],
+               *right[20];
+             
+              for(k = 0; k < 20; k++)
+                {
+                  left[k]  = &(pr->partitionData[model]->parsVect[(width * 20 * qNumber) + width * k]);
+                  right[k] = &(pr->partitionData[model]->parsVect[(width * 20 * pNumber) + width * k]);
+                }  
+           
+              for(i = 0; i < width; i += INTS_PER_VECTOR)
+                {                              
+                  int 
+                    j;
+                  
+                  INT_TYPE      
+                    l_A,
+                    v_N = SET_ALL_BITS_ZERO;     
+                  
+                  for(j = 0; j < 20; j++)
+                    {
+                      l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                      v_N = VECTOR_BIT_OR(l_A, v_N);
+                    }
+                  
+                  v_N = VECTOR_AND_NOT(v_N, allOne);
+                  
+                  sum += vectorPopcount(v_N);          
+                  if (perSiteScores)
+                    storePerSiteScores (pr, model, v_N, i);
+                }
+           }
+           break;
+         default:
+           {
+             parsimonyNumber
+               *left[32],  
+               *right[32]; 
+
+             assert(states <= 32);
+
+             for(k = 0; k < states; k++)
+               {
+                 left[k]  = &(pr->partitionData[model]->parsVect[(width * states * qNumber) + width * k]);
+                 right[k] = &(pr->partitionData[model]->parsVect[(width * states * pNumber) + width * k]);
+               }  
+           
+             for(i = 0; i < width; i += INTS_PER_VECTOR)
+               {                               
+                 size_t
+                   j;
+                 
+                 INT_TYPE      
+                   l_A,
+                   v_N = SET_ALL_BITS_ZERO;     
+                 
+                 for(j = 0; j < states; j++)
+                   {
+                     l_A = VECTOR_BIT_AND(VECTOR_LOAD((CAST)(&left[j][i])), VECTOR_LOAD((CAST)(&right[j][i])));
+                     v_N = VECTOR_BIT_OR(l_A, v_N);
+                   }
+                 
+                 v_N = VECTOR_AND_NOT(v_N, allOne);
+                 
+                 sum += vectorPopcount(v_N);           
+                 if (perSiteScores)
+                   storePerSiteScores (pr, model, v_N, i);
+               }
+           }
+         }
+    }
+  
+  return sum;
+}
+
+unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores)
+{
+  volatile unsigned int result;
+  nodeptr q = p->back;
+  int
+    *ti = tr->ti,
+    counter = 4;
+  
+  ti[1] = p->number;
+  ti[2] = q->number;
+
+  if(full)
+    {
+      if(p->number > tr->mxtips)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+  else
+    {
+      if(p->number > tr->mxtips && !p->xPars)
+        computeTraversalInfoParsimony(p, ti, &counter, tr->mxtips, full);
+      if(q->number > tr->mxtips && !q->xPars)
+        computeTraversalInfoParsimony(q, ti, &counter, tr->mxtips, full); 
+    }
+
+  ti[0] = counter;
+
+  result = evaluateParsimonyIterativeFast(tr, pr, perSiteScores);
+
+  return result;
+}
diff --git a/pllrepo/src/pll.h b/pllrepo/src/pll.h
new file mode 100644
index 0000000..065ddc5
--- /dev/null
+++ b/pllrepo/src/pll.h
@@ -0,0 +1,1692 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ *
+ * ABSTRACT
+ * 
+ * PLL is a highly optimized, parallelized software library to ease the
+ * development of new software tools dealing with phylogenetic inference. Among
+ * the functions included in PLL are 
+ *
+ * DOCUMENTATION
+ *
+ * Extensive documentation for using PLL is available online at
+ * 
+ *                 http://www.libpll.org
+ *
+ *
+ * USAGE
+ *
+ * To use PLL, 
+ *
+ * @file pll.h
+ * @brief Data structures for tree and model 
+ *
+ * @author Tomas Flouri
+ * @author Fernando Izquierdo-Carrasco
+ * @author Andre Aberer
+ * @author Alexandros Stamatakis
+ */
+
+#ifndef __pll__
+#define __pll__
+
+#include <stdint.h>
+#include <stdio.h>
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __MIC_NATIVE
+#define PLL_BYTE_ALIGNMENT 64
+#define PLL_VECTOR_WIDTH 8
+#elif defined (__AVX)
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include <pmmintrin.h>
+
+#define PLL_BYTE_ALIGNMENT 32
+#define PLL_VECTOR_WIDTH 4
+
+#elif defined (__SSE3)
+
+#include <xmmintrin.h>
+#include <pmmintrin.h>
+
+#define PLL_BYTE_ALIGNMENT 16
+#define PLL_VECTOR_WIDTH 2
+
+#else
+#define PLL_BYTE_ALIGNMENT 1
+#define PLL_VECTOR_WIDTH 1
+#endif
+
+#ifdef _MSC_VER
+	#define PLL_ALIGN_BEGIN __declspec(align(PLL_BYTE_ALIGNMENT))
+	#define PLL_ALIGN_END
+#else
+	#define PLL_ALIGN_BEGIN
+	#define PLL_ALIGN_END __attribute__((aligned(PLL_BYTE_ALIGNMENT)))
+#endif
+
+
+#include "stack.h"
+#include "newick.h"
+#include "queue.h"
+
+#define PLL_MAX_TIP_EV                          0.999999999 /* max tip vector value, sum of EVs needs to be smaller than 1.0, otherwise the numerics break down */
+#define PLL_MAX_LOCAL_SMOOTHING_ITERATIONS      32          /** @brief maximum iterations of smoothings per insert in the */
+#define PLL_ITERATIONS                          10          /* maximum iterations of iterations per insert */
+#define PLL_NEWZPERCYCLE                        10           /* iterations of makenewz per tree traversal */
+#define PLL_NMLNGTH                             256         /* number of characters in species name */
+#define PLL_DELTAZ                              0.00001     /* test of net branch length change in update */
+#define PLL_DEFAULTZ                            0.9         /* value of z assigned as starting point */
+#define PLL_UNLIKELY                            -1.0E300    /* low likelihood for initialization */
+#define PLL_SUMMARIZE_LENGTH                    -3
+#define PLL_SUMMARIZE_LH                        -2
+#define PLL_NO_BRANCHES                         -1
+#define PLL_MASK_LENGTH                         32
+#define PLL_ZMIN                                1.0E-15  /* max branch prop. to -log(PLL_ZMIN) (= 34) */
+#define PLL_ZMAX                                (1.0 - 1.0E-6) /* min branch prop. to 1.0-zmax (= 1.0E-6) */
+#define PLL_TWOTOTHE256                         115792089237316195423570985008687907853269984665640564039457584007913129639936.0  /*  2**256 (exactly)  */
+#define PLL_MINLIKELIHOOD                       (1.0/PLL_TWOTOTHE256)
+#define PLL_MINUSMINLIKELIHOOD                  -PLL_MINLIKELIHOOD
+
+
+#define PLL_FORMAT_PHYLIP                       1 
+#define PLL_FORMAT_FASTA                        2
+#define PLL_FORMAT_NEWICK                       3
+
+#define PLL_NNI_P_NEXT                          1       /**< Use p->next for the NNI move */
+#define PLL_NNI_P_NEXTNEXT                      2       /**< Use p->next->next for the NNI move */
+
+#define PLL_BADREAR                             -1
+
+#define PLL_NUM_BRANCHES                        1024
+
+#define PLL_TRUE                                1
+#define PLL_FALSE                               0
+
+#define PLL_REARRANGE_SPR                       0
+#define PLL_REARRANGE_TBR                       1
+#define PLL_REARRANGE_NNI                       2
+
+#define PLL_AA_SCALE                            10.0
+#define PLL_AA_SCALE_PLUS_EPSILON               10.001
+
+/* ALPHA_MIN is critical -> numerical instability, eg for 4 discrete rate cats                    */
+/* and alpha = 0.01 the lowest rate r_0 is                                                        */
+/* 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487             */
+/* which leads to numerical problems Table for alpha settings below:                              */
+/*                                                                                                */
+/* 0.010000 0.00000000000000000000000000000000000000000000000000000000000034878079110511010487    */
+/* 0.010000 yielded nasty numerical bugs in at least one case !                                   */
+/* 0.020000 0.00000000000000000000000000000044136090435925743185910935350715027016962154188875    */
+/* 0.030000 0.00000000000000000000476844846859006690412039180149775802624789852441798419292220    */
+/* 0.040000 0.00000000000000049522423236954066431210260930029681736928018820007024736185030633    */
+/* 0.050000 0.00000000000050625351310359203371872643495343928538368616365517027588794007897377    */
+/* 0.060000 0.00000000005134625283884191118711474021861409372524676086868566926568746566772461    */
+/* 0.070000 0.00000000139080650074206434685544624965062437960128249869740102440118789672851562    */
+/* 0.080000 0.00000001650681201563587066858709818343436959153791576682124286890029907226562500    */
+/* 0.090000 0.00000011301977332931251259273962858978301859735893231118097901344299316406250000    */
+/* 0.100000 0.00000052651925834844387815526344648331402709118265192955732345581054687500000000    */
+
+#define PLL_ALPHA_MIN                           0.02
+#define PLL_ALPHA_MAX                           1000.0
+
+#define PLL_RATE_MIN                            0.0000001
+#define PLL_RATE_MAX                            1000000.0
+
+#define PLL_LG4X_RATE_MIN                       0.0000001
+#define PLL_LG4X_RATE_MAX                       1000.0
+
+#define PLL_FREQ_MIN                            0.001
+
+#define PLL_NUM_AA_STATES                       20
+#define PLL_NUM_DNA_STATES                      4
+
+/* 
+   previous values between 0.001 and 0.000001
+
+   TO AVOID NUMERICAL PROBLEMS WHEN FREQ == 0 IN PARTITIONED MODELS, ESPECIALLY WITH AA 
+   previous value of FREQ_MIN was: 0.000001, but this seemed to cause problems with some 
+   of the 7-state secondary structure models with some rather exotic small toy test datasets,
+   on the other hand 0.001 caused problems with some of the 16-state secondary structure models
+
+   For some reason the frequency settings seem to be repeatedly causing numerical problems
+*/
+
+#define PLL_ITMAX                               100    /* max number of iterations in brent's algorithm */
+
+#define PLL_SHFT(a,b,c,d)                       (a)=(b);(b)=(c);(c)=(d);
+#define PLL_SIGN(a,b)                           ((b) > 0.0 ? fabs(a) : -fabs(a))
+#define PLL_ABS(x)                              (((x)<0)   ?  (-(x)) : (x))
+#define PLL_MIN(x,y)                            (((x)<(y)) ?    (x)  : (y))
+#define PLL_MAX(x,y)                            (((x)>(y)) ?    (x)  : (y))
+#define PLL_SWAP(x,y)                           do{ __typeof__ (x) _t = x; x = y; y = _t; } while(0)
+#define PLL_SWAP_PTR(x,y) do{ char* _t = x; x = y; y = _t; } while(0)
+#define PLL_SWAP_INT(x,y) do{ int _t = x; x = y; y = _t; } while(0)
+
+#define PLL_POINT_GAMMA(prob,alpha,beta)        PointChi2(prob,2.0*(alpha))/(2.0*(beta))
+
+#define PLL_LIB_NAME                            "PLL"
+#define PLL_LIB_VERSION                         "1.0.1"
+#define PLL_LIB_DATE                            "November 3 2014"
+
+/* aminoacid substitution models */
+#define PLL_DAYHOFF                             0
+#define PLL_DCMUT                               1
+#define PLL_JTT                                 2
+#define PLL_MTREV                               3
+#define PLL_WAG                                 4
+#define PLL_RTREV                               5
+#define PLL_CPREV                               6
+#define PLL_VT                                  7
+#define PLL_BLOSUM62                            8
+#define PLL_MTMAM                               9
+#define PLL_LG                                  10
+#define PLL_MTART                               11
+#define PLL_MTZOA                               12
+#define PLL_PMB                                 13
+#define PLL_HIVB                                14
+#define PLL_HIVW                                15
+#define PLL_JTTDCMUT                            16
+#define PLL_FLU                                 17 
+#define PLL_AUTO                                18
+#define PLL_LG4M                                 19
+#define PLL_LG4X                                20
+#define PLL_GTR                                 21  /* GTR always needs to be the last one */
+#define PLL_NUM_PROT_MODELS                     22
+
+/* information criteria for auto protein model selection */
+#define PLL_AUTO_ML   0
+#define PLL_AUTO_BIC  1
+#define PLL_AUTO_AIC  2
+#define PLL_AUTO_AICC 3
+
+/* bipartition stuff */
+#define PLL_BIPARTITIONS_RF                     4
+
+/* scenarios for likelihood computation */
+#define PLL_TIP_TIP                             0
+#define PLL_TIP_INNER                           1
+#define PLL_INNER_INNER                         2
+
+
+/* available data types in PLL */
+#define PLL_MIN_MODEL                          -1
+#define PLL_BINARY_DATA                         0
+#define PLL_DNA_DATA                            1
+#define PLL_AA_DATA                             2
+#define PLL_SECONDARY_DATA                      3
+#define PLL_SECONDARY_DATA_6                    4
+#define PLL_SECONDARY_DATA_7                    5
+#define PLL_GENERIC_32                          6
+#define PLL_GENERIC_64                          7
+#define PLL_MAX_MODEL                           8
+
+#define PLL_SEC_6_A                             0
+#define PLL_SEC_6_B                             1
+#define PLL_SEC_6_C                             2
+#define PLL_SEC_6_D                             3
+#define PLL_SEC_6_E                             4
+
+#define PLL_SEC_7_A                             5
+#define PLL_SEC_7_B                             6
+#define PLL_SEC_7_C                             7
+#define PLL_SEC_7_D                             8
+#define PLL_SEC_7_E                             9
+#define PLL_SEC_7_F                             10
+
+#define PLL_SEC_16                              11
+#define PLL_SEC_16_A                            12
+#define PLL_SEC_16_B                            13
+#define PLL_SEC_16_C                            14
+#define PLL_SEC_16_D                            15
+#define PLL_SEC_16_E                            16
+#define PLL_SEC_16_F                            17
+#define PLL_SEC_16_I                            18
+#define PLL_SEC_16_J                            19
+#define PLL_SEC_16_K                            20
+
+#define PLL_ORDERED_MULTI_STATE                 0
+#define PLL_MK_MULTI_STATE                      1
+#define PLL_GTR_MULTI_STATE                     2
+
+
+/* available models of rate heterogeneity in PLL */
+#define PLL_CAT                                 0
+#define PLL_GAMMA                               1
+
+/* recomp */
+#define PLL_SLOT_UNUSED                        -2  /* value to mark an available vector */
+#define PLL_NODE_UNPINNED                      -3  /* marks an inner node as not available in RAM */
+#define PLL_INNER_NODE_INIT_STLEN              -1  /* initialization */
+
+#define PLL_MIN_RECOM_FRACTION     0.1 /* at least this % of inner nodes will be allocated in RAM */
+#define PLL_MAX_RECOM_FRACTION     1.0 /* always 1, just there for boundary checks */
+
+
+typedef  int pllBoolean;
+
+/* @brief PLL instance attribute structure */
+typedef struct
+{
+  int rateHetModel;
+  int fastScaling;
+  int saveMemory;
+  int useRecom;
+  long randomNumberSeed;
+  int numberOfThreads;
+} pllInstanceAttr;
+
+/** @brief Stores the recomputation-state of likelihood vectors  */
+typedef struct
+{
+  int numVectors;      /**< Number of inner vectors allocated in RAM*/
+  int *iVector;        /**< size: numVectors, stores node id || PLL_SLOT_UNUSED  */
+  int *iNode;          /**< size: inner nodes, stores slot id || PLL_NODE_UNPINNED */
+  int *stlen;          /**< Number of tips behind the current orientation of the indexed inner node (subtree size/cost) */ 
+  int *unpinnable;     /**< size:numVectors , TRUE if we dont need the vector */
+  int maxVectorsUsed;  
+  pllBoolean allSlotsBusy; /**< on if all slots contain an ancesctral node (the usual case after first full traversal) */ 
+} recompVectors;
+/* E recomp */
+
+/** @brief ???
+ * @todo add explanation, is this ever used?  */
+ 
+typedef unsigned int hashNumberType;
+
+
+
+/*typedef uint_fast32_t parsimonyNumber;*/
+
+#define PLL_PCF 32
+
+/** @brief ???Hash tables 
+ * @todo add explanation of all hash tables  */
+typedef struct pllBipartitionEntry
+{
+  unsigned int *bitVector;
+  unsigned int *treeVector;
+  unsigned int amountTips;
+  int *supportVector;
+  unsigned int bipNumber;
+  unsigned int bipNumber2;
+  unsigned int supportFromTreeset[2]; 
+  struct pllBipartitionEntry *next;
+} pllBipartitionEntry;
+
+//typedef struct
+//{
+//  hashNumberType tableSize;
+//  entry **table;
+//  hashNumberType entryCount;
+//}
+//  hashtable;
+//struct stringEnt
+//{
+//  int nodeNumber;
+//  char *word;
+//  struct stringEnt *next;
+//};
+//
+//typedef struct stringEnt stringEntry;
+//typedef struct
+//{
+//  hashNumberType tableSize;
+//  stringEntry **table;
+//}
+//  stringHashtable;
+
+typedef struct pllHashItem
+{
+  void * data;
+  char * str;
+  struct pllHashItem * next;
+} pllHashItem;
+
+typedef struct pllHashTable
+{
+  unsigned int size;
+  struct pllHashItem ** Items;
+  unsigned int entries;
+} pllHashTable;
+
+
+
+
+/** @brief Per-site Rate category entry: likelihood per-site and CAT rate applied ???
+  *
+  */
+typedef struct ratec
+{
+  double accumulatedSiteLikelihood;
+  double rate;
+}rateCategorize;
+
+/** @brief Traversal descriptor entry.
+  * 
+  * Contains the information required to execute an operation in a step of the tree traversal.
+  * q   r
+  *  \ /
+  *   p
+  *
+  * The entry defines 2 input/parent nodes (q and r) and one output/child node (p)
+  * qz represents the branch length(s) of the branch connecting q and p
+  * rz represents the branch length(s) of the branch connecting r and p
+  * PLL_TIP_TIP     Both p and r are tips
+  * PLL_INNER_INNER Both p and r are inner nodes
+  * @note PLL_TIP_INNER   q is a tip and r is an inner node (by convention, flip q and r if required)
+  */
+typedef struct
+{
+  int tipCase;                  /**< Type of entry, must be PLL_TIP_TIP PLL_TIP_INNER or PLL_INNER_INNER */
+  int pNumber;                  /**< should exist in some nodeptr p->number */
+  int qNumber;                  /**< should exist in some nodeptr q->number */
+  int rNumber;                  /**< should exist in some nodeptr r->number */
+  double qz[PLL_NUM_BRANCHES];
+  double rz[PLL_NUM_BRANCHES];
+  /* recom */
+  int slot_p;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node p, otherwise unused */
+  int slot_q;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node q, otherwise unused */
+  int slot_r;                   /**< In recomputation mode, the RAM slot index for likelihood vector of node r, otherwise unused */
+  /* E recom */
+} traversalInfo;
+
+/** @brief Traversal descriptor.
+  * 
+  * Describes the state of a traversal descriptor
+  */
+typedef struct
+{
+  traversalInfo *ti;              /**< list of traversal steps */
+  int count;                      /**< number of traversal steps */
+  int functionType;
+  pllBoolean traversalHasChanged;   
+  pllBoolean *executeModel;           
+  double  *parameterValues;
+} traversalData;
+
+/** @brief Node record structure
+  * 
+  * Each inner node is a trifurcation in the tree represented as a circular list containing 3 node records. One node record uniquely identifies a subtree, and the orientation of the likelihood vector within a node
+  *
+  * p1 -------> p2 ----> to the next node
+  * ^           |
+  * |-----p3<---|          
+  * 
+  */
+struct noderec;
+
+/** @brief Branch length information.
+  * 
+  * @todo add relevant info on where this is used ???
+  */
+typedef struct
+{
+  unsigned int *vector; 
+  int support;   
+  struct noderec *oP;
+  struct noderec *oQ;
+} branchInfo;
+
+
+
+
+
+/** @brief Linkage of partitions.
+  * 
+  * @todo add relevant info on where this is used ???
+  */
+typedef struct
+{
+  pllBoolean valid;
+  int partitions;  
+  int *partitionList;
+}
+  linkageData;
+typedef struct
+{
+  int entries;
+  linkageData* ld;
+}
+  linkageList;
+
+
+
+  /** 
+   *
+   * the data structure below is fundamental for representing trees 
+     in the library!
+
+     Inner nodes are represented by three instances of the nodeptr data structure that is linked 
+     via a cyclic list using the next pointer.
+
+     So for building an inner node of the tree we need to allocate three nodeptr 
+     data structures and link them together, e.g.:
+
+     assuming that we have allocated space for an inner node 
+     for nodeptr pointers p1, p2, p3, 
+
+     we would then link them like this:
+
+     p1->next = p2;
+     p2->next = p3;
+     p3->next = p1;
+
+     also note that the node number that identifies the inner node 
+     needs to be set to the same value.
+
+     for n taxa, tip nodes are enumarated/indexed from 1....n,
+     and inner node inbdices start at n+1. Assuming that we have 10 taxa 
+     and this is our first inner node, we'd initialize the number as follows:
+
+     p1->number = 11;
+     p2->number = 11;
+     p3->number = 11;
+
+     Note that the node number is important for indexing tip sequence data as well as inner likelihood vectors 
+     and that it is this number (the index) that actually gets stored in the traversal descriptor.
+
+     Tip nodes are non-cyclic nodes that simply consist of one instance/allocation of nodeptr.
+
+     if we have allocated a tip data structure nodeptr t1, 
+     we would initialize it as follows:
+
+     t1->number = 1;
+
+     t1->next = NULL;
+
+     now let's assume that we want to build a four taxon tree with tips t1, t2, t3, t4 
+     and inner nodes (p1,p2,p3) and (q1,q2,q3).
+
+     we first build the tips:
+
+     t1->number = 1;
+     t1->next = NULL;
+     
+     t2->number = 2;
+     t2->next = NULL;
+
+     t3->number = 3;
+     t3->next = NULL;
+
+     t4->number = 4;
+     t4->next = NULL;
+     
+     now the first inner node
+
+     p1->next = p2;
+     p2->next = p3;
+     p3->next = p1;    
+
+     p1->number = 5;
+     p2->number = 5;
+     p3->number = 5;
+
+     and the second inner node.
+
+     q1->next = q2;
+     q2->next = q3;
+     q3->next = q1;    
+
+     q1->number = 6;
+     q2->number = 6;
+     q3->number = 6;
+     
+     now we need to link the nodes together such that they form a tree, let's assume we want ((t1,t2), (t3, t4));
+
+     we will have to link the nodes via the so-called back pointer,
+     i.e.:
+
+     let's connect node p with t1 and t2
+
+     t1->back = p1;
+     t2->back = p2;
+
+     and vice versa:
+
+     p1->back = t1;
+     p2->back = t2;
+
+     let's connect node p with node q:
+
+     p3->back = q3;
+
+     and vice versa:
+
+     q3->back = p3;
+
+     and now let's connect node q with tips t3 and t4:
+
+     q1->back = t3;
+     q2->back = t4;
+
+     and vice versa:
+
+     t3->back = q1;
+     t4->back = q2;
+
+     What remains to be done is to set up the branch lengths.
+     Using the data structure below, we always have to store the 
+     branch length twice for each "topological branch" unfortunately.
+
+     Assuming that we are only estimating a single branch across all partitions 
+     we'd just set the first index of the branch length array z[PLL_NUM_BRANCHES].
+
+     e.g., 
+
+     t3->z[0] = q1->z[0] = 0.9;
+
+     the above operation for connecting nodes is implemented in functions hookup() which will set 
+     the back pointers of two nodes that are to be connected as well as the branch lengths.
+
+     The branchInfo data field is a pointer to a data-structure that stores meta-data and requires 
+     the tree not to change while it is being used.
+     
+     Also, this pointer needs to be set by doing a full tree traversal on the tree.
+
+     Note that q1->bInf == t3->bInf in the above example.
+
+     The hash number is used for mapping bipartitions to a hash table as described in the following paper:
+
+     A. Aberer, N. Pattengale, A. Stamatakis: "Parallelized phylogenetic post-analysis on multi-core architectures". Journal of Computational Science 1, 107-114, 2010.
+     
+     The support data field stores the support value for the branch associated with each nodeptr structure.
+     Note that support always refers to branches. 
+
+     Thus for consistency, q3->support must be equal to p3->support;
+
+     Finally, the three char fields x, xPars and xBips are very very important!
+
+     They are used to denote the presence/absence or if you want, direction of the 
+     parsimony, bipartition, or likelihood vector at a node with respect to the virtual root.
+
+     Essentially, they are just used as single presence/absence bits and ONLY for inner nodes!
+
+     When setting up new inner nodes, one of the three pointers in the cyclic list must 
+     have x = 1 and the other two x = 0;
+
+     in the above example we could set:
+
+     p1->x = 0;
+     p2->x = 0;
+     p3->x = 1;
+
+     q1->x = 0;
+     q2->x = 0;
+     q3->x = 1;
+
+     This would mean that the virtual root is located at the inner branch of the four taxon tree ((t1,t2),(t3,t4));
+
+     When we re-root the tree at some other branch we need to update the location of the x pointer that is set to 1.
+
+     This means if we root the tree at the branch leading to t1 we would set 
+
+     p1->x = 1;
+     p2->x = 0;
+     p3->x = 0;
+
+     the values for q remaon unchanged since q3 is still pointing toward the root.
+
+     When we re-locate the root to branch p1 <-> t1 the fact that we have to "rotate" the x value that is set to 1
+     to another node of the cyclic list representing the abstract topological node p, also tells us that we 
+     need to re-compute the conditional likelihood array for p. 
+
+     Note that, only one likelihood or parsimony array is stored per inner node and the location of x essentially tells us which subtree 
+     it summarizes, if p1->x == 1, it summarizes subtree (t2, (t3, t4)), if p3->x = 1 the likelihood vector associated with 
+     node p summarizes subtree (t1, t2).
+
+     @todo I think we should rename the back pointer. It's not back, it can be forward depending on the orientation. We should renmae it to outer. Back is too confusing, I would assume it's the opposite of next, i.e. previous.
+
+     @struct noderec
+
+     @brief Tree node record
+
+     A node in a tree is a structure which contains a cyclic list of pointers to 3 nodes which we call a \e roundabout. The first node is the structure itself, and the other two nodes are accessed via \a noderec->next and \a noderec->next->next. To access the outer node with which each of the 3 nodes forms an edge one has to use the \a back pointer
+
+     @var noderec::next
+     @brief Next node in the roundabout
+
+     @var noderec::back
+     @brief Outer node
+
+     @var noderec::number
+     @brief Node identifier
+
+     In general, tips (i.e. leaves) are numbered from 1 to \e n where \e n is the number of taxa. Identifiers for internal nodes start from \e n + 1. Note
+     that for a given inner node, the identifier must be the same for all 3 nodes that compose it.
+
+     @var info::z
+     @brief The branch lengths per partition for the main node in the roundabout
+
+     @todo Append an image
+  */
+typedef  struct noderec
+{
+ 
+  branchInfo      *bInf;
+  double           z[PLL_NUM_BRANCHES];
+  struct noderec  *next;        
+  struct noderec  *back;       
+  hashNumberType   hash;
+  int              support;
+  int              number;    
+  char             x;
+  char             xPars;
+  char             xBips;
+}
+  node, *nodeptr;
+
+typedef unsigned int parsimonyNumber;
+
+/* @brief Alignment, transition model, model of rate heterogenety and likelihood vectors for one partition.
+  * 
+  * @todo De-couple into smaller data structures
+  *
+  * ALIGNMENT DATA 
+  * This depends only on the type of data in this partition of the alignment 
+  *
+  * MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR 
+  * Rate heterogenety: Per Site Categories (PSR) model aka CAT, 
+  * Rate of site i is given by perSiteRates[rateCategory[i]]
+  *
+  * TRANSITION MODEL: We always assume General Time Reversibility 
+  * Transistion probability matrix: P(t) = exp(Qt)
+  * Branch length t is the expected number of substitutions per site 
+  * Pij(t) is the probability of going from state i to state j in a branch of length t 
+  * Relative substitution rates (Entries in the Q matrix) 
+  * In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies 
+
+    @var protModels
+    @brief Protein models
+
+    @detail Detailed protein models descriptiopn
+
+    @var autoProtModels
+    @brief Auto prot models
+    @detail Detailed auto prot models
+  */
+ 
+
+
+/** @struct pInfo
+    
+    @brief Partition information structure
+
+    This data structure encapsulates all properties and auxiliary variables that together
+    consist a partition.
+
+    @var pInfo::dataType
+    @brief Type of data this partition contains
+
+    Can be DNA (\b PLL_DNA_DATA) or AminoAcid (\b PLL_AA_DATA) data
+
+    @var pInfo::states
+    @brief Number of states
+
+    Number of states this type of data can consist of
+
+    @var pInfo::maxTipStates
+    @brief Number of undetermined states (possible states at the tips)
+
+    This is the total number of possible states that can appear in the alignment. This includes degenerate (undetermined) bases
+
+    @var pInfo::partitionName
+    @brief Name of partition
+
+    A null-terminated string describing the name of partition
+
+    @var pInfo::lower
+    @brief Position of the first site in the alignment that is part of this partition [1, tr->originalCrunchedLength]
+
+    @var pInfo::upper
+    @brief Position of the last site that is part of this partition plus one (i.e. position of the first site that is not part of this partition) 
+
+    @var pInfo::width
+    @brief Number of sites in the partition (i.e. \a upper - \a lower)
+
+    @var pInfo::wgt
+    @brief Weight of site
+
+    Number of times this particular site appeared in the partition before the duplicates were removed and replaced by this weight
+
+    @var pInfo::empiricalFrequencies
+    @brief Empirical frequency of each state in the current partition
+
+    @var pInfo::perSiteRates
+    @brief Per Site Categories (PSR) or aka CAT values for each rate
+
+    @var pInfo::rateCategory
+    @brief CAT category index for each site
+
+    @var pInfo::numberOfCategories
+    @brief CAT size of the set of possible categories
+
+    @var pInfo::alpha
+    @brief Gamma parameter to be optimized
+    
+    @var pInfo::gammaRates
+    @brief Values of the 4 gamma categories (rates) computed given an alpha
+
+    @var pInfo::substRates
+    @brief Entries of substitution matrix, e.g. 6 free parameters in DNA
+
+    In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
+    which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
+
+    @var pInfo::frequencies
+    @brief State frequencies, entries in D are initialized as empiricalFrequencies
+    
+    In GTR we can write \f$ Q = S * D \f$, where \f$ S \f$ is a symmetrical matrix and \f$ D \f$ a diagonal with the state frequencies,
+    which is represented by the array \a frequencies. The symmetrical matrix is the array \a substRates
+
+    @var pInfo::freqExponents
+
+    @var pInfo::EIGN
+    @brief Eigenvalues of Q matrix
+
+    @var pInfo::EV
+    @brief Eigenvectors of Q matrix
+
+    @var pInfo::EI
+    @brief Inverse eigenvectors of Q matrix
+
+    @var pInfo::left
+    @brief P matrix for the left term of the conditional likelihood equation
+
+    @var pInfo::right
+    @brief P matrix for the right term of the conditional likelihood equation
+
+    @var pInfo::tipVector
+    @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base 
+
+    @var pInfo::EIGN_LG4
+    @brief Eigenvalues of Q matrix for the LG4 model
+
+    @var pInfo::EV_LG4
+    @brief Eigenvectors of Q matrix for the LG4 model
+
+    @var pInfo::EI_LG4
+    @brief Inverse eigenvectors of Q matrix for the LG4 model
+    
+    @var pInfo::frequencies_LG4
+    @brief State frequencies for the LG4 model
+
+    @var pInfo::tipVector_LG4
+    @brief Precomputed (based on current P matrix) conditional likelihood vectors for every possible base for the LG4 model
+
+    @var pInfo::substRates_LG4
+    @brief Entries of substitution matrix for the LG4 model
+
+    @var pInfo::protModels
+    @brief Protein model for current partition
+
+    In case \a pInfo::dataType is set to \a PLL_AA_DATA then \a protModels indicates the index in the global array \a protModels
+    of the protein model that the current partition uses.
+
+    @var pInfo::autoProtModels
+    @brief Best fitted protein model for the \b PLL_AUTO partitions
+
+    If \a protModels is set to \b PLL_AUTO then \a autoProtModels holds the currently detected best fitting protein model for the partition
+
+    @var pInfo::protUseEmpiricalFreqs
+
+    @var pInfo::nonGTR
+
+    @var pInfo::optimizeBaseFrequencies
+
+    @var pInfo::optimizeAlphaParameter
+
+    @var pInfo::optimizeSubstitutionRates
+
+    @var pInfo::symmetryVector
+
+    @var pInfo::frequencyGrouping
+
+
+    @todo
+      Document freqExponents
+
+*/
+
+
+
+typedef struct {
+  int     dataType;
+  int     states;
+  int     maxTipStates;
+  char   *partitionName;
+  int     lower;
+  int     upper;
+  int     width;
+  int    *wgt;
+  double *empiricalFrequencies; 
+
+
+  /* MODEL OF RATE HETEROGENETY, We use either GAMMA or PSR */
+  /* Rate heterogenety: Per Site Categories (PSR) model aka CAT, see updatePerSiteRates() */
+  /* Rate of site i is given by perSiteRates[rateCategory[i]] */
+  double *perSiteRates;
+  int    *rateCategory;
+  int     numberOfCategories;
+  /* Rate heterogenety: GAMMA model of rate heterogenety */
+  double alpha;
+  double *gammaRates;
+
+
+  /* TRANSITION MODEL: We always assume General Time Reversibility */
+  /* Transistion probability matrix: P(t) = exp(Qt)*/
+  /* Branch length t is the expected number of substitutions per site */
+  /* Pij(t) is the probability of going from state i to state j in a branch of length t */
+  /* Relative substitution rates (Entries in the Q matrix) */
+  /* In GTR we can write Q = S * D, where S is a symmetrical matrix and D a diagonal with the state frequencies */
+  double *substRates;       /**< TRANSITION MODEL Entries in S, e.g. 6 free parameters in DNA */   
+  double *frequencies;      /**< State frequencies, entries in D, are initialized as empiricalFrequencies */
+  double *freqExponents;
+  /* Matrix decomposition: @todo map this syntax to Explanation of the mathematical background */
+  double *EIGN;
+  double *EV;
+  double *EI;
+  double *left;
+  double *right;
+  double *tipVector;
+
+
+  /* asc bias */
+  pllBoolean       ascBias;
+  int           ascOffset;
+  int         * ascExpVector;
+  double      * ascSumBuffer;
+  double      * ascVector;
+  double        ascScaler[64];
+  
+  /* LG4 */
+
+  double *EIGN_LG4[4];
+  double *EV_LG4[4];
+  double *EI_LG4[4];
+
+  double *frequencies_LG4[4];
+  double *tipVector_LG4[4];
+  double *substRates_LG4[4];
+  
+  /* LG4X */
+
+  double lg4x_weights[4];
+  double lg4x_weightExponents[4];
+  double lg4x_weightsBuffer[4];
+  double lg4x_weightExponentsBuffer[4];
+  double lg4x_weightLikelihood;
+  
+  /* Protein specific */
+  int     protModels;			/**< Empirical model matrix */
+  int     autoProtModels;		/**< Model selected with "auto" protein model */
+  int     protUseEmpiricalFreqs;	/**< Whether to use empirical frequencies for protein model */
+
+  pllBoolean nonGTR;
+  pllBoolean optimizeBaseFrequencies;	/**< Whether to optimize base frequencies */
+  pllBoolean optimizeAlphaParameter;	/**< Whether to optimize alpha parameters and gamma rates */
+  pllBoolean optimizeSubstitutionRates;	/**< Whether to optimize substitution rates */
+  int    *symmetryVector;		/**< Specify linkage between substitution rate parameters */
+  int    *frequencyGrouping;
+
+  /* LIKELIHOOD VECTORS */
+
+  /* partial LH Inner vectors  ancestral vectors, we have 2*tips - 3 inner nodes */
+  double          **xVector;          /**< Conditional likelihood vectors for inner nodes */
+  unsigned char   **yVector;          /**< Tip entries (sequence) for tip nodes */
+  unsigned int     *globalScaler;     /**< Counters for scaling operations done at node i */
+
+  /* data structures for conducting per-site likelihood scaling.
+     this allows to compute the per-site log likelihood scores 
+     needed for RELL-based bootstrapping and all sorts of statistical 
+     tests for comparing trees ! */
+  int              **expVector;     /**< @brief An entry per inner node. Each element is an array of size the number of sites in the current partition and represents how many times the respective site has been scaled in the subtree rooted at the current node */
+  size_t           *expSpaceVector; /**< @brief Each entry represents an inner node and states the size of the corresponding element in \a expVector, which is the number of sites for the current partition */
+
+  /* These are for the saveMemory option (tracking gaps to skip computations and memory) */
+  size_t           *xSpaceVector;       /* Size of conditional likelihood vectors per inner node */
+  int               gapVectorLength;    /** Length of \a gapVector bitvector in unsigned integers assuming that \a unsigned \a int is 32bits. It is set to partition size / 32 */
+  unsigned int     *gapVector;          /** A bit vector of size \a gapVectorLength * 32 bits. A bit is set to 1 if the corresponding */
+  double           *gapColumn; 
+
+  /* Parsimony vectors at each node */
+  size_t parsimonyLength;
+  parsimonyNumber *parsVect; 
+  parsimonyNumber *perSiteParsScores;
+
+  /* This buffer of size width is used to store intermediate values for the branch length optimization under 
+     newton-raphson. The data in here can be re-used for all iterations irrespective of the branch length.
+   */
+  double *sumBuffer; 
+
+  /* Buffer to store the per-site log likelihoods */
+  double *perSiteLikelihoods;
+
+  /* This buffer of size width is used to store the ancestral state at a node of the tree. */
+  double *ancestralBuffer;
+
+  /* From tree */
+  pllBoolean executeModel;
+  double fracchange;
+  double rawFracchange;
+  double partitionContribution;
+  double partitionWeight;
+  double partitionLH;
+
+// #if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+  int partitionAssignment;
+// #endif
+
+} pInfo;
+
+typedef struct
+ {
+   pInfo **partitionData;
+   int numberOfPartitions;
+   pllBoolean perGeneBranchLengths;
+   pllBoolean dirty;
+   linkageList *alphaList;
+   linkageList *rateList;
+   linkageList *freqList;
+ }  partitionList;
+
+
+
+#define PLL_REARR_SETTING 1
+#define PLL_FAST_SPRS     2
+#define PLL_SLOW_SPRS     3
+
+
+/** @brief Checkpointing states. 
+ * 
+ * @todo Raxml specific 
+  */
+typedef struct {
+ 
+  int state;
+
+  /*unsigned int vLength;*/
+  double accumulatedTime;  
+  int rearrangementsMax;
+  int rearrangementsMin;
+  int thoroughIterations;
+  int fastIterations;
+  int mintrav;
+  int maxtrav;
+  int bestTrav;
+  double startLH; 
+  double lh;
+  double previousLh;
+  double difference;
+  double epsilon;  
+  pllBoolean impr;
+  pllBoolean cutoff;  
+       
+  double tr_startLH;
+  double tr_endLH;
+  double tr_likelihood;
+  double tr_bestOfNode;  
+  double tr_lhCutoff;
+  double tr_lhAVG;
+  double tr_lhDEC;
+  int    tr_NumberOfCategories;
+  int    tr_itCount;  
+  int    tr_doCutoff;
+  int    tr_thoroughInsertion;
+  int    tr_optimizeRateCategoryInvocations;
+ 
+  /* prevent users from doing stupid things */
+ 
+  int searchConvergenceCriterion;
+  int rateHetModel;
+  int maxCategories;
+  int NumberOfModels;
+  int numBranches;
+  int originalCrunchedLength;    
+  int mxtips;
+  char seq_file[1024];
+} checkPointState;
+
+
+
+/* recomp */
+#ifdef _DEBUG_RECOMPUTATION
+typedef struct {
+  unsigned long int numTraversals;
+  unsigned long int tt;
+  unsigned long int ti;
+  unsigned long int ii;
+  unsigned int *travlenFreq;
+} traversalCounter;
+#endif
+/* E recomp */
+
+
+/** @brief Tree topology.
+ * 
+ * @todo Apart from the topology this structure contains several fields that act like global variables in raxml
+  */
+typedef  struct  {
+
+  int *ti;
+
+  /* recomp */
+  recompVectors *rvec;            /**< this data structure tracks which vectors store which nodes */
+  float maxMegabytesMemory;       /**< User says how many MB in main memory should be used */
+  float vectorRecomFraction;      /**< vectorRecomFraction ~= 0.8 * maxMegabytesMemory  */
+  pllBoolean useRecom;               /**< ON if we apply recomputation of ancestral vectors*/
+#ifdef _DEBUG_RECOMPUTATION 
+  traversalCounter *travCounter;
+  double stlenTime;
+#endif
+  /* E recomp */
+  
+  pllBoolean fastScaling;
+  pllBoolean saveMemory;
+  int              startingTree;
+  long             randomNumberSeed;
+
+  double          *lhs;         /**< Array to store per-site log likelihoods of \a originalCrunchedLength (compressed) sites */
+  double          *patrat;      /**< rates per pattern */
+  double          *patratStored; 
+  int             *rateCategory;
+  int             *aliaswgt;    /**< weight by pattern */ 
+  pllBoolean    manyPartitions;
+
+  pllBoolean grouped;              /**< No idea what this is, but is always set to PLL_FALSE */
+  pllBoolean constrained;          /**< No idea what this is, but is always set to PLL_FALSE */
+  int threadID;
+  volatile int numberOfThreads;
+
+//#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+ 
+  unsigned char *y_ptr; 
+  
+  double lower_spacing;
+  double upper_spacing; 
+
+  double *ancestralVector;
+
+//#endif
+  
+  pllHashTable     *nameHash;
+  char           ** tipNames;
+
+  char             *secondaryStructureInput;
+
+  traversalData    td[1];
+
+  int              maxCategories;
+  int              categories;
+
+  double           coreLZ[PLL_NUM_BRANCHES];
+  
+ 
+  branchInfo       *bInf;
+
+  int              multiStateModel;
+
+
+  pllBoolean curvatOK[PLL_NUM_BRANCHES];
+
+  /* the stuff below is shared among DNA and AA, span does
+     not change depending on datatype */
+
+  /* model stuff end */
+  unsigned char    **yVector;        /**< list of raw sequences (parsed from the alignment)*/
+
+  int              secondaryStructureModel;
+  int              originalCrunchedLength; /**< Length of alignment after removing duplicate sites in each partition */
+
+  int              *secondaryStructurePairs;
+
+  double            fracchange;      /**< Average substitution rate */
+  double            rawFracchange;
+  double            lhCutoff;
+  double            lhAVG;
+  unsigned long     lhDEC;
+  unsigned long     itCount;
+  int               numberOfInvariableColumns;
+  int               weightOfInvariableColumns;
+  int               rateHetModel;
+
+  double           startLH;
+  double           endLH;
+  double           likelihood;           /**< last likelihood value evaluated for the current topology */
+ 
+  node           **nodep;                /**< pointer to the list of nodes, which describe the current topology */
+  nodeptr          nodeBaseAddress;
+  node            *start;                /**< starting node by default for full traversals (must be a tip contained in the tree we are operating on) */
+  int              mxtips;  /**< Number of tips in the topology */
+
+  int              *constraintVector;   /**< @todo What is this? */
+  int              numberOfSecondaryColumns;
+  pllBoolean          searchConvergenceCriterion;
+  int              ntips;
+  int              nextnode;  
+
+  pllBoolean          bigCutoff;
+  pllBoolean          partitionSmoothed[PLL_NUM_BRANCHES];
+  pllBoolean          partitionConverged[PLL_NUM_BRANCHES];
+  pllBoolean          rooted;
+  pllBoolean          doCutoff;
+ 
+  double         gapyness;
+
+  char **nameList;     /**< list of tips names (read from the phylip file) */
+  char *tree_string;   /**< the newick representaion of the topology */
+  char *tree0;
+  char *tree1;
+  int treeStringLength;
+ 
+  unsigned int bestParsimony;
+  unsigned int *parsimonyScore;
+  
+  double bestOfNode;
+  nodeptr removeNode;   /**< the node that has been removed. Together with \a insertNode represents an SPR move */
+  nodeptr insertNode;   /**< the node where insertion should take place . Together with \a removeNode represents an SPR move*/
+
+  double zqr[PLL_NUM_BRANCHES];
+  double currentZQR[PLL_NUM_BRANCHES];
+
+  double currentLZR[PLL_NUM_BRANCHES];
+  double currentLZQ[PLL_NUM_BRANCHES];
+  double currentLZS[PLL_NUM_BRANCHES];
+  double currentLZI[PLL_NUM_BRANCHES];
+  double lzs[PLL_NUM_BRANCHES];
+  double lzq[PLL_NUM_BRANCHES];
+  double lzr[PLL_NUM_BRANCHES];
+  double lzi[PLL_NUM_BRANCHES];
+
+
+  unsigned int **bitVectors;
+
+  unsigned int vLength;
+
+  pllHashTable *h;                 /**< hashtable for ML convergence criterion */
+  //hashtable *h;
+ 
+  int optimizeRateCategoryInvocations;
+
+  checkPointState ckp;
+  pllBoolean thoroughInsertion; /**< true if the neighbor branches should be optimized when a subtree is inserted (slower)*/
+  pllBoolean useMedian;
+
+  int autoProteinSelectionType;
+
+  pllStack * rearrangeHistory;
+
+
+  /* analdef defines */
+  /* TODO: Do some initialization */
+  int              bestTrav;            /**< best rearrangement radius */
+  int              max_rearrange;       /**< max. rearrangemenent radius */
+  int              stepwidth;           /**< step in rearrangement radius */
+  int              initial;             /**< user defined rearrangement radius which also sets bestTrav if initialSet is set */
+  pllBoolean          initialSet;          /**< set bestTrav according to initial */
+  int              mode;                /**< candidate for removal */
+  pllBoolean        perGeneBranchLengths;
+  pllBoolean        permuteTreeoptimize;   /**< randomly select subtrees for SPR moves */
+  pllBoolean        compressPatterns;
+  double         likelihoodEpsilon;
+  pllBoolean        useCheckpoint;
+
+} pllInstance;
+
+/** @brief Stores data related to a NNI move  */
+typedef struct {
+        pllInstance * tr;
+        nodeptr p;
+        int nniType;
+        double z[PLL_NUM_BRANCHES]; // optimize branch lengths
+        double z0[PLL_NUM_BRANCHES]; // unoptimized branch lengths
+        double likelihood;
+        double deltaLH;
+} nniMove;
+
+/***************************************************************/
+
+typedef struct {
+  int partitionNumber;
+  int partitionLength;
+} partitionType;
+
+typedef struct
+{
+  double z[PLL_NUM_BRANCHES];
+  nodeptr p, q;
+  int cp, cq;
+}
+  connectRELL, *connptrRELL;
+
+typedef  struct
+{
+  connectRELL     *connect; 
+  int             start;
+  double          likelihood;
+}
+  topolRELL;
+
+
+typedef  struct
+{
+  int max;
+  topolRELL **t;
+}
+  topolRELL_LIST;
+
+/**************************************************************/
+
+/** @brief Connection within a topology.
+*   */
+typedef struct conntyp {
+    double           z[PLL_NUM_BRANCHES];           /**< branch length */
+    node            *p, *q;       /**< parent and child sectors */
+    void            *valptr;      /**< pointer to value of subtree */
+    int              descend;     /**< pointer to first connect of child */
+    int              sibling;     /**< next connect from same parent */
+    } pllConnect, *connptr;
+
+/** @brief Single Topology
+*   */
+typedef  struct {
+    double           likelihood;
+    int              initialTreeNumber;
+    pllConnect         *links;       /**< pointer to first connect (start) */
+    node            *start;
+    int              nextlink;    /**< index of next available connect */
+                                  /**< tr->start = tpl->links->p */
+    int              ntips;
+    int              nextnode;    /**< next available inner node for tree parsing */
+    int              scrNum;      /**< position in sorted list of scores */
+    int              tplNum;      /**< position in sorted list of trees */
+    } topol;
+
+/** @brief small helper data structure for printing out/downstream use of marginal ancestral probability vectors.
+*
+* it is allocated as an array that has the same length as the input alignment and can be used to 
+*   index the ancestral states for each position/site/pattern 
+*   */
+typedef struct {
+  double *probs; /**< marginal ancestral states */
+  char c; /**< most likely stated, i.e. max(probs[i]) above */
+  int states; /**< number of states for this position */
+} ancestralState;
+
+/** @brief List of topologies
+*
+*   */
+typedef struct {
+    double           best;        /**< highest score saved */
+    double           worst;       /**< lowest score saved */
+    topol           *start;       /**< starting tree for optimization */
+    topol          **byScore;
+    topol          **byTopol;
+    int              nkeep;       /**< maximum topologies to save */
+    int              nvalid;      /**< number of topologies saved */
+    int              ninit;       /**< number of topologies initialized */
+    int              numtrees;    /**< number of alternatives tested */
+    pllBoolean          improved;
+    } bestlist;
+
+/** @brief  This is used to look up some hard-coded data for each data type 
+*   */
+typedef struct 
+{
+  int leftLength;         /**< s^2 */
+  int rightLength;/**< s^2 */
+  int eignLength;/**<  s */
+  int evLength;
+  int eiLength;
+  int substRatesLength;   /**< (s^2 - s)/2 free model parameters for matrix Q i.e. substitution rates */
+  int frequenciesLength;  /**< s frequency of each state */ 
+  int tipVectorLength;    /* ??? */
+  int symmetryVectorLength;
+  int frequencyGroupingLength;
+
+  pllBoolean nonGTR;
+  pllBoolean optimizeBaseFrequencies;
+
+  int undetermined;
+
+  const char *inverseMeaning;
+
+  int states;   /* s */
+
+  pllBoolean smoothFrequencies;
+
+  const unsigned  int *bitVector;
+
+} partitionLengths;
+
+typedef struct
+{
+  int rearrangeType;
+  double  likelihood;
+
+  union {
+    struct {
+      double * zp;
+      double * zpn;
+      double * zpnn;
+      double * zqr;
+      nodeptr pn;
+      nodeptr pnn;
+      nodeptr r;
+      nodeptr p;
+      nodeptr q;
+    } SPR;
+    struct {
+      nodeptr origin;
+      int swapType;
+      double z[PLL_NUM_BRANCHES];
+    } NNI;
+  };
+} pllRollbackInfo;
+
+
+/** @struct pllRearrangeAttr
+ 
+    @brief Structure holding attributes for searching possible tree rearrangements
+    
+    Holds the attributes for performing tree rearrangements.
+
+    @var pllRearrangeAttr
+      The origin node where the search should start
+
+    @var pllRearrangeAttr:mintrav
+      The minimum radius around the origin node \a p for which nodes should be tested
+
+    @var pllRearrangeAttr:maxtrav
+      The maximum radius around the origin node \a p for which nodes should be tested
+
+    @var pllRearrangeAttr:max
+      Maximum number of results to be returned
+*/
+typedef struct
+ {
+   nodeptr p;
+   int mintrav;
+   int maxtrav;
+ } pllRearrangeAttr;
+
+/** @typedef pllRearrangeInfo
+    
+    @brief Tree rearrangement information structure
+
+    Holds information for conducting tree arrangements. This structure
+    is the result of a tree arrangement search under given search
+    attributes.
+
+    @var pllRearrangeInfo::rearrangeType
+      Type of rearrangement. Can be \b PLL_REARRANGE_SPR, \b PLL_REARRANGE_NNI or
+      \b PLL_REARRANGE_TBR
+    
+    @var pllRearrangeInfo::likelihood
+      Holds the computed likelihood for the addressed rearrangement
+
+    @var pllRearrangeInfo::SPR::removeNode
+      Node where to perform subtree pruning
+
+    @var pllRearrangeInfo::SPR::insertNode
+      Node where to place the pruned subtree
+
+    @var pllRearrangeInfo::zqr
+      Holds the computed branch lengths after the SPR
+*/
+typedef struct
+ {
+   int rearrangeType;
+   double  likelihood;
+   union {
+     struct {
+       nodeptr removeNode;
+       nodeptr insertNode;
+       double  zqr[PLL_NUM_BRANCHES];
+     } SPR;
+     struct {
+       nodeptr originNode;
+       int     swapType;
+     } NNI;
+   };
+ } pllRearrangeInfo;
+
+
+typedef struct
+ {
+   int max_entries;
+   int entries;
+   pllRearrangeInfo * rearr;
+ } pllRearrangeList;
+
+/** @brief Generic structure for storing a multiple sequence alignment */
+typedef struct
+ {
+   int              sequenceCount;      /**< @brief Number of sequences */
+   int              sequenceLength;     /**< @brief Length of sequences */
+   int              originalSeqLength;  /**< @brief Original length of sequences (not modified after removing duplicates) */
+   char          ** sequenceLabels;     /**< @brief An array of where the \a i-th element is the name of the \a i-th sequence */
+   unsigned char ** sequenceData;       /**< @brief The actual sequence data */
+   int            * siteWeights;        /**< @brief An array where the \a i-th element indicates how many times site \a i appeared (prior to duplicates removal) in the alignment */
+ } pllAlignmentData;
+
+
+/******************** START OF API FUNCTION DESCRIPTIONS ********************/
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+pllBoolean isThisMyPartition(partitionList *pr, int tid, int model);
+void printParallelTimePerRegion(void); 
+#endif
+
+#ifdef _FINE_GRAIN_MPI
+extern void pllFinalizeMPI (void);
+#endif
+
+
+
+/**
+ * @brief Create the main instance of PLL
+ *   
+ * Create an instance of the phylogenetic likelihood library
+ *
+ * @param rateHetModel   Rate heterogeneity model
+ * @param fastScaling    TODO: explain fastScaling here
+ * @param saveMemory     TODO: explain saveMemory here
+ * @param useRecom       If set to \b PLL_TRUE, enables ancestral state recomputation
+ * 
+ * @todo                 Document fastScaling, rate heterogeneity and saveMemory and useRecom
+ *
+ * @note                 Do not set \a saveMemory to when using \a useRecom as memory saving 
+ *                       techniques are not yet implemented for ancestral state recomputation. 
+ * 
+ * @return               On success returns an instance to PLL, otherwise \b NULL
+ */
+extern pllInstance * pllCreateInstance (pllInstanceAttr * pInst);
+
+/** 
+ *  @ingroup instanceLinkingGroup
+ *  @brief Load alignment to the PLL instance
+ *   
+ *   Loads (copies) the parsed alignment \a alignmentData to the PLL instance
+ *   as a deep copy.
+ * 
+ *    @param tr              The library instance
+ *    @param alignmentData   The multiple sequence alignment
+ *    @param partitions      List of partitions
+ *
+ *    @return Returns 1 in case of success, 0 otherwise.
+ */
+extern int pllLoadAlignment (pllInstance * tr, 
+                             pllAlignmentData * alignmentData, 
+                             partitionList * pList);
+
+/**
+ * @brief Compute the empirical base frequencies for all partitions
+ *
+ * Compute the empirical base frequencies for all partitions in the list \a pl.
+ *
+ * @param pl                Partition list
+ * @param alignmentData     Multiple sequence alignment
+ *
+ * @return   A list of \a pl->numberOfPartitions arrays each of size
+             \a pl->partitionData[i]->states, where \a i is the \a i-th partition
+*/
+extern double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl);
+extern double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl);
+
+/* pthreads and MPI */
+extern void pllStartPthreads (pllInstance *tr, partitionList *pr);
+extern void pllStopPthreads (pllInstance * tr);
+extern void pllLockMPI (pllInstance * tr);
+extern void pllInitMPI(int * argc, char **argv[]);
+
+
+/* handling branch lengths*/
+extern double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id);
+extern void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl);
+extern int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel);
+extern void pllOptimizeBranchLengths ( pllInstance *tr, partitionList *pr, int maxSmoothIterations );
+
+
+extern void pllEvaluateLikelihood (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean fullTraversal, pllBoolean getPerSiteLikelihoods);
+extern void pllUpdatePartials (pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean masked);
+extern void pllUpdatePartialsAncestral(pllInstance *tr, partitionList *pr, nodeptr p);
+extern void pllNewviewIterative(pllInstance *tr, partitionList *pr, int startIndex);
+extern void pllEvaluateIterative(pllInstance *tr, partitionList *pr, pllBoolean getPerSiteLikelihoods);
+
+/* newick parser declarations */
+extern pllNewickTree * pllNewickParseString (const char * newick);
+extern pllNewickTree * pllNewickParseFile (const char * filename);
+extern int pllValidateNewick (pllNewickTree *);
+extern void pllNewickParseDestroy (pllNewickTree **);
+extern int pllNewickUnroot (pllNewickTree * t);
+extern char * pllTreeToNewick ( char *treestr, pllInstance *tr, partitionList *pr, nodeptr p,
+      pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+      pllBoolean rellTree, pllBoolean finalPrint, int perGene,
+      pllBoolean branchLabelSupport, pllBoolean printSHSupport);
+
+/* partition parser declarations */
+extern void  pllQueuePartitionsDestroy (pllQueue ** partitions);
+extern pllQueue * pllPartitionParse (const char * filename);
+extern pllQueue * pllPartitionParseString (const char * p);
+extern void pllPartitionDump (pllQueue * partitions);
+void pllBaseSubstitute (pllInstance * tr, partitionList * partitions);
+//void pllBaseSubstitute (pllAlignmentData * tr, partitionList * partitions);
+partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData);
+int pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData);
+extern void pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl);
+void pllPartitionsDestroy (pllInstance *, partitionList **);
+
+/* alignment data declarations */
+extern void pllAlignmentDataDestroy (pllAlignmentData *);
+extern int pllAlignmentDataDumpFile (pllAlignmentData *, int, const char *);
+extern void pllAlignmentDataDumpConsole (pllAlignmentData * alignmentData);
+extern pllAlignmentData * pllInitAlignmentData (int, int);
+extern pllAlignmentData * pllParseAlignmentFile (int fileType, const char *);
+extern pllAlignmentData *pllParsePHYLIPString (const char *rawdata, long filesize);
+
+
+/* model management */
+int pllInitModel (pllInstance *, partitionList *);
+void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model);
+void pllMakeGammaCats(double alpha, double *gammaRates, int K, pllBoolean useMedian);
+int pllLinkAlphaParameters(char *string, partitionList *pr);
+int pllLinkFrequencies(char *string, partitionList *pr);
+int pllLinkRates(char *string, partitionList *pr);
+int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model);
+void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr);
+void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr);
+int  pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr);
+void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
+void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr);
+int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer);
+int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon);
+double pllGetAlpha (partitionList * pr, int pid);
+void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer);
+extern void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer);
+extern void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer);
+void pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models);
+extern void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, linkageList *ll);
+extern void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
+extern void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll);
+extern void pllOptLG4X(pllInstance *tr, partitionList * pr, double modelEpsilon, linkageList *ll, int numberOfModels);
+
+/* tree topology */
+void pllTreeInitTopologyNewick (pllInstance *, pllNewickTree *, int);
+void pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList);
+void pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData);
+extern void pllMakeRandomTree ( pllInstance *tr);
+void pllMakeParsimonyTree(pllInstance *tr);
+extern void pllMakeParsimonyTreeFast(pllInstance *tr, partitionList *pr, int sprDist);
+void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist);
+nodeptr pllGetRandomSubtree(pllInstance *);
+extern void pllFreeParsimonyDataStructures(pllInstance *tr, partitionList *pr);
+void pllDestroyInstance (pllInstance *);
+extern void pllGetAncestralState(pllInstance *tr, partitionList *pr, nodeptr p, double * outProbs, char * outSequence);
+unsigned int pllEvaluateParsimony(pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean full, pllBoolean perSiteScores);
+void pllInitParsimonyStructures(pllInstance *tr, partitionList *pr, pllBoolean perSiteScores);
+
+/* rearrange functions (NNI and SPR) */
+pllRearrangeList * pllCreateRearrangeList (int max);
+void pllDestroyRearrangeList (pllRearrangeList ** bestList);
+void pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList);
+void pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo);
+int pllRearrangeRollback (pllInstance * tr, partitionList * pr);
+void pllClearRearrangeHistory (pllInstance * tr);
+int pllRaxmlSearchAlgorithm (pllInstance * tr, partitionList * pr, pllBoolean estimateModel);
+int pllGetTransitionMatrix (pllInstance * tr, partitionList * pr, nodeptr p, int model, int rate, double * outBuffer);
+void pllGetTransitionMatrix2 (pllInstance * tr, partitionList * pr, int model, nodeptr p, double * outBuffer);
+int pllGetCLV (pllInstance * tr, partitionList * pr, nodeptr p, int partition, double * outProbs);
+extern int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap);
+
+/* hash functions */
+unsigned int pllHashString (const char * s, unsigned int size);
+int pllHashAdd  (pllHashTable * hTable, unsigned int hash, const char * s, void * item);
+pllHashTable * pllHashInit (unsigned int n);
+int pllHashSearch (struct pllHashTable * hTable, char * s, void ** item);
+void pllHashDestroy (struct pllHashTable ** hTable, void (*cbDealloc)(void *));
+
+/* node specific functions */
+nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p);
+
+/* other functions */
+extern char * pllReadFile (const char *, long *);
+extern int * pllssort1main (char ** x, int n);
+extern node ** pllGetInnerBranchEndPoints (pllInstance * tr);
+
+/* ---------------- */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
diff --git a/pllrepo/src/pllInternal.h b/pllrepo/src/pllInternal.h
new file mode 100644
index 0000000..1b6e0ac
--- /dev/null
+++ b/pllrepo/src/pllInternal.h
@@ -0,0 +1,313 @@
+/*
+ * pllInternal.h
+ *
+ *  Created on: Feb 17, 2014
+ *      Author: diego
+ */
+
+#ifndef PLLINTERNAL_H_
+#define PLLINTERNAL_H_
+
+#include "pll.h"
+#include "genericParallelization.h"
+#include "errcodes.h"
+#include "lexer.h"
+#include "parsePartition.h"
+#include "mem_alloc.h"
+
+//extern int lookupWord(char *s, stringHashtable *h);
+
+extern void getDataTypeString(pllInstance *tr, pInfo *partitionInfo, char typeOfData[1024]);
+extern int countTips(nodeptr p, int numsp);
+extern unsigned int precomputed16_bitcount(unsigned int n, char *bits_in_16bits);
+
+extern size_t discreteRateCategories(int rateHetModel);
+
+extern const partitionLengths * getPartitionLengths(pInfo *p);
+extern pllBoolean getSmoothFreqs(int dataType);
+extern const unsigned int *getBitVector(int dataType);
+extern int getUndetermined(int dataType);
+extern int getStates(int dataType);
+extern char getInverseMeaning(int dataType, unsigned char state);
+extern double gettime ( void );
+extern int gettimeSrand ( void );
+extern double randum ( long *seed );
+
+extern void getxnode ( nodeptr p );
+extern void hookup ( nodeptr p, nodeptr q, double *z, int numBranches);
+extern void hookupFull ( nodeptr p, nodeptr q, double *z);
+extern void hookupDefault ( nodeptr p, nodeptr q);
+extern pllBoolean whitechar ( int ch );
+extern void printLog ( pllInstance *tr);
+extern double LnGamma ( double alpha );
+extern double IncompleteGamma ( double x, double alpha, double ln_gamma_alpha );
+extern double PointNormal ( double prob );
+extern double PointChi2 ( double prob, double v );
+extern void initModel ( pllInstance *tr, double **empiricalFrequencies, partitionList * partitions);
+
+extern void resetBranches ( pllInstance *tr );
+extern void modOpt ( pllInstance *tr, partitionList *pr, double likelihoodEpsilon);
+
+extern void initializePartitionData(pllInstance *localTree, partitionList * localPartitions);
+extern void initMemorySavingAndRecom(pllInstance *tr, partitionList *pr);
+
+extern void nodeRectifier ( pllInstance *tr );
+extern void allocateParsimonyDataStructures(pllInstance *tr, partitionList *pr);
+
+extern FILE *myfopen(const char *path, const char *mode);
+
+extern pllBoolean initrav ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void initravPartition ( pllInstance *tr, nodeptr p, int model );
+extern void update ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void smooth ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern void smoothTree ( pllInstance *tr, partitionList *pr, int maxtimes );
+extern void localSmooth ( pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes );
+extern pllBoolean localSmoothMulti(pllInstance *tr, nodeptr p, int maxtimes, int model);
+
+extern void smoothRegion ( pllInstance *tr, partitionList *pr, nodeptr p, int region );
+extern void regionalSmooth ( pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes, int region );
+extern nodeptr removeNodeBIG ( pllInstance *tr, partitionList *pr, nodeptr p, int numBranches);
+extern nodeptr removeNodeRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p );
+extern pllBoolean insertBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q);
+extern pllBoolean insertRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern pllBoolean testInsertBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern int NNI(pllInstance * tr, nodeptr p, int swap);
+extern void addTraverseBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav );
+extern int rearrangeBIG ( pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav );
+extern void traversalOrder ( nodeptr p, int *count, nodeptr *nodeArray );
+extern pllBoolean testInsertRestoreBIG ( pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q );
+extern void restoreTreeFast ( pllInstance *tr, partitionList *pr );
+
+extern void initTL ( topolRELL_LIST *rl, pllInstance *tr, int n );
+extern void freeTL ( topolRELL_LIST *rl);
+extern void restoreTL ( topolRELL_LIST *rl, pllInstance *tr, int n, int numBranches );
+extern void resetTL ( topolRELL_LIST *rl );
+extern void saveTL ( topolRELL_LIST *rl, pllInstance *tr, int index );
+
+extern topol  *setupTopol (int maxtips);
+extern void saveTree (pllInstance *tr, topol *tpl, int numBranches);
+extern pllBoolean restoreTree (topol *tpl, pllInstance *tr, partitionList *pr);
+
+
+
+
+extern int  saveBestTree (bestlist *bt, pllInstance *tr, int numBranches);
+extern int  recallBestTree (bestlist *bt, int rank, pllInstance *tr, partitionList *pr);
+extern int initBestTree ( bestlist *bt, int newkeep, int numsp );
+extern void resetBestTree ( bestlist *bt );
+extern pllBoolean freeBestTree ( bestlist *bt );
+
+
+/* extern int treeReadLen (FILE *fp, pllInstance *tr, pllBoolean readBranches, pllBoolean readNodeLabels, pllBoolean topologyOnly);
+extern void getStartingTree (pllInstance *tr); 
+extern void treeReadTopologyString(char *treeString, pllInstance *tr);
+extern double treeLength (pllInstance *tr, int model);*/
+extern double evaluatePartialGeneric (pllInstance *, partitionList *pr, int i, double ki, int _model);
+extern void newviewAncestralIterative(pllInstance *tr, partitionList *pr);
+extern void printAncestralState(nodeptr p, pllBoolean printStates, pllBoolean printProbs, pllInstance *tr, partitionList *pr);
+extern void makenewzGeneric(pllInstance *tr, partitionList * pr, nodeptr p, nodeptr q, double *z0, int maxiter, double *result, pllBoolean mask);
+extern void makenewzGenericDistance(pllInstance *tr, int maxiter, double *z0, double *result, int taxon1, int taxon2);
+extern double evaluatePartitionGeneric (pllInstance *tr, nodeptr p, int model);
+extern void newviewPartitionGeneric (pllInstance *tr, nodeptr p, int model);
+extern double evaluateGenericVector (pllInstance *tr, nodeptr p);
+extern void categorizeGeneric (pllInstance *tr, nodeptr p);
+extern double makenewzPartitionGeneric(pllInstance *tr, nodeptr p, nodeptr q, double z0, int maxiter, int model);
+extern pllBoolean isTip(int number, int maxTips);
+
+/* recom functions */
+extern void computeTraversal(pllInstance *tr, nodeptr p, pllBoolean partialTraversal, int numBranches);
+extern void allocRecompVectorsInfo(pllInstance *tr);
+extern void allocTraversalCounter(pllInstance *tr);
+extern pllBoolean getxVector(recompVectors *rvec, int nodenum, int *slot, int mxtips);
+extern pllBoolean needsRecomp(pllBoolean recompute, recompVectors *rvec, nodeptr p, int mxtips);
+extern void unpinNode(recompVectors *v, int nodenum, int mxtips);
+extern void protectNode(recompVectors *rvec, int nodenum, int mxtips);
+
+/* Handling branch lengths*/
+extern void computeTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec, int *count);
+extern void computeFullTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec);
+extern void printTraversalInfo(pllInstance *tr);
+extern void countTraversal(pllInstance *tr);
+extern void storeExecuteMaskInTraversalDescriptor(pllInstance *tr, partitionList *pr);
+extern void storeValuesInTraversalDescriptor(pllInstance *tr, partitionList *pr, double *value);
+extern void makenewzIterative(pllInstance *, partitionList *pr);
+extern void execCore(pllInstance *, partitionList *pr, volatile double *dlnLdlz, volatile double *d2lnLdlz2);
+extern void makePermutation(int *perm, int n, pllInstance *tr);
+extern nodeptr findAnyTip(nodeptr p, int numsp);
+extern void putWAG(double *ext_initialRates);
+extern  unsigned int **initBitVector(int mxtips, unsigned int *vectorLength);
+//extern hashtable *initHashTable(unsigned int n);
+extern void cleanupHashTable(pllHashTable * h, int state);
+extern double convergenceCriterion(pllHashTable *h, int mxtips);
+extern void freeBitVectors(unsigned int **v, int n);
+//extern void freeHashTable(hashtable *h);
+//extern stringHashtable *initStringHashTable(hashNumberType n);
+//extern void addword(char *s, stringHashtable *h, int nodeNumber);
+extern void initRateMatrix(pllInstance *tr, partitionList *pr);
+extern void bitVectorInitravSpecial(unsigned int **bitVectors, nodeptr p, int numsp, unsigned int vectorLength, pllHashTable *h, int treeNumber, int function, branchInfo *bInf,
+                                    int *countBranches, int treeVectorLength, pllBoolean traverseOnly, pllBoolean computeWRF, int processID);
+extern  unsigned int bitcount_32_bit(unsigned int i);
+extern __inline unsigned int bitcount_64_bit(uint64_t i);
+extern void perSiteLogLikelihoods(pllInstance *tr, partitionList *pr, double *logLikelihoods);
+extern void updatePerSiteRates(pllInstance *tr, partitionList *pr, pllBoolean scaleRates);
+extern void restart(pllInstance *tr, partitionList *pr);
+
+//extern const unsigned int mask32[32];
+
+/** @brief Check whether the position \a pos in bitvector \a x is a gap
+
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is set (i.e. it is a gap)
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//inline
+//#endif
+pllBoolean isGap(unsigned int *x, int pos);
+
+/** @brief Check whether the position \a pos in bitvector \a x is \b NOT a gap
+
+    @param x
+      A bitvector represented by unsigned integers
+
+    @param pos
+      Position to check in \a x if it is \b NOT set (i.e. it is \b NOT a gap)
+
+    @return
+      Returns the value of the bit vector (\b 1 if set, \b 0 if not)
+*/
+//#ifndef __clang__
+//inline
+//#endif
+pllBoolean noGap(unsigned int *x, int pos);
+
+//#ifndef __clang__
+//__inline
+//#endif
+//pllBoolean isGap(unsigned int *x, int pos);
+
+//#ifndef __clang__
+//__inline
+//#endif
+//pllBoolean noGap(unsigned int *x, int pos);
+
+/* from utils.h */
+linkageList* initLinkageList(int *linkList, partitionList *pr);
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS) )
+/* work tags for parallel regions */
+
+#define PLL_THREAD_NEWVIEW                  0
+#define PLL_THREAD_EVALUATE                 1
+#define PLL_THREAD_MAKENEWZ                 2
+#define PLL_THREAD_MAKENEWZ_FIRST           3
+#define PLL_THREAD_RATE_CATS                4
+#define PLL_THREAD_COPY_RATE_CATS           5
+#define PLL_THREAD_COPY_INIT_MODEL          6
+#define PLL_THREAD_INIT_PARTITION           7
+#define PLL_THREAD_OPT_ALPHA                8
+#define PLL_THREAD_OPT_RATE                 9
+#define PLL_THREAD_OPT_LG4X_RATE            10
+#define PLL_THREAD_COPY_ALPHA               11
+#define PLL_THREAD_COPY_RATES               12
+#define PLL_THREAD_COPY_LG4X_RATES          13
+#define PLL_THREAD_PER_SITE_LIKELIHOODS     14
+#define PLL_THREAD_NEWVIEW_ANCESTRAL        15
+#define PLL_THREAD_GATHER_ANCESTRAL         16
+#define PLL_THREAD_EXIT_GRACEFULLY          17
+#define PLL_THREAD_EVALUATE_PER_SITE_LIKES  18
+
+
+typedef struct
+{
+  pllInstance *tr;
+
+  partitionList *pr;
+  int threadNumber;
+}
+  threadData;
+extern void optRateCatPthreads(pllInstance *tr, partitionList *pr, double lower_spacing, double upper_spacing, double *lhs, int n, int tid);
+extern void pllMasterBarrier(pllInstance *, partitionList *, int);
+#endif
+
+
+#ifdef __AVX
+
+extern void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
+                                        double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
+                                        int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                                        double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGTRCAT_AVX_GAPPED_SAVE(int tipCase,  double *EV,  int *cptr,
+                                   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                   int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+extern void newviewGTRCATPROT_AVX_GAPPED_SAVE(int tipCase, double *extEV,
+                                       int *cptr,
+                                       double *x1, double *x2, double *x3, double *tipVector,
+                                       int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                       int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                       unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                       double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
+
+extern void  newviewGTRGAMMA_AVX_GAPPED_SAVE(int tipCase,
+                                      double *x1_start, double *x2_start, double *x3_start,
+                                      double *extEV, double *tipVector,
+                                      int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                                      const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                      unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                      double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
+                                      );
+
+extern void newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(int tipCase,
+                                         double *x1_start, double *x2_start, double *x3_start, double *extEV, double *tipVector,
+                                         int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                                         double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling,
+                                         unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
+                                         double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
+
+extern void newviewGTRCAT_AVX(int tipCase,  double *EV,  int *cptr,
+    double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    int n,  double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGenericCATPROT_AVX(int tipCase, double *extEV,
+    int *cptr,
+    double *x1, double *x2, double *x3, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+
+extern void newviewGTRGAMMA_AVX(int tipCase,
+    double *x1_start, double *x2_start, double *x3_start,
+    double *EV, double *tipVector,
+    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+    const int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+extern void newviewGTRGAMMAPROT_AVX(int tipCase,
+                             double *x1, double *x2, double *x3, double *extEV, double *tipVector,
+                             int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n,
+                             double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+extern void newviewGTRCATPROT_AVX(int tipCase, double *extEV,
+                           int *cptr,
+                           double *x1, double *x2, double *x3, double *tipVector,
+                           int *ex3, unsigned char *tipX1, unsigned char *tipX2,
+                           int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean useFastScaling);
+
+#endif
+
+extern int virtual_width( int n );
+extern void computeAllAncestralVectors(nodeptr p, pllInstance *tr, partitionList *pr);
+
+#endif /* PLLINTERNAL_H_ */
diff --git a/pllrepo/src/pthread.h b/pllrepo/src/pthread.h
new file mode 100644
index 0000000..b4072f7
--- /dev/null
+++ b/pllrepo/src/pthread.h
@@ -0,0 +1,1368 @@
+/* This is an implementation of the threads API of POSIX 1003.1-2001.
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#if !defined( PTHREAD_H )
+#define PTHREAD_H
+
+/*
+ * See the README file for an explanation of the pthreads-win32 version
+ * numbering scheme and how the DLL is named etc.
+ */
+#define PTW32_VERSION 2,9,1,0
+#define PTW32_VERSION_STRING "2, 9, 1, 0\0"
+
+/* There are three implementations of cancel cleanup.
+ * Note that pthread.h is included in both application
+ * compilation units and also internally for the library.
+ * The code here and within the library aims to work
+ * for all reasonable combinations of environments.
+ *
+ * The three implementations are:
+ *
+ *   WIN32 SEH
+ *   C
+ *   C++
+ *
+ * Please note that exiting a push/pop block via
+ * "return", "exit", "break", or "continue" will
+ * lead to different behaviour amongst applications
+ * depending upon whether the library was built
+ * using SEH, C++, or C. For example, a library built
+ * with SEH will call the cleanup routine, while both
+ * C++ and C built versions will not.
+ */
+
+/*
+ * Define defaults for cleanup code.
+ * Note: Unless the build explicitly defines one of the following, then
+ * we default to standard C style cleanup. This style uses setjmp/longjmp
+ * in the cancelation and thread exit implementations and therefore won't
+ * do stack unwinding if linked to applications that have it (e.g.
+ * C++ apps). This is currently consistent with most/all commercial Unix
+ * POSIX threads implementations.
+ */
+#if !defined( __CLEANUP_SEH ) && !defined( __CLEANUP_CXX ) && !defined( __CLEANUP_C )
+# define __CLEANUP_C
+#endif
+
+#if defined( __CLEANUP_SEH ) && ( !defined( _MSC_VER ) && !defined(PTW32_RC_MSC))
+#error ERROR [__FILE__, line __LINE__]: SEH is not supported for this compiler.
+#endif
+
+/*
+ * Stop here if we are being included by the resource compiler.
+ */
+#if !defined(RC_INVOKED)
+
+#undef PTW32_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_LEVEL
+#define PTW32_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_LEVEL)
+#define PTW32_LEVEL PTW32_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(_UWIN)
+#   define HAVE_STRUCT_TIMESPEC 1
+#   define HAVE_SIGNAL_H        1
+#   undef HAVE_PTW32_CONFIG_H
+#   pragma comment(lib, "pthread")
+#endif
+
+/*
+ * -------------------------------------------------------------
+ *
+ *
+ * Module: pthread.h
+ *
+ * Purpose:
+ *      Provides an implementation of PThreads based upon the
+ *      standard:
+ *
+ *              POSIX 1003.1-2001
+ *  and
+ *    The Single Unix Specification version 3
+ *
+ *    (these two are equivalent)
+ *
+ *      in order to enhance code portability between Windows,
+ *  various commercial Unix implementations, and Linux.
+ *
+ *      See the ANNOUNCE file for a full list of conforming
+ *      routines and defined constants, and a list of missing
+ *      routines and constants not defined in this implementation.
+ *
+ * Authors:
+ *      There have been many contributors to this library.
+ *      The initial implementation was contributed by
+ *      John Bossom, and several others have provided major
+ *      sections or revisions of parts of the implementation.
+ *      Often significant effort has been contributed to
+ *      find and fix important bugs and other problems to
+ *      improve the reliability of the library, which sometimes
+ *      is not reflected in the amount of code which changed as
+ *      result.
+ *      As much as possible, the contributors are acknowledged
+ *      in the ChangeLog file in the source code distribution
+ *      where their changes are noted in detail.
+ *
+ *      Contributors are listed in the CONTRIBUTORS file.
+ *
+ *      As usual, all bouquets go to the contributors, and all
+ *      brickbats go to the project maintainer.
+ *
+ * Maintainer:
+ *      The code base for this project is coordinated and
+ *      eventually pre-tested, packaged, and made available by
+ *
+ *              Ross Johnson <rpj at callisto.canberra.edu.au>
+ *
+ * QA Testers:
+ *      Ultimately, the library is tested in the real world by
+ *      a host of competent and demanding scientists and
+ *      engineers who report bugs and/or provide solutions
+ *      which are then fixed or incorporated into subsequent
+ *      versions of the library. Each time a bug is fixed, a
+ *      test case is written to prove the fix and ensure
+ *      that later changes to the code don't reintroduce the
+ *      same error. The number of test cases is slowly growing
+ *      and therefore so is the code reliability.
+ *
+ * Compliance:
+ *      See the file ANNOUNCE for the list of implemented
+ *      and not-implemented routines and defined options.
+ *      Of course, these are all defined is this file as well.
+ *
+ * Web site:
+ *      The source code and other information about this library
+ *      are available from
+ *
+ *              http://sources.redhat.com/pthreads-win32/
+ *
+ * -------------------------------------------------------------
+ */
+
+/* Try to avoid including windows.h */
+#if (defined(__MINGW64__) || defined(__MINGW32__)) && defined(__cplusplus)
+#define PTW32_INCLUDE_WINDOWS_H
+#endif
+
+#if defined(PTW32_INCLUDE_WINDOWS_H)
+#include <windows.h>
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER < 1300 || defined(__DMC__)
+/*
+ * VC++6.0 or early compiler's header has no DWORD_PTR type.
+ */
+typedef unsigned long DWORD_PTR;
+typedef unsigned long ULONG_PTR;
+#endif
+/*
+ * -----------------
+ * autoconf switches
+ * -----------------
+ */
+
+#if defined(HAVE_PTW32_CONFIG_H)
+#include "config.h"
+#endif /* HAVE_PTW32_CONFIG_H */
+
+#if !defined(NEED_FTIME)
+#include <time.h>
+#else /* NEED_FTIME */
+/* use native WIN32 time API */
+#endif /* NEED_FTIME */
+
+#if defined(HAVE_SIGNAL_H)
+#include <signal.h>
+#endif /* HAVE_SIGNAL_H */
+
+#include <limits.h>
+
+/*
+ * Boolean values to make us independent of system includes.
+ */
+enum {
+  PTW32_FALSE = 0,
+  PTW32_TRUE = (! PTW32_FALSE)
+};
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Several systems don't define some error numbers.
+ */
+#if !defined(ENOTSUP)
+#  define ENOTSUP 48   /* This is the value in Solaris. */
+#endif
+
+#if !defined(ETIMEDOUT)
+#  define ETIMEDOUT 10060 /* Same as WSAETIMEDOUT */
+#endif
+
+#if !defined(ENOSYS)
+#  define ENOSYS 140     /* Semi-arbitrary value */
+#endif
+
+#if !defined(EDEADLK)
+#  if defined(EDEADLOCK)
+#    define EDEADLK EDEADLOCK
+#  else
+#    define EDEADLK 36     /* This is the value in MSVC. */
+#  endif
+#endif
+
+/* POSIX 2008 - related to robust mutexes */
+#if !defined(EOWNERDEAD)
+#  define EOWNERDEAD 43
+#endif
+#if !defined(ENOTRECOVERABLE)
+#  define ENOTRECOVERABLE 44
+#endif
+
+#include <sched.h>
+
+/*
+ * To avoid including windows.h we define only those things that we
+ * actually need from it.
+ */
+#if !defined(PTW32_INCLUDE_WINDOWS_H)
+#if !defined(HANDLE)
+# define PTW32__HANDLE_DEF
+# define HANDLE void *
+#endif
+#if !defined(DWORD)
+# define PTW32__DWORD_DEF
+# define DWORD unsigned long
+#endif
+#endif
+
+#if !defined(HAVE_STRUCT_TIMESPEC)
+#define HAVE_STRUCT_TIMESPEC
+#if !defined(_TIMESPEC_DEFINED)
+#define _TIMESPEC_DEFINED
+struct timespec {
+        time_t tv_sec;
+        long tv_nsec;
+};
+#endif /* _TIMESPEC_DEFINED */
+#endif /* HAVE_STRUCT_TIMESPEC */
+
+#if !defined(SIG_BLOCK)
+#define SIG_BLOCK 0
+#endif /* SIG_BLOCK */
+
+#if !defined(SIG_UNBLOCK)
+#define SIG_UNBLOCK 1
+#endif /* SIG_UNBLOCK */
+
+#if !defined(SIG_SETMASK)
+#define SIG_SETMASK 2
+#endif /* SIG_SETMASK */
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+/*
+ * -------------------------------------------------------------
+ *
+ * POSIX 1003.1-2001 Options
+ * =========================
+ *
+ * Options are normally set in <unistd.h>, which is not provided
+ * with pthreads-win32.
+ *
+ * For conformance with the Single Unix Specification (version 3), all of the
+ * options below are defined, and have a value of either -1 (not supported)
+ * or 200112L (supported).
+ *
+ * These options can neither be left undefined nor have a value of 0, because
+ * either indicates that sysconf(), which is not implemented, may be used at
+ * runtime to check the status of the option.
+ *
+ * _POSIX_THREADS (== 200112L)
+ *                      If == 200112L, you can use threads
+ *
+ * _POSIX_THREAD_ATTR_STACKSIZE (== 200112L)
+ *                      If == 200112L, you can control the size of a thread's
+ *                      stack
+ *                              pthread_attr_getstacksize
+ *                              pthread_attr_setstacksize
+ *
+ * _POSIX_THREAD_ATTR_STACKADDR (== -1)
+ *                      If == 200112L, you can allocate and control a thread's
+ *                      stack. If not supported, the following functions
+ *                      will return ENOSYS, indicating they are not
+ *                      supported:
+ *                              pthread_attr_getstackaddr
+ *                              pthread_attr_setstackaddr
+ *
+ * _POSIX_THREAD_PRIORITY_SCHEDULING (== -1)
+ *                      If == 200112L, you can use realtime scheduling.
+ *                      This option indicates that the behaviour of some
+ *                      implemented functions conforms to the additional TPS
+ *                      requirements in the standard. E.g. rwlocks favour
+ *                      writers over readers when threads have equal priority.
+ *
+ * _POSIX_THREAD_PRIO_INHERIT (== -1)
+ *                      If == 200112L, you can create priority inheritance
+ *                      mutexes.
+ *                              pthread_mutexattr_getprotocol +
+ *                              pthread_mutexattr_setprotocol +
+ *
+ * _POSIX_THREAD_PRIO_PROTECT (== -1)
+ *                      If == 200112L, you can create priority ceiling mutexes
+ *                      Indicates the availability of:
+ *                              pthread_mutex_getprioceiling
+ *                              pthread_mutex_setprioceiling
+ *                              pthread_mutexattr_getprioceiling
+ *                              pthread_mutexattr_getprotocol     +
+ *                              pthread_mutexattr_setprioceiling
+ *                              pthread_mutexattr_setprotocol     +
+ *
+ * _POSIX_THREAD_PROCESS_SHARED (== -1)
+ *                      If set, you can create mutexes and condition
+ *                      variables that can be shared with another
+ *                      process.If set, indicates the availability
+ *                      of:
+ *                              pthread_mutexattr_getpshared
+ *                              pthread_mutexattr_setpshared
+ *                              pthread_condattr_getpshared
+ *                              pthread_condattr_setpshared
+ *
+ * _POSIX_THREAD_SAFE_FUNCTIONS (== 200112L)
+ *                      If == 200112L you can use the special *_r library
+ *                      functions that provide thread-safe behaviour
+ *
+ * _POSIX_READER_WRITER_LOCKS (== 200112L)
+ *                      If == 200112L, you can use read/write locks
+ *
+ * _POSIX_SPIN_LOCKS (== 200112L)
+ *                      If == 200112L, you can use spin locks
+ *
+ * _POSIX_BARRIERS (== 200112L)
+ *                      If == 200112L, you can use barriers
+ *
+ *      + These functions provide both 'inherit' and/or
+ *        'protect' protocol, based upon these macro
+ *        settings.
+ *
+ * -------------------------------------------------------------
+ */
+
+/*
+ * POSIX Options
+ */
+#undef _POSIX_THREADS
+#define _POSIX_THREADS 200809L
+
+#undef _POSIX_READER_WRITER_LOCKS
+#define _POSIX_READER_WRITER_LOCKS 200809L
+
+#undef _POSIX_SPIN_LOCKS
+#define _POSIX_SPIN_LOCKS 200809L
+
+#undef _POSIX_BARRIERS
+#define _POSIX_BARRIERS 200809L
+
+#undef _POSIX_THREAD_SAFE_FUNCTIONS
+#define _POSIX_THREAD_SAFE_FUNCTIONS 200809L
+
+#undef _POSIX_THREAD_ATTR_STACKSIZE
+#define _POSIX_THREAD_ATTR_STACKSIZE 200809L
+
+/*
+ * The following options are not supported
+ */
+#undef _POSIX_THREAD_ATTR_STACKADDR
+#define _POSIX_THREAD_ATTR_STACKADDR -1
+
+#undef _POSIX_THREAD_PRIO_INHERIT
+#define _POSIX_THREAD_PRIO_INHERIT -1
+
+#undef _POSIX_THREAD_PRIO_PROTECT
+#define _POSIX_THREAD_PRIO_PROTECT -1
+
+/* TPS is not fully supported.  */
+#undef _POSIX_THREAD_PRIORITY_SCHEDULING
+#define _POSIX_THREAD_PRIORITY_SCHEDULING -1
+
+#undef _POSIX_THREAD_PROCESS_SHARED
+#define _POSIX_THREAD_PROCESS_SHARED -1
+
+
+/*
+ * POSIX 1003.1-2001 Limits
+ * ===========================
+ *
+ * These limits are normally set in <limits.h>, which is not provided with
+ * pthreads-win32.
+ *
+ * PTHREAD_DESTRUCTOR_ITERATIONS
+ *                      Maximum number of attempts to destroy
+ *                      a thread's thread-specific data on
+ *                      termination (must be at least 4)
+ *
+ * PTHREAD_KEYS_MAX
+ *                      Maximum number of thread-specific data keys
+ *                      available per process (must be at least 128)
+ *
+ * PTHREAD_STACK_MIN
+ *                      Minimum supported stack size for a thread
+ *
+ * PTHREAD_THREADS_MAX
+ *                      Maximum number of threads supported per
+ *                      process (must be at least 64).
+ *
+ * SEM_NSEMS_MAX
+ *                      The maximum number of semaphores a process can have.
+ *                      (must be at least 256)
+ *
+ * SEM_VALUE_MAX
+ *                      The maximum value a semaphore can have.
+ *                      (must be at least 32767)
+ *
+ */
+#undef _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+#define _POSIX_THREAD_DESTRUCTOR_ITERATIONS     4
+
+#undef PTHREAD_DESTRUCTOR_ITERATIONS
+#define PTHREAD_DESTRUCTOR_ITERATIONS           _POSIX_THREAD_DESTRUCTOR_ITERATIONS
+
+#undef _POSIX_THREAD_KEYS_MAX
+#define _POSIX_THREAD_KEYS_MAX                  128
+
+#undef PTHREAD_KEYS_MAX
+#define PTHREAD_KEYS_MAX                        _POSIX_THREAD_KEYS_MAX
+
+#undef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN                       0
+
+#undef _POSIX_THREAD_THREADS_MAX
+#define _POSIX_THREAD_THREADS_MAX               64
+
+  /* Arbitrary value */
+#undef PTHREAD_THREADS_MAX
+#define PTHREAD_THREADS_MAX                     2019
+
+#undef _POSIX_SEM_NSEMS_MAX
+#define _POSIX_SEM_NSEMS_MAX                    256
+
+  /* Arbitrary value */
+#undef SEM_NSEMS_MAX
+#define SEM_NSEMS_MAX                           1024
+
+#undef _POSIX_SEM_VALUE_MAX
+#define _POSIX_SEM_VALUE_MAX                    32767
+
+#undef SEM_VALUE_MAX
+#define SEM_VALUE_MAX                           INT_MAX
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * The Open Watcom C/C++ compiler uses a non-standard calling convention
+ * that passes function args in registers unless __cdecl is explicitly specified
+ * in exposed function prototypes.
+ *
+ * We force all calls to cdecl even though this could slow Watcom code down
+ * slightly. If you know that the Watcom compiler will be used to build both
+ * the DLL and application, then you can probably define this as a null string.
+ * Remember that pthread.h (this file) is used for both the DLL and application builds.
+ */
+#define PTW32_CDECL __cdecl
+
+#if defined(_UWIN) && PTW32_LEVEL >= PTW32_LEVEL_MAX
+#   include     <sys/types.h>
+#else
+/*
+ * Generic handle type - intended to extend uniqueness beyond
+ * that available with a simple pointer. It should scale for either
+ * IA-32 or IA-64.
+ */
+typedef struct {
+    void * p;                   /* Pointer to actual object */
+    unsigned int x;             /* Extra information - reuse count etc */
+} ptw32_handle_t;
+
+typedef ptw32_handle_t pthread_t;
+typedef struct pthread_attr_t_ * pthread_attr_t;
+typedef struct pthread_once_t_ pthread_once_t;
+typedef struct pthread_key_t_ * pthread_key_t;
+typedef struct pthread_mutex_t_ * pthread_mutex_t;
+typedef struct pthread_mutexattr_t_ * pthread_mutexattr_t;
+typedef struct pthread_cond_t_ * pthread_cond_t;
+typedef struct pthread_condattr_t_ * pthread_condattr_t;
+#endif
+typedef struct pthread_rwlock_t_ * pthread_rwlock_t;
+typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t;
+typedef struct pthread_spinlock_t_ * pthread_spinlock_t;
+typedef struct pthread_barrier_t_ * pthread_barrier_t;
+typedef struct pthread_barrierattr_t_ * pthread_barrierattr_t;
+
+/*
+ * ====================
+ * ====================
+ * POSIX Threads
+ * ====================
+ * ====================
+ */
+
+enum {
+/*
+ * pthread_attr_{get,set}detachstate
+ */
+  PTHREAD_CREATE_JOINABLE       = 0,  /* Default */
+  PTHREAD_CREATE_DETACHED       = 1,
+
+/*
+ * pthread_attr_{get,set}inheritsched
+ */
+  PTHREAD_INHERIT_SCHED         = 0,
+  PTHREAD_EXPLICIT_SCHED        = 1,  /* Default */
+
+/*
+ * pthread_{get,set}scope
+ */
+  PTHREAD_SCOPE_PROCESS         = 0,
+  PTHREAD_SCOPE_SYSTEM          = 1,  /* Default */
+
+/*
+ * pthread_setcancelstate paramters
+ */
+  PTHREAD_CANCEL_ENABLE         = 0,  /* Default */
+  PTHREAD_CANCEL_DISABLE        = 1,
+
+/*
+ * pthread_setcanceltype parameters
+ */
+  PTHREAD_CANCEL_ASYNCHRONOUS   = 0,
+  PTHREAD_CANCEL_DEFERRED       = 1,  /* Default */
+
+/*
+ * pthread_mutexattr_{get,set}pshared
+ * pthread_condattr_{get,set}pshared
+ */
+  PTHREAD_PROCESS_PRIVATE       = 0,
+  PTHREAD_PROCESS_SHARED        = 1,
+
+/*
+ * pthread_mutexattr_{get,set}robust
+ */
+  PTHREAD_MUTEX_STALLED         = 0,  /* Default */
+  PTHREAD_MUTEX_ROBUST          = 1,
+
+/*
+ * pthread_barrier_wait
+ */
+  PTHREAD_BARRIER_SERIAL_THREAD = -1
+};
+
+/*
+ * ====================
+ * ====================
+ * Cancelation
+ * ====================
+ * ====================
+ */
+#define PTHREAD_CANCELED       ((void *)(size_t) -1)
+
+
+/*
+ * ====================
+ * ====================
+ * Once Key
+ * ====================
+ * ====================
+ */
+#define PTHREAD_ONCE_INIT       { PTW32_FALSE, 0, 0, 0}
+
+struct pthread_once_t_
+{
+  int          done;        /* indicates if user function has been executed */
+  void *       lock;
+  int          reserved1;
+  int          reserved2;
+};
+
+
+/*
+ * ====================
+ * ====================
+ * Object initialisers
+ * ====================
+ * ====================
+ */
+#define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -1)
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -2)
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER ((pthread_mutex_t)(size_t) -3)
+
+/*
+ * Compatibility with LinuxThreads
+ */
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP PTHREAD_RECURSIVE_MUTEX_INITIALIZER
+#define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP PTHREAD_ERRORCHECK_MUTEX_INITIALIZER
+
+#define PTHREAD_COND_INITIALIZER ((pthread_cond_t)(size_t) -1)
+
+#define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t)(size_t) -1)
+
+#define PTHREAD_SPINLOCK_INITIALIZER ((pthread_spinlock_t)(size_t) -1)
+
+
+/*
+ * Mutex types.
+ */
+enum
+{
+  /* Compatibility with LinuxThreads */
+  PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_TIMED_NP = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_ADAPTIVE_NP = PTHREAD_MUTEX_FAST_NP,
+  /* For compatibility with POSIX */
+  PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_FAST_NP,
+  PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP,
+  PTHREAD_MUTEX_ERRORCHECK = PTHREAD_MUTEX_ERRORCHECK_NP,
+  PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL
+};
+
+
+typedef struct ptw32_cleanup_t ptw32_cleanup_t;
+
+#if defined(_MSC_VER)
+/* Disable MSVC 'anachronism used' warning */
+#pragma warning( disable : 4229 )
+#endif
+
+typedef void (* PTW32_CDECL ptw32_cleanup_callback_t)(void *);
+
+#if defined(_MSC_VER)
+#pragma warning( default : 4229 )
+#endif
+
+struct ptw32_cleanup_t
+{
+  ptw32_cleanup_callback_t routine;
+  void *arg;
+  struct ptw32_cleanup_t *prev;
+};
+
+#if defined(__CLEANUP_SEH)
+        /*
+         * WIN32 SEH version of cancel cleanup.
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+        _cleanup.routine        = (ptw32_cleanup_callback_t)(_rout); \
+            _cleanup.arg        = (_arg); \
+            __try \
+              { \
+
+#define pthread_cleanup_pop( _execute ) \
+              } \
+            __finally \
+                { \
+                    if( _execute || AbnormalTermination()) \
+                      { \
+                          (*(_cleanup.routine))( _cleanup.arg ); \
+                      } \
+                } \
+        }
+
+#else /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_C)
+
+        /*
+         * C implementation of PThreads cancel cleanup
+         */
+
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            ptw32_cleanup_t     _cleanup; \
+            \
+            ptw32_push_cleanup( &_cleanup, (ptw32_cleanup_callback_t) (_rout), (_arg) ); \
+
+#define pthread_cleanup_pop( _execute ) \
+            (void) ptw32_pop_cleanup( _execute ); \
+        }
+
+#else /* __CLEANUP_C */
+
+#if defined(__CLEANUP_CXX)
+
+        /*
+         * C++ version of cancel cleanup.
+         * - John E. Bossom.
+         */
+
+        class PThreadCleanup {
+          /*
+           * PThreadCleanup
+           *
+           * Purpose
+           *      This class is a C++ helper class that is
+           *      used to implement pthread_cleanup_push/
+           *      pthread_cleanup_pop.
+           *      The destructor of this class automatically
+           *      pops the pushed cleanup routine regardless
+           *      of how the code exits the scope
+           *      (i.e. such as by an exception)
+           */
+      ptw32_cleanup_callback_t cleanUpRout;
+          void    *       obj;
+          int             executeIt;
+
+        public:
+          PThreadCleanup() :
+            cleanUpRout( 0 ),
+            obj( 0 ),
+            executeIt( 0 )
+            /*
+             * No cleanup performed
+             */
+            {
+            }
+
+          PThreadCleanup(
+             ptw32_cleanup_callback_t routine,
+                         void    *       arg ) :
+            cleanUpRout( routine ),
+            obj( arg ),
+            executeIt( 1 )
+            /*
+             * Registers a cleanup routine for 'arg'
+             */
+            {
+            }
+
+          ~PThreadCleanup()
+            {
+              if ( executeIt && ((void *) cleanUpRout != (void *) 0) )
+                {
+                  (void) (*cleanUpRout)( obj );
+                }
+            }
+
+          void execute( int exec )
+            {
+              executeIt = exec;
+            }
+        };
+
+        /*
+         * C++ implementation of PThreads cancel cleanup;
+         * This implementation takes advantage of a helper
+         * class who's destructor automatically calls the
+         * cleanup routine if we exit our scope weirdly
+         */
+#define pthread_cleanup_push( _rout, _arg ) \
+        { \
+            PThreadCleanup  cleanup((ptw32_cleanup_callback_t)(_rout), \
+                                    (void *) (_arg) );
+
+#define pthread_cleanup_pop( _execute ) \
+            cleanup.execute( _execute ); \
+        }
+
+#else
+
+#error ERROR [__FILE__, line __LINE__]: Cleanup type undefined.
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* __CLEANUP_C */
+
+#endif /* __CLEANUP_SEH */
+
+/*
+ * ===============
+ * ===============
+ * Methods
+ * ===============
+ * ===============
+ */
+
+/*
+ * PThread Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_init (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_destroy (pthread_attr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getdetachstate (const pthread_attr_t * attr,
+                                         int *detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstackaddr (const pthread_attr_t * attr,
+                                       void **stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstacksize (const pthread_attr_t * attr,
+                                       size_t * stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setdetachstate (pthread_attr_t * attr,
+                                         int detachstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstackaddr (pthread_attr_t * attr,
+                                       void *stackaddr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstacksize (pthread_attr_t * attr,
+                                       size_t stacksize);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedparam (const pthread_attr_t *attr,
+                                        struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedparam (pthread_attr_t *attr,
+                                        const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedpolicy (pthread_attr_t *,
+                                         int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedpolicy (const pthread_attr_t *,
+                                         int *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setinheritsched(pthread_attr_t * attr,
+                                         int inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getinheritsched(const pthread_attr_t * attr,
+                                         int * inheritsched);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_setscope (pthread_attr_t *,
+                                   int);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_attr_getscope (const pthread_attr_t *,
+                                   int *);
+
+/*
+ * PThread Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_create (pthread_t * tid,
+                            const pthread_attr_t * attr,
+                            void *(PTW32_CDECL *start) (void *),
+                            void *arg);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_detach (pthread_t tid);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_equal (pthread_t t1,
+                           pthread_t t2);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_exit (void *value_ptr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_join (pthread_t thread,
+                          void **value_ptr);
+
+PTW32_DLLPORT pthread_t PTW32_CDECL pthread_self (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cancel (pthread_t thread);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcancelstate (int state,
+                                    int *oldstate);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setcanceltype (int type,
+                                   int *oldtype);
+
+PTW32_DLLPORT void PTW32_CDECL pthread_testcancel (void);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_once (pthread_once_t * once_control,
+                          void (PTW32_CDECL *init_routine) (void));
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+PTW32_DLLPORT ptw32_cleanup_t * PTW32_CDECL ptw32_pop_cleanup (int execute);
+
+PTW32_DLLPORT void PTW32_CDECL ptw32_push_cleanup (ptw32_cleanup_t * cleanup,
+                                 ptw32_cleanup_callback_t routine,
+                                 void *arg);
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread Specific Data Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_key_create (pthread_key_t * key,
+                                void (PTW32_CDECL *destructor) (void *));
+
+PTW32_DLLPORT int PTW32_CDECL pthread_key_delete (pthread_key_t key);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setspecific (pthread_key_t key,
+                                 const void *value);
+
+PTW32_DLLPORT void * PTW32_CDECL pthread_getspecific (pthread_key_t key);
+
+
+/*
+ * Mutex Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_init (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_destroy (pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getpshared (const pthread_mutexattr_t
+                                          * attr,
+                                          int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setpshared (pthread_mutexattr_t * attr,
+                                          int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_settype (pthread_mutexattr_t * attr, int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_gettype (const pthread_mutexattr_t * attr, int *kind);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setrobust(
+                                           pthread_mutexattr_t *attr,
+                                           int robust);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getrobust(
+                                           const pthread_mutexattr_t * attr,
+                                           int * robust);
+
+/*
+ * Barrier Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_init (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_destroy (pthread_barrierattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_getpshared (const pthread_barrierattr_t
+                                            * attr,
+                                            int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_setpshared (pthread_barrierattr_t * attr,
+                                            int pshared);
+
+/*
+ * Mutex Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_init (pthread_mutex_t * mutex,
+                                const pthread_mutexattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_destroy (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_lock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_timedlock(pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_trylock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_unlock (pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_mutex_consistent (pthread_mutex_t * mutex);
+
+/*
+ * Spinlock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_init (pthread_spinlock_t * lock, int pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_destroy (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_lock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_trylock (pthread_spinlock_t * lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_spin_unlock (pthread_spinlock_t * lock);
+
+/*
+ * Barrier Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_init (pthread_barrier_t * barrier,
+                                  const pthread_barrierattr_t * attr,
+                                  unsigned int count);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_destroy (pthread_barrier_t * barrier);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_barrier_wait (pthread_barrier_t * barrier);
+
+/*
+ * Condition Variable Attribute Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_init (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_destroy (pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_getpshared (const pthread_condattr_t * attr,
+                                         int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_condattr_setpshared (pthread_condattr_t * attr,
+                                         int pshared);
+
+/*
+ * Condition Variable Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_init (pthread_cond_t * cond,
+                               const pthread_condattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_destroy (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_wait (pthread_cond_t * cond,
+                               pthread_mutex_t * mutex);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_timedwait (pthread_cond_t * cond,
+                                    pthread_mutex_t * mutex,
+                                    const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_signal (pthread_cond_t * cond);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_cond_broadcast (pthread_cond_t * cond);
+
+/*
+ * Scheduling
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_setschedparam (pthread_t thread,
+                                   int policy,
+                                   const struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_getschedparam (pthread_t thread,
+                                   int *policy,
+                                   struct sched_param *param);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_setconcurrency (int);
+ 
+PTW32_DLLPORT int PTW32_CDECL pthread_getconcurrency (void);
+
+/*
+ * Read-Write Lock Functions
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_init(pthread_rwlock_t *lock,
+                                const pthread_rwlockattr_t *attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_destroy(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_tryrdlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_trywrlock(pthread_rwlock_t *);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_rdlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedrdlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_wrlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedwrlock(pthread_rwlock_t *lock,
+                                       const struct timespec *abstime);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_unlock(pthread_rwlock_t *lock);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_init (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_destroy (pthread_rwlockattr_t * attr);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_getpshared (const pthread_rwlockattr_t * attr,
+                                           int *pshared);
+
+PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_setpshared (pthread_rwlockattr_t * attr,
+                                           int pshared);
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX - 1
+
+/*
+ * Signal Functions. Should be defined in <signal.h> but MSVC and MinGW32
+ * already have signal.h that don't define these.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_kill(pthread_t thread, int sig);
+
+/*
+ * Non-portable functions
+ */
+
+/*
+ * Compatibility with Linux.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setkind_np(pthread_mutexattr_t * attr,
+                                         int kind);
+PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getkind_np(pthread_mutexattr_t * attr,
+                                         int *kind);
+
+/*
+ * Possibly supported by other POSIX threads implementations
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_delay_np (struct timespec * interval);
+PTW32_DLLPORT int PTW32_CDECL pthread_num_processors_np(void);
+PTW32_DLLPORT unsigned __int64 PTW32_CDECL pthread_getunique_np(pthread_t thread);
+
+/*
+ * Useful if an application wants to statically link
+ * the lib rather than load the DLL at run-time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_detach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_attach_np(void);
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_detach_np(void);
+
+/*
+ * Features that are auto-detected at load/run time.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthread_win32_test_features_np(int);
+enum ptw32_features {
+  PTW32_SYSTEM_INTERLOCKED_COMPARE_EXCHANGE = 0x0001, /* System provides it. */
+  PTW32_ALERTABLE_ASYNC_CANCEL              = 0x0002  /* Can cancel blocked threads. */
+};
+
+/*
+ * Register a system time change with the library.
+ * Causes the library to perform various functions
+ * in response to the change. Should be called whenever
+ * the application's top level window receives a
+ * WM_TIMECHANGE message. It can be passed directly to
+ * pthread_create() as a new thread if desired.
+ */
+PTW32_DLLPORT void * PTW32_CDECL pthread_timechange_handler_np(void *);
+
+#endif /*PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 */
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/*
+ * Returns the Win32 HANDLE for the POSIX thread.
+ */
+PTW32_DLLPORT HANDLE PTW32_CDECL pthread_getw32threadhandle_np(pthread_t thread);
+/*
+ * Returns the win32 thread ID for POSIX thread.
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL pthread_getw32threadid_np (pthread_t thread);
+
+
+/*
+ * Protected Methods
+ *
+ * This function blocks until the given WIN32 handle
+ * is signaled or pthread_cancel had been called.
+ * This function allows the caller to hook into the
+ * PThreads cancel mechanism. It is implemented using
+ *
+ *              WaitForMultipleObjects
+ *
+ * on 'waitHandle' and a manually reset WIN32 Event
+ * used to implement pthread_cancel. The 'timeout'
+ * argument to TimedWait is simply passed to
+ * WaitForMultipleObjects.
+ */
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableWait (HANDLE waitHandle);
+PTW32_DLLPORT int PTW32_CDECL pthreadCancelableTimedWait (HANDLE waitHandle,
+                                        DWORD timeout);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+/*
+ * Thread-Safe C Runtime Library Mappings.
+ */
+#if !defined(_UWIN)
+#  if defined(NEED_ERRNO)
+     PTW32_DLLPORT int * PTW32_CDECL _errno( void );
+#  else
+#    if !defined(errno)
+#      if (defined(_MT) || defined(_DLL))
+         __declspec(dllimport) extern int * __cdecl _errno(void);
+#        define errno   (*_errno())
+#      endif
+#    endif
+#  endif
+#endif
+
+/*
+ * Some compiler environments don't define some things.
+ */
+#if defined(__BORLANDC__)
+#  define _ftime ftime
+#  define _timeb timeb
+#endif
+
+#if defined(__cplusplus)
+
+/*
+ * Internal exceptions
+ */
+class ptw32_exception {};
+class ptw32_exception_cancel : public ptw32_exception {};
+class ptw32_exception_exit   : public ptw32_exception {};
+
+#endif
+
+#if PTW32_LEVEL >= PTW32_LEVEL_MAX
+
+/* FIXME: This is only required if the library was built using SEH */
+/*
+ * Get internal SEH tag
+ */
+PTW32_DLLPORT DWORD PTW32_CDECL ptw32_get_exception_services_code(void);
+
+#endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */
+
+#if !defined(PTW32_BUILD)
+
+#if defined(__CLEANUP_SEH)
+
+/*
+ * Redefine the SEH __except keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#define __except( E ) \
+        __except( ( GetExceptionCode() == ptw32_get_exception_services_code() ) \
+                 ? EXCEPTION_CONTINUE_SEARCH : ( E ) )
+
+#endif /* __CLEANUP_SEH */
+
+#if defined(__CLEANUP_CXX)
+
+/*
+ * Redefine the C++ catch keyword to ensure that applications
+ * propagate our internal exceptions up to the library's internal handlers.
+ */
+#if defined(_MSC_VER)
+        /*
+         * WARNING: Replace any 'catch( ... )' with 'PtW32CatchAll'
+         * if you want Pthread-Win32 cancelation and pthread_exit to work.
+         */
+
+#if !defined(PtW32NoCatchWarn)
+
+#pragma message("Specify \"/DPtW32NoCatchWarn\" compiler flag to skip this message.")
+#pragma message("------------------------------------------------------------------")
+#pragma message("When compiling applications with MSVC++ and C++ exception handling:")
+#pragma message("  Replace any 'catch( ... )' in routines called from POSIX threads")
+#pragma message("  with 'PtW32CatchAll' or 'CATCHALL' if you want POSIX thread")
+#pragma message("  cancelation and pthread_exit to work. For example:")
+#pragma message("")
+#pragma message("    #if defined(PtW32CatchAll)")
+#pragma message("      PtW32CatchAll")
+#pragma message("    #else")
+#pragma message("      catch(...)")
+#pragma message("    #endif")
+#pragma message("        {")
+#pragma message("          /* Catchall block processing */")
+#pragma message("        }")
+#pragma message("------------------------------------------------------------------")
+
+#endif
+
+#define PtW32CatchAll \
+        catch( ptw32_exception & ) { throw; } \
+        catch( ... )
+
+#else /* _MSC_VER */
+
+#define catch( E ) \
+        catch( ptw32_exception & ) { throw; } \
+        catch( E )
+
+#endif /* _MSC_VER */
+
+#endif /* __CLEANUP_CXX */
+
+#endif /* ! PTW32_BUILD */
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#if defined(PTW32__HANDLE_DEF)
+# undef HANDLE
+#endif
+#if defined(PTW32__DWORD_DEF)
+# undef DWORD
+#endif
+
+#undef PTW32_LEVEL
+#undef PTW32_LEVEL_MAX
+
+#endif /* ! RC_INVOKED */
+
+#endif /* PTHREAD_H */
diff --git a/pllrepo/src/queue.c b/pllrepo/src/queue.c
new file mode 100644
index 0000000..eecf3fb
--- /dev/null
+++ b/pllrepo/src/queue.c
@@ -0,0 +1,96 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file queue.c
+ */
+#include <stdio.h>
+#include "queue.h"
+#include "mem_alloc.h"
+
+int
+pllQueueInit (pllQueue ** q)
+{  
+  *q = (pllQueue *) rax_malloc (sizeof (pllQueue));
+  if (!*q) return (0);
+   
+  (*q)->head = NULL;
+  (*q)->tail = NULL;
+   
+  return (1);
+}  
+
+int 
+pllQueueSize (pllQueue * q)
+{  
+  int n = 0;
+  struct pllQueueItem * elm;
+   
+  if (!q) return (0);
+   
+  for (elm = q->head; elm; elm = elm->next) ++n;
+   
+  return (n);
+}  
+
+int
+pllQueueRemove (pllQueue * q, void ** item)
+{  
+  struct pllQueueItem * elm;
+   
+  if (!q || !q->head) return (0);
+   
+  elm = q->head;
+   
+  *item = elm->item;
+   
+  q->head = q->head->next;
+  if (!q->head)  q->tail = NULL;
+  rax_free (elm);
+   
+  return (1);
+}  
+
+int 
+pllQueueAppend (pllQueue * q, void * item)
+{ 
+  struct pllQueueItem * qitem;
+  if (!q) return (0);
+  
+  qitem = (struct pllQueueItem *) rax_malloc (sizeof (struct pllQueueItem));
+  if (!qitem) return (0);
+  
+  qitem->item = item;
+  qitem->next = NULL;
+  
+  if (!q->head) 
+    q->head = qitem;
+  else
+    q->tail->next = qitem;
+  
+  q->tail = qitem;
+
+  return (1);
+} 
diff --git a/pllrepo/src/queue.h b/pllrepo/src/queue.h
new file mode 100644
index 0000000..b359c4a
--- /dev/null
+++ b/pllrepo/src/queue.h
@@ -0,0 +1,48 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file queue.h
+ */
+#ifndef __pll_QUEUE__
+#define __pll_QUEUE__
+
+struct pllQueueItem
+{  
+  void * item;
+  struct pllQueueItem * next;
+}; 
+   
+typedef struct
+{  
+  struct pllQueueItem * head;
+  struct pllQueueItem * tail;
+} pllQueue; 
+
+int pllQueueInit (pllQueue ** q);
+int pllQueueSize (pllQueue * q);
+int pllQueueRemove (pllQueue * q, void ** item);
+int pllQueueAppend (pllQueue * q, void * item);
+#endif
diff --git a/pllrepo/src/randomTree.c b/pllrepo/src/randomTree.c
new file mode 100644
index 0000000..c1d9af4
--- /dev/null
+++ b/pllrepo/src/randomTree.c
@@ -0,0 +1,177 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file randomTree.c
+ */
+#include "mem_alloc.h"
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static void insertTaxon (nodeptr p, nodeptr q)
+{
+  nodeptr  r;
+  
+  r = q->back;
+  
+  hookupDefault(p->next,       q);
+  hookupDefault(p->next->next, r);
+} 
+
+static nodeptr buildNewTip (pllInstance *tr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  q = tr->nodep[(tr->nextnode)++];
+  hookupDefault(p, q);
+  q->next->back = (nodeptr)NULL;
+  q->next->next->back = (nodeptr)NULL;
+ 
+  return  q;
+} 
+
+static void buildSimpleTreeRandom (pllInstance *tr, int ip, int iq, int ir)
+{    
+  nodeptr  
+    p, 
+    s;
+  
+  int  
+    i;
+  
+  i = PLL_MIN(ip, iq);
+  if (ir < i)  i = ir; 
+  tr->start = tr->nodep[i];
+  tr->ntips = 3;
+  p = tr->nodep[ip];
+  
+  hookupDefault(p, tr->nodep[iq]);
+  
+  s = buildNewTip(tr, tr->nodep[ir]);
+  
+  insertTaxon(s, p);
+}
+
+static int randomInt(int n, pllInstance *tr)
+{
+  int 
+    res = (int)((double)(n) * randum(&tr->randomNumberSeed));
+
+  assert(res >= 0 && res < n);
+  
+  return res;
+}
+
+void makePermutation(int *perm, int n, pllInstance *tr)
+{    
+  int  
+    i, 
+    j, 
+    k;    
+
+  for (i = 1; i <= n; i++)    
+    perm[i] = i;               
+
+  for (i = 1; i <= n; i++) 
+    {    
+      k =  randomInt(n + 1 - i, tr); /*(int)((double)(n + 1 - i) * randum(&tr->randomNumberSeed));*/
+
+      assert(i + k <= n);
+      
+      j        = perm[i];
+      perm[i]     = perm[i + k];
+      perm[i + k] = j; 
+    }
+}
+
+static int markBranches(nodeptr *branches, nodeptr p, int *counter, int numsp)
+{
+  if(isTip(p->number, numsp))
+    return 0;
+  else
+    {
+      branches[*counter] = p->next;
+      branches[*counter + 1] = p->next->next;
+      
+      *counter = *counter + 2;
+      
+      return ((2 + markBranches(branches, p->next->back, counter, numsp) + 
+	       markBranches(branches, p->next->next->back, counter, numsp)));
+    }
+}
+
+
+
+void pllMakeRandomTree(pllInstance *tr)
+{  
+  nodeptr 
+    p, 
+    f, 
+    randomBranch,
+    *branches = (nodeptr *)rax_malloc(sizeof(nodeptr) * (2 * tr->mxtips));    
+  
+  int 
+    nextsp, 
+    *perm = (int *)rax_malloc((tr->mxtips + 1) * sizeof(int)), 
+    branchCounter;                      
+  
+  makePermutation(perm, tr->mxtips, tr);              
+  
+  tr->ntips = 0;       	       
+  tr->nextnode = tr->mxtips + 1;    
+  
+  buildSimpleTreeRandom(tr, perm[1], perm[2], perm[3]);
+  
+  while(tr->ntips < tr->mxtips) 
+    {	             
+      nextsp = ++(tr->ntips);             
+      p = tr->nodep[perm[nextsp]];            
+      
+      buildNewTip(tr, p);  	
+      
+      f = findAnyTip(tr->start, tr->mxtips);
+      f = f->back;
+      
+      branchCounter = 1;
+      branches[0] = f;
+      markBranches(branches, f, &branchCounter, tr->mxtips);
+
+      assert(branchCounter == ((2 * (tr->ntips - 1)) - 3));
+      
+      randomBranch = branches[randomInt(branchCounter, tr)];
+      
+      insertTaxon(p->back, randomBranch);
+    }
+  
+  rax_free(perm);            
+  rax_free(branches);
+}
+
diff --git a/pllrepo/src/recom.c b/pllrepo/src/recom.c
new file mode 100644
index 0000000..5ab20c7
--- /dev/null
+++ b/pllrepo/src/recom.c
@@ -0,0 +1,689 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file recom.c
+ * @brief Functions used for recomputation of vectors (only a fraction of LH vectors stored in RAM)   
+ */
+#include "mem_alloc.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <time.h>
+#include <math.h>
+#ifndef WIN32
+#include <sys/time.h>
+#endif
+#include "pll.h"
+#include "pllInternal.h"
+
+/** @brief Locks node \a nodenum to force it remains availably in memory
+ *
+ * @warning If a node is available we dont need to recompute it, but we neet to make sure it is not unpinned while buildding the rest of the traversal descriptor, i.e. unpinnable must be PLL_FALSE at this point, it will automatically be set to PLL_TRUE, after the counter post-order instructions have been executed 
+Omitting this call the traversal will likely still work as long as num_allocated_nodes >> log n, but wrong inner vectors will be used at the wrong moment of pllNewviewIterative, careful! 
+ *
+ *  @param rvec 
+ *    Recomputation info
+ *
+ *  @param nodenum
+ *    Node id that must remain available in memory 
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+void protectNode(recompVectors *rvec, int nodenum, int mxtips)
+{
+
+  int slot;
+  slot = rvec->iNode[nodenum - mxtips - 1];
+  assert(slot != PLL_NODE_UNPINNED);
+  assert(rvec->iVector[slot] == nodenum);
+
+  if(rvec->unpinnable[slot])
+    rvec->unpinnable[slot] = PLL_FALSE;
+}
+
+/** @brief Checks if \a nodenum  is currently pinned (available in RAM)
+ *
+ *  @note shall we document static functions? 
+ * 
+ *  @param rvec 
+ *    Recomputation info
+ *
+ *  @param nodenum
+ *    Node id to be checked
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static pllBoolean isNodePinned(recompVectors *rvec, int nodenum, int mxtips)
+{
+  assert(nodenum > mxtips);
+
+  if(rvec->iNode[nodenum - mxtips - 1] == PLL_NODE_UNPINNED)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @brief Checks if the likelihood entries at node \a p should be updated
+ *
+ * A node needs update if one of the following holds:
+ *    1. It is not oriented (p->x == 0) 
+ *    2. We are applying recomputations and node \a p is not currently available in RAM
+ *  
+ *  @param recompute 
+ *    PLL_TRUE if recomputation is currently applied 
+ *
+ *  @param p
+ *    Node to check whether it is associated with the likelihood vector
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+pllBoolean needsRecomp(pllBoolean recompute, recompVectors *rvec, nodeptr p, int mxtips)
+{ 
+  if((!p->x) || (recompute && !isNodePinned(rvec, p->number, mxtips)))
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+
+
+/** @brief Allocates memory for recomputation structure
+ *  
+ *  
+ *  @todo this should not depend on tr (\a vectorRecomFraction should be a parameter)
+ *    PLL_TRUE if recomputation is currently applied 
+ *
+ */
+void allocRecompVectorsInfo(pllInstance *tr)
+{
+  recompVectors 
+    *v = (recompVectors *) rax_malloc(sizeof(recompVectors));
+
+  int 
+    num_inner_nodes = tr->mxtips - 2,
+                    num_vectors, 
+                    i;
+
+  assert(tr->vectorRecomFraction > PLL_MIN_RECOM_FRACTION);
+  assert(tr->vectorRecomFraction < PLL_MAX_RECOM_FRACTION);
+
+  num_vectors = (int) (1 + tr->vectorRecomFraction * (float)num_inner_nodes); 
+
+  int theoretical_minimum_of_vectors = 3 + ((int)(log((double)tr->mxtips)/log(2.0)));
+  //printBothOpen("Try to use %d ancestral vectors, min required %d\n", num_vectors, theoretical_minimum_of_vectors);
+
+  assert(num_vectors >= theoretical_minimum_of_vectors);
+  assert(num_vectors < tr->mxtips);
+
+
+  v->numVectors = num_vectors; /* use minimum bound theoretical */
+
+  /* init vectors tracking */
+
+  v->iVector         = (int *) rax_malloc((size_t)num_vectors * sizeof(int));
+  v->unpinnable      = (pllBoolean *) rax_malloc((size_t)num_vectors * sizeof(pllBoolean));
+
+  for(i = 0; i < num_vectors; i++)
+  {
+    v->iVector[i]         = PLL_SLOT_UNUSED;
+    v->unpinnable[i]      = PLL_FALSE;
+  }
+
+  v->iNode      = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int));
+  v->stlen      = (int *) rax_malloc((size_t)num_inner_nodes * sizeof(int));
+
+  for(i = 0; i < num_inner_nodes; i++)
+  {
+    v->iNode[i] = PLL_NODE_UNPINNED;
+    v->stlen[i] = PLL_INNER_NODE_INIT_STLEN;
+  }
+
+  v->allSlotsBusy = PLL_FALSE;
+
+  /* init nodes tracking */
+
+  v->maxVectorsUsed = 0;
+  tr->rvec = v;
+}
+
+/** @brief Find the slot id with the minimum cost to be recomputed.
+ *  
+ *  The minum cost is defined as the minimum subtree size. In general, the closer a vector is to the tips, 
+ *  the less recomputations are required to re-establish its likelihood entries
+ *
+ *  @todo remove _DEBUG_RECOMPUTATION code
+ *  
+ *  @param v
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static int findUnpinnableSlotByCost(recompVectors *v, int mxtips)
+{
+  int 
+    i, 
+    slot, 
+    cheapest_slot = -1, 
+    min_cost = mxtips * 2; /* more expensive than the most expensive*/
+#ifdef _DEBUG_RECOMPUTATION 
+  double straTime = gettime();
+#endif 
+
+
+  for(i = 0; i < mxtips - 2; i++)
+  {
+    slot = v->iNode[i];
+    if(slot != PLL_NODE_UNPINNED)
+    {
+      assert(slot >= 0 && slot < v->numVectors);
+
+      if(v->unpinnable[slot])
+      {
+        assert(v->stlen[i] > 0);
+
+        if(v->stlen[i] < min_cost)
+        {
+          min_cost = v->stlen[i];
+          cheapest_slot = slot;
+          /* if the slot costs 2 you can break cause there is nothing cheaper to recompute */
+          if(min_cost == 2)
+            break;
+        }
+      }
+    }
+  }
+  assert(min_cost < mxtips * 2 && min_cost >= 2);
+  assert(cheapest_slot >= 0);
+  return cheapest_slot;
+}
+
+static void unpinAtomicSlot(recompVectors *v, int slot, int mxtips)
+{
+  int 
+    nodenum = v->iVector[slot];
+
+  v->iVector[slot] = PLL_SLOT_UNUSED;
+
+  if(nodenum != PLL_SLOT_UNUSED)  
+    v->iNode[nodenum - mxtips - 1] = PLL_NODE_UNPINNED; 
+}
+
+/** @brief Finds the cheapest slot and unpins it
+ *
+ */
+static int findUnpinnableSlot(recompVectors *v, int mxtips)
+{
+  int     
+    slot_unpinned = findUnpinnableSlotByCost(v, mxtips);
+
+  assert(slot_unpinned >= 0);
+  assert(v->unpinnable[slot_unpinned]);
+
+  unpinAtomicSlot(v, slot_unpinned, mxtips);
+
+  return slot_unpinned;
+}
+
+/** @brief Finds a free slot 
+ * 
+ *  If all slots are occupied, it will find the cheapest slot and unpin it
+ *
+ */
+static int findFreeSlot(recompVectors *v, int mxtips)
+{
+  int 
+    slotno = -1, 
+           i;
+
+  assert(v->allSlotsBusy == PLL_FALSE);
+
+  for(i = 0; i < v->numVectors; i++)
+  {
+    if(v->iVector[i] == PLL_SLOT_UNUSED)
+    {
+      slotno = i;
+      break;
+    } 
+  }
+
+  if(slotno == -1)
+  {
+    v->allSlotsBusy = PLL_TRUE;
+    slotno = findUnpinnableSlot(v, mxtips);
+  }
+
+  return slotno;
+}
+
+
+/** @brief Pins node \a nodenum to slot \a slot
+ *  
+ *  The slot is initialized as non-unpinnable (ensures that the contents of the vector will not be overwritten)
+ *
+ *  @param nodenum
+ *    node id
+ *
+ *  @param slot
+ *    slot id 
+ *    
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+static void pinAtomicNode(recompVectors *v, int nodenum, int slot, int mxtips)
+{
+  v->iVector[slot] = nodenum;
+  v->iNode[nodenum - mxtips - 1] = slot;
+  v->unpinnable[slot] = PLL_FALSE;
+}
+
+static int pinNode(recompVectors *rvec, int nodenum, int mxtips)
+{
+  int 
+    slot;
+
+  assert(!isNodePinned(rvec, nodenum, mxtips));
+
+  if(rvec->allSlotsBusy)
+    slot = findUnpinnableSlot(rvec, mxtips);
+  else
+    slot = findFreeSlot(rvec, mxtips);
+
+  assert(slot >= 0);
+
+  pinAtomicNode(rvec, nodenum, slot, mxtips);
+
+  if(slot > rvec->maxVectorsUsed)
+    rvec->maxVectorsUsed = slot;
+
+  assert(slot == rvec->iNode[nodenum - mxtips - 1]);
+
+  return slot;
+}
+
+/** @brief Marks node \a nodenum as unpinnable
+ *  
+ *  The slot holding the node \a nodenum is added to the pool of slot candidates that can be overwritten.
+ *
+ *  @param v
+ *    Recomputation info
+ *    
+ *  @param nodenum
+ *    node id
+ *    
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+void unpinNode(recompVectors *v, int nodenum, int mxtips)
+{
+  if(nodenum <= mxtips)
+    return;
+  else
+  {
+    int 
+      slot = -1;
+
+    assert(nodenum > mxtips);
+    slot = v->iNode[nodenum-mxtips-1];
+    assert(slot >= 0 && slot < v->numVectors); 
+
+    if(slot >= 0 && slot < v->numVectors)
+      v->unpinnable[slot] = PLL_TRUE;
+  }
+}
+
+
+/** @brief Get a pinned slot \a slot that holds the likelihood vector for inner node \a nodenum
+ *  
+ *  If node \a node nodenum is not pinned to any slot yet, the minimum cost replacement strategy is used.
+ *
+ *  @param v
+ *    Recomputation info
+ *    
+ *  @param nodenum
+ *    node id
+ *    
+ *  @param slot
+ *    slot id
+ *
+ *  @param mxtips
+ *    Number of tips in the tree
+ *
+ */
+pllBoolean getxVector(recompVectors *rvec, int nodenum, int *slot, int mxtips)
+{
+  pllBoolean 
+    slotNeedsRecomp = PLL_FALSE;
+
+  *slot = rvec->iNode[nodenum - mxtips - 1];
+
+  if(*slot == PLL_NODE_UNPINNED)
+  {
+    *slot = pinNode(rvec, nodenum, mxtips); /* now we will run the replacement strategy */
+    slotNeedsRecomp = PLL_TRUE;
+  }
+
+  assert(*slot >= 0 && *slot < rvec->numVectors);
+
+  rvec->unpinnable[*slot] = PLL_FALSE;
+
+  return slotNeedsRecomp;
+}
+
+
+#ifdef _DEBUG_RECOMPUTATION
+
+static int subtreeSize(nodeptr p, int maxTips)
+{
+  if(isTip(p->number, maxTips))
+    return 1;
+  else   
+    return (subtreeSize(p->next->back, maxTips) + subtreeSize(p->next->next->back, maxTips));
+}
+
+#endif
+
+/** @brief Annotes unoriented tree nodes \a tr with their subtree size 
+ *  
+ *  This function recursively updates the subtree size of each inner node.
+ *  @note The subtree size of node \a p->number is the number of nodes included in the subtree where node record \a p is the virtual root. 
+ *
+ *  @param p
+ *    Pointer to node 
+ *    
+ *  @param maxTips
+ *    Number of tips in the tree
+ *
+ *  @param rvec 
+ *    Recomputation info
+ *    
+ *  @param count
+ *    Number of visited nodes 
+ */
+void computeTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec, int *count) 
+{
+  if(isTip(p->number, maxTips))
+    return;
+  else
+  {          
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;
+
+    *count += 1;
+    /* set xnode info at this point */     
+
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))  
+    {
+      rvec->stlen[p->number - maxTips - 1] = 2;	
+
+#ifdef _DEBUG_RECOMPUTATION
+      assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+    }
+    else
+    {
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {	     
+        nodeptr 
+          tmp;
+
+        if(isTip(r->number, maxTips))
+        {
+          tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+        if(!r->x)
+          computeTraversalInfoStlen(r, maxTips, rvec, count);
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[r->number - maxTips - 1] + 1;
+
+#ifdef _DEBUG_RECOMPUTATION	      
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+      else
+      {		 
+        if(!r->x)
+          computeTraversalInfoStlen(r, maxTips, rvec, count);
+        if(!q->x)
+          computeTraversalInfoStlen(q, maxTips, rvec, count); 
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[q->number - maxTips - 1] + rvec->stlen[r->number - maxTips - 1];	
+
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+    }
+  }
+}
+
+
+
+
+/* pre-compute the node stlens (this needs to be known prior to running the strategy) */
+/** @brief Annotes all tree nodes \a tr with their subtree size 
+ *  
+ *  Similar to \a computeTraversalInfoStlen, but does a full traversal ignoring orientation.
+ *  The minum cost is defined as the minimum subtree size. In general, the closer a vector is to the tips, 
+ *  the less recomputations are required to re-establish its likelihood entries
+ *
+ *  @param p
+ *    Pointer to node 
+ *    
+ *  @param maxTips
+ *    Number of tips in the tree
+ *
+ *  @param rvec 
+ *    Recomputation info
+ */
+void computeFullTraversalInfoStlen(nodeptr p, int maxTips, recompVectors *rvec) 
+{
+  if(isTip(p->number, maxTips))
+    return;
+  else
+  {    
+    nodeptr 
+      q = p->next->back,
+        r = p->next->next->back;     
+
+    if(isTip(r->number, maxTips) && isTip(q->number, maxTips))
+    {	  
+      rvec->stlen[p->number - maxTips - 1] = 2;
+
+#ifdef _DEBUG_RECOMPUTATION
+      assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+    }
+    else
+    {	    
+      if(isTip(r->number, maxTips) || isTip(q->number, maxTips))
+      {	  	      
+        nodeptr 
+          tmp;
+
+        if(isTip(r->number, maxTips))
+        {
+          tmp = r;
+          r = q;
+          q = tmp;
+        }
+
+        computeFullTraversalInfoStlen(r, maxTips, rvec);
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[r->number - maxTips - 1] + 1;	   
+
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+      else
+      {	    	     	      
+        computeFullTraversalInfoStlen(r, maxTips, rvec);
+        computeFullTraversalInfoStlen(q, maxTips, rvec); 
+
+        rvec->stlen[p->number - maxTips - 1] = rvec->stlen[q->number - maxTips - 1] + rvec->stlen[r->number - maxTips - 1];
+#ifdef _DEBUG_RECOMPUTATION
+        assert(rvec->stlen[p->number - maxTips - 1] == subtreeSize(p, maxTips));
+#endif
+      }
+    }
+  }
+}
+
+
+#ifdef _DEBUG_RECOMPUTATION
+
+void allocTraversalCounter(pllInstance *tr)
+{
+  traversalCounter 
+    *tc;
+
+  int 
+    k;
+
+  tc = (traversalCounter *)rax_malloc(sizeof(traversalCounter));
+
+  tc->travlenFreq = (unsigned int *)rax_malloc(tr->mxtips * sizeof(int));
+
+  for(k = 0; k < tr->mxtips; k++)
+    tc->travlenFreq[k] = 0;
+
+  tc->tt = 0;
+  tc->ti = 0;
+  tc->ii = 0;
+  tc->numTraversals = 0;
+  tr->travCounter = tc;
+}
+
+/* recomp */
+/* code to track traversal descriptor stats */
+
+void countTraversal(pllInstance *tr)
+{
+  traversalInfo 
+    *ti   = tr->td[0].ti;
+  int i;
+  traversalCounter *tc = tr->travCounter; 
+  tc->numTraversals += 1;
+
+  /*
+  printBothOpen("trav #%d(%d):",tc->numTraversals, tr->td[0].count);
+  */
+
+  for(i = 1; i < tr->td[0].count; i++)
+  {
+    traversalInfo *tInfo = &ti[i];
+
+    /* 
+       printBothOpen(" %d q%d r%d |",  tInfo->pNumber, tInfo->qNumber, tInfo->rNumber);
+       printBothOpen("%d",  tInfo->pNumber);
+       */
+    switch(tInfo->tipCase)
+    {
+      case PLL_TIP_TIP: 
+        tc->tt++; 
+        /* printBothOpen("T"); */
+        break;		  
+      case PLL_TIP_INNER: 
+        tc->ti++; 
+        /* printBothOpen("M"); */
+        break;		  
+
+      case PLL_INNER_INNER: 
+        tc->ii++; 
+        /* printBothOpen("I"); */
+        break;		  
+      default: 
+        assert(0);
+    }
+    /* printBothOpen(" "); */
+  }
+  /* printBothOpen(" so far T %d, M %d, I %d \n", tc->tt, tc->ti,tc->ii); */
+  tc->travlenFreq[tr->td[0].count] += 1;
+}
+
+
+/*
+void printTraversalInfo(pllInstance *tr)
+{
+  int 
+    k, 
+    total_steps = 0;
+
+  printBothOpen("Traversals : %d \n", tr->travCounter->numTraversals);
+  printBothOpen("Traversals tt: %d \n", tr->travCounter->tt);
+  printBothOpen("Traversals ti: %d \n", tr->travCounter->ti);
+  printBothOpen("Traversals ii: %d \n", tr->travCounter->ii);
+  printBothOpen("all: %d \n", tr->travCounter->tt + tr->travCounter->ii + tr->travCounter->ti);
+  printBothOpen("Traversals len freq  : \n");
+  
+  for(k = 0; k < tr->mxtips; k++)
+  {
+    total_steps += tr->travCounter->travlenFreq[k] * (k - 1);
+    if(tr->travCounter->travlenFreq[k] > 0)
+      printBothOpen("len %d : %d\n", k, tr->travCounter->travlenFreq[k]);
+  }
+  printBothOpen("all steps: %d \n", total_steps);
+}
+*/
+/*end code to track traversal descriptor stats */
+/* E recomp */
+
+/*
+void printVector(double *vector, int len, char *name)
+{ 
+  int i;
+  printBothOpen("LHVECTOR %s :", name);
+  for(i=0; i < len; i++)
+  {
+    printBothOpen("%.2f ", vector[i]);
+    if(i>10)
+    {
+      printBothOpen("...");
+      break; 
+    }
+  } 
+  printBothOpen("\n");
+} 
+*/
+
+#endif
+
diff --git a/pllrepo/src/restartHashTable.c b/pllrepo/src/restartHashTable.c
new file mode 100644
index 0000000..007e247
--- /dev/null
+++ b/pllrepo/src/restartHashTable.c
@@ -0,0 +1,357 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file bipartitionList.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+/*
+static pllBoolean treeNeedString(const char *fp, char c1, int *position)
+{
+  char 
+    c2 = fp[(*position)++];
+  
+  if(c2 == c1)  
+    return PLL_TRUE;
+  else  
+    {   
+      int 
+	lower = PLL_MAX(0, *position - 20),
+	upper = *position + 20;
+      
+      printf("Tree Parsing ERROR: Expecting '%c', found: '%c'\n", c1, c2); 
+      printf("Context: \n");
+      
+      while(lower < upper && fp[lower])
+	printf("%c", fp[lower++]);
+      
+      printf("\n");
+
+      return PLL_FALSE;
+  }
+} 
+
+
+static pllBoolean treeLabelEndString (char ch)
+{
+  switch(ch) 
+    {   
+    case '\0':  
+    case '\t':  
+    case '\n':  
+    case '\r': 
+    case ' ':
+    case ':':  
+    case ',':   
+    case '(':   
+    case ')':  
+    case ';':
+      return PLL_TRUE;
+    default:
+      break;
+    }
+  
+  return PLL_FALSE;
+} 
+
+static pllBoolean  treeGetLabelString (const char *fp, char *lblPtr, int maxlen, int *position)
+{
+  char 
+    ch;
+  
+  pllBoolean  
+    done, 
+    lblfound;
+
+  if (--maxlen < 0) 
+    lblPtr = (char *)NULL; 
+  else 
+    if(lblPtr == NULL) 
+      maxlen = 0;
+
+  ch = fp[(*position)++];
+  
+  done = treeLabelEndString(ch);
+
+  lblfound = !done;  
+
+  while(!done) 
+    {      
+      if(treeLabelEndString(ch)) 
+	break;     
+
+      if(--maxlen >= 0) 
+	*lblPtr++ = ch;
+      
+      ch = fp[(*position)++];      
+    }
+  
+  (*position)--; 
+
+  if (lblPtr != NULL) 
+    *lblPtr = '\0';
+
+  return lblfound;
+}
+
+static pllBoolean  treeFlushLabelString(const char *fp, int *position)
+{ 
+  return  treeGetLabelString(fp, (char *) NULL, (int) 0, position);
+} 
+
+static pllBoolean treeProcessLengthString (const char *fp, double *dptr, int *position)
+{ 
+  (*position)++;
+  
+  if(sscanf(&fp[*position], "%lf", dptr) != 1) 
+    {
+      printf("ERROR: treeProcessLength: Problem reading branch length\n");     
+      assert(0);
+    }
+
+  while(fp[*position] != ',' && fp[*position] != ')' && fp[*position] != ';')
+    *position = *position + 1;
+  
+  return  PLL_TRUE;
+}
+
+static int treeFlushLenString (const char *fp, int *position)
+{
+  double  
+    dummy;  
+  
+  char     
+    ch;
+
+  ch = fp[(*position)++];
+ 
+  if(ch == ':') 
+    {     
+      if(!treeProcessLengthString(fp, &dummy, position)) 
+	return 0;
+      return 1;	  
+    }
+    
+  (*position)--;
+
+  return 1;
+} 
+
+static int treeFindTipByLabelString(char  *str, pllInstance *tr)                    
+{
+  int lookup = lookupWord(str, tr->nameHash);
+
+  if(lookup > 0)
+    {
+      assert(! tr->nodep[lookup]->back);
+      return lookup;
+    }
+  else
+    { 
+      printf("ERROR: Cannot find tree species: %s\n", str);
+      return  0;
+    }
+}
+
+static int treeFindTipNameString (const char *fp, pllInstance *tr, int *position)
+{
+  char    str[PLL_NMLNGTH + 2];
+  int      n;
+
+  if (treeGetLabelString (fp, str, PLL_NMLNGTH + 2, position))
+    n = treeFindTipByLabelString(str, tr);
+  else
+    n = 0;
+   
+  return  n;
+} 
+
+static pllBoolean addElementLenString(const char *fp, pllInstance *tr, nodeptr p, int *position)
+{
+  nodeptr  
+    q;
+  
+  int      
+    n, 
+    fres;
+
+  char 
+    ch;
+  
+  if ((ch = fp[(*position)++]) == '(') 
+    { 
+      n = (tr->nextnode)++;
+      if (n > 2*(tr->mxtips) - 2) 
+	{
+	  if (tr->rooted || n > 2*(tr->mxtips) - 1) 
+	    {
+	      printf("ERROR: Too many internal nodes.  Is tree rooted?\n");
+	      printf("       Deepest splitting should be a trifurcation.\n");
+	      return PLL_FALSE;
+	    }
+	  else 
+	    {	   
+	      tr->rooted = PLL_TRUE;
+	    }
+	}
+      
+      q = tr->nodep[n];
+
+      if (!addElementLenString(fp, tr, q->next, position))        
+	return PLL_FALSE;
+      if (!treeNeedString(fp, ',', position))             
+	return PLL_FALSE;
+      if (!addElementLenString(fp, tr, q->next->next, position))  
+	return PLL_FALSE;
+      if (!treeNeedString(fp, ')', position))             
+	return PLL_FALSE;
+      
+     
+      treeFlushLabelString(fp, position);
+    }
+  else 
+    {   
+      (*position)--;
+     
+      if ((n = treeFindTipNameString(fp, tr, position)) <= 0)          
+	return PLL_FALSE;
+      q = tr->nodep[n];
+      
+      if (tr->start->number > n)  
+	tr->start = q;
+      (tr->ntips)++;
+    }
+  
+     
+  fres = treeFlushLenString(fp, position);
+  if(!fres) 
+    return PLL_FALSE;
+  
+  hookupDefault(p, q);
+
+  return PLL_TRUE;          
+}
+
+
+
+void treeReadTopologyString(char *treeString, pllInstance *tr)
+{ 
+  char 
+    *fp = treeString;
+
+  nodeptr  
+    p;
+  
+  int
+    position = 0, 
+    i;
+  
+  char 
+    ch;   
+    
+
+  for(i = 1; i <= tr->mxtips; i++)    
+    tr->nodep[i]->back = (node *)NULL;      
+  
+  for(i = tr->mxtips + 1; i < 2 * tr->mxtips; i++)
+    {
+      tr->nodep[i]->back = (nodeptr)NULL;
+      tr->nodep[i]->next->back = (nodeptr)NULL;
+      tr->nodep[i]->next->next->back = (nodeptr)NULL;
+      tr->nodep[i]->number = i;
+      tr->nodep[i]->next->number = i;
+      tr->nodep[i]->next->next->number = i;           
+    }
+      
+  tr->start       = tr->nodep[1];
+  tr->ntips       = 0;
+  tr->nextnode    = tr->mxtips + 1;    
+  tr->rooted      = PLL_FALSE;      
+  
+  p = tr->nodep[(tr->nextnode)++]; 
+   
+  assert(fp[position++] == '(');  
+    
+  if (! addElementLenString(fp, tr, p, &position))                 
+    assert(0);
+  
+  if (! treeNeedString(fp, ',', &position))                
+    assert(0);
+   
+  if (! addElementLenString(fp, tr, p->next, &position))           
+    assert(0);
+
+  if(!tr->rooted) 
+    {
+      if ((ch = fp[position++]) == ',') 
+	{ 
+	  if (! addElementLenString(fp, tr, p->next->next, &position)) 
+	    assert(0);	 
+	}
+      else 
+	assert(0);     
+    }
+  else
+    assert(0);
+        
+  if (! treeNeedString(fp, ')', &position))                
+    assert(0);
+
+  treeFlushLabelString(fp, &position);
+  
+  if (!treeFlushLenString(fp, &position))                         
+    assert(0);
+  
+  if (!treeNeedString(fp, ';', &position))       
+    assert(0);
+    
+  if(tr->rooted)     
+    assert(0);           
+  else           
+    tr->start = tr->nodep[1];   
+
+  printf("Tree parsed\n");
+
+} 
+*/
diff --git a/pllrepo/src/sched.h b/pllrepo/src/sched.h
new file mode 100644
index 0000000..f36a97a
--- /dev/null
+++ b/pllrepo/src/sched.h
@@ -0,0 +1,183 @@
+/*
+ * Module: sched.h
+ *
+ * Purpose:
+ *      Provides an implementation of POSIX realtime extensions
+ *      as defined in 
+ *
+ *              POSIX 1003.1b-1993      (POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined(_SCHED_H)
+#define _SCHED_H
+
+#undef PTW32_SCHED_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SCHED_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SCHED_LEVEL
+#define PTW32_SCHED_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SCHED_LEVEL_MAX 3
+
+#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 )  || !defined(PTW32_SCHED_LEVEL)
+#define PTW32_SCHED_LEVEL PTW32_SCHED_LEVEL_MAX
+/* Include everything */
+#endif
+
+
+#if defined(__GNUC__) && !defined(__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX */
+
+#if (defined(__MINGW64__) || defined(__MINGW32__)) || defined(_UWIN)
+# if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX
+/* For pid_t */
+#  include <sys/types.h>
+/* Required by Unix 98 */
+#  include <time.h>
+# else
+   typedef int pid_t;
+# endif
+#else
+ typedef int pid_t;
+#endif
+
+/* Thread scheduling policies */
+
+enum {
+  SCHED_OTHER = 0,
+  SCHED_FIFO,
+  SCHED_RR,
+  SCHED_MIN   = SCHED_OTHER,
+  SCHED_MAX   = SCHED_RR
+};
+
+struct sched_param {
+  int sched_priority;
+};
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif                          /* __cplusplus */
+
+PTW32_DLLPORT int __cdecl sched_yield (void);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_min (int policy);
+
+PTW32_DLLPORT int __cdecl sched_get_priority_max (int policy);
+
+PTW32_DLLPORT int __cdecl sched_setscheduler (pid_t pid, int policy);
+
+PTW32_DLLPORT int __cdecl sched_getscheduler (pid_t pid);
+
+/*
+ * Note that this macro returns ENOTSUP rather than
+ * ENOSYS as might be expected. However, returning ENOSYS
+ * should mean that sched_get_priority_{min,max} are
+ * not implemented as well as sched_rr_get_interval.
+ * This is not the case, since we just don't support
+ * round-robin scheduling. Therefore I have chosen to
+ * return the same value as sched_setscheduler when
+ * SCHED_RR is passed to it.
+ */
+#define sched_rr_get_interval(_pid, _interval) \
+  ( errno = ENOTSUP, (int) -1 )
+
+
+#if defined(__cplusplus)
+}                               /* End of extern "C" */
+#endif                          /* __cplusplus */
+
+#undef PTW32_SCHED_LEVEL
+#undef PTW32_SCHED_LEVEL_MAX
+
+#endif                          /* !_SCHED_H */
+
diff --git a/pllrepo/src/searchAlgo.c b/pllrepo/src/searchAlgo.c
new file mode 100644
index 0000000..c638d48
--- /dev/null
+++ b/pllrepo/src/searchAlgo.c
@@ -0,0 +1,3310 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file searchAlgo.c
+ * @brief Collection of routines for performing likelihood computation and branch optimization.
+ *
+ * Detailed description to appear soon.
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+typedef struct bInf {
+  double likelihood;
+  nodeptr node;
+} bestInfo;
+
+typedef struct iL {
+  bestInfo *list;
+  int n;
+  int valid;
+} infoList;
+
+double treeOptimizeRapid(pllInstance *tr, partitionList *pr, int mintrav, int maxtrav, bestlist *bt, infoList *iList);
+nniMove getBestNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p, double curLH);
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p, nniMove* nniList, int* cnt, int* cnt_nni, double curLH);
+
+
+static int cmp_nni(const void* nni1, const void* nni2);
+static void pllTraverseUpdate (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllRearrangeList * bestList);
+static int pllStoreRearrangement (pllRearrangeList * bestList, pllRearrangeInfo * rearr);
+static int pllTestInsertBIG (pllInstance * tr, partitionList * pr, nodeptr p, nodeptr q, pllRearrangeList * bestList);
+static int pllTestSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList);
+static void pllCreateSprInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches);
+static void pllCreateNniInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr);
+static void pllCreateRollbackInfo (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches);
+static void pllRollbackNNI (pllInstance * tr, partitionList * pr, pllRollbackInfo * ri);
+static void pllRollbackSPR (partitionList * pr, pllRollbackInfo * ri);
+
+extern partitionLengths pLengths[PLL_MAX_MODEL];
+
+pllBoolean initrav (pllInstance *tr, partitionList *pr, nodeptr p)
+{ 
+  nodeptr  q;
+
+  if (!isTip(p->number, tr->mxtips)) 
+  {      
+    q = p->next;
+
+    do 
+    {	   
+      if (! initrav(tr, pr, q->back))  return PLL_FALSE;
+      q = q->next;	
+    } 
+    while (q != p);  
+
+    pllUpdatePartials(tr, pr, p, PLL_FALSE);
+  }
+
+  return PLL_TRUE;
+} 
+
+
+/** @brief Optimize the length of a specific branch
+
+    Optimize the length of the branch connecting \a p and \a p->back
+    for each partition (\a tr->numBranches) in library instance \a tr.
+ 
+    @param tr
+      The library instance
+
+    @param pr
+      Partition list
+ 
+    @param p
+      Endpoints of branch to be optimized 
+*/
+void update(pllInstance *tr, partitionList *pr, nodeptr p)
+{       
+  nodeptr  q; 
+  int i;
+  double   z[PLL_NUM_BRANCHES], z0[PLL_NUM_BRANCHES];
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  #ifdef _DEBUG_UPDATE
+    double 
+      startLH;
+  
+    pllEvaluateLikelihood (tr, p);
+  
+    startLH = tr->likelihood;
+  #endif
+
+  q = p->back;   
+
+  for(i = 0; i < numBranches; i++)
+    z0[i] = q->z[i];    
+
+  if(numBranches > 1)
+    makenewzGeneric(tr, pr, p, q, z0, PLL_NEWZPERCYCLE, z, PLL_TRUE);
+  else
+    makenewzGeneric(tr, pr, p, q, z0, PLL_NEWZPERCYCLE, z, PLL_FALSE);
+
+  for(i = 0; i < numBranches; i++)
+  {         
+    if(!tr->partitionConverged[i])
+    {	  
+      if(PLL_ABS(z[i] - z0[i]) > PLL_DELTAZ)  
+      {	      
+        tr->partitionSmoothed[i] = PLL_FALSE;
+      }	 
+
+      p->z[i] = q->z[i] = z[i];	 
+    }
+  }
+ 
+  #ifdef _DEBUG_UPDATE
+    pllEvaluateLikelihood (tr, p);
+  
+    if(tr->likelihood <= startLH)
+      {
+        if(fabs(tr->likelihood - startLH) > 0.01)
+  	{
+  	  printf("%f %f\n", startLH, tr->likelihood);
+  	  assert(0);      
+  	}
+      }
+  #endif
+}
+
+/** @brief Branch length optimization of subtree
+
+    Optimize the length of branch connected by \a p and \a p->back, and the
+    lengths of all branches in the subtrees rooted at \a p->next and \a p->next->next
+
+    @param tr
+      The library instance
+
+    @param pr
+      Partition list
+
+    @param p
+      Endpoint of branches to be optimized
+*/
+void smooth (pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  nodeptr  q;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  update(tr, pr, p);    /*  Adjust branch */
+
+  if (! isTip(p->number, tr->mxtips)) 
+  {                                  /*  Adjust descendants */
+    q = p->next;
+    while (q != p) 
+    {
+      smooth(tr, pr, q->back);
+      q = q->next;
+    }	
+
+    if(numBranches > 1 && !tr->useRecom)
+      pllUpdatePartials(tr, pr,p, PLL_TRUE);
+    else
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+  }
+} 
+
+/**  @brief Check whether the branches in all partitions have been optimized
+ 
+     Check if all branches in all partitions have reached the threshold for
+     optimization. If at least one branch can be optimized further return \b PLL_FALSE.
+
+     @param tr
+       The library instance 
+
+     @return
+       If at least one branch can be further optimized return \b PLL_FALSE,
+       otherwise \b PLL_TRUE.
+             
+*/
+static pllBoolean allSmoothed(pllInstance *tr, int numBranches)
+{
+  int i;
+  pllBoolean result = PLL_TRUE;
+
+  for(i = 0; i < numBranches; i++)
+  {
+    if(tr->partitionSmoothed[i] == PLL_FALSE)
+      result = PLL_FALSE;
+    else
+      tr->partitionConverged[i] = PLL_TRUE;
+  }
+
+  return result;
+}
+
+
+/** @brief Optimize all branch lenghts of a tree
+  
+    Perform \a maxtimes rounds of branch length optimization by running smooth()
+    on all neighbour nodes of node \a tr->start.
+
+    @param tr
+      The library instance
+
+    @param maxtimes
+      Number of optimization rounds to perform
+*/
+/* do maxtimes rounds of branch length optimization */
+void smoothTree (pllInstance *tr, partitionList *pr, int maxtimes)
+{
+	nodeptr  p, q;
+	int i, count = 0;
+    int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+	p = tr->start;
+	for(i = 0; i < numBranches; i++)
+		tr->partitionConverged[i] = PLL_FALSE;
+
+	while (--maxtimes >= 0)
+	{
+		for(i = 0; i < numBranches; i++)
+			tr->partitionSmoothed[i] = PLL_TRUE;
+
+		smooth(tr, pr, p->back);
+		if (!isTip(p->number, tr->mxtips))
+		{
+			q = p->next;
+			while (q != p)
+			{
+				smooth(tr, pr, q->back);
+				q = q->next;
+			}
+		}
+		count++;
+
+		if (allSmoothed(tr, numBranches)) break;
+	}
+
+	for(i = 0; i < numBranches; i++)
+		tr->partitionConverged[i] = PLL_FALSE;
+} 
+
+
+/** @brief Optimize the branch length of edges around a specific node
+    
+    Optimize \a maxtimes the branch length of all (3) edges around a given node 
+    \a p of the tree of library instance \a tr.
+
+    @param tr
+      The library instance
+
+    @param p
+      The node around which to optimize the edges
+
+    @param maxtimes
+      Number of optimization rounds to perform
+*/
+void localSmooth (pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes)
+{ 
+  nodeptr  q;
+  int i;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+  if (isTip(p->number, tr->mxtips)) return;
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+    tr->partitionConverged[i] = PLL_FALSE;	
+
+  while (--maxtimes >= 0) 
+  {     
+    for(i = 0; i < PLL_NUM_BRANCHES; i++)
+      tr->partitionSmoothed[i] = PLL_TRUE;
+
+    q = p;
+    do 
+    {
+      update(tr, pr, q);
+      q = q->next;
+    } 
+    while (q != p);
+
+    if (allSmoothed(tr, numBranches))
+      break;
+  }
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+  {
+    tr->partitionSmoothed[i] = PLL_FALSE; 
+    tr->partitionConverged[i] = PLL_FALSE;
+  }
+}
+
+
+
+
+/** @brief Reset an \a infoList
+
+    Resets an \a infoList by setting elements \a node and \a likelihood
+    of each element of the \a bestInfo list structure to \b NULL and
+    \a PLL_UNLIKELY, respectively.
+
+    @param iList
+      The given \a infoList.
+*/
+static void resetInfoList(infoList *iList)
+{
+  int 
+    i;
+
+  iList->valid = 0;
+
+  for(i = 0; i < iList->n; i++)    
+  {
+    iList->list[i].node = (nodeptr)NULL;
+    iList->list[i].likelihood = PLL_UNLIKELY;
+  }    
+}
+
+/** @brief Initialize an \a infoList
+
+    Initialize an \a infoList by creating a \a bestInfo list structure
+    of \a n elements and setting the attributes \a node and \a likelihood
+    of each element of the \a bestInfo list structure to \b NULL and
+    \a PLL_UNLIKELY, respectively.
+
+    @param iList
+      The given \a infoList.
+
+    @param n
+      Number of elements to be created in the \a bestInfo list.
+*/
+static void initInfoList(infoList *iList, int n)
+{
+  int 
+    i;
+
+  iList->n = n;
+  iList->valid = 0;
+  iList->list = (bestInfo *)rax_malloc(sizeof(bestInfo) * (size_t)n);
+
+  for(i = 0; i < n; i++)
+  {
+    iList->list[i].node = (nodeptr)NULL;
+    iList->list[i].likelihood = PLL_UNLIKELY;
+  }
+}
+
+/** @brief Deallocate the contents of an \a infoList
+    
+    Deallocate the contents of a given \a infoList by freeing
+    the memory used by its \a bestInfo list structure.
+
+    @param iList
+      The \a infoList to be used.
+*/
+static void freeInfoList(infoList *iList)
+{ 
+  rax_free(iList->list);   
+}
+
+
+/** @brief Insert a record in an \a infoList
+
+    Insert the pair \a likelihood and \node into list \a iList 
+    \b only if there already exists a pair in \a iList 
+    whose \a likelihood attribute is smaller than the given \a 
+    likelihood. The insertion is done by replacing the smallest
+    likelihood pair with the new pair.
+
+    @param node
+      The given node
+
+    @param likelihood
+      The given likelihood
+
+    @param iList
+      The given \a infoList where the record will possibly be appended.
+*/
+static void insertInfoList(nodeptr node, double likelihood, infoList *iList)
+{
+  int 
+    i,
+    min = 0;
+
+  double 
+    min_l =  iList->list[0].likelihood;
+
+  for(i = 1; i < iList->n; i++)
+  {
+    if(iList->list[i].likelihood < min_l)
+    {
+      min = i;
+      min_l = iList->list[i].likelihood;
+    }
+  }
+
+  if(likelihood > min_l)
+  {
+    iList->list[min].likelihood = likelihood;
+    iList->list[min].node = node;
+    if(iList->valid < iList->n)
+      iList->valid += 1;
+  }
+}
+
+
+/** @brief  Optimize branch lengths of region
+
+    Optimize the branch lenghts of only a specific region. The branch optimization starts
+    at a node \a p and is carried out in all nodes with distance upto \a region edges from 
+    \a p.
+
+    @param tr
+      The library instance.
+    
+    @param p
+      Node to start branch optimization from.
+
+    @param region
+      The allowed node distance from \p for which to still perform branch optimization.
+*/
+void smoothRegion (pllInstance *tr, partitionList *pr, nodeptr p, int region)
+{ 
+  nodeptr  q;
+
+  update(tr, pr, p);   /* Adjust branch */
+
+  if (region > 0)
+  {
+    if (!isTip(p->number, tr->mxtips)) 
+    {                                 
+      q = p->next;
+      while (q != p) 
+      {
+        smoothRegion(tr, pr, q->back, --region);
+        q = q->next;
+      }	
+
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+    }
+  }
+}
+
+/** @brief Wrapper function for optimizing the branch length of a region \a maxtimes times
+
+    Optimize the branch lengths of a specific region \a maxtimes times. The branch optimization
+    starts at a given node \a p and is carried out in all nodes with distance upto \a region
+    from \a p.
+
+    @param tr
+      The library instance.
+
+    @param p
+      Node to start branch optimization from.
+
+    @param maxtimes
+      Number of times to perform branch optimization.
+
+    @param region
+      The allwed node distance from \p for which to still perform branch optimization.
+
+    @todo
+      In the previous version (before the model-sep merge) the loops were controlled by tr->numBranches,
+      and now they are controlled by a constant PLL_NUM_BRANCHES. What is right?
+*/
+void regionalSmooth (pllInstance *tr, partitionList *pr, nodeptr p, int maxtimes, int region)
+{
+  nodeptr  q;
+  int i;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if (isTip(p->number, tr->mxtips)) return;            /* Should be an error */
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+    tr->partitionConverged[i] = PLL_FALSE;
+
+  while (--maxtimes >= 0) 
+  {	
+    for(i = 0; i < PLL_NUM_BRANCHES; i++)
+      tr->partitionSmoothed[i] = PLL_TRUE;
+
+    q = p;
+    do 
+    {
+      smoothRegion(tr, pr, q, region);
+      q = q->next;
+    } 
+    while (q != p);
+
+    if (allSmoothed(tr, numBranches))
+      break;
+  }
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++) {
+    tr->partitionSmoothed[i] = PLL_FALSE;
+    tr->partitionConverged[i] = PLL_FALSE;
+  }
+} 
+
+
+
+
+/** @brief Split the tree into two components and optimize new branch length
+
+   Split the tree into two components. The disconnection point is node \a p.
+   First, a branch length is computed for the newly created branch between nodes
+   \a p->next->back and \a p->next->next->back and then the two nodes are
+   connected (hookup). Disconnection is done by setting \a p->next->next->back
+   and \a p->next->back to \b NULL.
+
+   @param tr
+     The library instance
+
+   @param p
+     The node at which the tree should be decomposed into two components.
+
+   @param numBranches
+     Number of branches per partition
+
+   @return
+     Node from the disconnected component
+
+   @todo
+     Why do we return this node?
+
+   @image html removeBIG.png "The diagram shows in blue color the new edge that is created and in red the edges that are removed"
+*/
+nodeptr  removeNodeBIG (pllInstance *tr, partitionList *pr, nodeptr p, int numBranches)
+{  
+//  double   zqr[numBranches], result[numBranches];
+  double*   zqr = rax_malloc(numBranches*sizeof(double)), *result = rax_malloc(numBranches*sizeof(double));
+  nodeptr  q, r;
+  int i;
+
+  q = p->next->back;
+  r = p->next->next->back;
+
+  for(i = 0; i < numBranches; i++)
+    zqr[i] = q->z[i] * r->z[i];        
+
+  makenewzGeneric(tr, pr, q, r, zqr, PLL_ITERATIONS, result, PLL_FALSE);
+
+  for(i = 0; i < numBranches; i++)        
+    tr->zqr[i] = result[i];
+
+  hookup(q, r, result, numBranches); 
+
+  p->next->next->back = p->next->back = (node *) NULL;
+
+  rax_free(result);
+  rax_free(zqr);
+  return  q; 
+}
+
+/** @brief Split the tree into two components and recompute likelihood
+
+    Split the tree into two component. The disconnection point is node \a p.
+    Set the branch length of the new node between \a p->next->back and
+    \a p->next->next->back to \a tr->currentZQR and then decompose the tree
+    into two components by setting \a p->next->back and \a p->next->next->back
+    to \b NULL.
+
+    @param tr
+      The library instance
+
+    @param p
+      The node at which the tree should be decomposed into two components.
+
+    @return q
+      the node after \a p
+
+    @todo
+      Why do we return this node? Why do we set to tr->currentZQR and not compute
+      new optimized length? What is tr->currentZQR? 
+*/
+nodeptr  removeNodeRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p)
+{
+  nodeptr  q, r;
+
+  q = p->next->back;
+  r = p->next->next->back;  
+
+  pllUpdatePartials(tr, pr,q, PLL_FALSE);
+  pllUpdatePartials(tr, pr,r, PLL_FALSE);
+
+  hookup(q, r, tr->currentZQR, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+
+  p->next->next->back = p->next->back = (node *) NULL;
+
+  return  q;
+}
+
+/** @brief Connect two disconnected tree components
+   
+   Connect two disconnected components by specifying an internal edge from one
+   component and a leaf from the other component. The internal edge \a e is the
+   edge between \a q and \a q->back. The leaf is specified by \a p.
+   Edge \a e is removed and two new edges are created. The first one is an edge
+   between \a p->next and \a q, and the second one is between \a p->next->next
+   and \a q->back. The new likelihood vector for node \a p is computed.
+
+   @note The function makes use of the \a thoroughInsertion flag
+
+   @todo
+     What is tr->lzi ? What is thorough insertion? Why do we optimize branch lengths
+     that will be removed? Add explanation
+
+   @image html pll.png "The diagram shows in blue colors the new edges that are created and in red the edge that is removed" 
+*/
+pllBoolean insertBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r, s;
+  int i;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  r = q->back;
+  s = p->back;
+
+  for(i = 0; i < numBranches; i++)
+    tr->lzi[i] = q->z[i];
+
+  if(tr->thoroughInsertion)
+  { 
+	  double * zqr = rax_malloc(numBranches*sizeof(double)), 
+		  *zqs = rax_malloc(numBranches*sizeof(double)), 
+		  *zrs = rax_malloc(numBranches*sizeof(double));
+	  double lzqr, lzqs, lzrs, lzsum, lzq, lzr, lzs, lzmax;
+    double *defaultArray=rax_malloc(numBranches*sizeof(double));
+	double *e1 = rax_malloc(numBranches*sizeof(double)),
+		*e2 = rax_malloc(numBranches*sizeof(double)),
+		*e3 = rax_malloc(numBranches*sizeof(double));
+    double *qz;
+
+    qz = q->z;
+
+    for(i = 0; i < numBranches; i++)
+      defaultArray[i] = PLL_DEFAULTZ;
+
+    makenewzGeneric(tr, pr, q, r, qz, PLL_ITERATIONS, zqr, PLL_FALSE);
+    /* the branch lengths values will be estimated using q, r and s
+     * q-s are not connected, but both q and s have a valid LH vector , so we can call makenewzGeneric  to get a value for
+     * lzsum, which is then use to generate reasonable starting values e1, e2, e3 for the new branches we create after the       insertion
+     */
+
+    makenewzGeneric(tr, pr, q, s, defaultArray, PLL_ITERATIONS, zqs, PLL_FALSE);
+    makenewzGeneric(tr, pr, r, s, defaultArray, PLL_ITERATIONS, zrs, PLL_FALSE);
+
+
+    for(i = 0; i < numBranches; i++)
+    {
+      lzqr = (zqr[i] > PLL_ZMIN) ? log(zqr[i]) : log(PLL_ZMIN); 
+      lzqs = (zqs[i] > PLL_ZMIN) ? log(zqs[i]) : log(PLL_ZMIN);
+      lzrs = (zrs[i] > PLL_ZMIN) ? log(zrs[i]) : log(PLL_ZMIN);
+      lzsum = 0.5 * (lzqr + lzqs + lzrs);
+
+      lzq = lzsum - lzrs;
+      lzr = lzsum - lzqs;
+      lzs = lzsum - lzqr;
+      lzmax = log(PLL_ZMAX);
+
+      if      (lzq > lzmax) {lzq = lzmax; lzr = lzqr; lzs = lzqs;} 
+      else if (lzr > lzmax) {lzr = lzmax; lzq = lzqr; lzs = lzrs;}
+      else if (lzs > lzmax) {lzs = lzmax; lzq = lzqs; lzr = lzrs;}          
+
+      e1[i] = exp(lzq);
+      e2[i] = exp(lzr);
+      e3[i] = exp(lzs);
+    }
+    hookup(p->next,       q, e1, numBranches);
+    hookup(p->next->next, r, e2, numBranches);
+    hookup(p,             s, e3, numBranches);      		  
+	rax_free(e3);
+	rax_free(e2);
+	rax_free(e1);
+	rax_free(defaultArray);
+	rax_free(zrs);
+	rax_free(zqs);
+	rax_free(zqr);
+
+  }
+  else
+  {       
+	  double  *z = rax_malloc(numBranches*sizeof(double));
+
+    for(i = 0; i < numBranches; i++)
+    {
+      z[i] = sqrt(q->z[i]);      
+
+      if(z[i] < PLL_ZMIN) 
+        z[i] = PLL_ZMIN;
+      if(z[i] > PLL_ZMAX)
+        z[i] = PLL_ZMAX;
+    }
+
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+	rax_free(z);
+  }
+
+  pllUpdatePartials(tr, pr,p, PLL_FALSE);
+
+  if(tr->thoroughInsertion)
+  {     
+    localSmooth(tr, pr, p, PLL_MAX_LOCAL_SMOOTHING_ITERATIONS);
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->lzq[i] = p->next->z[i];
+      tr->lzr[i] = p->next->next->z[i];
+      tr->lzs[i] = p->z[i];            
+    }
+  }           
+
+  return  PLL_TRUE;
+}
+
+/** @brief Connect two disconnected tree components without optimizing branch lengths
+   
+   Connect two disconnected components by specifying an internal edge from one
+   component and a leaf from the other component. The internal edge \a e is the
+   edge between \a q and \a q->back. The leaf is specified by \a p.
+   Edge \a e is removed and two new edges are created. The first one is an edge
+   between \a p->next and \a q, and the second one is between \a p->next->next
+   and \a q->back. The new likelihood vector for node \a p is computed.
+
+   @note The function makes use of the \a thoroughInsertion flag
+
+   @todo
+     What is the difference between this and insertBIG? 
+*/
+pllBoolean insertRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+  nodeptr  r, s;
+
+  r = q->back;
+  s = p->back;
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  if(tr->thoroughInsertion)
+  {                        
+    hookup(p->next,       q, tr->currentLZQ, numBranches);
+    hookup(p->next->next, r, tr->currentLZR, numBranches);
+    hookup(p,             s, tr->currentLZS, numBranches);
+  }
+  else
+  {       
+    double  z[PLL_NUM_BRANCHES];
+    int i;
+
+    for(i = 0; i < numBranches; i++)
+    {
+      double zz;
+      zz = sqrt(q->z[i]);     
+      if(zz < PLL_ZMIN) 
+        zz = PLL_ZMIN;
+      if(zz > PLL_ZMAX)
+        zz = PLL_ZMAX;
+      z[i] = zz;
+    }
+
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+  }   
+
+  pllUpdatePartials(tr, pr,p, PLL_FALSE);
+
+  return  PLL_TRUE;
+}
+
+
+static void restoreTopologyOnly(pllInstance *tr, bestlist *bt, int numBranches)
+{ 
+  nodeptr p = tr->removeNode;
+  nodeptr q = tr->insertNode;
+  double qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES], p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES];
+  nodeptr p1, p2, r, s;
+  double currentLH = tr->likelihood;
+  int i;
+
+  p1 = p->next->back;
+  p2 = p->next->next->back;
+
+  //memcpy(p1z, p1->z, numBranches*sizeof(double));
+  //memcpy(p2z, p2->z, numBranches*sizeof(double));
+  //memcpy(qz, q->z, numBranches*sizeof(double));
+  //memcpy(pz, p->z, numBranches*sizeof(double));
+  for(i = 0; i < numBranches; i++)
+  {
+    p1z[i] = p1->z[i];
+    p2z[i] = p2->z[i];
+  }
+
+  hookup(p1, p2, tr->currentZQR, numBranches);
+
+  p->next->next->back = p->next->back = (node *) NULL;             
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  r = q->back;
+  s = p->back;
+
+  if(tr->thoroughInsertion)
+  {                        
+    hookup(p->next,       q, tr->currentLZQ, numBranches);
+    hookup(p->next->next, r, tr->currentLZR, numBranches);
+    hookup(p,             s, tr->currentLZS, numBranches);
+  }
+  else
+  { 	
+    double  z[PLL_NUM_BRANCHES];	
+    for(i = 0; i < numBranches; i++)
+    {
+      z[i] = sqrt(q->z[i]);      
+      if(z[i] < PLL_ZMIN)
+        z[i] = PLL_ZMIN;
+      if(z[i] > PLL_ZMAX)
+        z[i] = PLL_ZMAX;
+    }
+    hookup(p->next,       q, z, numBranches);
+    hookup(p->next->next, r, z, numBranches);
+  }     
+
+  tr->likelihood = tr->bestOfNode;
+
+  saveBestTree(bt, tr, numBranches);
+
+  tr->likelihood = currentLH;
+
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)    
+    hookup(p, s, pz, numBranches);
+
+  hookup(p->next,       p1, p1z, numBranches);
+  hookup(p->next->next, p2, p2z, numBranches);
+}
+
+/** @brief Test the 
+*/
+pllBoolean testInsertBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{
+
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  double  qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES];
+  nodeptr  r;
+  double startLH = tr->endLH;
+  int i;
+
+  r = q->back; 
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+  pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+
+  if(tr->likelihood > tr->bestOfNode)
+  {
+    tr->bestOfNode = tr->likelihood;
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->currentZQR[i] = tr->zqr[i];           
+      tr->currentLZR[i] = tr->lzr[i];
+      tr->currentLZQ[i] = tr->lzq[i];
+      tr->currentLZS[i] = tr->lzs[i];      
+    }
+  }
+
+  if(tr->likelihood > tr->endLH)
+  {			  
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+      tr->currentZQR[i] = tr->zqr[i];      
+    tr->endLH = tr->likelihood;                      
+  }        
+
+  /* reset the topology so that it is the same as it was before calling insertBIG */
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)
+  {
+    nodeptr s = p->back;
+    hookup(p, s, pz, numBranches);
+  } 
+
+  if((tr->doCutoff) && (tr->likelihood < startLH))
+  {
+    tr->lhAVG += (startLH - tr->likelihood);
+    tr->lhDEC++;
+    if((startLH - tr->likelihood) >= tr->lhCutoff)
+      return PLL_FALSE;	    
+    else
+      return PLL_TRUE;
+  }
+  else
+    return PLL_TRUE;
+}
+
+
+/** @brief Recursively traverse tree and test insertion
+
+    Recursively traverses the tree structure starting from node \a q and
+    tests the insertion of the component specified by leaf \a p at the edge
+    between \a q and \a q->back.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+    @param p
+      Leaf node of one tree component
+
+    @param q
+      Endpoint node of the edge to test the insertion
+
+    @param mintrav
+      Minimum radius around \a q to test the insertion
+
+    @param maxtrav
+      Maximum radius around \a q to test the insertion\
+*/
+void addTraverseBIG(pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav)
+{  
+  if (--mintrav <= 0) 
+  {              
+    if (! testInsertBIG(tr, pr, p, q))  return;
+
+  }
+
+  if ((!isTip(q->number, tr->mxtips)) && (--maxtrav > 0)) 
+  {    
+    addTraverseBIG(tr, pr, p, q->next->back, mintrav, maxtrav);
+    addTraverseBIG(tr, pr, p, q->next->next->back, mintrav, maxtrav);
+  }
+} 
+
+
+
+
+/** @brief  Compute the  best SPR movement
+
+    Compute all SPR moves starting from \a p in the space defined by \a mintrav and
+    \a maxtrav and store the best in the \a tr structure.
+
+    @param tr
+      PLL instancve
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node from which to start the SPR moves testing
+
+    @param mintrav
+      Minimum distance from \a p where to start testing SPRs
+
+    @param maxtrav
+      Maximum distance from \a p where to test SPRs
+
+    @return
+       0,1 or \b PLL_BADREAR
+
+    @todo
+      fix the return value
+*/
+int rearrangeBIG(pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav)
+{  
+  double   p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES], q1z[PLL_NUM_BRANCHES], q2z[PLL_NUM_BRANCHES];
+  nodeptr  p1, p2, q, q1, q2;
+  int      mintrav2, i;  
+  pllBoolean doP = PLL_TRUE, doQ = PLL_TRUE;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (maxtrav < 1 || mintrav > maxtrav)  return (0);
+  q = p->back;
+
+
+
+
+  if (!isTip(p->number, tr->mxtips) && doP) 
+  {     
+    p1 = p->next->back;
+    p2 = p->next->next->back;
+
+
+    if(!isTip(p1->number, tr->mxtips) || !isTip(p2->number, tr->mxtips))
+    {
+      for(i = 0; i < numBranches; i++)
+      {
+        p1z[i] = p1->z[i];
+        p2z[i] = p2->z[i];	   	   
+      }
+
+      if (! removeNodeBIG(tr, pr, p,  numBranches)) return PLL_BADREAR;
+
+      if (!isTip(p1->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, p, p1->next->back,
+            mintrav, maxtrav);         
+
+        addTraverseBIG(tr, pr, p, p1->next->next->back,
+            mintrav, maxtrav);          
+      }
+
+      if (!isTip(p2->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, p, p2->next->back,
+            mintrav, maxtrav);
+        addTraverseBIG(tr, pr, p, p2->next->next->back,
+            mintrav, maxtrav);          
+      }
+
+      hookup(p->next,       p1, p1z, numBranches);
+      hookup(p->next->next, p2, p2z, numBranches);
+      pllUpdatePartials(tr, pr,p, PLL_FALSE);
+    }
+  }  
+
+  if (!isTip(q->number, tr->mxtips) && maxtrav > 0 && doQ) 
+  {
+    q1 = q->next->back;
+    q2 = q->next->next->back;
+
+    /*if (((!q1->tip) && (!q1->next->back->tip || !q1->next->next->back->tip)) ||
+      ((!q2->tip) && (!q2->next->back->tip || !q2->next->next->back->tip))) */
+    if (
+        (
+         ! isTip(q1->number, tr->mxtips) && 
+         (! isTip(q1->next->back->number, tr->mxtips) || ! isTip(q1->next->next->back->number, tr->mxtips))
+        )
+        ||
+        (
+         ! isTip(q2->number, tr->mxtips) && 
+         (! isTip(q2->next->back->number, tr->mxtips) || ! isTip(q2->next->next->back->number, tr->mxtips))
+        )
+       )
+    {
+
+      for(i = 0; i < numBranches; i++)
+      {
+        q1z[i] = q1->z[i];
+        q2z[i] = q2->z[i];
+      }
+
+      if (! removeNodeBIG(tr, pr, q, numBranches)) return PLL_BADREAR;
+
+      mintrav2 = mintrav > 2 ? mintrav : 2;
+
+      if (/*! q1->tip*/ !isTip(q1->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, q, q1->next->back,
+            mintrav2 , maxtrav);
+        addTraverseBIG(tr, pr, q, q1->next->next->back,
+            mintrav2 , maxtrav);         
+      }
+
+      if (/*! q2->tip*/ ! isTip(q2->number, tr->mxtips)) 
+      {
+        addTraverseBIG(tr, pr, q, q2->next->back,
+            mintrav2 , maxtrav);
+        addTraverseBIG(tr, pr, q, q2->next->next->back,
+            mintrav2 , maxtrav);          
+      }	   
+
+      hookup(q->next,       q1, q1z, numBranches);
+      hookup(q->next->next, q2, q2z, numBranches);
+
+      pllUpdatePartials(tr, pr,q, PLL_FALSE);
+    }
+  } 
+
+  return  1;
+} 
+
+
+
+
+/** @brief Perform an SPR move?
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param mintrav
+
+    @param maxtrav
+
+    @param adef
+
+    @param bt
+
+    @param iList
+
+*/
+double treeOptimizeRapid(pllInstance *tr, partitionList *pr, int mintrav, int maxtrav, bestlist *bt, infoList *iList)
+{
+  int i, index,
+      *perm = (int*)NULL;   
+
+  nodeRectifier(tr);
+
+
+
+  if (maxtrav > tr->mxtips - 3)  
+    maxtrav = tr->mxtips - 3;  
+
+
+
+  resetInfoList(iList);
+
+  resetBestTree(bt);
+
+  tr->startLH = tr->endLH = tr->likelihood;
+
+  if(tr->doCutoff)
+  {
+    if(tr->bigCutoff)
+    {	  
+      if(tr->itCount == 0)    
+        tr->lhCutoff = 0.5 * (tr->likelihood / -1000.0);    
+      else    		 
+        tr->lhCutoff = 0.5 * ((tr->lhAVG) / ((double)(tr->lhDEC))); 	  
+    }
+    else
+    {
+      if(tr->itCount == 0)    
+        tr->lhCutoff = tr->likelihood / -1000.0;    
+      else    		 
+        tr->lhCutoff = (tr->lhAVG) / ((double)(tr->lhDEC));   
+    }    
+
+    tr->itCount = tr->itCount + 1;
+    tr->lhAVG = 0;
+    tr->lhDEC = 0;
+  }
+
+  /*
+     printf("DoCutoff: %d\n", tr->doCutoff);
+     printf("%d %f %f %f\n", tr->itCount, tr->lhAVG, tr->lhDEC, tr->lhCutoff);
+
+     printf("%d %d\n", mintrav, maxtrav);
+     */
+
+  for(i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+  {           
+    tr->bestOfNode = PLL_UNLIKELY;          
+
+    if(tr->permuteTreeoptimize)
+      index = perm[i];
+    else
+      index = i;     
+
+    if(rearrangeBIG(tr, pr, tr->nodep[index], mintrav, maxtrav))
+    {    
+      if(tr->thoroughInsertion)
+      {
+        if(tr->endLH > tr->startLH)                 	
+        {			   
+          /* commit the best SPR found by rearrangeBIG */
+          restoreTreeFast(tr, pr);    
+          tr->startLH = tr->endLH = tr->likelihood;	 
+          saveBestTree(bt, tr, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }
+        else
+        { 		  
+          if(tr->bestOfNode != PLL_UNLIKELY)
+            restoreTopologyOnly(tr, bt, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }	   
+      }
+      else
+      {
+        insertInfoList(tr->nodep[index], tr->bestOfNode, iList);	    
+        if(tr->endLH > tr->startLH)                 	
+        {		      
+          restoreTreeFast(tr, pr);
+          tr->startLH = tr->endLH = tr->likelihood;	  	 	  	  	  	  	  	  
+        }	    	  
+      }
+    }     
+  }     
+
+  if(!tr->thoroughInsertion)
+  {           
+    tr->thoroughInsertion = PLL_TRUE;  
+
+    for(i = 0; i < iList->valid; i++)
+    { 	  
+      tr->bestOfNode = PLL_UNLIKELY;
+
+      if(rearrangeBIG(tr, pr, iList->list[i].node, mintrav, maxtrav))
+      {	  
+        if(tr->endLH > tr->startLH)                 	
+        {	 	     
+          restoreTreeFast(tr, pr);
+          tr->startLH = tr->endLH = tr->likelihood;	 
+          saveBestTree(bt, tr, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+        }
+        else
+        { 
+
+          if(tr->bestOfNode != PLL_UNLIKELY)
+          {	     
+            restoreTopologyOnly(tr, bt, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+          }	
+        }      
+      }
+    }       
+
+    tr->thoroughInsertion = PLL_FALSE;
+  }
+
+  if(tr->permuteTreeoptimize)
+    rax_free(perm);
+
+  return tr->startLH;     
+}
+
+
+
+
+pllBoolean testInsertRestoreBIG (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q)
+{    
+  if(tr->thoroughInsertion)
+  {
+    if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+    pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+  }
+  else
+  {
+    if (! insertRestoreBIG(tr, pr, p, q))       return PLL_FALSE;
+
+    {
+      nodeptr x, y;
+      x = p->next->next;
+      y = p->back;
+
+      if(! isTip(x->number, tr->mxtips) && isTip(y->number, tr->mxtips))
+      {
+        while ((! x->x)) 
+        {
+          if (! (x->x))
+            pllUpdatePartials(tr, pr,x, PLL_FALSE);
+        }
+      }
+
+      if(isTip(x->number, tr->mxtips) && !isTip(y->number, tr->mxtips))
+      {
+        while ((! y->x)) 
+        {		  
+          if (! (y->x))
+            pllUpdatePartials(tr, pr,y, PLL_FALSE);
+        }
+      }
+
+      if(!isTip(x->number, tr->mxtips) && !isTip(y->number, tr->mxtips))
+      {
+        while ((! x->x) || (! y->x)) 
+        {
+          if (! (x->x))
+            pllUpdatePartials(tr, pr,x, PLL_FALSE);
+          if (! (y->x))
+            pllUpdatePartials(tr, pr,y, PLL_FALSE);
+        }
+      }				      	
+
+    }
+
+    tr->likelihood = tr->endLH;
+  }
+
+  return PLL_TRUE;
+} 
+
+void restoreTreeFast(pllInstance *tr, partitionList *pr)
+{
+  removeNodeRestoreBIG(tr, pr, tr->removeNode);
+  testInsertRestoreBIG(tr, pr, tr->removeNode, tr->insertNode);
+}
+
+/*
+static void myfwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+  size_t  
+    bytes_written = fwrite(ptr, size, nmemb, stream);
+
+  assert(bytes_written == nmemb);
+}
+
+static void myfread(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+  size_t
+    bytes_read;
+
+  bytes_read = fread(ptr, size, nmemb, stream);
+
+  assert(bytes_read == nmemb);
+}
+
+static void readTree(pllInstance *tr, partitionList *pr, FILE *f)
+{
+  int 
+    nodeNumber,   
+    x = tr->mxtips + 3 * (tr->mxtips - 1);
+
+  nodeptr
+    startAddress;
+
+  myfread(&nodeNumber, sizeof(int), 1, f);
+
+  tr->start = tr->nodep[nodeNumber];
+
+
+  myfread(&startAddress, sizeof(nodeptr), 1, f);
+
+  myfread(tr->nodeBaseAddress, sizeof(node), x, f);
+
+  {
+    int i;    
+
+    size_t         
+      offset;
+
+    pllBoolean 
+      addIt;
+
+    if(startAddress > tr->nodeBaseAddress)
+    {
+      addIt = PLL_FALSE;
+      offset = (size_t)startAddress - (size_t)tr->nodeBaseAddress;
+    }
+    else
+    {
+      addIt = PLL_TRUE;
+      offset = (size_t)tr->nodeBaseAddress - (size_t)startAddress;
+    }       
+
+    for(i = 0; i < x; i++)
+    {      	
+      if(addIt)
+      {	    
+        tr->nodeBaseAddress[i].next = (nodeptr)((size_t)tr->nodeBaseAddress[i].next + offset);	
+        tr->nodeBaseAddress[i].back = (nodeptr)((size_t)tr->nodeBaseAddress[i].back + offset);
+      }
+      else
+      {
+
+        tr->nodeBaseAddress[i].next = (nodeptr)((size_t)tr->nodeBaseAddress[i].next - offset);	
+        tr->nodeBaseAddress[i].back = (nodeptr)((size_t)tr->nodeBaseAddress[i].back - offset);	   
+      } 
+    }
+
+  }
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+
+  printBothOpen("RAxML Restart with likelihood: %1.50f\n", tr->likelihood);
+}
+
+static void readCheckpoint(pllInstance *tr, partitionList *pr)
+{
+  int  
+    restartErrors = 0,
+                  model; 
+
+  FILE 
+    *f = myfopen(binaryCheckpointInputName, "r");
+*/
+  /* cdta */   
+/*
+  myfread(&(tr->ckp), sizeof(checkPointState), 1, f);
+
+
+
+  if(tr->ckp.searchConvergenceCriterion != tr->searchConvergenceCriterion)
+  {
+    printf("restart error, you are trying to re-start a run where the ML search criterion was turned %s\n", (tr->ckp.searchConvergenceCriterion)?"ON":"OFF");
+    restartErrors++;
+  }  
+
+  if(tr->ckp.rateHetModel !=  tr->rateHetModel)
+  {
+    printf("restart error, you are trying to re-start a run with a different model of rate heterogeneity, the checkpoint was obtained under: %s\n", (tr->ckp.rateHetModel == PLL_GAMMA)?"GAMMA":"PSR");
+    restartErrors++;
+  }  
+
+  if(tr->ckp.maxCategories !=  tr->maxCategories)
+  {
+    printf("restart error, you are trying to re-start a run with %d per-site rate categories, the checkpoint was obtained with: %d\n", tr->maxCategories, tr->ckp.maxCategories);
+    restartErrors++;
+  }
+
+  if(tr->ckp.NumberOfModels != pr->numberOfPartitions)
+  {
+    printf("restart error, you are trying to re-start a run with %d partitions, the checkpoint was obtained with: %d partitions\n", (int)pr->numberOfPartitions, tr->ckp.NumberOfModels);
+    restartErrors++;      
+  }
+
+  if(tr->ckp.numBranches != pr->perGeneBranchLengths?pr->numberOfPartitions:1)
+  {
+    printf("restart error, you are trying to re-start a run where independent per-site branch length estimates were turned %s\n", (tr->ckp.numBranches > 1)?"ON":"OFF");
+    restartErrors++;
+  }
+
+  if(tr->ckp.originalCrunchedLength != tr->originalCrunchedLength)
+  {
+    printf("restart error, you are trying to re-start a run with %d site patterns, the checkpoint was obtained with: %d site patterns\n", tr->ckp.originalCrunchedLength, tr->originalCrunchedLength);
+    restartErrors++; 
+  }
+
+  if(tr->ckp.mxtips != tr->mxtips)
+  {
+    printf("restart error, you are trying to re-start a run with %d taxa, the checkpoint was obtained with: %d taxa\n", tr->mxtips, tr->ckp.mxtips);
+    restartErrors++; 
+  }
+
+  if(strcmp(tr->ckp.seq_file, seq_file) != 0)
+  {
+    printf("restart error, you are trying to re-start from alignemnt file %s, the checkpoint was obtained with file: %s\n", tr->ckp.seq_file, seq_file);
+    restartErrors++; 
+  }
+
+  printf("REstart errors: %d\n", restartErrors);
+
+  if(restartErrors > 0)
+  {
+    printf("User induced errors with the restart from checkpoint, exiting ...\n");
+
+    if(restartErrors > 4)
+      printf(" ... maybe you should do field work instead of trying to use a computer ...\n");
+    if(restartErrors > 6)
+      printf(" ... kala eisai telios ilithios;\n");
+
+    exit(-1);
+  }
+
+  tr->ntips = tr->mxtips;
+
+  tr->startLH    = tr->ckp.tr_startLH;
+  tr->endLH      = tr->ckp.tr_endLH;
+  tr->likelihood = tr->ckp.tr_likelihood;
+  tr->bestOfNode = tr->ckp.tr_bestOfNode;
+
+  tr->lhCutoff   = tr->ckp.tr_lhCutoff;
+  tr->lhAVG      = tr->ckp.tr_lhAVG;
+  tr->lhDEC      = tr->ckp.tr_lhDEC;
+  tr->itCount    = tr->ckp.tr_itCount;
+  tr->thoroughInsertion       = tr->ckp.tr_thoroughInsertion;
+
+
+
+  accumulatedTime = tr->ckp.accumulatedTime;
+*/
+  /* printf("Accumulated time so far: %f\n", accumulatedTime); */
+/*
+  tr->optimizeRateCategoryInvocations = tr->ckp.tr_optimizeRateCategoryInvocations;
+
+
+  myfread(tr->tree0, sizeof(char), tr->treeStringLength, f);
+  myfread(tr->tree1, sizeof(char), tr->treeStringLength, f);
+
+  if(tr->searchConvergenceCriterion)
+  {
+    int bCounter = 0;
+
+    if((tr->ckp.state == PLL_FAST_SPRS && tr->ckp.fastIterations > 0) ||
+        (tr->ckp.state == PLL_SLOW_SPRS && tr->ckp.thoroughIterations > 0))
+    { 
+
+#ifdef _DEBUG_CHECKPOINTING    
+      printf("parsing Tree 0\n");
+#endif
+
+      treeReadTopologyString(tr->tree0, tr);   
+
+      bitVectorInitravSpecial(tr->bitVectors, tr->nodep[1]->back, tr->mxtips, tr->vLength, tr->h, 0, PLL_BIPARTITIONS_RF, (branchInfo *)NULL,
+          &bCounter, 1, PLL_FALSE, PLL_FALSE, tr->threadID);
+
+      assert(bCounter == tr->mxtips - 3);
+    }
+
+    bCounter = 0;
+
+    if((tr->ckp.state == PLL_FAST_SPRS && tr->ckp.fastIterations > 1) ||
+        (tr->ckp.state == PLL_SLOW_SPRS && tr->ckp.thoroughIterations > 1))
+    {
+
+#ifdef _DEBUG_CHECKPOINTING
+      printf("parsing Tree 1\n");
+#endif
+
+      treeReadTopologyString(tr->tree1, tr); 
+
+      bitVectorInitravSpecial(tr->bitVectors, tr->nodep[1]->back, tr->mxtips, tr->vLength, tr->h, 1, PLL_BIPARTITIONS_RF, (branchInfo *)NULL,
+          &bCounter, 1, PLL_FALSE, PLL_FALSE, tr->threadID);
+
+      assert(bCounter == tr->mxtips - 3);
+    }
+  }
+
+  myfread(tr->rateCategory, sizeof(int), tr->originalCrunchedLength, f);
+  myfread(tr->patrat, sizeof(double), tr->originalCrunchedLength, f);
+  myfread(tr->patratStored, sizeof(double), tr->originalCrunchedLength, f);
+
+*/
+  /* need to read this as well in checkpoints, otherwise the branch lengths 
+     in the output tree files will be wrong, not the internal branch lengths though */
+/*
+  //TODO: Same problem as writing the checkpoint
+  //myfread(tr->fracchanges,  sizeof(double), pr->numberOfPartitions, f);
+  myfread(&(tr->fracchange),   sizeof(double), 1, f);
+*/
+  /* pInfo */
+/*
+  for(model = 0; model < pr->numberOfPartitions; model++)
+  {
+    int 
+      dataType = pr->partitionData[model]->dataType;
+
+    myfread(&(pr->partitionData[model]->numberOfCategories), sizeof(int), 1, f);
+    myfread(pr->partitionData[model]->perSiteRates, sizeof(double), tr->maxCategories, f);
+    myfread(pr->partitionData[model]->EIGN, sizeof(double), pLengths[dataType].eignLength, f);
+    myfread(pr->partitionData[model]->EV, sizeof(double),  pLengths[dataType].evLength, f);
+    myfread(pr->partitionData[model]->EI, sizeof(double),  pLengths[dataType].eiLength, f);
+
+    myfread(pr->partitionData[model]->frequencies, sizeof(double),  pLengths[dataType].frequenciesLength, f);
+    myfread(pr->partitionData[model]->tipVector, sizeof(double),  pLengths[dataType].tipVectorLength, f);
+    myfread(pr->partitionData[model]->substRates, sizeof(double),  pLengths[dataType].substRatesLength, f);
+    myfread(&(pr->partitionData[model]->alpha), sizeof(double), 1, f);
+    
+    if(pr->partitionData[model]->protModels == PLL_LG4M || pr->partitionData[model]->protModels == PLL_LG4X)
+	{
+	  int 
+	    k;
+	  
+	  for(k = 0; k < 4; k++)
+	    {
+	      myfread(pr->partitionData[model]->EIGN_LG4[k], sizeof(double), pLengths[dataType].eignLength, f);
+	      myfread(pr->partitionData[model]->EV_LG4[k], sizeof(double),  pLengths[dataType].evLength, f);
+	      myfread(pr->partitionData[model]->EI_LG4[k], sizeof(double),  pLengths[dataType].eiLength, f);    
+	      myfread(pr->partitionData[model]->frequencies_LG4[k], sizeof(double),  pLengths[dataType].frequenciesLength, f);
+	      myfread(pr->partitionData[model]->tipVector_LG4[k], sizeof(double),  pLengths[dataType].tipVectorLength, f);  
+	      myfread(pr->partitionData[model]->substRates_LG4[k], sizeof(double),  pLengths[dataType].substRatesLength, f);    
+	    }
+	}
+
+    pllMakeGammaCats(pr->partitionData[model]->alpha, pr->partitionData[model]->gammaRates, 4, tr->useMedian);
+  }
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_INIT_MODEL);
+#endif
+
+  updatePerSiteRates(tr, pr, PLL_FALSE);
+
+  readTree(tr, pr, f);
+
+  fclose(f); 
+
+}
+
+void restart(pllInstance *tr, partitionList *pr)
+{  
+  readCheckpoint(tr, pr);
+
+  switch(tr->ckp.state)
+  {
+    case PLL_REARR_SETTING:      
+      break;
+    case PLL_FAST_SPRS:
+      break;
+    case PLL_SLOW_SPRS:
+      break;
+    default:
+      assert(0);
+  }
+}
+*/
+
+/* The number of maximum smoothing iterations is given explicitely */
+/** @brief Optimize branch lenghts and evaluate likelihood of topology
+    
+    Optimize the branch lengths \a maxSmoothIterations times and evaluate
+    the likelihood of tree. The resulting likelihood is placed in
+    \a tr->likelihood
+
+    @param tr
+      The PLL instance
+
+    @param pr
+      List of partitions
+
+    @param maxSmoothIterations
+      Number of times to optimize branch lengths
+*/
+void
+pllOptimizeBranchLengths (pllInstance *tr, partitionList *pr, int maxSmoothIterations)       /* Evaluate a user tree */
+{
+  smoothTree(tr, pr, maxSmoothIterations); /* former (32 * smoothFactor) */
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+}
+
+/** @brief Perform an NNI move
+
+    Modify the tree topology of instance \a tr by performing an NNI (Neighbour Neighbor
+    Interchange) move at node \a p. Let \a q be \a p->back. If \a swap is set to \b PLL_NNI_P_NEXT 
+    then the subtrees rooted at \a p->next->back and \a q->next->back will be swapped. Otherwise,
+    if \a swap is set to \b PLL_NNI_P_NEXTNEXT then the subtrees rooted at \a p->next->next->back and
+    \a q->next->back are swapped. For clarity, see the illustration.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Node to use as origin for performing NNI
+
+    @param swap
+      Which node to use for the NNI move. \b PLL_NNI_P_NEXT uses node p->next while \b PLL_NNI_P_NEXTNEXT uses p->next->next
+
+    @return
+      In case of success \b PLL_TRUE, otherwise \b PLL_FALSE
+
+    @todo
+      Started error checking here. Instead of checking the errors in the specified way, implement a variadic
+      function where we pass the results of each check and the error code we want to assign if there is at
+      least one negative result
+
+    @image html nni.png "In case \a swap is set to \b PLL_NNI_P_NEXT then the dashed red edge between \a p and \a r is removed and the blue edges are created. If \a swap is set to \b PLL_INIT_P_NEXTNEXT then the dashed red edge between \a p and \a s is removed and the green edges are created. In both cases the black dashed edge is removed"
+*/
+int pllTopologyPerformNNI(pllInstance * tr, nodeptr p, int swap)
+{
+  nodeptr       q, r;
+
+  q = p->back;
+  if (isTip(q->number, tr->mxtips))
+   {
+     errno = PLL_NNI_Q_TIP;
+     return (PLL_FALSE);
+   }
+  if (isTip(p->number, tr->mxtips))
+   {
+     errno = PLL_NNI_P_TIP;
+     return (PLL_FALSE);
+   }
+  assert(!isTip(q->number, tr->mxtips));
+  assert(!isTip(p->number, tr->mxtips));
+
+
+  if(swap == PLL_NNI_P_NEXT)
+   {
+     r = p->next->back;
+     hookupFull(p->next, q->next->back, q->next->z);
+     hookupFull(q->next, r,             p->next->z);
+   }
+  else
+   {
+     r = p->next->next->back;
+     hookupFull(p->next->next, q->next->back, q->next->z);
+     hookupFull(q->next,       r,             p->next->next->z);
+   }
+
+  return PLL_TRUE;
+}
+
+/** @brief Compares 2 NNI moves */
+static int cmp_nni(const void* nni1, const void* nni2) {
+	nniMove* myNNI1 = (nniMove*) nni1;
+	nniMove* myNNI2 = (nniMove*) nni2;
+	return (int) (1000000.f * myNNI1->deltaLH - 1000000.f * myNNI2->deltaLH);
+}
+
+/** @brief Gets the best NNI move for a branch
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node to use as origin for performing NNI
+
+    @param curLH
+      The current likelihood
+
+    @return
+      The best NNI move
+
+*/
+nniMove getBestNNIForBran(pllInstance* tr, partitionList *pr, nodeptr p,
+		double curLH) {
+	nodeptr q = p->back;
+	assert( ! isTip(p->number, tr->mxtips));
+	assert( ! isTip(q->number, tr->mxtips));
+#ifdef _DEBUG_NNI
+	pllTreeToNewick(tr->tree_string, tr, tr->start->back, TRUE, FALSE, 0, 0, 0, SUMMARIZE_LH, 0,0);
+	fprintf(stderr, "%s\n", tr->tree_string);
+#endif
+
+	/* Backup the current branch length */
+	double z0[PLL_NUM_BRANCHES];
+	int i;
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		z0[i] = p->z[i];
+	}
+#ifdef _DEBUG_NNI
+	double lhOld = tr->likelihood;
+	printf("lhOld: %f \n", lhOld);
+#endif
+	double lh0 = curLH;
+
+
+#ifdef _DEBUG_NNI
+	printf("lh0: %f \n", lh0);
+#endif
+	nniMove nni0; // nni0 means no NNI move is done
+	nni0.p = p;
+	nni0.nniType = 0;
+	nni0.deltaLH = 0;
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni0.z[i] = p->z[i];
+	}
+
+	/* Save the scaling factor */
+	// Now try to do an NNI move of type 1
+	pllTopologyPerformNNI(tr, p, PLL_NNI_P_NEXT);
+	double lh1 = tr->likelihood;
+	/* Update branch lengths */
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, q, PLL_FALSE);
+	update(tr, pr, p);
+	pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+
+	nniMove nni1;
+	nni1.p = p;
+	nni1.nniType = 1;
+	// Store the optimized und unoptimized central branch length
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni1.z[i] = p->z[i];
+		nni1.z0[i] = z0[i];
+	}
+	nni1.likelihood = lh1;
+	nni1.deltaLH = lh1 - lh0;
+#ifdef _DEBUG_NNI
+	printf("Delta likelihood of the 1.NNI move: %f\n", nni1.deltaLH);
+#endif
+
+	/* Restore previous NNI move */
+	pllTopologyPerformNNI(tr, p, PLL_NNI_P_NEXT);
+	/* Restore the old branch length */
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		p->z[i] = z0[i];
+		p->back->z[i] = z0[i];
+	}
+
+#ifdef _DEBUG_NNI
+	printf("Restore topology\n");
+	pllTreeToNewick(tr->tree_string, tr, tr->start->back, TRUE, FALSE, 0, 0, 0, SUMMARIZE_LH, 0,0);
+	fprintf(stderr, "%s\n", tr->tree_string);
+	pllEvaluateLikelihood (tr, tr->start, TRUE);
+	printf("Likelihood after restoring from NNI 1: %f\n", tr->likelihood);
+#endif
+	/* Try to do an NNI move of type 2 */
+	pllTopologyPerformNNI(tr, p, 2);
+	double lh2 = tr->likelihood;
+	/* Update branch lengths */
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, q, PLL_FALSE);
+	update(tr, pr, p);
+	pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+
+	// Create the nniMove struct to store this move
+	nniMove nni2;
+	nni2.p = p;
+	nni2.nniType = 2;
+
+	// Store the optimized and unoptimized central branch length
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		nni2.z[i] = p->z[i];
+		nni2.z0[i] = z0[i];
+	}
+	nni2.likelihood = lh2;
+	nni2.deltaLH = lh2 - lh0;
+#ifdef _DEBUG_NNI
+	printf("Delta likelihood of the 2.NNI move: %f\n", nni2.deltaLH);
+#endif
+
+	/* Restore previous NNI move */
+	pllTopologyPerformNNI(tr, p, 2);
+	pllUpdatePartials(tr, pr, p, PLL_FALSE);
+	pllUpdatePartials(tr, pr, p->back, PLL_FALSE);
+	/* Restore the old branch length */
+	for (i = 0; i < pr->numberOfPartitions; i++) {
+		p->z[i] = z0[i];
+		p->back->z[i] = z0[i];
+	}
+	if (nni1.deltaLH > 0 && nni1.deltaLH >= nni2.deltaLH) {
+		return nni1;
+	} else if (nni1.deltaLH > 0 && nni1.deltaLH < nni2.deltaLH) {
+		return nni2;
+	} else if (nni1.deltaLH < 0 && nni2.deltaLH > 0) {
+		return nni2;
+	} else {
+		return nni0;
+	}
+}
+
+/** @brief ??? Not sure */
+void evalNNIForSubtree(pllInstance* tr, partitionList *pr, nodeptr p,
+		nniMove* nniList, int* cnt, int* cnt_nni, double curLH) {
+	if (!isTip(p->number, tr->mxtips)) {
+		nniList[*cnt] = getBestNNIForBran(tr, pr, p, curLH);
+		if (nniList[*cnt].deltaLH != 0.0) {
+			*cnt_nni = *cnt_nni + 1;
+		}
+		*cnt = *cnt + 1;
+		nodeptr q = p->next;
+		while (q != p) {
+			evalNNIForSubtree(tr, pr, q->back, nniList, cnt, cnt_nni, curLH);
+			q = q->next;
+		}
+	}
+}
+
+/** @brief Perform an NNI search
+
+    Modify the tree topology of instance and model parameters \a tr by performing a NNI (Neighbour Neighbor
+    Interchange) moves \a p.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param estimateModel
+      Determine wheter the model parameters should be optimized
+
+    @return
+      In case of success \b PLL_TRUE, otherwise \b PLL_FALSE
+
+*/
+int pllNniSearch(pllInstance * tr, partitionList *pr, int estimateModel) {
+
+	double curScore = tr->likelihood;
+
+	/* Initialize the NNI list */
+	nniMove* nniList = (nniMove*) malloc((tr->mxtips - 3) * sizeof(nniMove));
+	int i;
+	/* fill up the NNI list */
+	nodeptr p = tr->start->back;
+	nodeptr q = p->next;
+	int cnt = 0; // number of visited internal branches during NNI evaluation
+	int cnt_nni = 0; // number of positive NNI found
+	while (q != p) {
+		evalNNIForSubtree(tr, pr, q->back, nniList, &cnt, &cnt_nni, curScore);
+		q = q->next;
+	}
+	if (cnt_nni == 0)
+		return 0.0;
+
+	nniMove* impNNIList = (nniMove*) malloc(cnt_nni * sizeof(nniMove));
+	int j = 0;
+	for (i = 0; i < tr->mxtips - 3; i++) {
+		if (nniList[i].deltaLH > 0.0) {
+			impNNIList[j] = nniList[i];
+			j++;
+		}
+	}
+	// sort impNNIList
+	qsort(impNNIList, cnt_nni, sizeof(nniMove), cmp_nni);
+
+	// creating a list of non-conflicting positive NNI
+	nniMove* nonConfNNIList = (nniMove*) calloc(cnt_nni, sizeof(nniMove));
+
+	// the best NNI will always be taken
+	nonConfNNIList[0] = impNNIList[cnt_nni - 1];
+
+	// Filter out conflicting NNI
+	int numNonConflictNNI = 1; // size of the non-conflicting NNI list;
+	int k;
+	for (k = cnt_nni - 2; k >= 0; k--) {
+		int conflict = PLL_FALSE;
+		int j;
+		for (j = 0; j < numNonConflictNNI; j++) {
+			if (impNNIList[k].p->number == nonConfNNIList[j].p->number
+					|| impNNIList[k].p->number
+							== nonConfNNIList[j].p->back->number) {
+				conflict = PLL_TRUE;
+				break;
+			}
+		}
+		if (conflict) {
+			continue;
+		} else {
+			nonConfNNIList[numNonConflictNNI] = impNNIList[k];
+			numNonConflictNNI++;
+		}
+	}
+
+	// Applying non-conflicting NNI moves
+	double delta = 1.0; // portion of NNI moves to apply
+	int notImproved;
+	do {
+		notImproved = PLL_FALSE;
+		int numNNI2Apply = ceil(numNonConflictNNI * delta);
+		for (i = 0; i < numNNI2Apply; i++) {
+			// Just do the topological change
+			pllTopologyPerformNNI(tr, nonConfNNIList[i].p, nonConfNNIList[i].nniType);
+			pllUpdatePartials(tr, pr, nonConfNNIList[i].p, PLL_FALSE);
+			pllUpdatePartials(tr, pr, nonConfNNIList[i].p->back, PLL_FALSE);
+			// Apply the store branch length
+			int j;
+			for (j = 0; j < pr->numberOfPartitions; j++) {
+				nonConfNNIList[i].p->z[j] = nonConfNNIList[i].z[j];
+				nonConfNNIList[i].p->back->z[j] = nonConfNNIList[i].z[j];
+			}
+		}
+		// Re-optimize all branches
+		smoothTree(tr, pr, 2);
+		pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+		if (estimateModel) {
+			modOpt(tr, pr, 0.1);
+		}
+		pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+		if (tr->likelihood < curScore) {
+#ifdef _DEBUG_NNI
+			printf("Tree likelihood gets worse after applying NNI\n");
+			printf("curScore = %30.20f\n", curScore);
+			printf("newScore = %30.20f\n", tr->likelihood);
+			printf("Rolling back the tree\n");
+#endif
+			for (i = 0; i < numNNI2Apply; i++) {
+				pllTopologyPerformNNI(tr, nonConfNNIList[i].p, nonConfNNIList[i].nniType);
+				// Restore the branch length
+				int j;
+				for (j = 0; j < pr->numberOfPartitions; j++) {
+					nonConfNNIList[i].p->z[j] = nonConfNNIList[i].z0[j];
+					nonConfNNIList[i].p->back->z[j] = nonConfNNIList[i].z0[j];
+				}
+			}
+			pllEvaluateLikelihood (tr, pr, tr->start, PLL_FALSE, PLL_FALSE);
+#ifdef _DEBUG_NNI
+			printf("Tree likelihood after rolling back = %f \n",
+					tr->likelihood);
+#endif
+			notImproved = PLL_TRUE & (numNNI2Apply > 1);
+			delta = delta * 0.5;
+		}
+	} while (notImproved);
+	free(nniList);
+	free(impNNIList);
+	free(nonConfNNIList);
+
+	return PLL_TRUE;
+}
+
+
+/** @defgroup rearrangementGroup Topological rearrangements
+    
+    This set of functions handles the rearrangement of the tree topology
+*/
+
+
+/** @ingroup rearrangementGroup
+    @brief Create a list for storing topology rearrangements
+ 
+    Allocates space and initializes a structure that will hold information
+    of \a max topological rearrangements
+
+    @param max
+      Maximum number of elements that the structure should hold
+    
+    @note This should be called for creating a storage space (list) for
+    routines such as ::pllRearrangeSearch which compute the best NNI/PR/TBR rearrangements.
+*/
+pllRearrangeList * pllCreateRearrangeList (int max)
+{
+  pllRearrangeList * bl;
+
+  bl = (pllRearrangeList *) malloc (sizeof (pllRearrangeList));
+
+  bl->max_entries = max;
+  bl->entries     = 0;
+  bl->rearr       = (pllRearrangeInfo *) malloc (max * sizeof (pllRearrangeInfo));
+
+  return bl;
+}
+
+/** @ingroup rearrangementGroup
+    @brief Deallocator for topology rearrangements list
+    
+    Call this to destroy (deallocate) the memory taken by the \a bestList which holds
+    topological rearrangements
+
+    @param bestList
+      Pointer to the list to be deallocated
+*/
+void pllDestroyRearrangeList (pllRearrangeList ** bestList)
+{
+  pllRearrangeList * bl;
+
+  bl = *bestList;
+
+  rax_free (bl->rearr);
+  rax_free (bl);
+
+  *bestList = NULL;
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Store a rearrangement move to the list of best rearrangement moves
+
+     Checks if the likelihood yielded by the rearrangement move described in \a rearr
+     is better than any in the sorted list \a bestList. If it is, or
+     if there is still space in \a bestList, the info about the
+     move is inserted in the list.
+
+     @param bestList
+       The list of information about the best rearrangement moves
+
+     @param rearr
+       Info about the current rearrangement move
+
+     @return
+       Returns \b PLL_FALSE if the rearrangement move doesn't make it in the list, otherwise \b PLL_TRUE
+*/
+static int pllStoreRearrangement (pllRearrangeList * bestList, pllRearrangeInfo * rearr)
+ {
+   /* naive implementation of saving rearrangement moves */
+   int i;
+
+   for (i = 0; i < bestList->entries; ++ i)
+    {
+      /* Does the new rearrangement yield a better likelihood that the current in the list */
+      if (rearr->likelihood > bestList->rearr[i].likelihood)
+       {
+         /* is there enough space in the array ? */
+         if (bestList->entries < bestList->max_entries)
+          {
+            /* slide the entries to the right and overwrite the i-th element with the new item */
+            memmove (&(bestList->rearr[i + 1]), &(bestList->rearr[i]), (bestList->entries - i ) * sizeof (pllRearrangeInfo));
+            ++ bestList->entries;
+          }
+         else
+          {
+            memmove (&(bestList->rearr[i + 1]), &(bestList->rearr[i]), (bestList->entries - i - 1 ) * sizeof (pllRearrangeInfo));
+          }
+         memcpy (&(bestList->rearr[i]), rearr, sizeof (pllRearrangeInfo));
+         return (PLL_TRUE);
+       }
+    }
+   if (bestList->entries < bestList->max_entries)
+    {
+      memcpy (&(bestList->rearr[bestList->entries]), rearr, sizeof (pllRearrangeInfo));
+      ++ bestList->entries;
+      return (PLL_TRUE);
+    }
+
+   return (PLL_FALSE);
+ }
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for testing and saving an SPR move
+    
+    Checks the likelihood of the placement of the pruned subtree specified by \a p
+    to node \a q. If the likelihood is better than some in the sorted list 
+    \a bestList, or if there is still free space in \a bestList, then the SPR 
+    move is recorded (in \a bestList)
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Root of the subtree that is to be pruned
+
+    @param q
+      Where to place the pruned subtree (between \a q and \a q->back
+
+    @param bestList
+      Where to store the SPR move
+
+    @note Internal function which is not part of the PLL API and therefore should not be
+    called by the user
+
+    @return
+*/
+static int
+pllTestInsertBIG (pllInstance * tr, partitionList * pr, nodeptr p, nodeptr q, pllRearrangeList * bestList)
+{
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+  pllRearrangeInfo rearr;
+
+  double  qz[PLL_NUM_BRANCHES], pz[PLL_NUM_BRANCHES];
+  nodeptr  r;
+  //double startLH = tr->endLH;
+  int i;
+
+  r = q->back; 
+  for(i = 0; i < numBranches; i++)
+  {
+    qz[i] = q->z[i];
+    pz[i] = p->z[i];
+  }
+
+  if (! insertBIG(tr, pr, p, q))       return PLL_FALSE;
+
+  pllEvaluateLikelihood (tr, pr, p->next->next, PLL_FALSE, PLL_FALSE);
+  
+  rearr.rearrangeType  = PLL_REARRANGE_SPR;
+  rearr.likelihood     = tr->likelihood;
+  rearr.SPR.removeNode = p;
+  rearr.SPR.insertNode = q;
+  for (i = 0; i < numBranches; ++ i)
+   {
+     rearr.SPR.zqr[i] = tr->zqr[i];
+   }
+
+  pllStoreRearrangement (bestList, &rearr);
+
+/*
+  if(tr->likelihood > tr->bestOfNode)
+  {
+    pllStoreRearrangement (bestList, rearr)
+    tr->bestOfNode = tr->likelihood;
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+    {
+      tr->currentZQR[i] = tr->zqr[i];           
+      tr->currentLZR[i] = tr->lzr[i];
+      tr->currentLZQ[i] = tr->lzq[i];
+      tr->currentLZS[i] = tr->lzs[i];      
+    }
+  }
+
+  if(tr->likelihood > tr->endLH)
+  {			  
+    
+    tr->insertNode = q;
+    tr->removeNode = p;   
+    for(i = 0; i < numBranches; i++)
+      tr->currentZQR[i] = tr->zqr[i];      
+    tr->endLH = tr->likelihood;                      
+  }        
+*/
+  /* reset the topology so that it is the same as it was before calling insertBIG */
+  hookup(q, r, qz, numBranches);
+
+  p->next->next->back = p->next->back = (nodeptr) NULL;
+
+  if(tr->thoroughInsertion)
+  {
+    nodeptr s = p->back;
+    hookup(p, s, pz, numBranches);
+  } 
+
+/*
+  if((tr->doCutoff) && (tr->likelihood < startLH))
+  {
+    tr->lhAVG += (startLH - tr->likelihood);
+    tr->lhDEC++;
+    if((startLH - tr->likelihood) >= tr->lhCutoff)
+      return PLL_FALSE;	    
+    else
+      return PLL_TRUE;
+  }
+  else
+    return PLL_TRUE;
+  */
+  return (PLL_TRUE);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for recursively traversing a tree and testing a possible subtree insertion
+
+    Recursively traverses the tree rooted at \a q in the direction of \a q->next->back and \a q->next->next->back
+    and at each node tests the placement of the pruned subtree rooted at \a p by calling the function
+    \a pllTestInsertBIG, which in turn saves the computed SPR in \a bestList if a) there is still space in
+    the \a bestList or b) if the likelihood of the SPR is better than any of the ones in \a bestList.
+
+    @note This function is not part of the API and should not be called by the user.
+*/
+static void pllTraverseUpdate (pllInstance *tr, partitionList *pr, nodeptr p, nodeptr q, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{  
+  if (--mintrav <= 0) 
+  {              
+    if (! pllTestInsertBIG(tr, pr, p, q, bestList))  return;
+
+  }
+
+  if ((!isTip(q->number, tr->mxtips)) && (--maxtrav > 0)) 
+  {    
+    pllTraverseUpdate(tr, pr, p, q->next->back, mintrav, maxtrav, bestList);
+    pllTraverseUpdate(tr, pr, p, q->next->next->back, mintrav, maxtrav, bestList);
+  }
+} 
+
+
+/** @ingroup rearrangementGroup
+    @brief Internal function for computing SPR moves
+
+    Compute a list of at most \a max SPR moves that can be performed by pruning
+    the subtree rooted at node \a p and testing all possible placements in a
+    radius of at least \a mintrav nodes and at most \a maxtrav nodes from \a p.
+    Note that \a tr->thoroughInsertion affects the behaviour of the function (see note).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the root of the pruned subtree, i.e. where to prune.
+
+    @param mintrav
+      Minimum distance from \a p where to try inserting the pruned subtree
+
+    @param maxtrav
+      Maximum distance from \a p where to try inserting the pruned subtree
+
+    @param bestList
+      The list of best topological rearrangements
+
+    @note This function is not part of the API and should not be called by the user
+    as it is called internally by the API function \a pllComputeSPR. 
+    Also, setting \a tr->thoroughInsertion affects this function. For each tested SPR
+    the new branch lengths will also be optimized. This computes better likelihoods
+    but also slows down the method considerably.
+*/
+static int pllTestSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  nodeptr 
+    p1, p2, q, q1, q2;
+  double 
+    p1z[PLL_NUM_BRANCHES], p2z[PLL_NUM_BRANCHES], q1z[PLL_NUM_BRANCHES], q2z[PLL_NUM_BRANCHES];
+  int
+    mintrav2, i;
+  int numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (maxtrav < 1 || mintrav > maxtrav) return (PLL_FALSE);
+  q = p->back;
+
+  if (!isTip (p->number, tr->mxtips))
+   {
+     p1 = p->next->back;
+     p2 = p->next->next->back;
+
+     if (!isTip (p1->number, tr->mxtips) || !isTip (p2->number, tr->mxtips))
+      {
+        /* save branch lengths before splitting the tree in two components */
+        for (i = 0; i < numBranches; ++ i)
+         {
+           p1z[i] = p1->z[i];
+           p2z[i] = p2->z[i];
+         }
+
+        /* split the tree in two components */
+        if (! removeNodeBIG (tr, pr, p, numBranches)) return PLL_BADREAR;
+
+        /* recursively traverse and perform SPR on the subtree rooted at p1 */
+        if (!isTip (p1->number, tr->mxtips))
+         {
+           pllTraverseUpdate (tr, pr, p, p1->next->back,       mintrav, maxtrav, bestList);
+           pllTraverseUpdate (tr, pr, p, p1->next->next->back, mintrav, maxtrav, bestList);
+         }
+
+        /* recursively traverse and perform SPR on the subtree rooted at p2 */
+        if (!isTip (p2->number, tr->mxtips))
+         {
+           pllTraverseUpdate (tr, pr, p, p2->next->back,       mintrav, maxtrav, bestList);
+           pllTraverseUpdate (tr, pr, p, p2->next->next->back, mintrav, maxtrav, bestList);
+         }
+
+        /* restore the topology as it was before the split */
+        hookup (p->next,       p1, p1z, numBranches);
+        hookup (p->next->next, p2, p2z, numBranches);
+        pllUpdatePartials (tr, pr, p, PLL_FALSE);
+      }
+   }
+
+  if (!isTip (q->number, tr->mxtips) && maxtrav > 0)
+   {
+     q1 = q->next->back;
+     q2 = q->next->next->back;
+
+    /* why so many conditions? Why is it not analogous to the previous if for node p? */
+    if (
+        (
+         ! isTip(q1->number, tr->mxtips) && 
+         (! isTip(q1->next->back->number, tr->mxtips) || ! isTip(q1->next->next->back->number, tr->mxtips))
+        )
+        ||
+        (
+         ! isTip(q2->number, tr->mxtips) && 
+         (! isTip(q2->next->back->number, tr->mxtips) || ! isTip(q2->next->next->back->number, tr->mxtips))
+        )
+       )
+     {
+       for (i = 0; i < numBranches; ++ i)
+        {
+          q1z[i] = q1->z[i];
+          q2z[i] = q2->z[i];
+        }
+
+       if (! removeNodeBIG (tr, pr, q, numBranches)) return PLL_BADREAR;
+
+       mintrav2 = mintrav > 2 ? mintrav : 2;
+
+       if (!isTip (q1->number, tr->mxtips))
+        {
+          pllTraverseUpdate (tr, pr, q, q1->next->back,       mintrav2, maxtrav, bestList);
+          pllTraverseUpdate (tr, pr, q, q1->next->next->back, mintrav2, maxtrav, bestList);
+        }
+
+       if (!isTip (q2->number, tr->mxtips))
+        {
+          pllTraverseUpdate (tr, pr, q, q2->next->back,       mintrav2, maxtrav, bestList);
+          pllTraverseUpdate (tr, pr, q, q2->next->next->back, mintrav2, maxtrav, bestList);
+        }
+
+       hookup (q->next,       q1, q1z, numBranches);
+       hookup (q->next->next, q2, q2z, numBranches);
+       pllUpdatePartials (tr, pr, q, PLL_FALSE);
+     }
+   }
+  return (PLL_TRUE);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Compute a list of possible SPR moves
+    
+    Iteratively tries all possible SPR moves that can be performed by
+    pruning the subtree rooted at \a p and testing all possible placements
+    in a radius of at least \a mintrav nodea and at most \a maxtrav nodes from
+    \a p. Note that \a tr->thoroughInsertion affects the behaviour of the function (see note).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the root of the pruned subtree, i.e. where to prune.
+
+    @param mintrav
+      Minimum distance from \a p where to try inserting the pruned subtree
+
+    @param maxtrav
+      Maximum distance from \a p where to try inserting the pruned subtree
+
+    @note
+      Setting \a tr->thoroughInsertion affects this function. For each tested SPR
+      the new branch lengths will also be optimized. This computes better likelihoods
+      but also slows down the method considerably.
+*/
+static void 
+pllComputeSPR (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+
+  tr->startLH = tr->endLH = tr->likelihood;
+
+  /* TODO: Add cutoff code */
+
+  tr->bestOfNode = PLL_UNLIKELY;
+
+  pllTestSPR (tr, pr, p, mintrav, maxtrav, bestList);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Return the yielded likelihood of an NNI move, without altering the topology
+
+    This function performs the NNI move of type \a swapType at node \a p, optimizes
+    the branch with endpoints \a p  and \a p->back and evalutes the resulting likelihood.
+    It then restores the topology  to the origin and returns the likelihood that the NNI
+    move yielded.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Where to perform the NNI move
+
+    @param swapType
+      What type of NNI move to perform
+
+    @return
+      The likelihood yielded from the NNI
+*/
+static double 
+pllTestNNILikelihood (pllInstance * tr, partitionList * pr, nodeptr p, int swapType)
+{
+  double lh;
+  double z0[PLL_NUM_BRANCHES];
+  int i;
+
+  /* store the origin branch lengths and likelihood. The original branch lengths could
+  be passed as a parameter in order to avoid duplicate computations because of the two
+  NNI moves */
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+   {
+     z0[i] = p->z[i];
+   }
+
+  /* perform NNI */
+  pllTopologyPerformNNI(tr, p, swapType);
+  /* recompute the likelihood vectors of the two subtrees rooted at p and p->back,
+     optimize the branch lengths and evaluate the likelihood  */
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  lh = tr->likelihood;
+
+  /* restore topology */
+  pllTopologyPerformNNI(tr, p, swapType);
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  //update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  for (i = 0; i < pr->numberOfPartitions; ++ i)
+   {
+     p->z[i] = p->back->z[i] = z0[i];
+   }
+
+  return lh;
+}
+/** @ingroup rearrangementGroup
+    @brief Compares NNI likelihoods at a node and store in the rearrangement list
+
+    Compares the two possible NNI moves that can be performed at node \a p, and
+    if the likelihood improves from the one of the original topology, then 
+    it picks the one that yields the highest likelihood and tries to insert it in
+    the list of best rearrangement moves
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param bestList
+      Rearrangement moves list
+*/
+static void pllTestNNI (pllInstance * tr, partitionList * pr, nodeptr p, pllRearrangeList * bestList)
+{
+  double lh0, lh1, lh2;
+  pllRearrangeInfo rearr;
+
+  /* store the original likelihood */
+  lh0 = tr->likelihood;
+
+  lh1 = pllTestNNILikelihood (tr, pr, p, PLL_NNI_P_NEXT);
+  lh2 = pllTestNNILikelihood (tr, pr, p, PLL_NNI_P_NEXTNEXT);
+
+  if (lh0 > lh1 && lh0 > lh2) return;
+
+  /* set the arrangement structure */
+  rearr.rearrangeType  = PLL_REARRANGE_NNI;
+  rearr.likelihood     = PLL_MAX (lh1, lh2);
+  rearr.NNI.originNode = p;
+  rearr.NNI.swapType   = (lh1 > lh2) ? PLL_NNI_P_NEXT : PLL_NNI_P_NEXTNEXT;
+
+  /* try to store it in the best list */
+  pllStoreRearrangement (bestList, &rearr);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Recursive traversal of the tree structure for testing NNI moves
+ 
+    Recursively traverses the tree structure and tests all allowed NNI
+    moves in the area specified by \a mintrav and \a maxtrav. For more
+    information and details on the function arguments check ::pllSearchNNI
+*/
+static void 
+pllTraverseNNI (pllInstance *tr, partitionList *pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  if (isTip (p->number, tr->mxtips)) return;
+
+  /* if we are at the right radius then compute the NNIs for nodes p->next and p->next->next */
+  if (!mintrav)
+   {
+     pllTestNNI (tr, pr, p->next, bestList);
+     pllTestNNI (tr, pr, p->next->next, bestList);
+   }
+  
+  /* and then avoid computing the NNIs for nodes p->next->back and p->next->next->back as they are
+  the same to the ones computed in the above two lines. This way we do not need to resolve conflicts
+  later on as in the old code */
+  if (maxtrav)
+   {
+     if (!isTip (p->next->back->number, tr->mxtips))       
+       pllTraverseNNI (tr, pr, p->next->back,       mintrav ? mintrav - 1 : 0, maxtrav - 1, bestList);
+     if (!isTip (p->next->next->back->number, tr->mxtips)) 
+       pllTraverseNNI (tr, pr, p->next->next->back, mintrav ? mintrav - 1 : 0, maxtrav - 1, bestList);
+   }
+}
+
+/** @ingroup rearrangementGroup
+    @brief Compute a list of possible NNI moves
+    
+    Iteratively tries all possible NNI moves at each node that is at
+    least \a mintrav and at most \a maxtrav nodes far from node \a p.
+    At each NNI move, the likelihood is tested and if it is higher than
+    the likelihood of an element in the sorted (by likelihood) list 
+    \a bestList, or if there is still empty space in \a bestList, it is
+    inserted at the corresponding position.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param p
+      Node specifying the point where the NNI will be performed.
+
+    @param mintrav
+      Minimum distance from \a p where the NNI can be tested 
+
+    @param maxtrav
+      Maximum distance from \a p where to try NNIs
+*/
+static void
+pllSearchNNI (pllInstance * tr, partitionList * pr, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  /* avoid conflicts by precomputing the NNI of the first node */
+
+  if (mintrav == 0) 
+  pllTestNNI (tr, pr, p, bestList);
+  
+  pllTraverseNNI (tr, pr, p, mintrav, maxtrav, bestList);
+  if (maxtrav)
+    pllTraverseNNI (tr, pr, p->back, mintrav, maxtrav - 1, bestList);
+
+}
+
+/** @ingroup rearrangementGroup
+    @brief Create rollback information for an SPR move
+    
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the SPR move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner.
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the SPR move
+
+    @param numBranches
+      Number of partitions
+*/
+static void 
+pllCreateSprInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches)
+{
+  pllRollbackInfo * sprRb;
+  nodeptr p, q;
+  int i;
+
+  p = rearr->SPR.removeNode;
+  q = rearr->SPR.insertNode;
+
+  sprRb = (pllRollbackInfo *) rax_malloc (sizeof (pllRollbackInfo) + 4 * numBranches * sizeof (double));
+  sprRb->SPR.zp   = (double *) ((char *)sprRb + sizeof (pllRollbackInfo));
+  sprRb->SPR.zpn  = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + numBranches * sizeof (double));
+  sprRb->SPR.zpnn = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + 2 * numBranches * sizeof (double));
+  sprRb->SPR.zqr  = (double *) ((char *)sprRb + sizeof (pllRollbackInfo) + 3 * numBranches * sizeof (double));
+
+  for (i = 0; i < numBranches; ++ i)
+   {
+     sprRb->SPR.zp[i]   = p->z[i];
+     sprRb->SPR.zpn[i]  = p->next->z[i];
+     sprRb->SPR.zpnn[i] = p->next->next->z[i];
+     sprRb->SPR.zqr[i]  = q->z[i];
+   }
+
+  sprRb->SPR.pn  = p->next->back;
+  sprRb->SPR.pnn = p->next->next->back;
+  sprRb->SPR.r   = q->back;
+  sprRb->SPR.q   = q;
+  sprRb->SPR.p   = p;
+
+  sprRb->rearrangeType = PLL_REARRANGE_SPR;
+
+  pllStackPush (&(tr->rearrangeHistory), (void *) sprRb);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Create rollback information for an NNI move
+
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the SPR move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the NNI move
+*/
+static void
+pllCreateNniInfoRollback (pllInstance * tr, pllRearrangeInfo * rearr)
+{
+  /*TODO: add the branches ? */
+  pllRollbackInfo * ri;
+
+  ri = (pllRollbackInfo *) rax_malloc (sizeof (pllRollbackInfo));
+
+  ri->rearrangeType = PLL_REARRANGE_NNI;
+
+  ri->NNI.origin   = rearr->NNI.originNode;
+  ri->NNI.swapType = rearr->NNI.swapType;
+
+  pllStackPush (&(tr->rearrangeHistory), (void *) ri);
+  
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Generic function for creating rollback information
+
+    Creates a structure of type ::pllRollbackInfo and fills it with rollback
+    information about the move described in \a rearr. The rollback info
+    is stored in the PLL instance in a LIFO manner
+
+    @param tr
+      PLL instance
+
+    @param rearr
+      Description of the NNI move
+
+    @param numBranches
+      Number of partitions
+*/
+static void
+pllCreateRollbackInfo (pllInstance * tr, pllRearrangeInfo * rearr, int numBranches)
+{
+  switch (rearr->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllCreateNniInfoRollback (tr, rearr);
+       break;
+     case PLL_REARRANGE_SPR:
+       pllCreateSprInfoRollback (tr, rearr, numBranches);
+       break;
+     default:
+       break;
+   }
+
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Rollback an SPR move
+
+    Perform a rollback (undo) on the last SPR move.
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param ri
+      Rollback information
+*/
+static void
+pllRollbackSPR (partitionList * pr, pllRollbackInfo * ri)
+{
+  int numBranches;
+
+  numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  hookup (ri->SPR.p->next,       ri->SPR.pn,      ri->SPR.zpn,  numBranches);
+  hookup (ri->SPR.p->next->next, ri->SPR.pnn,     ri->SPR.zpnn, numBranches); 
+  hookup (ri->SPR.p,             ri->SPR.p->back, ri->SPR.zp,   numBranches);
+  hookup (ri->SPR.q,             ri->SPR.r,       ri->SPR.zqr,  numBranches);
+
+  rax_free (ri);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Rollback an NNI move
+
+    Perform a rollback (undo) on the last NNI move.
+    
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param ri
+      Rollback information
+*/
+static void
+pllRollbackNNI (pllInstance * tr, partitionList * pr, pllRollbackInfo * ri)
+{
+  nodeptr p = ri->NNI.origin;
+
+  pllTopologyPerformNNI(tr, p, ri->NNI.swapType);
+  pllUpdatePartials (tr, pr, p,       PLL_FALSE);
+  pllUpdatePartials (tr, pr, p->back, PLL_FALSE);
+  update (tr, pr, p);
+  pllEvaluateLikelihood (tr, pr, p, PLL_FALSE, PLL_FALSE);
+  
+  rax_free (ri);
+}
+
+/** @ingroup rearrangementGroup
+    @brief Rollback the last committed rearrangement move
+    
+    Perform a rollback (undo) on the last committed rearrangement move.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @return
+      Returns \b PLL_TRUE is the rollback was successful, otherwise \b PLL_FALSE
+      (if no rollback was done)
+*/
+int 
+pllRearrangeRollback (pllInstance * tr, partitionList * pr)
+{
+  pllRollbackInfo * ri;
+  
+  ri = (pllRollbackInfo *) pllStackPop (&(tr->rearrangeHistory));
+  if (!ri) return (PLL_FALSE);
+
+  switch (ri->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllRollbackNNI (tr, pr, ri);
+       break;
+     case PLL_REARRANGE_SPR:
+       pllRollbackSPR (pr, ri);
+       break;
+     default:
+       rax_free (ri);
+       return (PLL_FALSE);
+   }
+
+  return (PLL_TRUE);
+  
+}
+
+
+/** @ingroup rearrangementGroup
+    @brief Commit a rearrangement move
+
+    Applies the rearrangement move specified in \a rearr to the tree topology in \a tr. 
+    In case of SPR moves, if
+    \a tr->thoroughInsertion is set to \b PLL_TRUE, the new branch lengths are also optimized. 
+    The function stores rollback information in pllInstance::rearrangeHistory if \a saveRollbackInfo
+    is set to \b PLL_TRUE. This way, the rearrangement move can be rolled back (undone) by calling
+    ::pllRearrangeRollback
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param rearr
+      An element of a \a pllRearrangeInfo structure that contains information about the rearrangement move
+
+    @param saveRollbackInfo
+      If set to \b PLL_TRUE, rollback info will be kept for undoing the rearrangement move
+*/
+void
+pllRearrangeCommit (pllInstance * tr, partitionList * pr, pllRearrangeInfo * rearr, int saveRollbackInfo)
+{
+  int numBranches;
+
+  numBranches = pr->perGeneBranchLengths ? pr->numberOfPartitions : 1;
+
+  if (saveRollbackInfo)
+    pllCreateRollbackInfo (tr, rearr, numBranches);
+
+  switch (rearr->rearrangeType)
+   {
+     case PLL_REARRANGE_NNI:
+       pllTopologyPerformNNI(tr, rearr->NNI.originNode, rearr->NNI.swapType);
+       pllUpdatePartials (tr, pr, rearr->NNI.originNode, PLL_FALSE);
+       pllUpdatePartials (tr, pr, rearr->NNI.originNode->back, PLL_FALSE);
+       update (tr, pr, rearr->NNI.originNode);
+       pllEvaluateLikelihood (tr, pr, rearr->NNI.originNode, PLL_FALSE, PLL_FALSE);
+       break;
+     case PLL_REARRANGE_SPR:
+       removeNodeBIG (tr, pr, rearr->SPR.removeNode, numBranches);
+       insertBIG     (tr, pr, rearr->SPR.removeNode, rearr->SPR.insertNode);
+       break;
+     default:
+       break;
+   }
+}
+
+
+/******** new rearrangement functions ****************/
+
+/* change this to return the number of new elements in the list */
+/** @ingroup rearrangementGroup
+    @brief Search for rearrangement topologies
+    
+    Search for possible rearrangement moves of type \a rearrangeType in the
+    annular area defined by the minimal resp. maximal radii \a mintrav resp.
+    \a maxtrav. If the resulting likelihood is better than the current, try
+    to insert the move specification in \a bestList, which is a sorted list
+    that holds the rearrange info of the best moves sorted by likelihood
+    (desccending order).
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param rearrangeType
+      Type of rearrangement. Can be \b PLL_REARRANGE_SPR or \b PLL_REARRANGE_NNI
+
+    @param p
+      Point of origin, i.e. where to start searching from
+
+    @param mintrav
+      The minimal radius of the annulus
+
+    @param maxtrav
+      The maximal radius of the annulus
+
+    @param bestList
+      List that holds the details of the best rearrangement moves found
+
+    @note
+      If \a bestList is not empty, the existing entries will not be altered unless
+      better rearrangement moves (that means yielding better likelihood) are found
+      and the list is full, in which case the entries with the worst likelihood will be
+      thrown away.
+*/
+void
+pllRearrangeSearch (pllInstance * tr, partitionList * pr, int rearrangeType, nodeptr p, int mintrav, int maxtrav, pllRearrangeList * bestList)
+{
+  switch (rearrangeType)
+   {
+     case PLL_REARRANGE_SPR:
+       pllComputeSPR (tr, pr, p, mintrav, maxtrav, bestList);
+       break;
+
+     case PLL_REARRANGE_NNI:
+       pllSearchNNI (tr, pr, p, mintrav, maxtrav, bestList);
+       break;
+
+     case PLL_REARRANGE_TBR:
+       break;
+     default:
+       break;
+   }
+}
+
+
+static int
+determineRearrangementSetting(pllInstance *tr, partitionList *pr,
+    bestlist *bestT, bestlist *bt)
+{
+  int i, mintrav, maxtrav, bestTrav, impr, index, MaxFast, *perm = (int*) NULL;
+  double startLH;
+  pllBoolean cutoff;
+
+  MaxFast = 26;
+
+  startLH = tr->likelihood;
+
+  cutoff = tr->doCutoff;
+  tr->doCutoff = PLL_FALSE;
+
+  mintrav = 1;
+  maxtrav = 5;
+
+  bestTrav = maxtrav = 5;
+
+  impr = 1;
+
+  resetBestTree(bt);
+
+  if (tr->permuteTreeoptimize)
+    {
+      int n = tr->mxtips + tr->mxtips - 2;
+      perm = (int *) rax_malloc(sizeof(int) * (n + 1));
+      makePermutation(perm, n, tr);
+    }
+
+  while (impr && maxtrav < MaxFast)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+      nodeRectifier(tr);
+
+      if (maxtrav > tr->ntips - 3)
+        maxtrav = tr->ntips - 3;
+
+      tr->startLH = tr->endLH = tr->likelihood;
+
+      for (i = 1; i <= tr->mxtips + tr->mxtips - 2; i++)
+        {
+
+          if (tr->permuteTreeoptimize)
+            index = perm[i];
+          else
+            index = i;
+
+          tr->bestOfNode = PLL_UNLIKELY;
+          if (rearrangeBIG(tr, pr, tr->nodep[index], mintrav, maxtrav))
+            {
+              if (tr->endLH > tr->startLH)
+                {
+                  restoreTreeFast(tr, pr);
+                  tr->startLH = tr->endLH = tr->likelihood;
+                }
+            }
+        }
+
+      pllOptimizeBranchLengths(tr, pr, 8);
+      saveBestTree(bt, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      if (tr->likelihood > startLH)
+        {
+          startLH = tr->likelihood;
+          bestTrav = maxtrav;
+          impr = 1;
+        }
+      else
+        {
+          impr = 0;
+        }
+      maxtrav += 5;
+
+      if (tr->doCutoff)
+        {
+          tr->lhCutoff = (tr->lhAVG) / ((double) (tr->lhDEC));
+
+          tr->itCount = tr->itCount + 1;
+          tr->lhAVG = 0;
+          tr->lhDEC = 0;
+        }
+    }
+
+  recallBestTree(bt, 1, tr, pr);
+  tr->doCutoff = cutoff;
+
+  if (tr->permuteTreeoptimize)
+    rax_free(perm);
+
+  return bestTrav;
+}
+
+
+static void hash_dealloc_bipentry (void * entry)
+{
+  pllBipartitionEntry * e = (pllBipartitionEntry *)entry;
+
+  if(e->bitVector)     rax_free(e->bitVector);
+  if(e->treeVector)    rax_free(e->treeVector);
+  if(e->supportVector) rax_free(e->supportVector);
+
+}
+
+/** @ingroup rearrangementGroup
+    @brief RAxML algorithm for ML search
+
+    RAxML algorithm for searching the Maximum Likelihood tree and model.
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @param estimateModel
+      If true, model parameters are optimized in a ML framework.
+
+    @note
+      For datasets with a large number of taxa, setting tr->searchConvergenceCriterion to
+    PLL_TRUE can improve the execution time in up to 50% looking for topology convergence.
+*/
+int
+pllRaxmlSearchAlgorithm(pllInstance * tr, partitionList * pr,
+    pllBoolean estimateModel)
+{
+  pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  pllOptimizeBranchLengths(tr, pr, 32);
+
+  unsigned int vLength = 0;
+  int i, impr, bestTrav, rearrangementsMax = 0, rearrangementsMin = 0,
+      thoroughIterations = 0, fastIterations = 0;
+
+  double lh, previousLh, difference, epsilon;
+  bestlist *bestT, *bt;
+  infoList iList;
+  pllOptimizeBranchLengths(tr, pr, 32);
+
+  pllHashTable *h = NULL;
+  //hashtable *h = NULL;
+  unsigned int **bitVectors = (unsigned int**) NULL;
+
+  /* Security check... These variables might have not been initialized! */
+  if (tr->stepwidth == 0) tr->stepwidth = 5;
+  if (tr->max_rearrange == 0) tr->max_rearrange = 21;
+
+  if (tr->searchConvergenceCriterion)
+    {
+      bitVectors = initBitVector(tr->mxtips, &vLength);
+      //h = initHashTable(tr->mxtips * 4);
+      h = pllHashInit (tr->mxtips * 4);
+    }
+
+  bestT = (bestlist *) rax_malloc(sizeof(bestlist));
+  bestT->ninit = 0;
+  initBestTree(bestT, 1, tr->mxtips);
+
+  bt = (bestlist *) rax_malloc(sizeof(bestlist));
+  bt->ninit = 0;
+  initBestTree(bt, 20, tr->mxtips);
+
+  initInfoList(&iList, 50);
+
+  difference = 10.0;
+  epsilon = tr->likelihoodEpsilon;
+
+  tr->thoroughInsertion = 0;
+
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 10.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 64);
+
+  saveBestTree(bestT, tr,
+      pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+  if (!tr->initialSet)
+    bestTrav = tr->bestTrav = determineRearrangementSetting(tr, pr, bestT, bt);
+  else
+    bestTrav = tr->bestTrav = tr->initial;
+
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 5.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 32);
+
+  saveBestTree(bestT, tr,
+      pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+  impr = 1;
+  if (tr->doCutoff)
+    tr->itCount = 0;
+
+  while (impr)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+
+      if (tr->searchConvergenceCriterion)
+        {
+          int bCounter = 0;
+
+          if (fastIterations > 1)
+            cleanupHashTable(h, (fastIterations % 2));
+
+          bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back, tr->mxtips,
+              vLength, h, fastIterations % 2, PLL_BIPARTITIONS_RF,
+              (branchInfo *) NULL, &bCounter, 1, PLL_FALSE, PLL_FALSE, 0);
+
+          assert(bCounter == tr->mxtips - 3);
+
+          if (fastIterations > 0)
+            {
+              double rrf = convergenceCriterion(h, tr->mxtips);
+
+              if (rrf <= 0.01) /* 1% cutoff */
+                {
+                  cleanupHashTable(h, 0);
+                  cleanupHashTable(h, 1);
+                  goto cleanup_fast;
+                }
+            }
+        }
+
+      fastIterations++;
+
+      pllOptimizeBranchLengths(tr, pr, 32);
+
+      saveBestTree(bestT, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      lh = previousLh = tr->likelihood;
+
+      treeOptimizeRapid(tr, pr, 1, bestTrav, bt, &iList);
+
+      impr = 0;
+
+      for (i = 1; i <= bt->nvalid; i++)
+        {
+          recallBestTree(bt, i, tr, pr);
+
+          pllOptimizeBranchLengths(tr, pr, 8);
+
+          difference = (
+              (tr->likelihood > previousLh) ?
+                  tr->likelihood - previousLh : previousLh - tr->likelihood);
+          if (tr->likelihood > lh && difference > epsilon)
+            {
+              impr = 1;
+              lh = tr->likelihood;
+              saveBestTree(bestT, tr,
+                  pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+            }
+        }
+    }
+
+  if (tr->searchConvergenceCriterion)
+    {
+      cleanupHashTable(h, 0);
+      cleanupHashTable(h, 1);
+    }
+
+  cleanup_fast:
+
+  tr->thoroughInsertion = 1;
+  impr = 1;
+
+  recallBestTree(bestT, 1, tr, pr);
+  if (estimateModel)
+    {
+      pllEvaluateLikelihood(tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+      pllOptimizeModelParameters(tr, pr, 1.0);
+    }
+  else
+    pllOptimizeBranchLengths(tr, pr, 32);
+
+  while (1)
+    {
+      recallBestTree(bestT, 1, tr, pr);
+      if (impr)
+        {
+          rearrangementsMin = 1;
+          rearrangementsMax = tr->stepwidth;
+
+          if (tr->searchConvergenceCriterion)
+            {
+              int bCounter = 0;
+
+              if (thoroughIterations > 1)
+                cleanupHashTable(h, (thoroughIterations % 2));
+
+              bitVectorInitravSpecial(bitVectors, tr->nodep[1]->back,
+                  tr->mxtips, vLength, h, thoroughIterations % 2,
+                  PLL_BIPARTITIONS_RF, (branchInfo *) NULL, &bCounter, 1,
+                  PLL_FALSE, PLL_FALSE, 0);
+
+              assert(bCounter == tr->mxtips - 3);
+
+              if (thoroughIterations > 0)
+                {
+                  double rrf = convergenceCriterion(h, tr->mxtips);
+
+                  if (rrf <= 0.01) /* 1% cutoff */
+                    {
+                      goto cleanup;
+                    }
+                }
+            }
+
+          thoroughIterations++;
+        }
+      else
+        {
+          rearrangementsMax += tr->stepwidth;
+          rearrangementsMin += tr->stepwidth;
+          if (rearrangementsMax > tr->max_rearrange)
+            goto cleanup;
+        }
+      pllOptimizeBranchLengths(tr, pr, 32);
+
+      previousLh = lh = tr->likelihood;
+      saveBestTree(bestT, tr,
+          pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+
+      treeOptimizeRapid(tr, pr, rearrangementsMin, rearrangementsMax, bt,
+          &iList);
+
+      impr = 0;
+
+      for (i = 1; i <= bt->nvalid; i++)
+        {
+          recallBestTree(bt, i, tr, pr);
+
+          pllOptimizeBranchLengths(tr, pr, 8);
+
+          difference = (
+              (tr->likelihood > previousLh) ?
+                  tr->likelihood - previousLh : previousLh - tr->likelihood);
+          if (tr->likelihood > lh && difference > epsilon)
+            {
+              impr = 1;
+              lh = tr->likelihood;
+              saveBestTree(bestT, tr,
+                  pr->perGeneBranchLengths ? pr->numberOfPartitions : 1);
+            }
+        }
+
+    }
+
+  cleanup:
+  if (tr->searchConvergenceCriterion)
+    {
+      freeBitVectors(bitVectors, 2 * tr->mxtips);
+      rax_free(bitVectors);
+      //freeHashTable(h);
+      //rax_free(h);
+      pllHashDestroy(&h, hash_dealloc_bipentry);
+    }
+
+  freeBestTree(bestT);
+  rax_free(bestT);
+  freeBestTree(bt);
+  rax_free(bt);
+
+  freeInfoList(&iList);
+
+  if (estimateModel) {
+      pllOptimizeModelParameters(tr, pr, epsilon);
+  }
+  pllOptimizeBranchLengths(tr, pr, 64);
+
+  return 0;
+}
+
diff --git a/pllrepo/src/semaphore.h b/pllrepo/src/semaphore.h
new file mode 100644
index 0000000..c6e9407
--- /dev/null
+++ b/pllrepo/src/semaphore.h
@@ -0,0 +1,169 @@
+/*
+ * Module: semaphore.h
+ *
+ * Purpose:
+ *	Semaphores aren't actually part of the PThreads standard.
+ *	They are defined by the POSIX Standard:
+ *
+ *		POSIX 1003.1b-1993	(POSIX.1b)
+ *
+ * --------------------------------------------------------------------------
+ *
+ *      Pthreads-win32 - POSIX Threads Library for Win32
+ *      Copyright(C) 1998 John E. Bossom
+ *      Copyright(C) 1999,2005 Pthreads-win32 contributors
+ * 
+ *      Contact Email: rpj at callisto.canberra.edu.au
+ * 
+ *      The current list of contributors is contained
+ *      in the file CONTRIBUTORS included with the source
+ *      code distribution. The list can also be seen at the
+ *      following World Wide Web location:
+ *      http://sources.redhat.com/pthreads-win32/contributors.html
+ * 
+ *      This library is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU Lesser General Public
+ *      License as published by the Free Software Foundation; either
+ *      version 2 of the License, or (at your option) any later version.
+ * 
+ *      This library is distributed in the hope that it will be useful,
+ *      but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *      Lesser General Public License for more details.
+ * 
+ *      You should have received a copy of the GNU Lesser General Public
+ *      License along with this library in the file COPYING.LIB;
+ *      if not, write to the Free Software Foundation, Inc.,
+ *      59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#if !defined( SEMAPHORE_H )
+#define SEMAPHORE_H
+
+#undef PTW32_SEMAPHORE_LEVEL
+
+#if defined(_POSIX_SOURCE)
+#define PTW32_SEMAPHORE_LEVEL 0
+/* Early POSIX */
+#endif
+
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 1
+/* Include 1b, 1c and 1d */
+#endif
+
+#if defined(INCLUDE_NP)
+#undef PTW32_SEMAPHORE_LEVEL
+#define PTW32_SEMAPHORE_LEVEL 2
+/* Include Non-Portable extensions */
+#endif
+
+#define PTW32_SEMAPHORE_LEVEL_MAX 3
+
+#if !defined(PTW32_SEMAPHORE_LEVEL)
+#define PTW32_SEMAPHORE_LEVEL PTW32_SEMAPHORE_LEVEL_MAX
+/* Include everything */
+#endif
+
+#if defined(__GNUC__) && ! defined (__declspec)
+# error Please upgrade your GNU compiler to one that supports __declspec.
+#endif
+
+/*
+ * When building the library, you should define PTW32_BUILD so that
+ * the variables/functions are exported correctly. When using the library,
+ * do NOT define PTW32_BUILD, and then the variables/functions will
+ * be imported correctly.
+ */
+#if !defined(PTW32_STATIC_LIB)
+#  if defined(PTW32_BUILD)
+#    define PTW32_DLLPORT __declspec (dllexport)
+#  else
+#    define PTW32_DLLPORT __declspec (dllimport)
+#  endif
+#else
+#  define PTW32_DLLPORT
+#endif
+
+/*
+ * This is a duplicate of what is in the autoconf config.h,
+ * which is only used when building the pthread-win32 libraries.
+ */
+
+#if !defined(PTW32_CONFIG_H)
+#  if defined(WINCE)
+#    define NEED_ERRNO
+#    define NEED_SEM
+#  endif
+#  if defined(__MINGW64__)
+#    define HAVE_STRUCT_TIMESPEC
+#    define HAVE_MODE_T
+#  elif defined(_UWIN) || defined(__MINGW32__)
+#    define HAVE_MODE_T
+#  endif
+#endif
+
+/*
+ *
+ */
+
+#if PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX
+#if defined(NEED_ERRNO)
+#include "need_errno.h"
+#else
+#include <errno.h>
+#endif
+#endif /* PTW32_SEMAPHORE_LEVEL >= PTW32_SEMAPHORE_LEVEL_MAX */
+
+#define _POSIX_SEMAPHORES
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif				/* __cplusplus */
+
+#if !defined(HAVE_MODE_T)
+typedef unsigned int mode_t;
+#endif
+
+
+typedef struct sem_t_ * sem_t;
+
+PTW32_DLLPORT int __cdecl sem_init (sem_t * sem,
+			    int pshared,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_destroy (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_trywait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_wait (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_timedwait (sem_t * sem,
+				 const struct timespec * abstime);
+
+PTW32_DLLPORT int __cdecl sem_post (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_post_multiple (sem_t * sem,
+				     int count);
+
+PTW32_DLLPORT int __cdecl sem_open (const char * name,
+			    int oflag,
+			    mode_t mode,
+			    unsigned int value);
+
+PTW32_DLLPORT int __cdecl sem_close (sem_t * sem);
+
+PTW32_DLLPORT int __cdecl sem_unlink (const char * name);
+
+PTW32_DLLPORT int __cdecl sem_getvalue (sem_t * sem,
+				int * sval);
+
+#if defined(__cplusplus)
+}				/* End of extern "C" */
+#endif				/* __cplusplus */
+
+#undef PTW32_SEMAPHORE_LEVEL
+#undef PTW32_SEMAPHORE_LEVEL_MAX
+
+#endif				/* !SEMAPHORE_H */
diff --git a/pllrepo/src/ssort.c b/pllrepo/src/ssort.c
new file mode 100644
index 0000000..b08cbe7
--- /dev/null
+++ b/pllrepo/src/ssort.c
@@ -0,0 +1,121 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file ssort.c
+ * Detailed description to appear soon.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "mem_alloc.h"
+
+/*  string sorting implementation from:
+ *  Bentley J. L., Sedgewick R.: Fast Algorithms for Sorting and Searching 
+ *  Strings. In Proceedings of ACM-SIAM Symposium on Discrete Algorithms 
+ *  (SODA) 1997.
+ */
+
+static void 
+vecswap (int i, int j, int n, char ** x, int * oi)
+{
+  while (n-- > 0)
+   {
+     PLL_SWAP_PTR (x[i], x[j]);
+     PLL_SWAP_INT (oi[i], oi[j]);
+
+     ++ i; ++ j;
+   }
+}
+
+static void 
+ssort1 (char ** x, int n, int depth, int * oi)
+{
+  int           a, b, c, d, r, v;
+
+  if (n <= 1) return;
+
+  a = rand() % n;
+
+  PLL_SWAP_PTR (x[0], x[a]);
+  PLL_SWAP_INT (oi[0], oi[a]);
+
+  v = x[0][depth];
+
+  a = b = 1;
+  c = d = n - 1;
+
+  for (;;)
+   {
+     while (b <= c && (r = x[b][depth] - v) <= 0)
+      {
+        if (r == 0)
+         {
+           PLL_SWAP_PTR (x[a], x[b]);
+           PLL_SWAP_INT (oi[a], oi[b]);
+           ++ a;
+         }
+        ++ b;
+      }
+     while (b <= c && (r = x[c][depth] - v) >= 0)
+      {
+        if (r == 0)
+         {
+           PLL_SWAP_PTR (x[c], x[d]);
+           PLL_SWAP_INT (oi[c], oi[d]);
+           -- d;
+         }
+        -- c;
+      }
+     if (b > c) break;
+     PLL_SWAP_PTR (x[b], x[c]);
+     PLL_SWAP_INT (oi[b], oi[c]);
+     ++ b; -- c;
+   }
+  r = PLL_MIN (a,     b - a);      vecswap (0, b - r, r, x, oi);
+  r = PLL_MIN (d - c, n - d - 1);  vecswap (b, n - r, r, x, oi);
+  r = b - a; ssort1 (x, r, depth, oi);
+  if (x[r][depth] != 0)
+   {
+     ssort1 (x + r, a + n - d - 1, depth + 1, oi + r);
+   }
+  r = d - c; ssort1 (x + n - r, r, depth, oi + n - r);
+}
+
+int * 
+pllssort1main (char ** x, int n)
+{
+  int * oi;
+  int i;
+
+  oi = (int *) rax_malloc (n * sizeof (int));
+  for (i = 0; i < n; ++ i)
+   {
+     oi[i] = i;
+   }
+  ssort1 (x, n, 0, oi);
+  
+  return (oi);
+}
+
diff --git a/pllrepo/src/stack.c b/pllrepo/src/stack.c
new file mode 100644
index 0000000..062cf2e
--- /dev/null
+++ b/pllrepo/src/stack.c
@@ -0,0 +1,85 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file stack.c
+ * @brief Generic stack implementation
+ *
+ * Detailed description to appear soon.
+ */
+#include <stdio.h>
+#include "stack.h"
+#include "mem_alloc.h"
+
+int pllStackSize (pllStack ** stack)
+{
+  pllStack * top;
+  int size = 0;
+  top = *stack;
+ 
+  while (top)
+  {
+    ++ size;
+    top = top->next;
+  }
+  
+  return (size);
+}
+
+int 
+pllStackPush (pllStack ** head, void * item)
+{
+  pllStack * new;
+ 
+  new = (pllStack *) rax_malloc (sizeof (pllStack));
+  if (!new) return (0);
+ 
+  new->item = item;
+  new->next = *head;
+  *head     = new;
+ 
+  return (1);
+}
+
+void * pllStackPop (pllStack ** head)
+{
+  void * item;
+  pllStack * tmp;
+  if (!*head) return (NULL);
+ 
+  tmp     = (*head);
+  item    = (*head)->item;
+  (*head) = (*head)->next;
+  rax_free (tmp);
+ 
+  return (item);
+}
+ 
+void 
+pllStackClear (pllStack ** stack)
+{
+  while (*stack) pllStackPop (stack);
+}
+
diff --git a/pllrepo/src/stack.h b/pllrepo/src/stack.h
new file mode 100644
index 0000000..2ec64bd
--- /dev/null
+++ b/pllrepo/src/stack.h
@@ -0,0 +1,48 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file stack.h
+ * @brief Generic stack implementation
+ *
+ * Detailed description to appear soon.
+ */
+#ifndef __pll_STACK__
+#define __pll_STACK__
+
+struct pllStack
+{
+  void * item;
+  struct pllStack * next;
+};
+
+typedef struct pllStack pllStack;
+
+void  pllStackClear (pllStack ** stack);
+void * pllStackPop (pllStack ** head);
+int pllStackPush (pllStack ** head, void * item);
+int pllStackSize (pllStack ** stack);
+
+#endif
diff --git a/pllrepo/src/topologies.c b/pllrepo/src/topologies.c
new file mode 100644
index 0000000..f19bf3d
--- /dev/null
+++ b/pllrepo/src/topologies.c
@@ -0,0 +1,778 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file topologies.c
+ * @brief Miscellanous functions working with tree topology
+*/
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+static void saveTopolRELLRec(pllInstance *tr, nodeptr p, topolRELL *tpl, int *i, int numsp)
+{
+  int k;
+  if(isTip(p->number, numsp))
+    return;
+  else
+    {
+      nodeptr q = p->next;      
+      while(q != p)
+	{	  
+	  tpl->connect[*i].p = q;
+	  tpl->connect[*i].q = q->back; 
+	  
+	  if(tr->grouped ||  tr->constrained)
+	    {
+	      tpl->connect[*i].cp = tr->constraintVector[q->number];
+	      tpl->connect[*i].cq = tr->constraintVector[q->back->number]; 
+	    }
+	  
+	  for(k = 0; k < PLL_NUM_BRANCHES; k++)
+	    tpl->connect[*i].z[k] = q->z[k];
+	  *i = *i + 1;
+
+	  saveTopolRELLRec(tr, q->back, tpl, i, numsp);
+	  q = q->next;
+	}
+    }
+}
+
+static void saveTopolRELL(pllInstance *tr, topolRELL *tpl)
+{
+  nodeptr p = tr->start;
+  int k, i = 0;
+      
+  tpl->likelihood = tr->likelihood;
+  tpl->start      = 1;
+      
+  tpl->connect[i].p = p;
+  tpl->connect[i].q = p->back;
+  
+  if(tr->grouped ||  tr->constrained)
+    {
+      tpl->connect[i].cp = tr->constraintVector[p->number];
+      tpl->connect[i].cq = tr->constraintVector[p->back->number]; 
+    }
+
+  for(k = 0; k < PLL_NUM_BRANCHES; k++)
+    tpl->connect[i].z[k] = p->z[k];
+  i++;
+      
+  saveTopolRELLRec(tr, p->back, tpl, &i, tr->mxtips);
+
+  assert(i == 2 * tr->mxtips - 3);
+}
+
+
+static void restoreTopolRELL(pllInstance *tr, topolRELL *tpl, int numBranches)
+{
+  int i;
+  
+  for (i = 0; i < 2 * tr->mxtips - 3; i++) 
+    {
+      hookup(tpl->connect[i].p, tpl->connect[i].q, tpl->connect[i].z,  numBranches);
+      tr->constraintVector[tpl->connect[i].p->number] = tpl->connect[i].cp;
+      tr->constraintVector[tpl->connect[i].q->number] = tpl->connect[i].cq;
+    }
+  
+
+  tr->likelihood = tpl->likelihood;
+  tr->start      = tr->nodep[tpl->start];
+  /* TODO */
+}
+
+
+
+/** @brief Initializes space as large as the tree
+  *
+  * @param rl
+  *   RELL 
+  *
+  * @param tr
+  *   PLL instance
+  *
+  * @param n
+  *   Number of
+  *
+  * @todo
+  *   Don't know what is this used for. Something with RELL?
+  *
+  */
+void initTL(topolRELL_LIST *rl, pllInstance *tr, int n)
+{
+  int i;
+
+  rl->max = n; 
+  rl->t = (topolRELL **)rax_malloc(sizeof(topolRELL *) * n);
+
+  for(i = 0; i < n; i++)
+    {
+      rl->t[i] = (topolRELL *)rax_malloc(sizeof(topolRELL));
+      rl->t[i]->connect = (connectRELL *)rax_malloc((2 * tr->mxtips - 3) * sizeof(connectRELL));
+      rl->t[i]->likelihood = PLL_UNLIKELY;     
+    }
+}
+
+/** @brief Deallocate the space associated with this structure
+  *
+  * @paral rl
+  *   This structure
+  *
+  * @todo
+  *   fill the description
+  */
+void freeTL(topolRELL_LIST *rl)
+{
+  int i;
+  for(i = 0; i < rl->max; i++)    
+    {
+      rax_free(rl->t[i]->connect);          
+      rax_free(rl->t[i]);
+    }
+  rax_free(rl->t);
+}
+
+
+void restoreTL(topolRELL_LIST *rl, pllInstance *tr, int n, int numBranches)
+{
+  assert(n >= 0 && n < rl->max);    
+
+  restoreTopolRELL(tr, rl->t[n], numBranches);
+}
+
+
+
+/** @brief Reset this structure
+  *
+  * Reset the likelihoods in this structure
+  *
+  * @param rl
+  *   This structure
+  *
+  * @todo
+  *   Complete this
+  */
+void resetTL(topolRELL_LIST *rl)
+{
+  int i;
+
+  for(i = 0; i < rl->max; i++)    
+    rl->t[i]->likelihood = PLL_UNLIKELY;          
+}
+
+
+
+/** @brief Save 
+  *
+  * Save this topology?
+  *
+  * @todo 
+  *  Complete this
+  */
+void saveTL(topolRELL_LIST *rl, pllInstance *tr, int index)
+{ 
+  assert(index >= 0 && index < rl->max);    
+    
+  if(tr->likelihood > rl->t[index]->likelihood)        
+    saveTopolRELL(tr, rl->t[index]); 
+}
+
+
+static void  *tipValPtr (nodeptr p)
+{ 
+  return  (void *) & p->number;
+}
+
+
+static int  cmpTipVal (void *v1, void *v2)
+{
+  int  i1, i2;
+  
+  i1 = *((int *) v1);
+  i2 = *((int *) v2);
+  return  (i1 < i2) ? -1 : ((i1 == i2) ? 0 : 1);
+}
+
+
+/*  These are the only routines that need to UNDERSTAND topologies */
+
+/** @brief Allocate and initialize space for a tree topology
+    
+    Allocate and initialize a \a topol structure for a tree topology of
+    \a maxtips tips
+
+    @param
+      Number of tips of topology
+
+    @return
+      Pointer to the allocated \a topol structure
+*/
+topol  *setupTopol (int maxtips)
+{
+  topol   *tpl;
+
+  if (! (tpl = (topol *) rax_malloc(sizeof(topol))) || 
+      ! (tpl->links = (connptr) rax_malloc((2*maxtips-3) * sizeof(pllConnect))))
+    {
+      printf("ERROR: Unable to get topology memory");
+      tpl = (topol *) NULL;
+    }
+  else 
+    {
+      tpl->likelihood  = PLL_UNLIKELY;
+      tpl->start       = (node *) NULL;
+      tpl->nextlink    = 0;
+      tpl->ntips       = 0;
+      tpl->nextnode    = 0;    
+      tpl->scrNum      = 0;     /* position in sorted list of scores */
+      tpl->tplNum      = 0;     /* position in sorted list of trees */	      
+    }
+  
+  return  tpl;
+} 
+
+
+/** @brief Deallocate the space occupied by a \a topol structure
+    
+    Deallocate the space occupied by a \a topol structure
+
+    @param tpl
+      The \a topol structure that is to be deallocated
+*/
+void freeTopol (topol *tpl)
+{
+  rax_free(tpl->links);
+  rax_free(tpl);
+} 
+
+
+static int saveSubtree (nodeptr p, topol *tpl, int numsp, int numBranches)  
+{
+  connptr  r, r0;
+  nodeptr  q, s;
+  int      t, t0, t1, k;
+
+  r0 = tpl->links;
+  r = r0 + (tpl->nextlink)++;
+  r->p = p;
+  r->q = q = p->back;
+
+  for(k = 0; k < numBranches; k++)
+    r->z[k] = p->z[k];
+
+  r->descend = 0;                     /* No children (yet) */
+
+  if (isTip(q->number, numsp)) 
+    {
+      r->valptr = tipValPtr(q);         /* Assign value */
+    }
+  else 
+    {                              /* Internal node, look at children */
+      s = q->next;                      /* First child */
+      do 
+	{
+	  t = saveSubtree(s, tpl, numsp, numBranches);        /* Generate child's subtree */
+
+	  t0 = 0;                         /* Merge child into list */
+	  t1 = r->descend;
+	  while (t1 && (cmpTipVal(r0[t1].valptr, r0[t].valptr) < 0)) {
+	    t0 = t1;
+	    t1 = r0[t1].sibling;
+          }
+	  if (t0) r0[t0].sibling = t;  else  r->descend = t;
+	  r0[t].sibling = t1;
+
+	  s = s->next;                    /* Next child */
+        } while (s != q);
+
+      r->valptr = r0[r->descend].valptr;   /* Inherit first child's value */
+      }                                 /* End of internal node processing */
+
+  return  (r - r0);
+}
+
+/** @brief Get the node with the smallest tip value
+    
+    Recursively finds and returns the tip with the smallest value around a node
+    \a p0, or returns \a p0 if it is a tip.
+
+    @param p0
+      Node around which to at which the recursion starts
+
+    @param numsp
+      Number of species (tips) in the tree
+
+    @todo
+      Why do we return p0 immediately if it is a tip? Perhaps one of the two other nodes,
+      i.e. p0->next and p0->next->next, is a tip as well with a smaller number than p0.
+*/
+static nodeptr minSubtreeTip (nodeptr  p0, int numsp)
+{ 
+  nodeptr  minTip, p, testTip;
+
+  if (isTip(p0->number, numsp)) 
+    return p0;
+
+  p = p0->next;
+
+  minTip = minSubtreeTip(p->back, numsp);
+
+  while ((p = p->next) != p0) 
+    {
+      testTip = minSubtreeTip(p->back, numsp);
+      if (cmpTipVal(tipValPtr(testTip), tipValPtr(minTip)) < 0)
+        minTip = testTip;
+    }
+  return minTip;
+} 
+
+
+/** @brief
+*/
+static nodeptr  minTreeTip (nodeptr  p, int numsp)
+{
+  nodeptr  minp, minpb;
+
+  minp  = minSubtreeTip(p, numsp);
+  minpb = minSubtreeTip(p->back, numsp);
+  return (cmpTipVal(tipValPtr(minp), tipValPtr(minpb)) < 0 ? minp : minpb);
+}
+
+/** @brief Save the tree topology in a \a topol structure
+    
+    Save the current tree topology in \a topol structure \a tpl.
+
+*/
+void saveTree (pllInstance *tr, topol *tpl, int numBranches)
+/*  Save a tree topology in a standard order so that first branches
+ *  from a node contain lower value tips than do second branches from
+ *  the node.  The root tip should have the lowest value of all.
+ */
+{
+  connptr  r;  
+  
+  tpl->nextlink = 0;                             /* Reset link pointer */
+  r = tpl->links + saveSubtree(minTreeTip(tr->start, tr->mxtips), tpl, tr->mxtips, numBranches);  /* Save tree */
+  r->sibling = 0;
+  
+  tpl->likelihood = tr->likelihood;
+  tpl->start      = tr->start;
+  tpl->ntips      = tr->ntips;
+  tpl->nextnode   = tr->nextnode;    
+  
+} /* saveTree */
+
+
+/* @brief Transform tree to a given topology and evaluate likelihood
+
+   Transform our current tree topology to the one stored in \a tpl and
+   evaluates the likelihood
+
+   @param tr
+     PLL instance
+
+   @param pr
+     List of partitions
+
+   @return
+     \b PLL_TRUE
+
+   @todo
+     Remove the return value, unnecessary
+
+*/
+pllBoolean restoreTree (topol *tpl, pllInstance *tr, partitionList *pr)
+{ 
+  connptr  r;
+  nodeptr  p, p0;    
+  int  i;
+
+  /* first of all set all backs to NULL so that tips do not point anywhere */
+  for (i = 1; i <= 2*(tr->mxtips) - 2; i++) 
+    {  
+      /* Uses p = p->next at tip */
+      p0 = p = tr->nodep[i];
+      do 
+	{
+	  p->back = (nodeptr) NULL;
+	  p = p->next;
+	} 
+      while (p != p0);
+    }
+
+  /*  Copy connections from topology */
+
+  /* then connect the nodes together */
+  for (r = tpl->links, i = 0; i < tpl->nextlink; r++, i++)     
+    hookup(r->p, r->q, r->z, pr->perGeneBranchLengths?pr->numberOfPartitions:1);
+
+  tr->likelihood = tpl->likelihood;
+  tr->start      = tpl->start;
+  tr->ntips      = tpl->ntips;
+  
+  tr->nextnode   = tpl->nextnode;    
+
+  pllEvaluateLikelihood (tr, pr, tr->start, PLL_TRUE, PLL_FALSE);
+  return PLL_TRUE;
+}
+
+
+
+/** @brief Initialize a list of best trees
+    
+    Initialize a list that will contain the best \a newkeep tree topologies,
+    i.e. the ones that yield the best likelihood. Inside the list initialize
+    space for \a newkeep + 1 topologies of \a numsp tips. The additional
+    topology is the starting one
+
+    @param bt
+      Pointer to \a bestlist to be initialized
+
+    @param newkeep
+      Number of new topologies to keep
+
+    @param numsp
+      Number of species (tips)
+
+    @return
+      number of tree topology slots in the list (minus the starting one)
+
+    @todo
+      Is there a reason that this function is so complex? Many of the checks
+      are unnecessary as the function is called only at two places in the
+      code with newkeep=1 and newkeep=20
+*/
+int initBestTree (bestlist *bt, int newkeep, int numsp)
+{ /* initBestTree */
+  int  i;
+
+  bt->nkeep = 0;
+
+  if (bt->ninit <= 0) 
+    {
+      if (! (bt->start = setupTopol(numsp)))  return  0;
+      bt->ninit    = -1;
+      bt->nvalid   = 0;
+      bt->numtrees = 0;
+      bt->best     = PLL_UNLIKELY;
+      bt->improved = PLL_FALSE;
+      bt->byScore  = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *));
+      bt->byTopol  = (topol **) rax_malloc((newkeep + 1) * sizeof(topol *));
+      if (! bt->byScore || ! bt->byTopol) {
+        printf( "initBestTree: malloc failure\n");
+        return 0;
+      }
+    }
+  else if (PLL_ABS(newkeep) > bt->ninit) {
+    if (newkeep <  0) newkeep = -(bt->ninit);
+    else newkeep = bt->ninit;
+  }
+
+  if (newkeep < 1) {    /*  Use negative newkeep to clear list  */
+    newkeep = -newkeep;
+    if (newkeep < 1) newkeep = 1;
+    bt->nvalid = 0;
+    bt->best = PLL_UNLIKELY;
+  }
+  
+  if (bt->nvalid >= newkeep) {
+    bt->nvalid = newkeep;
+    bt->worst = bt->byScore[newkeep]->likelihood;
+  }
+  else 
+    {
+      bt->worst = PLL_UNLIKELY;
+    }
+  
+  for (i = bt->ninit + 1; i <= newkeep; i++) 
+    {    
+      if (! (bt->byScore[i] = setupTopol(numsp)))  break;
+      bt->byTopol[i] = bt->byScore[i];
+      bt->ninit = i;
+    }
+  
+  return  (bt->nkeep = PLL_MIN(newkeep, bt->ninit));
+} /* initBestTree */
+
+
+
+void resetBestTree (bestlist *bt)
+{ /* resetBestTree */
+  bt->best     = PLL_UNLIKELY;
+  bt->worst    = PLL_UNLIKELY;
+  bt->nvalid   = 0;
+  bt->improved = PLL_FALSE;
+} /* resetBestTree */
+
+
+pllBoolean  freeBestTree(bestlist *bt)
+{ /* freeBestTree */
+  while (bt->ninit >= 0)  freeTopol(bt->byScore[(bt->ninit)--]);
+    
+  /* VALGRIND */
+
+  rax_free(bt->byScore);
+  rax_free(bt->byTopol);
+
+  /* VALGRIND END */
+
+  freeTopol(bt->start);
+  return PLL_TRUE;
+} /* freeBestTree */
+
+
+/*  Compare two trees, assuming that each is in standard order.  Return
+ *  -1 if first preceeds second, 0 if they are identical, or +1 if first
+ *  follows second in standard order.  Lower number tips preceed higher
+ *  number tips.  A tip preceeds a corresponding internal node.  Internal
+ *  nodes are ranked by their lowest number tip.
+ */
+
+static int  cmpSubtopol (connptr p10, connptr p1, connptr p20, connptr p2)
+{
+  connptr  p1d, p2d;
+  int  cmp;
+  
+  if (! p1->descend && ! p2->descend)          /* Two tips */
+    return cmpTipVal(p1->valptr, p2->valptr);
+  
+  if (! p1->descend) return -1;                /* p1 = tip, p2 = node */
+  if (! p2->descend) return  1;                /* p2 = tip, p1 = node */
+  
+  p1d = p10 + p1->descend;
+  p2d = p20 + p2->descend;
+  while (1) {                                  /* Two nodes */
+    if ((cmp = cmpSubtopol(p10, p1d, p20, p2d)))  return cmp; /* Subtrees */
+    if (! p1d->sibling && ! p2d->sibling)  return  0; /* Lists done */
+    if (! p1d->sibling) return -1;             /* One done, other not */
+    if (! p2d->sibling) return  1;             /* One done, other not */
+    p1d = p10 + p1d->sibling;                  /* Neither done */
+    p2d = p20 + p2d->sibling;
+  }
+}
+
+
+
+static int  cmpTopol (void *tpl1, void *tpl2)
+{ 
+  connptr  r1, r2;
+  int      cmp;    
+  
+  r1 = ((topol *) tpl1)->links;
+  r2 = ((topol *) tpl2)->links;
+  cmp = cmpTipVal(tipValPtr(r1->p), tipValPtr(r2->p));
+  if (cmp)      	
+    return cmp;     
+  return  cmpSubtopol(r1, r1, r2, r2);
+} 
+
+
+
+static int  cmpTplScore (void *tpl1, void *tpl2)
+{ 
+  double  l1, l2;
+  
+  l1 = ((topol *) tpl1)->likelihood;
+  l2 = ((topol *) tpl2)->likelihood;
+  return  (l1 > l2) ? -1 : ((l1 == l2) ? 0 : 1);
+}
+
+
+
+/*  Find an item in a sorted list of n items.  If the item is in the list,
+ *  return its index.  If it is not in the list, return the negative of the
+ *  position into which it should be inserted.
+ */
+
+static int  findInList (void *item, void *list[], int n, int (* cmpFunc)(void *, void *))
+{
+  int  mid, hi, lo, cmp = 0;
+  
+  if (n < 1) return  -1;                    /*  No match; first index  */
+  
+  lo = 1;
+  mid = 0;
+  hi = n;
+  while (lo < hi) {
+    mid = (lo + hi) >> 1;
+    cmp = (* cmpFunc)(item, list[mid-1]);
+    if (cmp) {
+      if (cmp < 0) hi = mid;
+      else lo = mid + 1;
+    }
+    else  return  mid;                        /*  Exact match  */
+  }
+  
+  if (lo != mid) {
+    cmp = (* cmpFunc)(item, list[lo-1]);
+    if (cmp == 0) return lo;
+  }
+  if (cmp > 0) lo++;                         /*  Result of step = 0 test  */
+  return  -lo;
+} 
+
+
+
+static int  findTreeInList (bestlist *bt, pllInstance *tr, int numBranches)
+{
+  topol  *tpl;
+  
+  tpl = bt->byScore[0];
+  saveTree(tr, tpl, numBranches);
+  return  findInList((void *) tpl, (void **) (& (bt->byTopol[1])),
+		     bt->nvalid, cmpTopol);
+} 
+
+
+/** @brief Save the current tree in the \a bestlist structure
+    
+    Save the current tree topology in \a bestlist structure \a bt.
+
+    @param tr
+      The PLL instance
+    
+    @param bt
+      The \a bestlist structure
+    
+    @param numBranches
+      Number of branches u
+
+    @return
+      it is never used
+
+    @todo
+      What to do with the return value? Should we simplify the code?
+*/
+int  saveBestTree (bestlist *bt, pllInstance *tr, int numBranches)
+{    
+  topol  *tpl, *reuse;
+  int  tplNum, scrNum, reuseScrNum, reuseTplNum, i, oldValid, newValid;
+  
+  tplNum = findTreeInList(bt, tr, numBranches);
+  tpl = bt->byScore[0];
+  oldValid = newValid = bt->nvalid;
+  
+  if (tplNum > 0) {                      /* Topology is in list  */
+    reuse = bt->byTopol[tplNum];         /* Matching topol  */
+    reuseScrNum = reuse->scrNum;
+    reuseTplNum = reuse->tplNum;
+  }
+  /* Good enough to keep? */
+  else if (tr->likelihood < bt->worst)  return 0;
+  
+  else {                                 /* Topology is not in list */
+    tplNum = -tplNum;                    /* Add to list (not replace) */
+    if (newValid < bt->nkeep) bt->nvalid = ++newValid;
+    reuseScrNum = newValid;              /* Take worst tree */
+    reuse = bt->byScore[reuseScrNum];
+    reuseTplNum = (newValid > oldValid) ? newValid : reuse->tplNum;
+    if (tr->likelihood > bt->start->likelihood) bt->improved = PLL_TRUE;
+  }
+  
+  scrNum = findInList((void *) tpl, (void **) (& (bt->byScore[1])),
+		      oldValid, cmpTplScore);
+  scrNum = PLL_ABS(scrNum);
+  
+  if (scrNum < reuseScrNum)
+    for (i = reuseScrNum; i > scrNum; i--)
+      (bt->byScore[i] = bt->byScore[i-1])->scrNum = i;
+  
+  else if (scrNum > reuseScrNum) {
+    scrNum--;
+    for (i = reuseScrNum; i < scrNum; i++)
+      (bt->byScore[i] = bt->byScore[i+1])->scrNum = i;
+  }
+  
+  if (tplNum < reuseTplNum)
+    for (i = reuseTplNum; i > tplNum; i--)
+      (bt->byTopol[i] = bt->byTopol[i-1])->tplNum = i;
+  
+  else if (tplNum > reuseTplNum) {
+    tplNum--;
+    for (i = reuseTplNum; i < tplNum; i++)
+      (bt->byTopol[i] = bt->byTopol[i+1])->tplNum = i;
+  }
+  
+  
+  
+  tpl->scrNum = scrNum;
+  tpl->tplNum = tplNum;
+  bt->byTopol[tplNum] = bt->byScore[scrNum] = tpl;
+  bt->byScore[0] = reuse;
+  
+  if (scrNum == 1)  bt->best = tr->likelihood;
+  if (newValid == bt->nkeep) bt->worst = bt->byScore[newValid]->likelihood;
+  
+  return  scrNum;
+} 
+
+
+/** @brief Restore the best tree from \a bestlist structure
+    
+    Restore the \a rank-th best tree from the \a bestlist structure \a bt.
+
+    @param bt
+      The \a bestlist structure containing the stored best trees
+
+    @param rank
+      The rank (by score) of the tree we want to retrieve
+
+    @param tr
+      PLL instance
+
+    @param pr
+      List of partitions
+
+    @return
+      Index (rank) of restored topology in \a bestlist
+*/
+int  recallBestTree (bestlist *bt, int rank, pllInstance *tr, partitionList *pr)
+{ 
+  if (rank < 1)  rank = 1;
+  if (rank > bt->nvalid)  rank = bt->nvalid;
+  if (rank > 0)  if (! restoreTree(bt->byScore[rank], tr, pr)) return PLL_FALSE;
+  return  rank;
+}
+
+
+
+
diff --git a/pllrepo/src/trash.c b/pllrepo/src/trash.c
new file mode 100644
index 0000000..5247c25
--- /dev/null
+++ b/pllrepo/src/trash.c
@@ -0,0 +1,129 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file trash.c
+ */
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>  
+#endif
+
+#include <limits.h>
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+ 
+  
+/** @brief Reorder nodes in PLL tree
+
+    Re-order the internal nodes of the tree of PLL instance \a tr in a preorder
+    traversal such that they start from \a p
+    
+    @param tr
+      PLL instance
+
+    @param np
+      Array of node pointers
+
+    @param p
+      Node from where the preorder traversal should start
+
+    @param count
+
+    @todo
+      why not insert a break in the for loop when the node is found?
+
+*/
+static void reorderNodes(pllInstance *tr, nodeptr *np, nodeptr p, int *count)
+{
+  int i, found = 0;
+
+  if(isTip(p->number, tr->mxtips))    
+    return;
+  else
+    {              
+      for(i = tr->mxtips + 1; (i <= (tr->mxtips + tr->mxtips - 1)) && (found == 0); i++)
+	{
+	  if (p == np[i] || p == np[i]->next || p == np[i]->next->next)
+	    {
+	      if(p == np[i])			       
+		tr->nodep[*count + tr->mxtips + 1] = np[i];		 		
+	      else
+		{
+		  if(p == np[i]->next)		  
+		    tr->nodep[*count + tr->mxtips + 1] = np[i]->next;		     	   
+		  else		   
+		    tr->nodep[*count + tr->mxtips + 1] = np[i]->next->next;		    		    
+		}
+
+	      found = 1;	      	     
+	      *count = *count + 1;
+	    }
+	} 
+      
+      assert(found != 0);
+     
+      reorderNodes(tr, np, p->next->back, count);     
+      reorderNodes(tr, np, p->next->next->back, count);                
+    }
+}
+
+void nodeRectifier(pllInstance *tr)
+{
+  nodeptr *np = (nodeptr *)rax_malloc(2 * tr->mxtips * sizeof(nodeptr));
+  int i;
+  int count = 0;
+  
+  tr->start       = tr->nodep[1];
+  tr->rooted      = PLL_FALSE;
+
+  /* TODO why is tr->rooted set to PLL_FALSE here ?*/
+  
+  for(i = tr->mxtips + 1; i <= (tr->mxtips + tr->mxtips - 1); i++)
+    np[i] = tr->nodep[i];           
+  
+  reorderNodes(tr, np, tr->start->back, &count); 
+
+ 
+  rax_free(np);
+}
+
+nodeptr findAnyTip(nodeptr p, int numsp)
+{ 
+  return  isTip(p->number, numsp) ? p : findAnyTip(p->next->back, numsp);
+} 
+
diff --git a/pllrepo/src/treeIO.c b/pllrepo/src/treeIO.c
new file mode 100644
index 0000000..0a63b40
--- /dev/null
+++ b/pllrepo/src/treeIO.c
@@ -0,0 +1,236 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file treeIO.c
+ */
+#include "mem_alloc.h"
+
+#include "mem_alloc.h"
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h> 
+#endif
+
+#include <math.h>
+#include <time.h> 
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pll.h"
+#include "pllInternal.h"
+
+extern char *likelihood_key;
+extern char *ntaxa_key;
+extern char *smoothed_key;
+extern int partCount;
+
+int countTips(nodeptr p, int numsp)
+{
+  if(isTip(p->number, numsp))  
+    return 1;    
+  {
+    nodeptr q;
+    int tips = 0;
+
+    q = p->next;
+    while(q != p)
+      { 
+	tips += countTips(q->back, numsp);
+	q = q->next;
+      } 
+    
+    return tips;
+  }
+}
+
+
+static double getBranchLength(pllInstance *tr, partitionList *pr, int perGene, nodeptr p)
+{
+  double 
+    z = 0.0,
+    x = 0.0;
+  int numBranches = pr->perGeneBranchLengths?pr->numberOfPartitions:1;
+
+  assert(perGene != PLL_NO_BRANCHES);
+	      
+  if(numBranches == 1)
+    {
+      assert(tr->fracchange != -1.0);
+      z = p->z[0];
+      if (z < PLL_ZMIN) 
+	z = PLL_ZMIN;      	 
+      
+      x = -log(z) * tr->fracchange;           
+    }
+  else
+    {
+      if(perGene == PLL_SUMMARIZE_LH)
+	{
+	  int 
+	    i;
+	  
+	  double 
+	    avgX = 0.0;
+		      
+	  for(i = 0; i < numBranches; i++)
+	    {
+	      assert(pr->partitionData[i]->partitionContribution != -1.0);
+	      assert(pr->partitionData[i]->fracchange != -1.0);
+	      z = p->z[i];
+	      if(z < PLL_ZMIN) 
+		z = PLL_ZMIN;      	 
+	      x = -log(z) * pr->partitionData[i]->fracchange;
+	      avgX += x * pr->partitionData[i]->partitionContribution;
+	    }
+
+	  x = avgX;
+	}
+      else
+	{	
+	  assert(pr->partitionData[perGene]->fracchange != -1.0);
+	  assert(perGene >= 0 && perGene < numBranches);
+	  
+	  z = p->z[perGene];
+	  
+	  if(z < PLL_ZMIN) 
+	    z = PLL_ZMIN;      	 
+	  
+	  x = -log(z) * pr->partitionData[perGene]->fracchange;
+	}
+    }
+
+  return x;
+}
+
+static char *pllTreeToNewickREC(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames,
+			    pllBoolean printLikelihood, pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport)
+{
+  char  *nameptr;            
+      
+  if(isTip(p->number, tr->mxtips)) 
+    {	       	  
+      if(printNames)
+	{
+	  nameptr = tr->nameList[p->number];     
+	  sprintf(treestr, "%s", nameptr);
+	}
+      else
+	sprintf(treestr, "%d", p->number);    
+	
+      while (*treestr) treestr++;
+    }
+  else 
+    {                 	 
+      *treestr++ = '(';
+      treestr = pllTreeToNewickREC(treestr, tr, pr, p->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+			       finalPrint, perGene, branchLabelSupport, printSHSupport);
+      *treestr++ = ',';
+      treestr = pllTreeToNewickREC(treestr, tr, pr, p->next->next->back, printBranchLengths, printNames, printLikelihood, rellTree,
+			       finalPrint, perGene, branchLabelSupport, printSHSupport);
+      if(p == tr->start->back) 
+	{
+	  *treestr++ = ',';
+	  treestr = pllTreeToNewickREC(treestr, tr, pr, p->back, printBranchLengths, printNames, printLikelihood, rellTree,
+				   finalPrint, perGene, branchLabelSupport, printSHSupport);
+	}
+      *treestr++ = ')';                    
+    }
+
+  if(p == tr->start->back) 
+    {	      	 
+      if(printBranchLengths && !rellTree)
+	sprintf(treestr, ":0.0;\n");
+      else
+	sprintf(treestr, ";\n");	 	  	
+    }
+  else 
+    {                   
+      if(rellTree || branchLabelSupport || printSHSupport)
+	{	 	 
+	  if(( !isTip(p->number, tr->mxtips)) && 
+	     ( !isTip(p->back->number, tr->mxtips)))
+	    {			      
+	      assert(p->bInf != (branchInfo *)NULL);
+	      
+	      if(rellTree)
+		sprintf(treestr, "%d:%8.20f", p->bInf->support, p->z[0]);
+	      if(branchLabelSupport)
+		sprintf(treestr, ":%8.20f[%d]", p->z[0], p->bInf->support);
+	      if(printSHSupport)
+		sprintf(treestr, ":%8.20f[%d]", getBranchLength(tr, pr, perGene, p), p->bInf->support);
+	      
+	    }
+	  else		
+	    {
+	      if(rellTree || branchLabelSupport)
+		sprintf(treestr, ":%8.20f", p->z[0]);	
+	      if(printSHSupport)
+		sprintf(treestr, ":%8.20f", getBranchLength(tr, pr, perGene, p));
+	    }
+	}
+      else
+	{
+	  if(printBranchLengths)	    
+	    sprintf(treestr, ":%8.20f", getBranchLength(tr, pr, perGene, p));
+	  else	    
+	    sprintf(treestr, "%s", "\0");	    
+	}      
+    }
+  
+  while (*treestr) treestr++;
+  return  treestr;
+}
+
+
+char *pllTreeToNewick(char *treestr, pllInstance *tr, partitionList *pr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+		  pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport)
+{ 
+
+  if(rellTree)
+    assert(!branchLabelSupport && !printSHSupport);
+
+  if(branchLabelSupport)
+    assert(!rellTree && !printSHSupport);
+
+  if(printSHSupport)
+    assert(!branchLabelSupport && !rellTree);
+
+ 
+  pllTreeToNewickREC(treestr, tr, pr, p, printBranchLengths, printNames, printLikelihood, rellTree,
+		 finalPrint, perGene, branchLabelSupport, printSHSupport);  
+    
+  
+  while (*treestr) treestr++;
+  
+  return treestr;
+}
+
diff --git a/pllrepo/src/treeIO.h b/pllrepo/src/treeIO.h
new file mode 100644
index 0000000..c91a1ab
--- /dev/null
+++ b/pllrepo/src/treeIO.h
@@ -0,0 +1,23 @@
+/*
+ * treeIO.h
+ *
+ *  Created on: Nov 22, 2012
+ *      Author: tung
+ */
+
+/*
+I just put some declarations of the functions that I need here.
+Please extend this file. It's important to have a header file.
+It make things much easier for the integration with other software.
+*/
+
+#ifndef TREEIO_H_
+#define TREEIO_H_
+
+#include "pll.h"
+
+char *pllTreeToNewick(char *treestr, tree *tr, nodeptr p, pllBoolean printBranchLengths, pllBoolean printNames, pllBoolean printLikelihood,
+		  pllBoolean rellTree, pllBoolean finalPrint, int perGene, pllBoolean branchLabelSupport, pllBoolean printSHSupport);
+double getBranchLength(pllInstance *tr, partitionList *pr, int perGene, nodeptr p);
+
+#endif /* TREEIO_H_ */
diff --git a/pllrepo/src/utils.c b/pllrepo/src/utils.c
new file mode 100644
index 0000000..e7d0c42
--- /dev/null
+++ b/pllrepo/src/utils.c
@@ -0,0 +1,3734 @@
+/** 
+ * PLL (version 1.0.0) a software library for phylogenetic inference
+ * Copyright (C) 2013 Tomas Flouri and Alexandros Stamatakis
+ *
+ * Derived from 
+ * RAxML-HPC, a program for sequential and parallel estimation of phylogenetic
+ * trees by Alexandros Stamatakis
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ * 
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * For any other enquiries send an Email to Tomas Flouri
+ * Tomas.Flouri at h-its.org
+ *
+ * When publishing work that uses PLL please cite PLL
+ * 
+ * @file utils.c
+ *  
+ * @brief Miscellaneous general utility and helper functions
+ */
+#ifdef WIN32
+#include <direct.h>
+#endif
+
+#ifndef WIN32
+#include <sys/times.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <assert.h>
+#include <errno.h>
+#include "cycle.h"
+
+
+#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if (defined(__AVX) || defined(__SSE3))
+#include <xmmintrin.h>
+#endif
+/*
+   special bug fix, enforces denormalized numbers to be flushed to zero,
+   without this program is a tiny bit faster though.
+#include <emmintrin.h> 
+#define MM_DAZ_MASK    0x0040
+#define MM_DAZ_ON    0x0040
+#define MM_DAZ_OFF    0x0000
+*/
+#endif
+
+#include "pll.h"
+#include "pllInternal.h"
+
+#define GLOBAL_VARIABLES_DEFINITION
+
+#include "globalVariables.h"
+
+/* mappings of BIN/DNA/AA alphabet to numbers */
+
+static const char PLL_MAP_BIN[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3, -1, -1,
+    1,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+  };
+
+static const char PLL_MAP_NT[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15,
+   -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
+   -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
+   -1,  1, 14,  2, 13, -1, -1,  4, 11, -1, -1, 12, -1,  3, 15, 15,
+   -1, -1,  5,  6,  8,  8,  7,  9, 15, 10, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+
+static const char PLL_MAP_AA[256] =
+ {
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, 22, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22,
+   -1,  0, 20,  4,  3,  6, 13,  7,  8,  9, -1, 11, 10, 12,  2, -1,
+   14,  5,  1, 15, 16, -1, 19, 17, 22, 18, 21, -1, -1, -1, -1, -1,
+   -1,  0, 20,  4,  3,  6, 13,  7,  8,  9, -1, 11, 10, 12,  2, -1,
+   14,  5,  1, 15, 16, -1, 19, 17, 22, 18, 21, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+
+
+
+
+
+static void pllTreeInitDefaults (pllInstance * tr, int tips);
+static void getInnerBranchEndPointsRecursive (nodeptr p, int tips, int * i, node **nodes);
+#if (!defined(_FINE_GRAIN_MPI) && !defined(_USE_PTHREADS))
+static void initializePartitionsSequential(pllInstance *tr, partitionList *pr);
+#endif
+
+/** @defgroup instanceLinkingGroup Linking topology, partition scheme and alignment to the PLL instance
+    
+    This set of functions handles the linking of topology, partition scheme and multiple sequence alignment
+    with the PLL instance
+*/
+/***************** UTILITY FUNCTIONS **************************/
+
+#if (!defined(_SVID_SOURCE) && !defined(_BSD_SOURCE) && !defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE) && !defined(_POSIX_SOURCE))
+static char *
+my_strtok_r (char * s, const char * delim, char **save_ptr)
+{  
+  char *token;
+   
+  /* Scan leading delimiters */
+  if (s == NULL)
+    s = *save_ptr;
+   
+  s += strspn (s, delim);
+  if (*s == '\0')
+   {
+     *save_ptr = s;
+     return NULL;
+   }
+   
+  /* Find the end of the token. */
+  token = s;
+  s = strpbrk (token, delim);
+  if (!s)
+    *save_ptr = strchr (token, '\0');
+  else
+   {
+     /* Terminate the token and make *SAVE_PTR point past it */
+     *s = '\0';
+     *save_ptr = s + 1;
+   }
+   
+  return token;
+}
+#endif
+
+#if (defined(_SVID_SOURCE) || defined(_BSD_SOURCE) || defined(_POSIX_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE))
+#define STRTOK_R strtok_r
+#else
+#define STRTOK_R my_strtok_r
+#endif
+
+
+
+
+void storeExecuteMaskInTraversalDescriptor(pllInstance *tr, partitionList *pr)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    tr->td[0].executeModel[model] = pr->partitionData[model]->executeModel;
+
+}
+
+void storeValuesInTraversalDescriptor(pllInstance *tr, partitionList *pr, double *value)
+{
+  int model;
+
+  for(model = 0; model < pr->numberOfPartitions; model++)
+    tr->td[0].parameterValues[model] = value[model];
+}
+
+const unsigned int *getBitVector(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].bitVector;
+}
+
+/*
+int getStates(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].states;
+}
+*/
+
+int getUndetermined(int dataType)
+{
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  return pLengths[dataType].undetermined;
+}
+
+const partitionLengths *getPartitionLengths(pInfo *p)
+{
+  int 
+    dataType  = p->dataType,
+              states    = p->states,
+              tipLength = p->maxTipStates;
+
+  assert(states != -1 && tipLength != -1);
+
+  assert(PLL_MIN_MODEL < dataType && dataType < PLL_MAX_MODEL);
+
+  /*pLength.leftLength = pLength.rightLength = states * states;
+    pLength.eignLength = states;
+    pLength.evLength   = states * states;
+    pLength.eiLength   = states * states;
+    pLength.substRatesLength = (states * states - states) / 2;
+    pLength.frequenciesLength = states;
+    pLength.tipVectorLength   = tipLength * states;
+    pLength.symmetryVectorLength = (states * states - states) / 2;
+    pLength.frequencyGroupingLength = states;
+    pLength.nonGTR = PLL_FALSE;*/
+  return (&pLengths[dataType]); 
+}
+
+size_t discreteRateCategories(int rateHetModel)
+{
+  size_t 
+    result;
+
+  switch(rateHetModel)
+  {
+    case PLL_CAT:
+      result = 1;
+      break;
+    case PLL_GAMMA:
+      result = 4;
+      break;
+    default:
+      assert(0);
+  }
+
+  return result;
+}
+
+
+
+double gettime(void)
+{
+#ifdef WIN32
+  time_t tp;
+  struct tm localtm;
+  tp = time(NULL);
+  localtm = *localtime(&tp);
+  return 60.0*localtm.tm_min + localtm.tm_sec;
+#else
+  struct timeval ttime;
+  gettimeofday(&ttime , NULL);
+  return ttime.tv_sec + ttime.tv_usec * 0.000001;
+#endif
+}
+
+int gettimeSrand(void)
+{
+#ifdef WIN32
+  time_t tp;
+  struct tm localtm;
+  tp = time(NULL);
+  localtm = *localtime(&tp);
+  return 24*60*60*localtm.tm_yday + 60*60*localtm.tm_hour + 60*localtm.tm_min  + localtm.tm_sec;
+#else
+  struct timeval ttime;
+  gettimeofday(&ttime , NULL);
+  return ttime.tv_sec + ttime.tv_usec;
+#endif
+}
+
+double randum (long  *seed)
+{
+  long  sum, mult0, mult1, seed0, seed1, seed2, newseed0, newseed1, newseed2;
+  double res;
+
+  mult0 = 1549;
+  seed0 = *seed & 4095;
+  sum  = mult0 * seed0;
+  newseed0 = sum & 4095;
+  sum >>= 12;
+  seed1 = (*seed >> 12) & 4095;
+  mult1 =  406;
+  sum += mult0 * seed1 + mult1 * seed0;
+  newseed1 = sum & 4095;
+  sum >>= 12;
+  seed2 = (*seed >> 24) & 255;
+  sum += mult0 * seed2 + mult1 * seed1;
+  newseed2 = sum & 255;
+
+  *seed = newseed2 << 24 | newseed1 << 12 | newseed0;
+  res = 0.00390625 * (newseed2 + 0.000244140625 * (newseed1 + 0.000244140625 * newseed0));
+
+  return res;
+}
+
+
+/********************* END UTILITY FUNCTIONS ********************/
+
+
+/******************************some functions for the likelihood computation ****************************/
+
+
+/** @brief Check whether a node is a tip.
+    
+    Checks whether the node with number \a number is a tip.
+    
+    @param number
+     Node number to be checked
+   
+    @param maxTips
+     Number of tips in the tree
+   
+    @return
+      \b PLL_TRUE if tip, \b PLL_FALSE otherwise
+  */
+pllBoolean isTip(int number, int maxTips)
+{
+  assert(number > 0);
+
+  if(number <= maxTips)
+    return PLL_TRUE;
+  else
+    return PLL_FALSE;
+}
+
+/** @brief Set the orientation of a node
+
+    Sets the orientation of node \a p. That means, it will reset the orientation
+    \a p->next->x and \a p->next->next->x to 0 and of \a p->x to 1, meaning that
+    the conditional likelihood vector for that node is oriented on \a p, i.e.
+    the conditional likelihood vector represents the subtree rooted at \a p and
+    not any other of the two nodes.
+
+    @param p
+      Node which we want to orient
+*/
+void getxnode (nodeptr p)
+{
+  nodeptr  s;
+
+  if ((s = p->next)->x || (s = s->next)->x)
+  {
+    p->x = s->x;
+    s->x = 0;
+  }
+
+  assert(p->x);
+}
+
+
+/** @brief Connect two nodes and assign branch lengths 
+  * 
+  * Connect the two nodes \a p and \a q in each partition \e i with a branch of
+  * length \a z[i]
+  *
+  * @param p
+  *   Node \a p
+  * 
+  * @param q
+  *   Node \a q
+  *
+  * @param numBranches
+  *   Number of partitions
+  */
+void hookup (nodeptr p, nodeptr q, double *z, int numBranches)
+{
+  int i;
+
+  p->back = q;
+  q->back = p;
+
+  for(i = 0; i < numBranches; i++)
+    p->z[i] = q->z[i] = z[i];
+}
+
+/* connects node p with q and assigns the branch lengths z for the whole vector*/
+void hookupFull (nodeptr p, nodeptr q, double *z)
+{
+  //int i;
+
+  p->back = q;
+  q->back = p;
+
+  memcpy(p->z, z, PLL_NUM_BRANCHES*sizeof(double) );
+  memcpy(q->z, z, PLL_NUM_BRANCHES*sizeof(double) );
+  //for(i = 0; i < numBranches; i++)
+  //  p->z[i] = q->z[i] = z[i];
+
+}
+
+/* connect node p with q and assign the default branch lengths */
+void hookupDefault (nodeptr p, nodeptr q)
+{
+  int i;
+
+  p->back = q;
+  q->back = p;
+
+  for(i = 0; i < PLL_NUM_BRANCHES; i++)
+    p->z[i] = q->z[i] = PLL_DEFAULTZ;
+
+}
+
+
+/***********************reading and initializing input ******************/
+
+
+
+pllBoolean whitechar (int ch)
+{
+  return (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r');
+}
+/*
+static unsigned int KISS32(void)
+{
+  static unsigned int 
+    x = 123456789, 
+      y = 362436069,
+      z = 21288629,
+      w = 14921776,
+      c = 0;
+
+  unsigned int t;
+
+  x += 545925293;
+  y ^= (y<<13); 
+  y ^= (y>>17); 
+  y ^= (y<<5);
+  t = z + w + c; 
+  z = w; 
+  c = (t>>31); 
+  w = t & 2147483647;
+
+  return (x+y+w);
+}
+*/
+
+/** @brief Get a random subtree
+
+    Returns the root node of a randomly picked subtree of the tree in PLL
+    instance \a tr. The picked subtree is guaranteed to have height over
+    1, that is, the direct descendents of the returned (root) node are not tips.
+
+    @param tr
+      PLL instance
+
+    @return
+      The root node of the randomly picked subtree
+*/
+nodeptr pllGetRandomSubtree(pllInstance *tr)
+{
+  nodeptr p;
+  do
+  {
+    int exitDirection = rand() % 3; 
+    p = tr->nodep[(rand() % (tr->mxtips - 2)) + 1 + tr->mxtips];
+    switch(exitDirection)
+    {
+      case 0:
+        break;
+      case 1:
+        p = p->next;
+        break;
+      case 2:
+        p = p->next->next;
+        break;
+      default:
+        assert(0);
+    }
+  }
+  while(isTip(p->next->back->number, tr->mxtips) && isTip(p->next->next->back->number, tr->mxtips));
+  assert(!isTip(p->number, tr->mxtips));
+  return p;
+}
+/* small example program that executes ancestral state computations 
+   on the entire subtree rooted at p.
+
+   Note that this is a post-order traversal.
+*/
+
+  
+void computeAllAncestralVectors(nodeptr p, pllInstance *tr, partitionList *pr)
+{
+  /* if this is not a tip, for which evidently it does not make sense 
+     to compute the ancestral sequence because we have the real one ....
+  */
+
+  if(!isTip(p->number, tr->mxtips))
+    {
+      /* descend recursively to compute the ancestral states in the left and right subtrees */
+
+      computeAllAncestralVectors(p->next->back, tr, pr);
+      computeAllAncestralVectors(p->next->next->back, tr, pr);
+      
+      /* then compute the ancestral state at node p */
+
+      pllUpdatePartialsAncestral(tr, pr, p);
+
+      /* and print it to terminal, the two booleans that are set to PLL_TRUE here 
+         tell the function to print the marginal probabilities as well as 
+         a discrete inner sequence, that is, ACGT etc., always selecting and printing 
+         the state that has the highest probability */
+
+      printAncestralState(p, PLL_TRUE, PLL_TRUE, tr, pr);
+    }
+}
+
+
+
+void initializePartitionData(pllInstance *localTree, partitionList * localPartitions)
+{
+  /* in ancestralVectorWidth we store the total length in bytes (!) of 
+     one conditional likelihood array !
+     we need to know this length such that in the pthreads version the master thread can actually 
+     gather the scattered ancestral probabilities from the threads such that they can be printed to screen!
+  */
+
+  size_t 
+    maxCategories = (size_t)localTree->maxCategories;
+
+  size_t 
+    ancestralVectorWidth = 0,
+    model; 
+
+  int 
+    tid  = localTree->threadID,
+    innerNodes = localTree->mxtips - 2;
+
+  if(tid > 0)
+      localTree->rateCategory    = (int *)    rax_calloc((size_t)localTree->originalCrunchedLength, sizeof(int));           
+
+  for(model = 0; model < (size_t)localPartitions->numberOfPartitions; model++)
+    {
+      size_t 
+        width = localPartitions->partitionData[model]->width;
+
+      const partitionLengths 
+        *pl = getPartitionLengths(localPartitions->partitionData[model]);
+
+      /* 
+         globalScaler needs to be 2 * localTree->mxtips such that scalers of inner AND tip nodes can be added without a case switch
+         to this end, it must also be initialized with zeros -> calloc
+      */
+
+      localPartitions->partitionData[model]->globalScaler    = (unsigned int *)rax_calloc(2 *(size_t)localTree->mxtips, sizeof(unsigned int));
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->left),  PLL_BYTE_ALIGNMENT, (size_t)pl->leftLength * (maxCategories + 1) * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->right), PLL_BYTE_ALIGNMENT, (size_t)pl->rightLength * (maxCategories + 1) * sizeof(double));
+      localPartitions->partitionData[model]->EIGN              = (double*)rax_malloc((size_t)pl->eignLength * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->EV),    PLL_BYTE_ALIGNMENT, (size_t)pl->evLength * sizeof(double));
+      localPartitions->partitionData[model]->EI                = (double*)rax_malloc((size_t)pl->eiLength * sizeof(double));
+      localPartitions->partitionData[model]->substRates        = (double *)rax_malloc((size_t)pl->substRatesLength * sizeof(double));
+      localPartitions->partitionData[model]->frequencies       = (double*)rax_malloc((size_t)pl->frequenciesLength * sizeof(double));
+      localPartitions->partitionData[model]->freqExponents     = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
+      localPartitions->partitionData[model]->empiricalFrequencies       = (double*)rax_malloc((size_t)pl->frequenciesLength * sizeof(double));
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->tipVector), PLL_BYTE_ALIGNMENT, (size_t)pl->tipVectorLength * sizeof(double));
+      //localPartitions->partitionData[model]->partitionName      = NULL;   // very imporatant since it is deallocated in pllPartitionDestroy
+      
+       if(localPartitions->partitionData[model]->dataType == PLL_AA_DATA
+               && (localPartitions->partitionData[model]->protModels == PLL_LG4M || localPartitions->partitionData[model]->protModels == PLL_LG4X))
+        {
+          int 
+            k;
+          
+          for(k = 0; k < 4; k++)
+            {       
+              localPartitions->partitionData[model]->EIGN_LG4[k]              = (double*)rax_malloc(pl->eignLength * sizeof(double));
+              rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->EV_LG4[k]), PLL_BYTE_ALIGNMENT, pl->evLength * sizeof(double));
+              localPartitions->partitionData[model]->EI_LG4[k]                = (double*)rax_malloc(pl->eiLength * sizeof(double));
+              localPartitions->partitionData[model]->substRates_LG4[k]        = (double *)rax_malloc(pl->substRatesLength * sizeof(double));
+              localPartitions->partitionData[model]->frequencies_LG4[k]       = (double*)rax_malloc(pl->frequenciesLength * sizeof(double));
+              rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->tipVector_LG4[k]), PLL_BYTE_ALIGNMENT, pl->tipVectorLength * sizeof(double));
+            }
+        }
+
+      localPartitions->partitionData[model]->symmetryVector    = (int *)rax_malloc((size_t)pl->symmetryVectorLength  * sizeof(int));
+      localPartitions->partitionData[model]->frequencyGrouping = (int *)rax_malloc((size_t)pl->frequencyGroupingLength  * sizeof(int));
+
+      localPartitions->partitionData[model]->perSiteRates      = (double *)rax_malloc(sizeof(double) * maxCategories);
+
+      localPartitions->partitionData[model]->nonGTR = PLL_FALSE;
+
+      localPartitions->partitionData[model]->gammaRates = (double*)rax_malloc(sizeof(double) * 4);
+      localPartitions->partitionData[model]->yVector = (unsigned char **)rax_malloc(sizeof(unsigned char*) * ((size_t)localTree->mxtips + 1));
+
+
+      localPartitions->partitionData[model]->xVector = (double **)rax_calloc(sizeof(double*), (size_t)localTree->mxtips);
+
+      if (localPartitions->partitionData[model]->ascBias)
+       {
+         localPartitions->partitionData[model]->ascOffset    = 4 * localPartitions->partitionData[model]->states * localPartitions->partitionData[model]->states;
+         localPartitions->partitionData[model]->ascVector    = (double *)rax_malloc(innerNodes * 
+                                                                                    localPartitions->partitionData[model]->ascOffset * 
+                                                                                    sizeof(double));
+         localPartitions->partitionData[model]->ascExpVector = (int *)rax_calloc(innerNodes *
+                                                                                 localPartitions->partitionData[model]->states,
+                                                                                 sizeof(int));
+         localPartitions->partitionData[model]->ascSumBuffer = (double *)rax_malloc(localPartitions->partitionData[model]->ascOffset * sizeof(double)); 
+       }
+
+
+      /* 
+         Initializing the xVector array like this is absolutely required !!!!
+         I don't know which programming genious removed this, but it must absolutely stay in here!!!!
+      */
+      
+      {
+        int k;
+        
+        for(k = 0; k < localTree->mxtips; k++)
+              localPartitions->partitionData[model]->xVector[k] = (double*)NULL;       
+      }
+
+
+      localPartitions->partitionData[model]->xSpaceVector = (size_t *)rax_calloc((size_t)localTree->mxtips, sizeof(size_t));
+
+      const size_t span = (size_t)(localPartitions->partitionData[model]->states) *
+              discreteRateCategories(localTree->rateHetModel);
+
+#ifdef __MIC_NATIVE
+
+      // Alexey: sum buffer buffer padding for Xeon PHI
+      const int aligned_width = width % PLL_VECTOR_WIDTH == 0 ? width : width + (PLL_VECTOR_WIDTH - (width % PLL_VECTOR_WIDTH));
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->sumBuffer), PLL_BYTE_ALIGNMENT, aligned_width *
+                                                                                      span *
+                                                                                      sizeof(double));
+
+      // Alexey: fill padding entries with 1. (will be corrected with site weights, s. below)
+      {
+          int k;
+          for (k = width*span; k < aligned_width*span; ++k)
+              localPartitions->partitionData[model]->sumBuffer[k] = 1.;
+      }
+
+#else
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->sumBuffer), PLL_BYTE_ALIGNMENT, width *
+                                              span *
+                                              sizeof(double));
+#endif
+
+      /* Initialize buffers to store per-site log likelihoods */
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->perSiteLikelihoods), PLL_BYTE_ALIGNMENT, width * sizeof(double));
+
+      /* initialize data structures for per-site likelihood scaling */
+
+      if(localTree->fastScaling)
+        {
+           localPartitions->partitionData[model]->expVector      = (int **)NULL;
+           localPartitions->partitionData[model]->expSpaceVector = (size_t *)NULL;
+        }
+      else
+        {        
+          localPartitions->partitionData[model]->expVector      = (int **)rax_malloc(sizeof(int*) * innerNodes);
+           
+          /* 
+             Initializing the expVector array like this is absolutely required !!!!
+             Not doing this can (and did) cause segmentation faults !!!!
+          */
+          
+          {
+            int k;
+
+            for(k = 0; k < innerNodes; k++)
+              localPartitions->partitionData[model]->expVector[k] = (int*)NULL; 
+          }
+
+          localPartitions->partitionData[model]->expSpaceVector = (size_t *)rax_calloc(innerNodes, sizeof(size_t));
+        }
+
+      /* data structure to store the marginal ancestral probabilities in the sequential version or for each thread */
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->ancestralBuffer), PLL_BYTE_ALIGNMENT, width *
+                                                                                 (size_t)(localPartitions->partitionData[model]->states) *
+                                                                                 sizeof(double));
+
+      /* count and accumulate how many bytes we will need for storing a full ancestral vector. for this we addf over the per-partition space requirements in bytes */
+      /* ancestralVectorWidth += ((size_t)(pr->partitionData[model]->upper - pr->partitionData[model]->lower) * (size_t)(localPartitions->partitionData[model]->states) * sizeof(double)); */
+      ancestralVectorWidth += ((size_t)(localPartitions->partitionData[model]->upper - localPartitions->partitionData[model]->lower) * (size_t)(localPartitions->partitionData[model]->states) * sizeof(double));
+      /* :TODO: do we have to use the original tree for that   */
+
+#ifdef __MIC_NATIVE
+
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->wgt), PLL_BYTE_ALIGNMENT, aligned_width * sizeof(int));
+
+      // Alexey: fill padding entries with 0.
+      {
+          int k;
+          for (k = width; k < aligned_width; ++k)
+              localPartitions->partitionData[model]->wgt[k] = 0;
+      }
+#else
+      rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->wgt), PLL_BYTE_ALIGNMENT, width * sizeof(int));
+#endif
+
+      /* rateCategory must be assigned using rax_calloc() at start up there is only one rate category 0 for all sites */
+
+      localPartitions->partitionData[model]->rateCategory = (int *)rax_calloc(width, sizeof(int));
+
+      if(width > 0 && localTree->saveMemory)
+        {
+          localPartitions->partitionData[model]->gapVectorLength = ((int)width / 32) + 1;
+          assert(4 == sizeof(unsigned int));
+          localPartitions->partitionData[model]->gapVector = (unsigned int*)rax_calloc((size_t)localPartitions->partitionData[model]->gapVectorLength * 2 * (size_t)localTree->mxtips, sizeof(unsigned int));
+          rax_posix_memalign ((void **)&(localPartitions->partitionData[model]->gapColumn),PLL_BYTE_ALIGNMENT, ((size_t)localTree->mxtips) *
+                                                                               ((size_t)(localPartitions->partitionData[model]->states)) *
+                                                                               discreteRateCategories(localTree->rateHetModel) * sizeof(double));
+        }
+      else
+        {
+          localPartitions->partitionData[model]->gapVectorLength = 0;
+          localPartitions->partitionData[model]->gapVector = (unsigned int*)NULL;
+          localPartitions->partitionData[model]->gapColumn = (double*)NULL;
+        }              
+    }
+}
+
+int virtual_width( int n ) {
+    const int global_vw = 2;
+    return (n+1) / global_vw * global_vw;
+}
+
+
+void initMemorySavingAndRecom(pllInstance *tr, partitionList *pr)
+{
+  pllInstance  
+    *localTree = tr; 
+  partitionList
+    *localPartitions = pr;
+  size_t model; 
+
+  /* initialize gap bit vectors at tips when memory saving option is enabled */
+
+  if(localTree->saveMemory)
+    {
+      for(model = 0; model < (size_t)localPartitions->numberOfPartitions; model++)
+        {
+          int        
+            undetermined = getUndetermined(localPartitions->partitionData[model]->dataType);
+
+          size_t
+            i,
+            j,
+            width =  localPartitions->partitionData[model]->width;
+
+          if(width > 0)
+            {                                        
+              for(j = 1; j <= (size_t)(localTree->mxtips); j++)
+                for(i = 0; i < width; i++)
+                  if(localPartitions->partitionData[model]->yVector[j][i] == undetermined)
+                    localPartitions->partitionData[model]->gapVector[localPartitions->partitionData[model]->gapVectorLength * j + i / 32] |= mask32[i % 32];
+            }     
+        }
+    }
+  /* recom */
+  if(localTree->useRecom)
+    allocRecompVectorsInfo(localTree);
+  else
+    localTree->rvec = (recompVectors*)NULL;
+  /* E recom */
+}
+
+/** @brief Get the length of a specific branch
+
+    Get the length of the branch specified by node \a p and \a p->back
+    of partition \a partition_id.
+    The branch length is decoded from the PLL representation.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Specifies one end-point of the branch. The other one is \a p->back
+
+    @param partition_id
+      Specifies the partition
+
+    @return
+      The branch length
+*/
+double pllGetBranchLength (pllInstance *tr, nodeptr p, int partition_id)
+{
+  //assert(partition_id < tr->numBranches);
+  assert(partition_id < PLL_NUM_BRANCHES);
+  assert(partition_id >= 0);
+  assert(tr->fracchange != -1.0);
+  double z = p->z[partition_id];
+  if(z < PLL_ZMIN) z = PLL_ZMIN;
+  if(z > PLL_ZMAX) z = PLL_ZMAX;
+  return (-log(z) * tr->fracchange);
+}
+
+/** @brief Set the length of a specific branch
+
+    Set the length of the branch specified by node \a p and \a p->back
+    of partition \a partition_id.
+    The function encodes the branch length to the PLL representation.
+
+    @param tr
+      PLL instance
+
+    @param p
+      Specifies one end-point of the branch. The other one is \a p->back
+
+    @param partition_id
+      Specifies the partition
+
+    @param bl
+      Branch length
+*/
+void pllSetBranchLength (pllInstance *tr, nodeptr p, int partition_id, double bl)
+{
+  //assert(partition_id < tr->numBranches);
+  assert(partition_id < PLL_NUM_BRANCHES);
+  assert(partition_id >= 0);
+  assert(tr->fracchange != -1.0);
+  double z;
+  z = exp((-1 * bl)/tr->fracchange);
+  if(z < PLL_ZMIN) z = PLL_ZMIN;
+  if(z > PLL_ZMAX) z = PLL_ZMAX;
+  p->z[partition_id] = z;
+}
+
+#if (!defined(_FINE_GRAIN_MPI) && !defined(_USE_PTHREADS))
+static void initializePartitionsSequential(pllInstance *tr, partitionList *pr)
+{ 
+  size_t
+    model;
+
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+    assert(pr->partitionData[model]->width == pr->partitionData[model]->upper - pr->partitionData[model]->lower);
+
+  initializePartitionData(tr, pr);
+
+  /* figure in tip sequence data per-site pattern weights */ 
+  for(model = 0; model < (size_t)pr->numberOfPartitions; model++)
+  {
+    size_t
+      j;
+    size_t lower = pr->partitionData[model]->lower;
+    size_t width = pr->partitionData[model]->upper - lower;
+
+    for(j = 1; j <= (size_t)tr->mxtips; j++)
+    {
+      pr->partitionData[model]->yVector[j] = &(tr->yVector[j][pr->partitionData[model]->lower]);
+    }
+
+    memcpy((void*)(&(pr->partitionData[model]->wgt[0])),         (void*)(&(tr->aliaswgt[lower])),      sizeof(int) * width);
+  }  
+
+  initMemorySavingAndRecom(tr, pr);
+}
+#endif
+
+
+/* interface to outside  */
+//void initializePartitions(pllInstance *tr, pllInstance *localTree, partitionList *pr, partitionList *localPr, int tid, int n)
+//{
+//#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+//  initializePartitionsMaster(tr,localTree,pr,localPr,tid,n);
+//#else
+//  initializePartitionsSequential(tr, pr);
+//#endif
+//}
+
+static void freeLinkageList( linkageList* ll)
+{
+  int i;    
+
+  for(i = 0; i < ll->entries; i++)    
+    rax_free(ll->ld[i].partitionList);         
+
+  rax_free(ll->ld);
+  rax_free(ll);   
+}
+
+/** @brief free all data structures associated to a partition
+    
+    frees all data structures allocated for this partition
+
+    @param partitions
+      the pointer to the partition list
+
+    @param tips  
+       number of tips in the tree      
+*/
+void 
+pllPartitionsDestroy (pllInstance * tr, partitionList ** partitions)
+{
+  int i, j, tips;
+  partitionList * pl = *partitions;
+
+#ifdef _USE_PTHREADS
+  int tid = tr->threadID;
+  if (MASTER_P) {
+     pllMasterBarrier (tr, pl, PLL_THREAD_EXIT_GRACEFULLY);
+     pllStopPthreads (tr);
+    }
+#endif
+
+  tips = tr->mxtips;
+
+#ifdef _USE_PTHREADS
+  if (MASTER_P) {
+#endif
+#ifdef _FINE_GRAIN_MPI
+if (MASTER_P) {
+     pllMasterBarrier (tr, pl, PLL_THREAD_EXIT_GRACEFULLY);
+#endif
+  freeLinkageList(pl->alphaList);
+  freeLinkageList(pl->freqList); 
+  freeLinkageList(pl->rateList);
+#ifdef _FINE_GRAIN_MPI
+}
+#endif
+
+#ifdef _USE_PTHREADS
+  }
+#endif
+  for (i = 0; i < pl->numberOfPartitions; ++ i)
+   {
+     rax_free (pl->partitionData[i]->gammaRates);
+     rax_free (pl->partitionData[i]->perSiteRates);
+     rax_free (pl->partitionData[i]->globalScaler);
+     rax_free (pl->partitionData[i]->left);
+     rax_free (pl->partitionData[i]->right);
+     rax_free (pl->partitionData[i]->EIGN);
+     rax_free (pl->partitionData[i]->EV);
+     rax_free (pl->partitionData[i]->EI);
+     rax_free (pl->partitionData[i]->substRates);
+     rax_free (pl->partitionData[i]->frequencies);
+     rax_free (pl->partitionData[i]->freqExponents);
+     rax_free (pl->partitionData[i]->empiricalFrequencies);
+     rax_free (pl->partitionData[i]->tipVector);
+     rax_free (pl->partitionData[i]->symmetryVector);
+     rax_free (pl->partitionData[i]->frequencyGrouping);
+     for (j = 0; j < tips; ++ j)
+       rax_free (pl->partitionData[i]->xVector[j]);
+     rax_free (pl->partitionData[i]->xVector);
+     rax_free (pl->partitionData[i]->yVector);
+     rax_free (pl->partitionData[i]->xSpaceVector);
+     rax_free (pl->partitionData[i]->sumBuffer);
+     rax_free (pl->partitionData[i]->ancestralBuffer);
+     rax_free (pl->partitionData[i]->wgt);
+     rax_free (pl->partitionData[i]->rateCategory);
+     rax_free (pl->partitionData[i]->gapVector);
+     rax_free (pl->partitionData[i]->gapColumn);
+     rax_free (pl->partitionData[i]->perSiteLikelihoods);
+     rax_free (pl->partitionData[i]->partitionName);
+     rax_free (pl->partitionData[i]->expSpaceVector);
+     /*TODO: Deallocate all entries of expVector */
+     if (pl->partitionData[i]->expVector)
+      {
+        for (j = 0; j < tips - 2; ++ j)
+          rax_free (pl->partitionData[i]->expVector[j]);
+      }
+     rax_free (pl->partitionData[i]->expVector);
+     rax_free (pl->partitionData[i]);
+   }
+  rax_free (pl->partitionData);
+  rax_free (pl);
+
+  *partitions = NULL;
+
+#if (defined(_USE_PTHREADS) || defined(_FINE_GRAIN_MPI))
+     rax_free (tr->y_ptr);
+#endif
+}
+
+/** @ingroup instanceLinkingGroup
+    @brief Correspondance check between partitions and alignment
+
+    This function checks whether the partitions to be created and the given
+    alignment correspond, that is, whether each site of the alignment is
+    assigned to exactly one partition.
+
+    @param parts
+      A list of partitions suggested by the caller
+
+    @param alignmentData
+      The multiple sequence alignment
+    
+    @return
+      Returns \a 1 in case of success, otherwise \a 0
+*/
+int
+pllPartitionsValidate (pllQueue * parts, pllAlignmentData * alignmentData)
+{
+  int nparts;
+  char * used;
+  struct pllQueueItem * elm;
+  struct pllQueueItem * regionItem;
+  pllPartitionRegion * region;
+  pllPartitionInfo * pi;
+  int i;
+
+  /* check if the list contains at least one partition */
+  nparts = pllQueueSize (parts);
+  if (!nparts)          
+    return (0);   
+
+  /* pllBoolean array for marking that a site was assigned a partition */
+  used = (char *) rax_calloc (alignmentData->sequenceLength, sizeof (char));
+
+  /* traverse all partitions and their respective regions and mark sites */
+  for (elm = parts->head; elm; elm = elm->next)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+     
+     for (regionItem = pi->regionList->head; regionItem; regionItem = regionItem->next)
+      {
+        region = (pllPartitionRegion *) regionItem->item;
+        
+        if (region->start < 1 || region->end > alignmentData->sequenceLength) 
+         {
+           rax_free (used);
+           return (0);
+         }
+
+        for (i = region->start - 1; i < region->end; i += region->stride)
+         {
+           if (used[i])
+            {
+              rax_free (used);
+              return (0);
+            }
+           used[i] = 1; 
+         }
+      }
+   }
+
+  /* check whether all sites were assigned a partition */
+  for (i = 0; i < alignmentData->sequenceLength; ++ i)
+    if (used[i] != 1)
+     {
+       rax_free (used);
+       return (0);
+     }
+
+  rax_free (used);
+  return (1);
+}
+
+/** @brief Swap two sites in a buffer
+    
+    Swaps sites \a s1 and \a s2 in buffer \a buf which consists of \a nTaxa + 1
+    taxa (i.e. rows), and the first row contains no information, i.e. it is not
+    accessed.
+
+    @param buffer
+      Memory buffer
+
+    @param s1
+      First site
+
+    @param s2
+      Second site
+
+    @param nTaxa
+      Number of taxa, i.e. size of site
+*/
+static __inline void
+swapSite (unsigned char ** buf, int s1, int s2, int nTaxa)
+{
+  int i;
+  int x;
+
+  for (i = 1; i <= nTaxa; ++ i)
+  {
+    x = buf[i][s1];
+    buf[i][s1] = buf[i][s2];
+    buf[i][s2] = x;
+  }
+}
+
+/** @brief Constructs the list of partitions according to the proposed partition scheme
+    
+    A static function that construcs the \a partitionList structure according to
+    the partition scheme \b AFTER the sites have been repositioned in contiguous
+    regions according to the partition scheme.
+
+    @param bounds  An array of the new starting and ending posititons of sites
+    in the alignment for each partition.  This array is of size 2 * \a nparts.
+    The elements are always couples (lower,upper). The upper bounds is a site
+    that is not included in the partition
+
+    @param nparts The number of partitions to be created
+
+    @todo Fix the bug in PLL 
+*/
+static partitionList * createPartitions (pllQueue * parts, int * bounds)
+{
+  partitionList * pl;
+  pllPartitionInfo * pi;
+  struct pllQueueItem * elm;
+  int i, j;
+
+  pl = (partitionList *) rax_malloc (sizeof (partitionList));
+  
+  // TODO: fix this
+  pl->perGeneBranchLengths =      0;
+
+  // TODO: change PLL_NUM_BRANCHES to number of partitions I guess
+  pl->partitionData = (pInfo **) rax_calloc (PLL_NUM_BRANCHES, sizeof (pInfo *));
+  
+  for (i = 0, elm = parts->head; elm; elm = elm->next, ++ i)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+
+     /* check whether the data type is valid, and in case it's not, deallocate
+        and return NULL */
+     if (pi->dataType <= PLL_MIN_MODEL || pi->dataType >= PLL_MAX_MODEL)
+      {
+        for (j = 0; j < i; ++ j)
+         {
+           rax_free (pl->partitionData[j]->partitionName);
+           rax_free (pl->partitionData[j]);
+         }
+        rax_free (pl->partitionData);
+        rax_free (pl);
+        return (NULL);
+      }
+
+     pl->partitionData[i] = (pInfo *) rax_malloc (sizeof (pInfo));
+
+     pl->partitionData[i]->lower = bounds[i << 1];
+     pl->partitionData[i]->upper = bounds[(i << 1) + 1];
+     pl->partitionData[i]->width = bounds[(i << 1) + 1] - bounds[i << 1];
+     pl->partitionData[i]->partitionWeight = 1.0 * (double) pl->partitionData[i]->width;
+
+     //the two flags below are required to allow users to set 
+     //alpha parameters and substitution rates in the Q matrix 
+     //to fixed values. These parameters will then not be optimized 
+     //in the model parameter optimization functions
+     //by default we assume that all parameters are being optimized, i.e., 
+     //this has to be explicitly set by the user 
+     
+     pl->partitionData[i]->optimizeAlphaParameter    = PLL_TRUE;
+     pl->partitionData[i]->optimizeSubstitutionRates = PLL_TRUE;
+     pl->partitionData[i]->dataType                  = pi->dataType;
+     pl->partitionData[i]->protModels                = -1;
+     pl->partitionData[i]->protUseEmpiricalFreqs     = -1;
+     pl->partitionData[i]->maxTipStates              = pLengths[pi->dataType].undetermined + 1;
+     pl->partitionData[i]->optimizeBaseFrequencies   = pi->optimizeBaseFrequencies;
+     pl->partitionData[i]->ascBias                   = pi->ascBias;
+     pl->partitionData[i]->parsVect                  = NULL;
+
+
+
+     if (pi->dataType == PLL_AA_DATA)
+      {
+        if(pl->partitionData[i]->protModels != PLL_GTR)
+          pl->partitionData[i]->optimizeSubstitutionRates = PLL_FALSE;
+        pl->partitionData[i]->protUseEmpiricalFreqs     = pi->protUseEmpiricalFreqs;
+        pl->partitionData[i]->protModels                = pi->protModels;
+      }
+
+     pl->partitionData[i]->states                = pLengths[pl->partitionData[i]->dataType].states;
+     pl->partitionData[i]->numberOfCategories    =        1;
+     pl->partitionData[i]->autoProtModels        =        0;
+     pl->partitionData[i]->nonGTR                =        PLL_FALSE;
+     pl->partitionData[i]->partitionContribution =     -1.0;
+     pl->partitionData[i]->partitionLH           =      0.0;
+     pl->partitionData[i]->fracchange            =      1.0;
+     pl->partitionData[i]->executeModel          =     PLL_TRUE;
+
+
+     pl->partitionData[i]->partitionName         = (char *) rax_malloc ((strlen (pi->partitionName) + 1) * sizeof (char));
+     strcpy (pl->partitionData[i]->partitionName, pi->partitionName);
+   }
+
+  return (pl);
+}
+
+
+/** @ingroup instanceLinkingGroup
+    @brief Constructs the proposed partition scheme 
+
+    This function constructs the proposed partition scheme. It assumes
+    that the partition scheme is correct.
+
+    @note This function \b does \b not validate the partition scheme.
+    The user must manually call the ::pllPartitionsValidate function
+    for validation
+    
+    @param parts
+      A list of partitions suggested by the caller
+
+    @param alignmentData
+      The multiple sequence alignment
+
+    @return
+      Returns a pointer to \a partitionList structure of partitions in case of success, \b NULL otherwise
+*/
+partitionList * pllPartitionsCommit (pllQueue * parts, pllAlignmentData * alignmentData)
+{
+  int * oi;
+  int i, j, dst;
+  struct pllQueueItem * elm;
+  struct pllQueueItem * regionItem;
+  pllPartitionRegion * region;
+  pllPartitionInfo * pi;
+  partitionList * pl;
+  int * newBounds;
+  int k, nparts;
+  int tmpvar;
+ 
+
+  dst = k = 0;
+  oi  = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  for (i = 0; i < alignmentData->sequenceLength; ++ i) oi[i] = i;
+
+  nparts = pllQueueSize (parts);
+  newBounds = (int *) rax_malloc (2 * nparts * sizeof (int));
+
+  /* reposition the sites in the alignment */
+  for (elm = parts->head; elm; elm = elm->next, ++ k)
+   {
+     pi = (pllPartitionInfo *) elm->item;
+     
+     newBounds[k << 1] = dst;   /* set the lower column for this partition */
+     for (regionItem = pi->regionList->head; regionItem; regionItem = regionItem->next)
+      {
+        region = (pllPartitionRegion *) regionItem->item;
+
+        for (i = region->start - 1; i < region->end && i < alignmentData->sequenceLength; i += region->stride)
+         {
+           if (oi[i] == i)
+            {
+              swapSite (alignmentData->sequenceData, dst, i, alignmentData->sequenceCount);
+              tmpvar = oi[i];
+              oi[i] = oi[dst];
+              oi[dst++] = tmpvar;
+            }
+           else
+            {
+              j = i;
+              while (oi[j] != i) j = oi[j];
+
+              swapSite (alignmentData->sequenceData, dst, j, alignmentData->sequenceCount);
+              tmpvar = oi[j];
+              oi[j] = oi[dst];
+              oi[dst++] = tmpvar;
+            }
+         }
+      }
+     newBounds[(k << 1) + 1] = dst;    /* set the uppwer limit for this partition */
+   }
+  if ((pl = createPartitions (parts, newBounds)))
+   { 
+     pl->numberOfPartitions = nparts;
+     pl->dirty = PLL_FALSE;
+   }
+  
+  rax_free (newBounds);
+  rax_free (oi);
+
+  return (pl);
+}
+
+/** @brief Copy a site to another buffer
+
+    Copies site \a from from buffer \a src to \a to in buffer \a dst. Both buffers
+    must consist of \a nTaxa + 1 taxa and the first row contains no information, i.e.
+    it is not accessed.
+
+    @param dst
+      Destination buffer
+
+    @param src
+      Source buffer
+
+    @param to
+      At which position in \a dst to copy the site to
+
+    @param from
+      Which site from \a src to copy
+
+    @param nTaxa
+      Number of taxa, i.e. size of site
+*/
+static __inline void
+copySite (unsigned char ** dst, unsigned char ** src, int to, int from, int nTaxa)
+{
+  int i;
+
+  for (i = 1; i <= nTaxa; ++ i)
+   {
+     dst[i][to] = src[i][from];
+   }
+}
+
+/** @brief Remove duplicate sites from alignment and update weights vector
+
+    Removes duplicate sites from the alignment given the partitions list
+    and updates the weight vector of the alignment and the boundaries
+    (upper, lower, width) for each partition.
+
+    @param alignmentData
+      The multiple sequence alignment
+    
+    @param pl
+      List of partitions
+
+*/
+void 
+pllAlignmentRemoveDups (pllAlignmentData * alignmentData, partitionList * pl)
+{
+  int i, j, k, p;
+  char *** sites;
+  void ** memptr;
+  int ** oi;
+  int dups = 0;
+  int lower;
+
+  /* allocate space for the transposed alignments (sites) for every partition */
+  sites  = (char ***) rax_malloc (pl->numberOfPartitions * sizeof (char **));
+  memptr = (void **)  rax_malloc (pl->numberOfPartitions * sizeof (void *));
+  oi     = (int **)   rax_malloc (pl->numberOfPartitions * sizeof (int *));
+
+  /* transpose the sites by partition */
+  for (p = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     sites[p]  = (char **) rax_malloc (sizeof (char *) * pl->partitionData[p]->width);
+     memptr[p] = rax_malloc (sizeof (char) * (alignmentData->sequenceCount + 1) * pl->partitionData[p]->width);
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        sites[p][i] = (char *) ((char*)memptr[p] + sizeof (char) * i * (alignmentData->sequenceCount + 1));
+      }
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        for (j = 0; j < alignmentData->sequenceCount; ++ j)
+         {
+           sites[p][i][j] = alignmentData->sequenceData[j + 1][pl->partitionData[p]->lower + i]; 
+         }
+        sites[p][i][j] = 0;
+      }
+
+     oi[p] = pllssort1main (sites[p], pl->partitionData[p]->width);
+
+     for (i = 0; i < pl->partitionData[p]->width; ++ i) oi[p][i] = 1;
+
+     for (i = 1; i < pl->partitionData[p]->width; ++ i)
+      {
+        if (! strcmp (sites[p][i], sites[p][i - 1]))
+         {
+           ++ dups;
+           oi[p][i] = 0;
+         }
+      }
+   }
+
+  /* allocate memory for the alignment without duplicates*/
+  rax_free (alignmentData->sequenceData[1]);
+  rax_free (alignmentData->siteWeights);
+
+  alignmentData->sequenceLength = alignmentData->sequenceLength - dups;
+  alignmentData->sequenceData[0] = (unsigned char *) rax_malloc ((alignmentData->sequenceLength + 1) * sizeof (unsigned char) * alignmentData->sequenceCount);
+  for (i = 0; i < alignmentData->sequenceCount; ++ i)
+   {
+     alignmentData->sequenceData[i + 1] = (unsigned char *) (alignmentData->sequenceData[0] + sizeof (unsigned char) * i * (alignmentData->sequenceLength + 1));
+     alignmentData->sequenceData[i + 1][alignmentData->sequenceLength] = 0;
+   }
+
+  alignmentData->siteWeights    = (int *) rax_malloc ((alignmentData->sequenceLength) * sizeof (int));
+  alignmentData->siteWeights[0] = 1;
+
+  /* transpose sites back to alignment */
+  for (p = 0, k = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     lower = k;
+     for (i = 0; i < pl->partitionData[p]->width; ++ i)
+      {
+        if (!oi[p][i])
+         {
+           ++ alignmentData->siteWeights[k - 1];
+         }
+        else
+         {
+           alignmentData->siteWeights[k] = 1;
+           for (j = 0; j < alignmentData->sequenceCount; ++ j)
+            {
+              alignmentData->sequenceData[j + 1][k] = sites[p][i][j];
+            }
+           ++ k;
+         }
+      }
+     pl->partitionData[p]->lower = lower;
+     pl->partitionData[p]->upper = k;
+     pl->partitionData[p]->width = k - lower;
+   }
+
+  /* deallocate storage for transposed alignment (sites) */
+  for (p = 0; p < pl->numberOfPartitions; ++ p)
+   {
+     rax_free (oi[p]);
+     rax_free (memptr[p]);
+     rax_free (sites[p]);
+   }
+  rax_free (oi);
+  rax_free (sites);
+  rax_free (memptr);
+}
+
+
+/** @brief Compute the empirical frequencies of a partition
+  
+    Compute the empirical frequencies of partition \a partition and store them in
+    \a pfreqs.
+
+    @param partition
+      The partition for which to compute empirical frequencies
+
+    @param alignmentData
+      The multiple sequence alignment
+
+    @param smoothFrequencies
+      Not needed?
+
+    @param bitMask
+      The bitmask
+
+    @param pfreqs
+      Array of size \a partition->states where the empirical frequencies for this partition are stored
+*/
+static int genericBaseFrequenciesAlignment (pInfo * partition, 
+                                              pllAlignmentData * alignmentData, 
+                                              pllBoolean smoothFrequencies,
+                                              const unsigned int * bitMask, 
+                                              double * pfreqs)
+{
+  double 
+    wj, 
+    acc,
+    sumf[64],   
+    temp[64];
+ 
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    numFreqs,
+    lower,
+    upper;
+
+  unsigned char  *yptr;  
+  const char * map;
+  
+  switch (partition->dataType)
+   {
+     case PLL_BINARY_DATA:
+       map = PLL_MAP_BIN;
+     case PLL_DNA_DATA:
+       map = PLL_MAP_NT;
+       break;
+     case PLL_AA_DATA:
+       map = PLL_MAP_AA;
+       break;
+     default:
+       assert(0);
+   }
+
+  numFreqs = partition->states;
+  lower    = partition->lower;
+  upper    = partition->upper;
+
+  for(l = 0; l < numFreqs; l++)     
+    pfreqs[l] = 1.0 / ((double)numFreqs);
+          
+  for (k = 1; k <= 8; k++) 
+    {                                                   
+      for(l = 0; l < numFreqs; l++)
+        sumf[l] = 0.0;
+              
+      for (i = 1; i <= alignmentData->sequenceCount; i++) 
+        {                
+          yptr = alignmentData->sequenceData[i];
+          
+          for(j = lower; j < upper; j++) 
+            {
+              if (map[yptr[j]] < 0) return (0);
+              unsigned int code = bitMask[(unsigned char)map[yptr[j]]];
+              assert(code >= 1);
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if((code >> l) & 1)
+                    temp[l] = pfreqs[l];
+                  else
+                    temp[l] = 0.0;
+                }                             
+              
+              for(l = 0, acc = 0.0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)
+                    acc += temp[l];
+                }
+              
+              wj = alignmentData->siteWeights[j] / acc;
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)                
+                    sumf[l] += wj * temp[l];                                                                                               
+                }
+            }
+        }                     
+      
+      for(l = 0, acc = 0.0; l < numFreqs; l++)
+        {
+          if(sumf[l] != 0.0)
+            acc += sumf[l];
+        }
+              
+      for(l = 0; l < numFreqs; l++)
+        pfreqs[l] = sumf[l] / acc;           
+    }
+
+   /* TODO: What is that? */
+/*
+  if(smoothFrequencies)         
+   {;
+    smoothFreqs(numFreqs, pfreqs,  tr->partitionData[model].frequencies, &(tr->partitionData[model]));     
+   }
+  else    
+    {
+      pllBoolean
+        zeroFreq = PLL_FALSE;
+
+      char 
+        typeOfData[1024];
+
+      getDataTypeString(tr, model, typeOfData);  
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          if(pfreqs[l] == 0.0)
+            {
+              printBothOpen("Empirical base frequency for state number %d is equal to zero in %s data partition %s\n", l, typeOfData, tr->partitionData[model].partitionName);
+              printBothOpen("Since this is probably not what you want to do, RAxML will soon exit.\n\n");
+              zeroFreq = PLL_TRUE;
+            }
+        }
+
+      if(zeroFreq)
+        exit(-1);
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          assert(pfreqs[l] > 0.0);
+          tr->partitionData[model].frequencies[l] = pfreqs[l];
+        }     
+    }  
+*/
+  return (1);
+  
+}
+
+static void  genericBaseFrequenciesInstance (pInfo * partition, 
+                                             pllInstance * tr, 
+                                             pllBoolean smoothFrequencies,
+                                             const unsigned int * bitMask, 
+                                             double * pfreqs)
+{
+  double 
+    wj, 
+    acc,
+    sumf[64],   
+    temp[64];
+ 
+  int     
+    i, 
+    j, 
+    k, 
+    l,
+    numFreqs,
+    lower,
+    upper;
+
+  unsigned char  *yptr;  
+
+  numFreqs = partition->states;
+  lower    = partition->lower;
+  upper    = partition->upper;
+
+  for(l = 0; l < numFreqs; l++)     
+    pfreqs[l] = 1.0 / ((double)numFreqs);
+          
+  for (k = 1; k <= 8; k++) 
+    {                                                   
+      for(l = 0; l < numFreqs; l++)
+        sumf[l] = 0.0;
+              
+      for (i = 1; i <= tr->mxtips; i++) 
+        {                
+          yptr = tr->yVector[i];
+          
+          for(j = lower; j < upper; j++) 
+            {
+              unsigned int code = bitMask[yptr[j]];
+              assert(code >= 1);
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if((code >> l) & 1)
+                    temp[l] = pfreqs[l];
+                  else
+                    temp[l] = 0.0;
+                }                             
+              
+              for(l = 0, acc = 0.0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)
+                    acc += temp[l];
+                }
+              
+              wj = tr->aliaswgt[j] / acc;
+              
+              for(l = 0; l < numFreqs; l++)
+                {
+                  if(temp[l] != 0.0)                
+                    sumf[l] += wj * temp[l];                                                                                               
+                }
+            }
+        }                     
+      
+      for(l = 0, acc = 0.0; l < numFreqs; l++)
+        {
+          if(sumf[l] != 0.0)
+            acc += sumf[l];
+        }
+              
+      for(l = 0; l < numFreqs; l++)
+        pfreqs[l] = sumf[l] / acc;           
+    }
+
+   /* TODO: What is that? */
+/*
+  if(smoothFrequencies)         
+   {;
+    smoothFreqs(numFreqs, pfreqs,  tr->partitionData[model].frequencies, &(tr->partitionData[model]));     
+   }
+  else    
+    {
+      pllBoolean
+        zeroFreq = PLL_FALSE;
+
+      char 
+        typeOfData[1024];
+
+      getDataTypeString(tr, model, typeOfData);  
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          if(pfreqs[l] == 0.0)
+            {
+              printBothOpen("Empirical base frequency for state number %d is equal to zero in %s data partition %s\n", l, typeOfData, tr->partitionData[model].partitionName);
+              printBothOpen("Since this is probably not what you want to do, RAxML will soon exit.\n\n");
+              zeroFreq = PLL_TRUE;
+            }
+        }
+
+      if(zeroFreq)
+        exit(-1);
+
+      for(l = 0; l < numFreqs; l++)
+        {
+          assert(pfreqs[l] > 0.0);
+          tr->partitionData[model].frequencies[l] = pfreqs[l];
+        }     
+    }  
+*/
+
+  
+}
+
+/**  Compute the empirical base frequencies of an alignment
+
+     Computes the empirical base frequencies per partition of an alignment \a alignmentData
+     given the partition structure \a pl.
+
+     @param alignmentData The alignment structure for which to compute the empirical base frequencies
+     @param pl            List of partitions
+     @return Returns a list of frequencies for each partition
+*/
+double ** pllBaseFrequenciesAlignment (pllAlignmentData * alignmentData, partitionList * pl)
+{
+  int
+    i,
+    model;
+
+  double 
+    **freqs = (double **) rax_malloc (pl->numberOfPartitions * sizeof (double *));
+
+  for (model = 0; model < pl->numberOfPartitions; ++ model)
+    {
+      freqs[model] = (double *) rax_malloc (pl->partitionData[model]->states * sizeof (double));
+      
+      switch  (pl->partitionData[model]->dataType)
+        {
+        case PLL_BINARY_DATA:
+        case PLL_AA_DATA:
+        case PLL_DNA_DATA:
+          if (!genericBaseFrequenciesAlignment (pl->partitionData[model], 
+                                                alignmentData, 
+                                                pLengths[pl->partitionData[model]->dataType].smoothFrequencies,
+                                                pLengths[pl->partitionData[model]->dataType].bitVector,
+                                                freqs[model]
+                                               ))
+            return (NULL);
+          break;
+        default:
+          {
+            errno = PLL_UNKNOWN_MOLECULAR_DATA_TYPE;
+            for (i = 0; i <= model; ++ i) rax_free (freqs[i]);
+            rax_free (freqs);
+            return (double **)NULL;
+          }
+        }
+    }
+  
+  return (freqs);
+}
+
+/**  Compute the empirical base frequencies of the alignment incorporated in the instance
+
+     Computes the empirical base frequencies per partition of the alignment
+     incorporated in the instance \a tr given the partition structure \a pl.
+
+     @param tr The instance for which to compute the empirical base frequencies
+     @param pl List of partitions
+     @return Returns a list of frequencies for each partition
+*/
+double ** pllBaseFrequenciesInstance (pllInstance * tr, partitionList * pl)
+{
+  int
+    i,
+    model;
+
+  double 
+    **freqs = (double **) rax_malloc (pl->numberOfPartitions * sizeof (double *));
+
+  for (model = 0; model < pl->numberOfPartitions; ++ model)
+    {
+      freqs[model] = (double *) rax_malloc (pl->partitionData[model]->states * sizeof (double));
+      
+      switch  (pl->partitionData[model]->dataType)
+        {
+        case PLL_AA_DATA:
+        case PLL_DNA_DATA:
+        case PLL_BINARY_DATA:
+          genericBaseFrequenciesInstance (pl->partitionData[model], 
+                                          tr, 
+                                          pLengths[pl->partitionData[model]->dataType].smoothFrequencies,
+                                          pLengths[pl->partitionData[model]->dataType].bitVector,
+                                          freqs[model]
+                                          );
+          break;
+        default:
+          {
+            errno = PLL_UNKNOWN_MOLECULAR_DATA_TYPE;
+            for (i = 0; i <= model; ++ i) rax_free (freqs[i]);
+            rax_free (freqs);
+            return (double **)NULL;
+          }
+        }
+    }
+  
+  return (freqs);
+}
+
+void
+pllEmpiricalFrequenciesDestroy (double *** empiricalFrequencies, int models)
+{
+  int i;
+
+  for (i = 0; i < models; ++ i)
+   {
+     rax_free ((*empiricalFrequencies)[i]);
+   }
+  rax_free (*empiricalFrequencies);
+
+  *empiricalFrequencies = NULL;
+}
+
+int pllLoadAlignment (pllInstance * tr, pllAlignmentData * alignmentData, partitionList * partitions)
+{
+  int i;
+  nodeptr node;
+  pllHashItem * hItem;
+
+  if (tr->mxtips != alignmentData->sequenceCount) return (0);
+
+  tr->aliaswgt = (int *) rax_malloc (alignmentData->sequenceLength * sizeof (int));
+  memcpy (tr->aliaswgt, alignmentData->siteWeights, alignmentData->sequenceLength * sizeof (int));
+
+  tr->originalCrunchedLength = alignmentData->sequenceLength;
+  tr->rateCategory           = (int *)   rax_calloc (tr->originalCrunchedLength, sizeof (int));
+  tr->patrat                 = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+  tr->patratStored           = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+  tr->lhs                    = (double*) rax_malloc((size_t)tr->originalCrunchedLength * sizeof(double));
+
+  /* allocate memory for the alignment */
+  tr->yVector    = (unsigned char **) rax_malloc ((alignmentData->sequenceCount + 1) * sizeof (unsigned char *));                                                                                                                                                                      
+
+  tr->yVector[0] = (unsigned char *)  rax_malloc (sizeof (unsigned char) * (alignmentData->sequenceLength + 1) * alignmentData->sequenceCount);
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i) 
+   {                     
+     tr->yVector[i] = (unsigned char *) (tr->yVector[0] + (i - 1) * (alignmentData->sequenceLength + 1) * sizeof (unsigned char));
+     tr->yVector[i][alignmentData->sequenceLength] = 0;
+   }                     
+                         
+  /* place sequences to tips */                              
+  for (i = 1; i <= alignmentData->sequenceCount; ++ i)                      
+   {                     
+     if (!pllHashSearch (tr->nameHash, alignmentData->sequenceLabels[i],(void **)&node)) 
+      {
+        //rax_free (tr->originalCrunchedLength);
+        rax_free (tr->rateCategory);
+        rax_free (tr->patrat);
+        rax_free (tr->patratStored);
+        rax_free (tr->lhs);
+        rax_free (tr->yVector[0]);
+        rax_free (tr->yVector);
+        return (0);
+      }
+     memcpy (tr->yVector[node->number], alignmentData->sequenceData[i], alignmentData->sequenceLength);
+   }
+
+  /* Do the base substitution (from A,C,G....  ->   0,1,2,3....)*/
+  pllBaseSubstitute (tr, partitions);
+
+  /* Populate tipNames */
+  tr->tipNames = (char **) rax_calloc(tr->mxtips + 1, sizeof (char *));
+  for (i = 0; (unsigned int)i < tr->nameHash->size; ++ i)
+   {
+     hItem = tr->nameHash->Items[i];
+
+     for (; hItem; hItem = hItem->next)
+      {
+        tr->tipNames[((nodeptr)hItem->data)->number] = hItem->str; 
+      }
+   }
+
+  return (1);
+}
+
+pllInstance * pllCreateInstance (pllInstanceAttr * attr)
+{
+  pllInstance * tr;
+
+  if (attr->rateHetModel != PLL_GAMMA && attr->rateHetModel != PLL_CAT) return NULL;
+
+  tr = (pllInstance *) rax_calloc (1, sizeof (pllInstance));
+
+  tr->threadID          = 0;
+  tr->rateHetModel      = attr->rateHetModel;
+  tr->fastScaling       = attr->fastScaling;
+  tr->saveMemory        = attr->saveMemory;
+  tr->useRecom          = attr->useRecom;
+  tr->likelihoodEpsilon = 0.01;
+  
+  tr->randomNumberSeed = attr->randomNumberSeed;
+  tr->parsimonyScore   = NULL;
+
+  /* remove it from the library */
+  tr->useMedian         = PLL_FALSE;
+
+  tr->maxCategories     = (attr->rateHetModel == PLL_GAMMA) ? 4 : 25;
+
+  tr->numberOfThreads   = attr->numberOfThreads;
+  tr->rearrangeHistory  = NULL;
+
+  /* Lock the slave processors at this point */
+#ifdef _FINE_GRAIN_MPI
+  pllLockMPI (tr);
+#endif
+
+  return (tr);
+}
+
+/** @brief Initialize PLL tree structure with default values
+    
+    Initialize PLL tree structure with default values and allocate 
+    memory for its elements.
+
+    @todo
+      STILL NOT FINISHED
+*/
+static void pllTreeInitDefaults (pllInstance * tr, int tips)
+{
+  nodeptr p0, p, q;
+  int i, j;
+  int inner;
+
+  
+
+  /* TODO: make a proper static setupTree function */
+
+  inner = tips - 1;
+
+  tr->mxtips = tips;
+
+  tr->bigCutoff = PLL_FALSE;
+  tr->treeStringLength = tr->mxtips * (PLL_NMLNGTH + 128) + 256 + tr->mxtips * 2;
+  tr->tree_string = (char *) rax_calloc ( tr->treeStringLength, sizeof(char));
+  tr->tree0 = (char*)rax_calloc((size_t)tr->treeStringLength, sizeof(char));
+  tr->tree1 = (char*)rax_calloc((size_t)tr->treeStringLength, sizeof(char));
+  tr->constraintVector = (int *)rax_malloc((2 * tr->mxtips) * sizeof(int));
+  
+  p0 = (nodeptr) rax_malloc ((tips + 3 * inner) * sizeof (node));
+  assert (p0);
+
+  tr->nodeBaseAddress  = p0;
+
+  tr->nameList         = (char **)   rax_malloc ((tips + 1) * sizeof (char *));
+  tr->nodep            = (nodeptr *) rax_malloc ((2 * tips) * sizeof (nodeptr));
+
+  tr->autoProteinSelectionType = PLL_AUTO_ML;
+
+  assert (tr->nameList && tr->nodep);
+
+  tr->nodep[0] = NULL;          
+
+
+  /* TODO: The line below was commented... why? */
+  tr->fracchange = -1;
+  tr->rawFracchange = -1;
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     p = p0++;
+
+     //p->hash      = KISS32();     
+     p->x         = 0;
+     p->xBips     = 0;
+     p->number    = i;
+     p->next      = p;
+     p->back      = NULL;
+     p->bInf      = NULL;
+     tr->nodep[i]  = p;
+   }
+
+  for (i = tips + 1; i <= tips + inner; ++i)
+   {
+     q = NULL;
+     for (j = 1; j <= 3; ++ j)
+     {
+       p = p0++;
+       if (j == 1)
+        {
+          p->xBips = 1;
+          p->x = 1; //p->x     = 1;
+        }
+       else
+        {
+          p->xBips = 0;
+          p->x     = 0;
+        }
+       p->number = i;
+       p->next   = q;
+       p->bInf   = NULL;
+       p->back   = NULL;
+       p->hash   = 0;
+       q         = p;
+     }
+    p->next->next->next = p;
+    tr->nodep[i]         = p;
+   }
+
+  tr->likelihood  = PLL_UNLIKELY;
+  tr->start       = NULL;
+  tr->ntips       = 0;
+  tr->nextnode    = 0;
+
+  for (i = 0; i < PLL_NUM_BRANCHES; ++ i) tr->partitionSmoothed[i] = PLL_FALSE;
+
+  tr->bitVectors = NULL;
+  tr->vLength    = 0;
+  //tr->h          = NULL;
+
+  /* TODO: Fix hash type */
+  tr->nameHash   = pllHashInit (10 * tr->mxtips);
+
+  /* TODO: do these options really fit here or should they be put elsewhere? */
+  tr->td[0].count            = 0;
+  tr->td[0].ti               = (traversalInfo *) rax_malloc (sizeof(traversalInfo) * (size_t)tr->mxtips);
+  tr->td[0].parameterValues  = (double *) rax_malloc(sizeof(double) * (size_t)PLL_NUM_BRANCHES);
+  tr->td[0].executeModel     = (pllBoolean *) rax_malloc (sizeof(pllBoolean) * (size_t)PLL_NUM_BRANCHES);
+  tr->td[0].executeModel[0]  = PLL_TRUE;                                                                                                                                                                                                                                    
+  for (i = 0; i < PLL_NUM_BRANCHES; ++ i) tr->td[0].executeModel[i] = PLL_TRUE;
+}
+
+
+/* @brief Check a parsed tree for inclusion in the current tree
+   
+   Check whether the set of leaves (taxa) of the parsed tree \a nTree is a
+   subset of the leaves of the currently loaded tree.
+
+   @param pInst
+     PLL instance
+
+   @param nTree
+     Parsed newick tree structure
+
+   @return
+     Returns \b PLL_TRUE in case it is a subset, otherwise \b PLL_FALSE
+*/
+static int
+checkTreeInclusion (pllInstance * pInst, pllNewickTree * nTree)
+{
+  pllStack * sList;
+  pllNewickNodeInfo * sItem;
+  void * dummy;
+
+  if (!pInst->nameHash) return (PLL_FALSE);
+
+  for (sList = nTree->tree; sList; sList = sList->next)
+   {
+     sItem = (pllNewickNodeInfo *) sList->item;
+     if (!sItem->rank)   /* leaf */
+      {
+        if (!pllHashSearch (pInst->nameHash, sItem->name, &dummy)) return (PLL_FALSE);
+      }
+   }
+
+  return (PLL_TRUE);
+}
+
+static void
+updateBranchLength (nodeptr p, double old_fracchange, double new_fracchange)
+{
+  double z;
+  int j;
+
+  for (j = 0; j < PLL_NUM_BRANCHES; ++ j)
+   {
+     z = exp ((log (p->z[j]) * old_fracchange) / new_fracchange);
+     if (z < PLL_ZMIN) z = PLL_ZMIN;
+     if (z > PLL_ZMAX) z = PLL_ZMAX;
+     p->z[j] = p->back->z[j] = z;
+   }
+}
+
+static void
+updateAllBranchLengthsRecursive (nodeptr p, int tips, double old_fracchange, double new_fracchange)
+{
+  updateBranchLength (p, old_fracchange, new_fracchange);
+
+  if (!isTip (p->number, tips))
+   {
+     updateAllBranchLengthsRecursive (p->next->back,       tips, old_fracchange, new_fracchange);
+     updateAllBranchLengthsRecursive (p->next->next->back, tips, old_fracchange, new_fracchange);
+   }
+}
+
+static void
+updateAllBranchLengths (pllInstance * tr, double old_fracchange, double new_fracchange)
+{
+  nodeptr p;
+
+  p = tr->start;
+  assert (isTip(p->number, tr->mxtips));
+
+  updateAllBranchLengthsRecursive (p->back, tr->mxtips, old_fracchange, new_fracchange);
+
+}
+
+
+/** @brief Relink the taxa
+    
+    Relink the taxa by performing a preorder traversal of the unrooted binary tree.
+    We assume that the tree is rooted such that the root is the only node of
+    out-degree 3 and in-degree 0, while all the other inner nodes have in-degree
+    1 and out-degree 2. Finally, the leaves have in-degree 1 and out-degree 0.
+
+    @param pInst
+      PLL instance
+
+    @param nTree
+      Parsed newick tree structure
+
+    @param taxaExist
+      Is the set of taxa of \a nTree a subset of the taxa of the current tree
+
+    @return
+*/
+static int
+linkTaxa (pllInstance * pInst, pllNewickTree * nTree, int taxaExist)
+{
+  nodeptr 
+    parent,
+    child;
+  pllStack 
+    * nodeStack = NULL,
+    * current;
+  int
+    i,
+    j,
+    inner = nTree->tips + 1,
+    leaf  = 1;
+  double z;
+  pllNewickNodeInfo * nodeInfo;
+
+  if (!taxaExist) pllTreeInitDefaults (pInst, nTree->tips);
+
+  /* Place the ternary root node 3 times on the stack such that later on
+     three nodes use it as their parent */
+  current = nTree->tree;
+  for (parent = pInst->nodep[inner], i  = 0; i < 3; ++ i, parent = parent->next)
+    pllStackPush (&nodeStack, parent);
+  ++ inner;
+
+  /* now traverse the rest of the nodes */
+  for (current = current->next; current; current = current->next)
+   {
+     parent   = (nodeptr) pllStackPop (&nodeStack);
+     nodeInfo = (pllNewickNodeInfo *) current->item;
+
+     /* if inner node place it twice on the stack (out-degree 2) */
+     if (nodeInfo->rank)
+      {
+        child = pInst->nodep[inner ++];
+        pllStackPush (&nodeStack, child->next);
+        pllStackPush (&nodeStack, child->next->next);
+      }
+     else /* check if taxon already exists, i.e. we loaded another tree topology */
+      {
+        if (taxaExist)
+         {
+           assert (pllHashSearch (pInst->nameHash, nodeInfo->name, (void **) &child));
+         }
+        else
+         {
+           child = pInst->nodep[leaf];
+           pInst->nameList[leaf] = strdup (nodeInfo->name);
+           pllHashAdd (pInst->nameHash, pllHashString(pInst->nameList[leaf], pInst->nameHash->size), pInst->nameList[leaf], (void *) (pInst->nodep[leaf]));
+           ++ leaf;
+         }
+      }
+     assert (parent);
+     /* link parent and child */
+     parent->back = child;
+     child->back  = parent;
+
+     if (!taxaExist) pInst->fracchange = 1;
+
+     /* set the branch length */
+     z = exp ((-1 * atof (nodeInfo->branch)) / pInst->fracchange);
+     if (z < PLL_ZMIN) z = PLL_ZMIN;
+     if (z > PLL_ZMAX) z = PLL_ZMAX;
+     for (j = 0; j < PLL_NUM_BRANCHES; ++ j)
+       parent->z[j] = child->z[j] = z;
+   }
+  pllStackClear (&nodeStack);
+
+  return PLL_TRUE;
+}
+
+/** @brief Get the instantaneous rate matrix
+    
+    Obtain the instantaneous rate matrix (Q) for partitionm \a model
+    of the partition list \a pr, and store it in an array \a outBuffer.
+    
+    @param tr        PLL instance
+    @param pr        List of partitions
+    @param model     Index of partition to use
+    @param outBuffer Where to store the instantaneous rate matrix 
+
+    @todo Currently, the Q matrix can be only obtained for DNA GTR data.
+
+    @return Returns \b PLL_TRUE in case of success, otherwise \b PLL_FALSE
+*/
+int pllGetInstRateMatrix (partitionList * pr, int model, double * outBuffer)
+{
+  if (pr->partitionData[model]->dataType != PLL_DNA_DATA) return (PLL_FALSE);
+
+  int  i;
+  double mean = 0;
+  double * substRates = pr->partitionData[model]->substRates;
+  double * freqs = pr->partitionData[model]->frequencies;
+  
+  /* normalize substitution rates */
+  for (i = 0; i < 6; ++ i)  substRates[i] /= substRates[5];
+
+  outBuffer[0 * 4 + 1] = (substRates[0] * freqs[1]);
+  outBuffer[0 * 4 + 2] = (substRates[1] * freqs[2]);
+  outBuffer[0 * 4 + 3] = (substRates[2] * freqs[3]);
+
+  outBuffer[1 * 4 + 0] = (substRates[0] * freqs[0]);
+  outBuffer[1 * 4 + 2] = (substRates[3] * freqs[2]);
+  outBuffer[1 * 4 + 3] = (substRates[4] * freqs[3]);
+
+  outBuffer[2 * 4 + 0] = (substRates[1] * freqs[0]);
+  outBuffer[2 * 4 + 1] = (substRates[3] * freqs[1]);
+  outBuffer[2 * 4 + 3] = (substRates[5] * freqs[3]);
+
+  outBuffer[3 * 4 + 0] = (substRates[2] * freqs[0]);
+  outBuffer[3 * 4 + 1] = (substRates[4] * freqs[1]);
+  outBuffer[3 * 4 + 2] = (substRates[5] * freqs[2]);
+
+  outBuffer[0 * 4 + 0] = -(substRates[0] * freqs[1] + substRates[1] * freqs[2] + substRates[2] * freqs[3]);
+  outBuffer[1 * 4 + 1] = -(substRates[0] * freqs[0] + substRates[3] * freqs[2] + substRates[4] * freqs[3]);
+  outBuffer[2 * 4 + 2] = -(substRates[1] * freqs[0] + substRates[3] * freqs[1] + substRates[5] * freqs[3]);
+  outBuffer[3 * 4 + 3] = -(substRates[2] * freqs[0] + substRates[4] * freqs[1] + substRates[5] * freqs[2]);
+
+  for (i = 0; i <  4; ++ i) mean         += freqs[i] * (-outBuffer[i * 4 + i]);
+  for (i = 0; i < 16; ++ i) outBuffer[i] /= mean;
+
+  return (PLL_TRUE);
+}
+
+/** @ingroup instanceLinkingGroup
+    @brief Initializes the PLL tree topology according to a parsed newick tree
+
+    Set the tree topology based on a parsed and validated newick tree
+
+    @param tree
+      The PLL instance
+
+    @param nt
+      The \a pllNewickTree wrapper structure that contains the parsed newick tree
+
+    @param useDefaultz
+      If set to \b PLL_TRUE then the branch lengths will be reset to the default
+      value.
+*/
+void
+pllTreeInitTopologyNewick (pllInstance * tr, pllNewickTree * newick, int useDefaultz)
+{
+  linkTaxa (tr, newick, tr->nameHash && checkTreeInclusion (tr, newick));
+
+  tr->start = tr->nodep[1];
+
+  if (useDefaultz == PLL_TRUE)
+    resetBranches (tr);
+}
+
+/** @brief Get the node oriented pointer from a round-about node
+
+    Returns the pointer of the round-about node $p$ that has the orientation, i.e.
+    has the \a x flag set to 1. In case a tip is passed, then the returned pointer
+    is the same as the input.
+
+    @param pInst  PLL instance
+    @param p      One of the three pointers of a round-about node
+
+    @return  Returns the the pointer that has the orientation
+*/
+nodeptr pllGetOrientedNodePointer (pllInstance * pInst, nodeptr p)
+{
+  if (p->number <= pInst->mxtips || p->x) return p;
+
+  if (p->next->x) return p->next;
+
+  return p->next->next;
+}
+
+
+//void
+//pllTreeInitTopologyNewick (pllInstance * tr, pllNewickTree * nt, int useDefaultz)
+//{
+//  pllStack * nodeStack = NULL;
+//  pllStack * head;
+//  pllNewickNodeInfo * item;
+//  int i, j, k;
+//  
+///*
+//  for (i = 0; i < partitions->numberOfPartitions; ++ i)
+//   {
+//     partitions->partitionData[i] = (pInfo *) rax_malloc (sizeof (pInfo));
+//     partitions->partitionData[i]->partitionContribution = -1.0;
+//     partitions->partitionData[i]->partitionLH           =  0.0;
+//     partitions->partitionData[i]->fracchange            =  1.0;
+//   }
+//*/
+// 
+//
+// if (tr->nameHash)
+//  {
+//    if (checkTreeInclusion (tr, nt))
+//     {
+//       printf ("It is a subset\n");
+//     }
+//    else
+//     {
+//       printf ("It is not a subset\n");
+//     }
+//  }
+//  
+//  pllTreeInitDefaults (tr, nt->tips);
+//
+//  i = nt->tips + 1;
+//  j = 1;
+//  nodeptr v;
+//  
+//  
+//  for (head = nt->tree; head; head = head->next)
+//  {
+//    item = (pllNewickNodeInfo *) head->item;
+//    if (!nodeStack)
+//     {
+//       pllStackPush (&nodeStack, tr->nodep[i]);
+//       pllStackPush (&nodeStack, tr->nodep[i]->next);
+//       pllStackPush (&nodeStack, tr->nodep[i]->next->next);
+//       ++i;
+//     }
+//    else
+//     {
+//       v = (nodeptr) pllStackPop (&nodeStack);
+//       if (item->rank)  /* internal node */
+//        {
+//          v->back           = tr->nodep[i];
+//          tr->nodep[i]->back = v; //t->nodep[v->number]
+//          pllStackPush (&nodeStack, tr->nodep[i]->next);
+//          pllStackPush (&nodeStack, tr->nodep[i]->next->next);
+//          double z = exp((-1 * atof(item->branch))/tr->fracchange);
+//          if(z < PLL_ZMIN) z = PLL_ZMIN;
+//          if(z > PLL_ZMAX) z = PLL_ZMAX;
+//          for (k = 0; k < PLL_NUM_BRANCHES; ++ k)
+//             v->z[k] = tr->nodep[i]->z[k] = z;
+//
+//          ++ i;
+//        }
+//       else             /* leaf */
+//        {
+//          v->back           = tr->nodep[j];
+//          tr->nodep[j]->back = v; //t->nodep[v->number];
+//
+//          double z = exp((-1 * atof(item->branch))/tr->fracchange);
+//          if(z < PLL_ZMIN) z = PLL_ZMIN;
+//          if(z > PLL_ZMAX) z = PLL_ZMAX;
+//          for (k = 0; k < PLL_NUM_BRANCHES; ++ k)
+//            v->z[k] = tr->nodep[j]->z[k] = z;
+//            
+//          //t->nameList[j] = strdup (item->name);
+//          tr->nameList[j] = (char *) rax_malloc ((strlen (item->name) + 1) * sizeof (char));
+//          strcpy (tr->nameList[j], item->name);
+//          
+//          pllHashAdd (tr->nameHash, tr->nameList[j], (void *) (tr->nodep[j]));
+//          ++ j;
+//        }
+//     }
+//  }
+//  
+//  tr->start = tr->nodep[1];
+//  
+//  pllStackClear (&nodeStack);
+//
+//  if (useDefaultz == PLL_TRUE) 
+//    resetBranches (tr);
+//}
+
+/** @brief Initialize PLL tree with a random topology
+
+    Initializes the PLL tree with a randomly created topology
+
+    @todo
+      Perhaps pass a seed?
+
+    @param tr
+      The PLL instance
+
+    @param tips
+      Number of tips
+
+    @param nameList
+      A set of \a tips names representing the taxa labels
+*/
+void 
+pllTreeInitTopologyRandom (pllInstance * tr, int tips, char ** nameList)
+{
+  int i;
+  pllTreeInitDefaults (tr, tips);
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     tr->nameList[i] = (char *) rax_malloc ((strlen (nameList[i]) + 1) * sizeof (char));
+     strcpy (tr->nameList[i], nameList[i]);
+     pllHashAdd (tr->nameHash, pllHashString(tr->nameList[i], tr->nameHash->size), tr->nameList[i], (void *) (tr->nodep[i]));
+   }
+  
+
+  pllMakeRandomTree (tr);
+}
+
+
+/** @brief Initialize a tree that corresponds to a given (already parsed) alignment 
+
+    Initializes the PLL tree such that it corresponds to the given alignment
+
+    @todo
+      nothing 
+
+    @param tr
+      The PLL instance
+
+    @param alignmentData
+      Parsed alignment
+*/
+void 
+pllTreeInitTopologyForAlignment (pllInstance * tr, pllAlignmentData * alignmentData)
+{
+  int
+    tips = alignmentData->sequenceCount,
+    i;
+
+  char 
+    **nameList = alignmentData->sequenceLabels;
+  
+  pllTreeInitDefaults (tr, tips);
+
+  for (i = 1; i <= tips; ++ i)
+   {
+     tr->nameList[i] = (char *) rax_malloc ((strlen (nameList[i]) + 1) * sizeof (char));
+     strcpy (tr->nameList[i], nameList[i]);
+     pllHashAdd (tr->nameHash, pllHashString(tr->nameList[i], tr->nameHash->size), tr->nameList[i], (void *) (tr->nodep[i]));
+   }
+}
+
+
+/** @brief Compute a randomized stepwise addition oder parsimony tree
+
+    Implements the RAxML randomized stepwise addition order algorithm 
+
+    @todo
+      check functions that are invoked for potential memory leaks!
+
+    @param tr
+      The PLL instance
+
+    @param partitions
+      The partitions
+
+    @param sprDist
+      SPR distance for the SPR search in parsimony
+*/
+void pllComputeRandomizedStepwiseAdditionParsimonyTree(pllInstance * tr, partitionList * partitions, int sprDist)
+{
+  allocateParsimonyDataStructures(tr, partitions);
+  pllMakeParsimonyTreeFast(tr, partitions, sprDist);
+  pllFreeParsimonyDataStructures(tr, partitions);
+}
+
+/** @brief Encode the alignment data to the PLL numerical representation
+    
+    Transforms the alignment to the PLL internal representation by substituting each base 
+    with a specific digit.
+
+    @param alignmentData  Multiple sequence alignment
+    @param partitions     List of partitions
+*/
+void pllBaseSubstitute (pllInstance * tr, partitionList * partitions)
+{
+  const char * d;
+  int i, j, k;
+
+  for (i = 0; i < partitions->numberOfPartitions; ++ i)
+   {
+     switch (partitions->partitionData[i]->dataType)
+      {
+        case PLL_DNA_DATA:
+          d = PLL_MAP_NT;
+          break;
+        case PLL_BINARY_DATA:
+          d = PLL_MAP_BIN;
+          break;
+        case PLL_AA_DATA:
+          d = PLL_MAP_AA;
+          break;
+        default:
+          assert(0);
+      }
+     
+     for (j = 1; j <= tr->mxtips; ++ j)
+      {
+        for (k = partitions->partitionData[i]->lower; k < partitions->partitionData[i]->upper; ++ k)
+         {
+           tr->yVector[j][k] = d[tr->yVector[j][k]];
+         }
+      }
+   }
+}
+
+/** Clears the rearrangements history from PLL instance
+    
+    Clears the rearrangements rollback information (history) from the PLL instance \a tr.
+
+    @param tr
+      PLL instance
+*/
+void pllClearRearrangeHistory (pllInstance * tr)
+{
+  pllRollbackInfo * ri;
+
+  while ((ri = (pllRollbackInfo *)pllStackPop (&(tr->rearrangeHistory))))
+   {
+     rax_free (ri);
+   }
+}
+
+/** @brief Deallocate the PLL instance
+
+    Deallocates the library instance and all its elements.
+
+    @param tr
+      The PLL instance
+*/
+void
+pllDestroyInstance (pllInstance * tr)
+{
+  int i;
+
+  for (i = 1; i <= tr->mxtips; ++ i)
+    rax_free (tr->nameList[i]);
+  
+  pllHashDestroy (&(tr->nameHash), NULL);
+  if (tr->yVector)
+   {
+     if (tr->yVector[0]) rax_free (tr->yVector[0]);
+     rax_free (tr->yVector);
+   }
+  rax_free (tr->aliaswgt);
+  rax_free (tr->rateCategory);
+  rax_free (tr->patrat);
+  rax_free (tr->patratStored);
+  rax_free (tr->lhs);
+  rax_free (tr->td[0].parameterValues);
+  rax_free (tr->td[0].executeModel);
+  rax_free (tr->td[0].ti);
+  rax_free (tr->nameList);
+  rax_free (tr->nodep);
+  rax_free (tr->nodeBaseAddress);
+  rax_free (tr->tree_string);
+  rax_free (tr->tree0);
+  rax_free (tr->tree1);
+  rax_free (tr->tipNames);
+  rax_free (tr->constraintVector);
+  pllClearRearrangeHistory (tr);
+
+  rax_free (tr);
+
+#ifdef _FINE_GRAIN_MPI
+  pllFinalizeMPI ();
+#endif
+
+}
+
+/* initializwe a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+
+
+static int init_Q_MatrixSymmetries(char *linkageString, partitionList * pr, int model)
+{
+  int 
+    states = pr->partitionData[model]->states,
+    numberOfRates = ((states * states - states) / 2), 
+    *list = (int *)rax_malloc(sizeof(int) * numberOfRates),
+    j,
+    max = -1;
+
+  char
+    *str1,
+    *saveptr,
+    *ch,
+    *token;
+
+  ch = (char *) rax_malloc (strlen (linkageString) + 1);
+  strcpy (ch, linkageString);
+
+
+  for(j = 0, str1 = ch; ;j++, str1 = (char *)NULL) 
+    {
+      token = STRTOK_R(str1, ",", &saveptr);
+      if(token == (char *)NULL)
+        break;
+      if(!(j < numberOfRates))
+        {
+          errno = PLL_SUBSTITUTION_RATE_OUT_OF_BOUNDS;
+          return PLL_FALSE;
+        }
+      list[j] = atoi(token);     
+    }
+  
+  rax_free(ch);
+
+  for(j = 0; j < numberOfRates; j++)
+    {
+      if(!(list[j] <= j))
+        {
+          errno = PLL_INVALID_Q_MATRIX_SYMMETRY;
+          return PLL_FALSE;
+        }
+      
+      if(!(list[j] <= max + 1))
+        {
+          errno = PLL_Q_MATRIX_SYMMETRY_OUT_OF_BOUNDS;
+          return PLL_FALSE;
+        }
+      
+      if(list[j] > max)
+        max = list[j];
+    }  
+  
+  for(j = 0; j < numberOfRates; j++)  
+    pr->partitionData[model]->symmetryVector[j] = list[j];    
+
+  //less than the maximum possible number of rate parameters
+
+  if(max < numberOfRates - 1)    
+    pr->partitionData[model]->nonGTR = PLL_TRUE;
+
+  pr->partitionData[model]->optimizeSubstitutionRates = PLL_TRUE;
+
+  rax_free(list);
+
+  return PLL_TRUE;
+}
+
+/** @brief Check parameter linkage across partitions for consistency
+ *
+ * Checks that linked alpha, substitution rate and frequency model parameters 
+ * across several partitions are consistent. E.g., when two partitions are linked 
+ * via the alpha parameter, the alpha parameter should either be set to the same 
+ * fixed value or it should be estimated!
+ *
+ * @param pr
+ *   List of partitions
+ *
+ * @todo
+ *   Call this in more functions, right now it's only invoked in the wrapper 
+ *   for modOpt() 
+ */
+static int checkLinkageConsistency(partitionList *pr)
+{
+  if(pr->dirty)
+    {
+      int 
+        i;
+      
+      linkageList 
+        *ll;
+
+      /* first deal with rates */
+
+      ll = pr->rateList;
+        
+      for(i = 0; i < ll->entries; i++)
+        {
+          int
+            partitions = ll->ld[i].partitions,
+            reference = ll->ld[i].partitionList[0];
+          
+          if(pr->partitionData[reference]->dataType == PLL_AA_DATA)
+            {
+              if(pr->partitionData[reference]->protModels == PLL_GTR || pr->partitionData[reference]->nonGTR)                             
+                {
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == PLL_TRUE))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                }
+              else              
+                {
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == PLL_FALSE))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                }                 
+            }
+
+          if(partitions > 1)
+            {
+              int
+                j,
+                k;
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];
+                  
+                  int
+                    states = pr->partitionData[index]->states,
+                    rates = ((states * states - states) / 2);
+                  
+                  if(!(pr->partitionData[reference]->nonGTR == pr->partitionData[index]->nonGTR))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                  if(!(pr->partitionData[reference]->optimizeSubstitutionRates == pr->partitionData[index]->optimizeSubstitutionRates))
+                    {
+                      errno = PLL_INCONSISTENT_SUBST_RATE_OPTIMIZATION_SETTING;
+                      return PLL_FALSE;
+                    }
+                
+                  
+                  if(pr->partitionData[reference]->nonGTR)
+                    {              
+                      
+                      for(j = 0; j < rates; j++)                        
+                        {
+                          if(!(pr->partitionData[reference]->symmetryVector[j] == pr->partitionData[index]->symmetryVector[j]))
+                            {
+                              errno = PLL_INCONSISTENT_Q_MATRIX_SYMMETRIES_ACROSS_LINKED_PARTITIONS;
+                              return PLL_FALSE;
+                            }
+                        }
+                    }
+                  
+                 
+                  for(j = 0; j < rates; j++)
+                    {
+                      if(!(pr->partitionData[reference]->substRates[j] == pr->partitionData[index]->substRates[j]))
+                        {
+                          errno = PLL_INCONSISTENT_Q_MATRIX_ENTRIES_ACROSS_LINKED_PARTITIONS;
+                          return PLL_FALSE;
+                        }
+                    }
+                }           
+            }
+        }
+      
+      /* then deal with alpha parameters */
+
+      ll = pr->alphaList;
+
+      for(i = 0; i < ll->entries; i++)
+        {
+          int
+            partitions = ll->ld[i].partitions;
+          
+          if(partitions > 1)
+            {
+              int
+                k, 
+                reference = ll->ld[i].partitionList[0];
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int 
+                    index = ll->ld[i].partitionList[k];                          
+
+                  if(!(pr->partitionData[reference]->optimizeAlphaParameter == pr->partitionData[index]->optimizeAlphaParameter))
+                    {
+                      errno = PLL_INCONSISTENT_ALPHA_STATES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+                  if(!(pr->partitionData[reference]->alpha == pr->partitionData[index]->alpha))
+                    {
+                      errno = PLL_INCONSISTENT_ALPHA_VALUES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+                }           
+            }
+        }
+
+      /* and then deal with base frequencies */
+
+      ll = pr->freqList;
+
+      for(i = 0; i < ll->entries; i++)
+        {
+          int     
+            partitions = ll->ld[i].partitions;
+          
+          if(partitions > 1)
+            {
+              int               
+                k, 
+                reference = ll->ld[i].partitionList[0];
+              
+              for(k = 1; k < partitions; k++)
+                {
+                  int
+                    j,
+                    index = ll->ld[i].partitionList[k],
+                    states = pr->partitionData[index]->states;                           
+
+                  if(!(pr->partitionData[reference]->optimizeBaseFrequencies == pr->partitionData[index]->optimizeBaseFrequencies))
+                    {
+                      errno = PLL_INCONSISTENT_FREQUENCY_STATES_ACROSS_LINKED_PARTITIONS;
+                      return PLL_FALSE;
+                    }
+
+                  for(j = 0; j < states; j++)
+                    {
+                      if(!(pr->partitionData[reference]->frequencies[j] == pr->partitionData[index]->frequencies[j]))
+                        {
+                          errno = PLL_INCONSISTENT_FREQUENCY_VALUES_ACROSS_LINKED_PARTITIONS;
+                          return PLL_FALSE;
+                        }
+                    }
+                }           
+            }
+        }
+      
+      pr->dirty = PLL_FALSE;
+    }
+
+  return PLL_TRUE;
+}
+/** @brief Set symmetries among parameters in the Q matrix
+    
+    Allows to link some or all rate parameters in the Q-matrix 
+    for obtaining simpler models than GTR
+
+    @param string
+      string describing the symmetry pattern among the rates in the Q matrix
+
+    @param pr
+      List of partitions
+      
+    @param model
+      Index of the partition for which we want to set the Q matrix symmetries
+
+    @todo
+      nothing
+*/
+int pllSetSubstitutionRateMatrixSymmetries(char *string, partitionList * pr, int model)
+{
+  int 
+    result = init_Q_MatrixSymmetries(string, pr, model);
+
+  pr->dirty = PLL_TRUE;
+
+  return result;
+}
+
+/** @defgroup modelParamsGroup Model parameters setup and retrieval
+    
+    This set of functions is responsible for setting, retrieving, and optimizing
+    model parameters. It also contains functions for linking model parameters
+    across partitions.
+*/
+
+/** @ingroup modelParamsGroups
+    @brief Set the alpha parameter of the Gamma model to a fixed value for a partition
+    
+    Sets the alpha parameter of the gamma model of rate heterogeneity to a fixed value
+    and disables the optimization of this parameter 
+
+    @param alpha
+      alpha value
+
+    @param model
+      Index of the partition for which we want to set the alpha value
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix alpha 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedAlpha(double alpha, int model, partitionList * pr, pllInstance *tr)
+{
+  //make sure that we are swetting alpha for a partition within the current range 
+  //of partitions
+  double old_fracchange = tr->fracchange;
+
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  assert(alpha >= PLL_ALPHA_MIN && alpha <= PLL_ALPHA_MAX);
+
+  //set the alpha paremeter 
+  
+  pr->partitionData[model]->alpha = alpha;
+
+  //do the discretization of the gamma curve
+
+  pllMakeGammaCats(pr->partitionData[model]->alpha, pr->partitionData[model]->gammaRates, 4, tr->useMedian);
+
+  //broadcast the changed parameters to all threads/MPI processes 
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier(tr, pr, PLL_THREAD_COPY_ALPHA);
+#endif
+
+  pr->partitionData[model]->optimizeAlphaParameter = PLL_FALSE;
+
+  pr->dirty = PLL_FALSE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+/** @ingroup modelParamsGroups
+    @brief Get the rate categories of the Gamma model of a partition
+
+    Gets the gamma rate categories of the Gamma model of rate heterogeneity
+    of partition \a pid from partition list \a pr.
+
+    @param pr   List of partitions
+    @param pid  Index of partition to use
+    @param outBuffer  Output buffer where to store the rates
+*/
+void pllGetGammaRates (partitionList * pr, int pid, double * outBuffer)
+{
+  /* TODO: Change the hardcoded 4 and also add a check that this partition
+     really uses gamma. Currently, instance is also not required */
+  memcpy (outBuffer, pr->partitionData[pid]->gammaRates, 4 * sizeof (double));
+}
+
+/** @ingroup modelParamsGroups
+    @brief Get the alpha parameter of the Gamma model of a partition
+
+    Returns the alpha parameter of the gamma model of rate heterogeneity
+    of partition \a pid from partition list \a pr.
+
+    @param pr   List of partitions
+    @param pid  Index of partition to use
+
+    @return
+      Alpha parameter
+*/
+double pllGetAlpha (partitionList * pr, int pid)
+{
+  /* TODO: check if the partition uses gamma */
+  return (pr->partitionData[pid]->alpha);
+}
+
+
+/** @ingroup modelParamsGroups
+    @brief Get the base frequencies of a partition
+
+    Gets the base frequencies of partition \a model from partition list
+    \a partitionList and stores them in \a outBuffer. Note that \outBuffer
+    must be of size s, where s is the number of states.
+
+    @param  tr       PLL instance
+    @param pr        List of partitions
+    @param model     Index of the partition for which we want to get the base frequencies
+    @param outBuffer Buffer where to store the base frequencies
+*/
+void pllGetBaseFrequencies(partitionList * pr, int model, double * outBuffer)
+{
+  memcpy (outBuffer, pr->partitionData[model]->frequencies, pr->partitionData[model]->states * sizeof (double));
+}
+
+
+/** @ingroup modelParamsGroups
+    @brief Set all base frequencies to a fixed value for a partition
+    
+    Sets all base freuqencies of a partition to fixed values and disables 
+    ML optimization of these parameters 
+
+    @param f
+      array containing the base frequencies
+
+    @param  length
+      length of array f, this needs to be as long as the number of 
+      states in the model, otherwise an assertion will fail!
+
+    @param model
+      Index of the partition for which we want to set the frequencies 
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the base frequencies
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedBaseFrequencies(double *f, int length, int model, partitionList * pr, pllInstance *tr)
+{
+  int 
+    i;
+
+  double 
+    acc = 0.0,
+    old_fracchange;
+
+  old_fracchange = tr->fracchange;
+
+  //make sure that we are setting the base frequencies for a partition within the current range 
+  //of partitions
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  //make sure that the length of the input array f containing the frequencies 
+  //is as long as the number of states in the model 
+  assert(length == pr->partitionData[model]->states);
+
+
+  //make sure that the base frequencies sum approximately to 1.0
+  
+  for(i = 0; i < length; i++)
+    acc += f[i];
+
+  if(fabs(acc - 1.0) > 0.000001)
+    assert(0);
+
+  //copy the base frequencies 
+  memcpy(pr->partitionData[model]->frequencies, f, sizeof(double) * length);
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+  pr->partitionData[model]->optimizeBaseFrequencies = PLL_FALSE;
+
+  pr->dirty = PLL_TRUE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+/** @ingroup modelParamsGroups
+    @brief Set that the base freuqencies are optimized under ML
+    
+    The base freuqencies for partition model will be optimized under ML    
+
+    @param model
+      Index of the partition for which we want to optimize base frequencies 
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the base frequencies
+
+    @todo
+      test if this works with the parallel versions
+*/
+int pllSetOptimizeBaseFrequencies(int model, partitionList * pr, pllInstance *tr)
+{
+  int
+    states,
+    i;
+
+  double 
+    initialFrequency,
+    acc = 0.0;
+
+  //make sure that we are setting the base frequencies for a partition within the current range 
+  //of partitions
+  if(!(model >= 0 && model < pr->numberOfPartitions))
+    {
+      errno = PLL_PARTITION_OUT_OF_BOUNDS;
+      return PLL_FALSE;
+    }
+
+  //set the number of states/ferquencies in this partition 
+  states = pr->partitionData[model]->states;
+
+  //set all frequencies to 1/states
+  
+  initialFrequency = 1.0 / (double)states;
+
+  for(i = 0; i < states; i++)
+    pr->partitionData[model]->frequencies[i] = initialFrequency;
+
+  //make sure that the base frequencies sum approximately to 1.0
+  
+  for(i = 0; i < states; i++)
+    acc += pr->partitionData[model]->frequencies[i];
+
+  if(fabs(acc - 1.0) > 0.000001)
+    {
+      errno = PLL_BASE_FREQUENCIES_DO_NOT_SUM_TO_1;
+      return PLL_FALSE;
+    }
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+  pr->partitionData[model]->optimizeBaseFrequencies = PLL_TRUE;
+
+  pr->dirty = PLL_TRUE;
+
+  return PLL_TRUE;
+}
+
+
+
+
+/** @ingroup modelParamsGroups
+    @brief Get the substitution rates for a specific partition
+
+    Gets the substitution rates of partition \a model from partition list
+    \a partitionList and stores them in \a outBuffer. Note that \outBuffer
+    must be of size (2 * s - s) / 2, where s is the number of states, i.e.
+    the number of upper diagonal entries of the Q matrix.
+
+    @param tr        PLL instance
+    @param pr        List of partitions
+    @param model     Index of partition for which we want to get the substitution rates
+    @param outBuffer Buffer where to store the substitution rates.
+*/
+void pllGetSubstitutionMatrix (partitionList * pr, int model, double * outBuffer)
+{
+  int 
+    rates,
+    states;
+  
+  states = pr->partitionData[model]->states;
+  rates = (states * states - states) / 2;
+
+  memcpy (outBuffer, pr->partitionData[model]->substRates, rates * sizeof (double));
+}
+
+/** @ingroup modelParamsGroups
+     @brief Set all substitution rates for a specific partition and disable ML optimization for them
+    
+    Sets all substitution rates of a partition to fixed values and disables 
+    ML optimization of these parameters. It will automatically re-scale the relative rates  
+    such that the last rate is 1.0 
+
+    @param f
+      array containing the substitution rates
+
+    @param length
+      length of array f, this needs to be as long as: (s * s - s) / 2,
+      i.e., the number of upper diagonal entries of the Q matrix
+
+    @param model
+      Index of the partition for which we want to set/fix the substitution rates
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the substitution rates 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetFixedSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr)
+{
+  pllSetSubstitutionMatrix(q, length, model, pr, tr);
+  pr->partitionData[model]->optimizeSubstitutionRates = PLL_FALSE;
+}
+
+/** @ingroup modelParamsGroups
+     @brief Set all substitution rates for a specific partition
+    
+    Sets all substitution rates of a partition to the given values.
+    It will automatically re-scale the relative rates such that the last rate is 1.0 
+
+    @param f
+      array containing the substitution rates
+
+    @param length
+      length of array f, this needs to be as long as: (s * s - s) / 2,
+      i.e., the number of upper diagonal entries of the Q matrix
+
+    @param model
+      Index of the partition for which we want to set/fix the substitution rates
+
+    @param pr
+      List of partitions
+      
+    @param tr
+      Library instance for which we want to fix the substitution rates 
+
+    @todo
+      test if this works with the parallel versions
+*/
+void pllSetSubstitutionMatrix(double *q, int length, int model, partitionList * pr,  pllInstance *tr)
+{
+  int 
+    i,
+    numberOfRates; 
+
+  double
+    scaler,
+    old_fracchange;
+
+  old_fracchange = tr->fracchange;
+
+  //make sure that we are setting the Q matrix for a partition within the current range 
+  //of partitions
+  assert(model >= 0 && model < pr->numberOfPartitions);
+
+  numberOfRates = (pr->partitionData[model]->states * pr->partitionData[model]->states - pr->partitionData[model]->states) / 2;
+
+  //  make sure that the length of the array containing the subsitution rates 
+  //  corresponds to the number of states in the model
+
+  assert(length == numberOfRates);
+
+  //automatically scale the last rate to 1.0 if this is not already the case
+
+  if(q[length - 1] != 1.0)    
+    scaler = 1.0 / q[length - 1]; 
+  else
+    scaler = 1.0;
+
+  //set the rates for the partition and make sure that they are within the allowed bounds 
+
+  for(i = 0; i < length; i++)
+    {
+      double
+        r = q[i] * scaler;
+      
+      assert(r >= PLL_RATE_MIN && r <= PLL_RATE_MAX);
+      
+      pr->partitionData[model]->substRates[i] = r;
+    }
+
+  //re-calculate the Q matrix 
+  pllInitReversibleGTR(tr, pr, model);
+
+  //broadcast the new Q matrix to all threads/processes 
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  pllMasterBarrier (tr, pr, PLL_THREAD_COPY_RATES);
+#endif
+  
+
+  pr->dirty = PLL_TRUE;
+  updateAllBranchLengths (tr, old_fracchange, tr->fracchange);
+}
+
+
+
+
+/* initialize a parameter linkage list for a certain parameter type (can be whatever).
+   the input is an integer vector that contaions NumberOfModels (numberOfPartitions) elements.
+
+   if we want to have all alpha parameters unlinked and have say 4 partitions the input 
+   vector would look like this: {0, 1, 2, 3}, if we want to link partitions 0 and 3 the vector 
+   should look like this: {0, 1, 2, 0} 
+*/
+
+/** @ingroup modelParamsGroups
+*/
+linkageList* initLinkageList(int *linkList, partitionList *pr)
+{
+  int 
+    k,
+    partitions,
+    numberOfModels = 0,
+    i,
+    pos;
+  
+  linkageList 
+    *ll = (linkageList*)rax_malloc(sizeof(linkageList));
+    
+  /* figure out how many distinct parameters we need to estimate 
+     in total, if all parameters are linked the result will be 1 if all 
+     are unlinked the result will be pr->numberOfPartitions */
+  
+  for(i = 0; i < pr->numberOfPartitions; i++)
+    {
+      if(!(linkList[i] >= 0 && linkList[i] < pr->numberOfPartitions))
+        {
+          errno = PLL_LINKAGE_LIST_OUT_OF_BOUNDS;
+          return (linkageList*)NULL;
+        }
+
+      if(!(linkList[i] <= i && linkList[i] <= numberOfModels + 1))
+        {
+          errno = PLL_LINKAGE_LIST_OUT_OF_BOUNDS;
+          return (linkageList*)NULL;
+        }
+
+      if(linkList[i] > numberOfModels)
+        numberOfModels = linkList[i];
+
+    }
+
+  numberOfModels++;
+  
+  /* allocate the linkage list data structure that containes information which parameters of which partition are 
+     linked with each other.
+
+     Note that we need a separate invocation of initLinkageList() and a separate linkage list 
+     for each parameter type */
+
+  ll->entries = numberOfModels;
+  ll->ld      = (linkageData*)rax_malloc(sizeof(linkageData) * numberOfModels);
+
+  /* noe loop over the number of free parameters and assign the corresponding partitions to each parameter */
+
+  for(i = 0; i < numberOfModels; i++)
+    {
+      /* 
+         the valid flag is used for distinguishing between DNA and protein data partitions.
+         This can be used to enable/disable parameter optimization for the paremeter 
+         associated to the corresponding partitions. This deature is used in optRatesGeneric 
+         to first optimize all DNA GTR rate matrices and then all PROT GTR rate matrices */
+
+      ll->ld[i].valid = PLL_TRUE;
+      partitions = 0;
+
+      /* now figure out how many partitions share this joint parameter */
+
+      for(k = 0; k < pr->numberOfPartitions; k++)
+        if(linkList[k] == i)
+          partitions++;     
+
+      /* assign a list to store the partitions that share the parameter */
+
+      ll->ld[i].partitions = partitions;
+      ll->ld[i].partitionList = (int*)rax_malloc(sizeof(int) * partitions);
+      
+      /* now store the respective partition indices in this list */
+      
+      for(k = 0, pos = 0; k < pr->numberOfPartitions; k++)
+        if(linkList[k] == i)
+          ll->ld[i].partitionList[pos++] = k;
+    }
+
+  /* return the linkage list for the parameter */
+
+  return ll;
+}
+
+
+
+static linkageList* initLinkageListString(char *linkageString, partitionList * pr)
+{
+  int 
+    *list = (int*)rax_malloc(sizeof(int) * pr->numberOfPartitions),
+    j;
+
+  linkageList 
+    *l;
+
+  char
+    *str1,
+    *saveptr,
+//    *ch = strdup(linkageString),
+    *ch,
+    *token;
+  
+  ch = (char *) rax_malloc (strlen (linkageString) + 1);
+  strcpy (ch, linkageString);
+
+  for(j = 0, str1 = ch; ;j++, str1 = (char *)NULL) 
+    {
+      token = STRTOK_R(str1, ",", &saveptr);
+      if(token == (char *)NULL)
+        break;
+      assert(j < pr->numberOfPartitions);
+      list[j] = atoi(token);
+    }
+  
+  rax_free(ch);
+
+  l = initLinkageList(list, pr);
+  
+  rax_free(list);
+
+  return l;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link alpha parameters across partitions
+    
+    Links alpha paremeters across partitions (GAMMA model of rate heterogeneity)
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      test behavior/impact/mem-leaks of this when PSR model is used 
+      it shouldn't do any harm, but it would be better to check!
+*/
+int pllLinkAlphaParameters(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->alphaList);
+  
+  pr->alphaList = initLinkageListString(string, pr); 
+
+  pr->dirty = PLL_TRUE;
+  
+  if(!pr->alphaList)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link base frequency parameters across partitions
+    
+    Links base frequency paremeters across partitions
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      semantics of this function not clear yet: right now this only has an effect 
+      when we do a ML estimate of base frequencies 
+      when we use empirical or model-defined (protein data) base frequencies, one could 
+      maybe average over the per-partition frequencies, but the averages would need to be weighted 
+      accodring on the number of patterns per partition 
+*/
+int pllLinkFrequencies(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->freqList);
+
+  pr->freqList = initLinkageListString(string, pr);
+
+  pr->dirty = PLL_TRUE;
+
+  if(!pr->freqList)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+/** @ingroup modelParamsGroups
+    @brief Link Substitution matrices across partitions
+    
+    Links substitution matrices (Q matrices) across partitions
+
+    @param string
+      string describing the linkage pattern    
+
+    @param pr
+      List of partitions
+
+    @todo
+      re-think/re-design how this is done for protein
+      models
+*/
+int pllLinkRates(char *string, partitionList *pr)
+{
+  //assumes that it has already been assigned once
+  freeLinkageList(pr->rateList);
+  
+  pr->rateList = initLinkageListString(string, pr);
+  
+  pr->dirty = PLL_TRUE;  
+
+  if(!pr->dirty)
+    return PLL_FALSE;
+  else
+    return PLL_TRUE;
+}
+
+
+
+
+/** @ingroup modelParamsGroups
+    @brief Initialize partitions according to model parameters
+    
+    Initializes partitions according to model parameters.
+
+    @param tr              The PLL instance
+    @param partitions      List of partitions
+    @param alignmentData   The parsed alignment
+    @return                Returns \b PLL_TRUE in case of success, otherwise \b PLL_FALSE
+*/
+int pllInitModel (pllInstance * tr, partitionList * partitions) 
+{
+  double ** ef;
+  int
+    i,
+    *unlinked = (int *)rax_malloc(sizeof(int) * partitions->numberOfPartitions);
+  double old_fracchange = tr->fracchange;
+
+  ef = pllBaseFrequenciesInstance (tr, partitions);
+
+  if(!ef)
+    return PLL_FALSE;
+
+  
+#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if (defined(__AVX) || defined(__SSE3))
+  _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
+#endif
+#endif 
+
+#ifdef _USE_PTHREADS
+  tr->threadID = 0;
+#ifndef _PORTABLE_PTHREADS
+  /* not very portable thread to core pinning if PORTABLE_PTHREADS is not defined
+     by defualt the cod ebelow is deactivated */
+  pinToCore(0);
+#endif
+#endif
+
+#if (defined(_FINE_GRAIN_MPI) || defined(_USE_PTHREADS))
+  /* 
+     this main function is the master thread, so if we want to run RAxML with n threads,
+     we use pllStartPthreads to start the n-1 worker threads */
+  
+#ifdef _USE_PTHREADS
+  pllStartPthreads (tr, partitions);
+#endif
+
+  /* via pllMasterBarrier() we invoke parallel regions in which all Pthreads work on computing something, mostly likelihood 
+     computations. Have a look at execFunction() in axml.c where we siwtch of the different types of parallel regions.
+
+     Although not necessary, below we copy the info stored on tr->partitionData to corresponding copies in each thread.
+     While this is shared memory and we don't really need to copy stuff, it was implemented like this to allow for an easier 
+     transition to a distributed memory implementation (MPI).
+     */
+#ifdef _FINE_GRAIN_MPI
+  //MPI_Bcast (&(partitions->numberOfPartitions), 1, MPI_INT, MPI_ROOT, MPI_COMM_WORLD);
+  MPI_Bcast (&(partitions->numberOfPartitions), 1, MPI_INT, 0, MPI_COMM_WORLD);
+#endif
+  
+  /* mpi version now also uses the generic barrier */
+  pllMasterBarrier (tr, partitions, PLL_THREAD_INIT_PARTITION);
+#else  /* SEQUENTIAL */
+  /* 
+     allocate the required data structures for storing likelihood vectors etc 
+     */
+
+  //initializePartitions(tr, tr, partitions, partitions, 0, 0);
+  initializePartitionsSequential (tr, partitions);
+#endif
+  
+  //initializePartitions (tr, tr, partitions, partitions, 0, 0);
+  
+  initModel (tr, ef, partitions);
+
+  pllEmpiricalFrequenciesDestroy (&ef, partitions->numberOfPartitions);
+
+  for(i = 0; i < partitions->numberOfPartitions; i++)
+    unlinked[i] = i;
+
+  //by default everything is unlinked initially 
+  partitions->alphaList = initLinkageList(unlinked, partitions);
+  partitions->freqList  = initLinkageList(unlinked, partitions);
+  partitions->rateList  = initLinkageList(unlinked, partitions);
+
+  rax_free(unlinked);
+
+  updateAllBranchLengths (tr, old_fracchange ? old_fracchange : 1,  tr->fracchange);
+  pllEvaluateLikelihood (tr, partitions, tr->start, PLL_TRUE, PLL_FALSE);
+
+  return PLL_TRUE;
+}
+ 
+/** @ingroup modelParamsGroups
+    @brief Optimize all free model parameters of the likelihood model
+    
+    Initializes partitions according to model parameters.
+
+    @param tr
+      The PLL instance
+
+    @param pr
+      List of partitions
+
+    @param likelihoodEpsilon
+      Specifies up to which epsilon in likelihood values the iterative routine will 
+      be optimizing the parameters  
+*/
+int pllOptimizeModelParameters(pllInstance *tr, partitionList *pr, double likelihoodEpsilon)
+{
+  //force the consistency check
+
+  pr->dirty = PLL_TRUE;
+
+  if(!checkLinkageConsistency(pr))
+    return PLL_FALSE;
+
+  modOpt(tr, pr, likelihoodEpsilon);
+
+  return PLL_TRUE;
+}
+
+/** @brief Read the contents of a file
+    
+    Reads the ile \a filename and return its content. In addition
+    the size of the file is stored in the input variable \a filesize.
+    The content of the variable \a filesize can be anything and will
+    be overwritten.
+
+    @param filename
+      Name of the input file
+
+    @param filesize
+      Input parameter where the size of the file (in bytes) will be stored
+
+    @return
+      Contents of the file
+*/
+char * 
+pllReadFile (const char * filename, long * filesize)
+{
+  FILE * fp;
+  char * rawdata;
+
+  // FIX BUG: opening with "r" does not work on Windows
+//  fp = fopen (filename, "r");
+  printf("[PLL] Reading file %s...\n", filename);
+  fp = fopen (filename, "rb");
+  printf("[PLL] Success!\n");
+  if (!fp) return (NULL);
+
+  /* obtain file size */
+  if (fseek (fp, 0, SEEK_END) == -1)
+   {
+     fclose (fp);
+     return (NULL);
+   }
+
+  *filesize = ftell (fp);
+
+  if (*filesize == -1) 
+   {
+     fclose (fp);
+     return (NULL);
+   }
+  rewind (fp);
+
+  /* allocate buffer and read file contents */
+  rawdata = (char *) rax_malloc (((*filesize) + 1) * sizeof (char));
+  if (rawdata) 
+   {
+     if (fread (rawdata, sizeof (char), *filesize, fp) != (size_t) *filesize) 
+      {
+        rax_free (rawdata);
+        rawdata = NULL;
+      }
+     else
+      {
+        rawdata[*filesize] = 0;
+      }
+   }
+
+  fclose (fp);
+
+  return (rawdata);
+}
+
+static void getInnerBranchEndPointsRecursive (nodeptr p, int tips, int * i, node **nodes)
+{
+  if (!isTip (p->next->back->number, tips))
+   {
+     nodes[(*i)++] = p->next;
+     getInnerBranchEndPointsRecursive(p->next->back, tips, i, nodes);
+   }
+  if (!isTip (p->next->next->back->number, tips))
+   {
+     nodes[(*i)++] = p->next->next;
+     getInnerBranchEndPointsRecursive(p->next->next->back, tips, i, nodes);
+   }
+}
+
+node ** pllGetInnerBranchEndPoints (pllInstance * tr)
+{
+  node ** nodes;
+  nodeptr p;
+  int i = 0;
+
+  nodes = (node **) rax_calloc(tr->mxtips - 3, sizeof(node *));
+
+  p = tr->start;
+  assert (isTip(p->number, tr->mxtips));
+
+  getInnerBranchEndPointsRecursive(p->back, tr->mxtips, &i, nodes);
+
+  return nodes;
+}
+
+#if defined WIN32 || defined _WIN32 || defined __WIN32__
+void* rax_calloc(size_t count, size_t size) {
+	void* res = rax_malloc(size * count);
+	memset(res,0,size * count);
+	return res;
+}
+#endif
+
diff --git a/pruning.cpp b/pruning.cpp
new file mode 100644
index 0000000..2c8fc27
--- /dev/null
+++ b/pruning.cpp
@@ -0,0 +1,183 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "pruning.h"
+#include <algorithm>
+
+/*********************************************
+	class Pruning
+*********************************************/
+/**
+	Steffen Klaere's pruning algorithm
+*/
+void Pruning::run(Params &params, vector<PDTaxaSet> &taxa_set)
+{
+	//if (params.min_size < 2) 
+		params.min_size = params.sub_size;
+
+	list_size = 2*(leafNum-params.sub_size)-1;
+	if (!initialset.empty()) {
+		doInitialSet();
+	}
+	buildLeaves();
+	for (int step = leafNum; step > params.sub_size; step--)
+	{
+		deleteExNode(nearestLeaf());
+		list_size -= 2;
+	}
+	taxa_set.resize(1);
+	taxa_set[0].setTree(*this);
+
+}
+
+void Pruning::doInitialSet() {
+	for (NodeVector::iterator it = initialset.begin(); it != initialset.end(); it++) {
+		(*it)->height = 1;
+	}
+}
+
+void Pruning::printLeaves()
+{
+	// print info
+	for (LeafSet::iterator it = leaves.begin(); it != leaves.end(); it++)
+	{
+		Node *node = *it;
+		cout << node->id << " " << node->neighbors[0]->length << endl;
+	}
+}
+
+
+void Pruning::buildLeaves(Node *node, Node *dad)
+{
+	if (!node) node = root;
+	if (node->isLeaf())
+		addLeaf(node);
+	FOR_NEIGHBOR_IT(node, dad, it)
+		buildLeaves((*it)->node, node);
+}
+
+LeafSet::iterator Pruning::findNode(Node *node)
+{
+	pair<LeafSet::iterator, LeafSet::iterator>
+	range = leaves.equal_range(node);
+	for (LeafSet::iterator it = range.first; it != range.second; it++)
+		if (*it == node)
+			return it;
+	return leaves.end();
+}
+
+void Pruning::deleteExNode(LeafSet::iterator pos)
+{
+	// delete from the tree
+	Node *node = *pos;
+	Node *innode = node->neighbors[0]->node;
+	Node *othernodes[2] = { NULL, NULL };
+	int i;
+	NeighborVec::iterator it;
+	double length = 0;
+
+	bool should_merge = true;
+	//for (it = innode->neighbors.begin(); it != innode->neighbors.end(); it++)
+		//if ((*it)->node != node)
+	FOR_NEIGHBOR(innode, node, it)	{
+		length += (*it)->length;
+		if (othernodes[0] == NULL)
+			othernodes[0] = (*it)->node;
+		else if (othernodes[1] == NULL)
+			othernodes[1] = (*it)->node;
+		else
+			should_merge = false;
+	}
+
+	if (should_merge)
+	{	
+		// merge two branches
+		for (i = 0; i < 2; i++)
+			if (othernodes[i]->isLeaf()) {
+				LeafSet::iterator temp = findNode(othernodes[i]);
+				if (temp != leaves.end())
+					leaves.erase(temp);
+			}
+
+		for (i = 0; i < 2; i++)
+			for (it = othernodes[i]->neighbors.begin(); it != othernodes[i]->neighbors.end(); it++)
+				if ((*it)->node == innode)
+				{
+					(*it)->node = othernodes[1-i];
+					(*it)->length = length;
+				}
+	} else {
+		// simple delete the neighbor of innode
+		for (it = innode->neighbors.begin(); it != innode->neighbors.end(); it++)
+			if ((*it)->node == node) {
+				innode->neighbors.erase(it);
+				break;
+			}
+	}
+	// delete the first element
+	leaves.erase(pos);
+	if (should_merge)
+	for (i = 0; i < 2; i++)
+		if (othernodes[i]->isLeaf())
+			addLeaf(othernodes[i]);
+
+	// also delete the last element if necessary
+	if (leaves.size() > list_size && leaves.size() > 1) {
+		LeafSet::iterator last = leaves.end();
+		last--;
+		leaves.erase(last);
+	}	
+
+	if (node == root)
+	{	// find another root
+		root = *(leaves.begin());
+	}
+}
+
+LeafSet::iterator Pruning::nearestLeaf()
+{
+	return leaves.begin();
+}
+
+
+/**
+	insert a leaf into the LeafSet
+*/
+void Pruning::addLeaf(Node* leaf) {
+
+	// if leaf in the initial set
+	if (leaf->height == 1) {
+		return;
+	}
+
+	if (list_size <= 0)
+		return;
+	if (leaves.size() < list_size) 
+		leaves.insert(leaf);
+	else {
+		LeafSet::iterator last = leaves.end();
+		last--;
+		Node* endp = *last;
+		if (leaf->neighbors[0]->length < endp->neighbors[0]->length) {
+			leaves.erase(last);
+			leaves.insert(leaf);
+		}
+	}
+}
+
diff --git a/pruning.h b/pruning.h
new file mode 100644
index 0000000..8e59dab
--- /dev/null
+++ b/pruning.h
@@ -0,0 +1,112 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef PRUNING_H
+#define PRUNING_H
+
+#include "pdtree.h"
+
+/**
+Implementation of Pruning algorithm with complexity O(n*log(n-k))
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class Pruning : public PDTree
+{
+public:
+	/**
+		construct from program parameters
+		@param params program parameters
+	*/
+    Pruning(Params &params) : 
+		PDTree(params) {}
+
+	/**
+		constructor, get from another tree
+		@param tree another MTree
+	*/
+    Pruning(PDTree &tree) : 
+		PDTree(tree) {}
+
+	/**
+		constructor
+	*/
+	Pruning() : PDTree() {};
+
+	/**
+		run the algorithm
+		@param params program parameters
+		@param taxa_set (OUT) vector of PD sets
+	*/
+	void run(Params &params, vector<PDTaxaSet> &taxa_set);
+
+	/**
+		delete an external node 
+		@param pos the position of the node in the LeafSet
+	*/
+	void deleteExNode(LeafSet::iterator pos);
+
+	/**
+		build the list of all leaves into field leaves.
+		@param node the starting node, NULL to start from the root
+		@param dad dad of the node, used to direct the search
+	*/
+	void buildLeaves(Node *node = NULL, Node *dad = NULL);
+
+	/**
+		print all leaves into screen
+	*/
+	void printLeaves();
+
+	/**
+		find the iterator to the leaf node in leaves field
+		@param node a leaf node.
+		@return iterator to the leaf node in leaves field
+	*/
+	LeafSet::iterator findNode(Node *node);
+
+	/**
+		@return leaves.begin().
+	*/
+	LeafSet::iterator nearestLeaf();
+
+	/**
+		mark the node in the inital set to be not PRUNABLE
+	*/
+	void doInitialSet();
+
+	/**
+		insert a leaf into the LeafSet
+		@param leaf leaf node to be inserted
+	*/
+	void addLeaf(Node* leaf);
+
+	/**
+		leaf set of the tree, used for pruning algorithm
+	*/
+	LeafSet leaves;
+
+	/**
+		maximum size of the ordered list of leaves
+	*/
+	int list_size;
+
+};
+
+#endif
diff --git a/split.cpp b/split.cpp
new file mode 100644
index 0000000..74a1506
--- /dev/null
+++ b/split.cpp
@@ -0,0 +1,572 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "split.h"
+
+Split::Split()
+		: vector<UINT>()
+{
+	ntaxa = 0;
+	weight = 0.0;
+}
+
+Split::Split(int antaxa, double aweight)
+		: vector<UINT>()
+{
+	weight = aweight;
+	setNTaxa(antaxa);
+}
+
+Split::Split(const Split &sp)
+		: vector<UINT>(sp) 
+{
+	weight = sp.weight;
+	ntaxa = sp.ntaxa;
+/*
+	setNTaxa(sp.ntaxa);
+	int i = 0;
+	for (iterator it = begin(); it != end(); it++, i++)
+		(*it) = sp[i];
+*/
+}
+
+
+
+Split::Split(int antaxa, double aweight, vector<int> taxa_list)
+		: vector<UINT>()
+{
+	ntaxa = antaxa;
+	weight = aweight;
+	vector<int>::iterator it;
+
+	// inverted mode: include the remaining part into the split
+
+	// if taxa_list contains more than half of taxa, turn on the inverted mode
+	/* if taxa_list contains exactly one haft of taxa, only turn on the inverted mode
+	 if taxon 0 is not in the list */
+/*	bool inverted = (taxa_list.size()*2 > ntaxa);
+	if (taxa_list.size()*2 == ntaxa) {
+		inverted = true;
+		for (it = taxa_list.begin(); it != taxa_list.end(); it++)
+			if ((*it) == 0) {
+				inverted = false;
+				break;
+			}
+	}*/
+
+	// resize the split size
+	resize((ntaxa + UINT_BITS -1) / UINT_BITS, 0);
+
+	for (it = taxa_list.begin(); it != taxa_list.end(); it++)
+	{
+		int value = *it;
+		int bit_pos = value / UINT_BITS;
+		int bit_off = value % UINT_BITS;
+		(*this)[bit_pos] |= (UINT) (1 << bit_off);
+	}
+
+
+	//if (inverted) invert();
+	if (shouldInvert()) invert();
+
+}
+
+void Split::invert() {
+	for (iterator uit = begin(); uit != end(); uit++)
+	{
+		int num_bits = (uit+1 == end()) ? ntaxa % UINT_BITS : UINT_BITS;
+
+		*uit = (1 << (num_bits-1)) - 1 + (1 << (num_bits-1)) - (*uit);
+	}
+}
+
+
+bool Split::shouldInvert() {
+	int count = countTaxa();
+	if (count * 2 < ntaxa) 
+		return false;
+	if (count * 2 > ntaxa)
+		return true;
+	return !containTaxon(0);
+}
+
+
+/**
+	set number of taxa
+	@param antaxa number of taxa
+*/
+void Split::setNTaxa(int antaxa)
+{
+	ntaxa = antaxa;
+	resize((ntaxa + UINT_BITS - 1) / UINT_BITS, 0);
+	for (iterator it = begin(); it != end(); it++)
+		(*it) = 0;
+}
+
+int Split::countTaxa() const {
+	int count=0;
+	for (int i = 0; i < size(); i++)
+		for (UINT j = 0; j < UINT_BITS && (i*UINT_BITS+j < getNTaxa()); j++)
+			if ((*this)[i] & (1 << j))
+			{
+				count++;
+			}
+	return count;
+}
+
+void Split::report(ostream &out)
+{
+
+	out << getWeight() << '\t';
+	for (int i = 0; i < size(); i++)
+		for (UINT j = 0; j < UINT_BITS && (i*UINT_BITS+j < getNTaxa()); j++)
+			if ((*this)[i] & (1 << j))
+			{
+				//out << i * UINT_BITS + j + 1 << " ";
+				out << i * UINT_BITS + j << " ";
+			}
+	out << endl;
+}
+
+
+int Split::firstTaxon() {
+	for (int i = 0; i < size(); i++)
+		if ((*this)[i] != 0) {
+			for (UINT j = 0; j < UINT_BITS && (i*UINT_BITS+j < getNTaxa()); j++)
+				if ((*this)[i] & (1 << j)) {
+					return (i * UINT_BITS + j);
+				}
+		}
+	return -1;
+}
+
+bool Split::isEmpty() {
+	for (iterator it = begin(); it != end(); it++)
+		if (*it != 0) return false;
+	return true;
+}
+
+
+/**
+	@param sp the other split
+	@return true if this split is compatible with sp
+*/
+bool Split::compatible(Split &sp)
+{
+	// be sure that	the two split has the same size
+	assert(sp.size() == size() && sp.ntaxa == ntaxa);
+
+	UINT res = 0, res2 = 0, res3 = 0, res4 = 0;
+	for (iterator it = begin(), sit = sp.begin(); it != end(); it++, sit++)
+	{
+		int num_bits = (it+1 == end()) ? ntaxa % UINT_BITS : UINT_BITS;
+		UINT it2 = (1 << (num_bits-1)) - 1 + (1 << (num_bits-1)) - (*it);
+		UINT sit2 = (1 << (num_bits-1)) - 1 + (1 << (num_bits-1)) - (*sit);
+
+		res |= (*it) & (*sit);
+		res2 |= (it2) & (sit2);
+		res3 |= (*it) & (sit2);
+		res4 |= (it2) & (*sit);
+		if (res != 0 && res2 != 0 && res3 != 0 && res4 != 0)
+			return false;
+		//if (res != 0 && res != (*it) && res != (*sit) && res2 != 0)
+			//return false;
+	}
+	return true;
+	//return (res == 0) || (res2 == 0) || (res3 == 0) || (res4 == 0);
+}
+
+/**
+	@param taxa_set set of taxa
+	@return true if this split is preserved in the set taxa_set
+*/
+bool Split::preserved(Split &taxa_set)
+{
+	// be sure that	the two split has the same size
+	assert(taxa_set.size() == size() && taxa_set.ntaxa == ntaxa);
+
+	int time_zero = 0, time_notzero = 0;
+	
+	for (iterator it = begin(), sit = taxa_set.begin(); it != end(); it++, sit++)
+	{
+		UINT res = (*it) & (*sit);
+		if (res != 0 && res != (*sit))
+			return true;
+		if (*sit != 0) {
+			if (res == 0) time_zero++; else time_notzero++;
+			if (res == 0 && time_notzero > 0) return true;
+			if (res != 0 && time_zero > 0) return true;
+		}
+	}
+	return false;
+}
+
+int Split::trivial() {
+/*
+	int num = countTaxa();
+	if (num == 1) {
+		// trivial split, fetch the first bit-1
+		int tax = 0;
+		for (iterator it = begin(); it != end(); it++) {
+			for (int i = 0; i < UINT_BITS && tax < ntaxa; i++, tax++)
+				if (((*it) & (1 << i)) != 0)
+					return tax;
+		}
+	} else if (num == ntaxa - 1) {
+		// trivial split, fetch the first bit-0
+		int tax = 0;
+		for (iterator it = begin(); it != end(); it++) {
+			for (int i = 0; i < UINT_BITS && tax < ntaxa; i++, tax++)
+				if (((*it) & (1 << i)) == 0)
+					return tax;
+		}
+	} else 
+		// not a trivial split
+		return -1;
+*/
+	int id0 = 0, id1 = 0, pos = 0;
+	int bit0s = 0, bit1s = 0;
+	for (iterator it = begin(); it != end(); it++, pos++) {
+		UINT content = *it;
+		int max_step;
+		if ((it + 1) == end()) {
+			max_step = ntaxa % UINT_BITS;
+			if (!max_step) max_step = UINT_BITS;
+		}
+		else
+			max_step = UINT_BITS;
+
+		for (int i = 0; i < max_step; i++) {
+			if ((content & ( 1 << i)) != 0) {
+				bit1s ++;
+				if (bit1s == 1) 
+					id1 = pos * UINT_BITS + i;
+			}
+			else {
+				bit0s ++;
+				if (bit0s == 1) 
+					id0 = pos * UINT_BITS + i;
+			}
+			// if both number of bit 0 and 1 greater than 1, return -1 (not trivial)
+			if (bit1s > 1 && bit0s > 1) 
+				return -1;
+		}
+	}
+	if (bit1s == 1)
+		return id1;
+	else if (bit0s == 1)
+		return id0;
+	else
+		return -1;
+}
+
+
+/**
+	add a taxon into the split
+	@param tax_id id of taxon from 0..ntaxa-1
+*/
+void Split::addTaxon(int tax_id)
+{
+	assert(tax_id >= 0 && tax_id < ntaxa);
+	int pos = tax_id / UINT_BITS, off = tax_id % UINT_BITS;
+	(*this)[pos] |= 1 << off;
+}
+
+/**
+	remove a taxon from the split
+	@param tax_id id of taxon from 0..ntaxa-1
+*/
+void Split::removeTaxon(int tax_id)
+{
+	assert(tax_id >= 0 && tax_id < ntaxa);
+	int pos = tax_id / UINT_BITS, off = tax_id % UINT_BITS;
+
+	(*this)[pos] &= -1 - (1 << off);
+}
+
+/**
+	@param tax_id id of taxon from 0..ntaxa-1
+	@return true if tax_id is in the set
+*/
+bool Split::containTaxon(int tax_id)
+{
+	assert(tax_id >= 0 && tax_id < ntaxa);
+	int pos = tax_id / UINT_BITS, off = tax_id % UINT_BITS;
+	return ((*this)[pos] & ( 1 << off)) != 0;
+
+}
+
+void Split::getTaxaList(vector<int> &invec) {
+	int tax = 0;
+	invec.clear();
+	for (iterator it = begin(); it != end(); it++) {
+		for (int i = 0; i < UINT_BITS && tax < ntaxa; i++, tax++)
+			if (((*it) & (1 << i)) != 0) // inside the split
+				invec.push_back(tax);
+	}
+}
+
+
+void Split::getTaxaList(vector<int> &invec, vector<int> &outvec) {
+	int tax = 0;
+	invec.clear();
+	outvec.clear();
+	for (iterator it = begin(); it != end(); it++) {
+		for (int i = 0; i < UINT_BITS && tax < ntaxa; i++, tax++)
+			if (((*it) & (1 << i)) != 0) // inside the split
+				invec.push_back(tax);
+			else
+				outvec.push_back(tax);
+	}
+}
+
+bool Split::operator<(const Split &sp) const {
+	return countTaxa() < sp.countTaxa();
+}
+
+Split &Split::operator+=(Split &sp) {
+	assert(sp.ntaxa == ntaxa);
+	iterator it1, it2;
+	for (it1 = begin(), it2 = sp.begin(); it1 != end(); it1++, it2++) {
+		(*it1) |= (*it2);
+	}
+	return *this;
+}
+
+Split &Split::operator*=(Split &sp) {
+	assert(sp.ntaxa == ntaxa);
+	iterator it1, it2;
+	for (it1 = begin(), it2 = sp.begin(); it1 != end(); it1++, it2++) {
+		(*it1) &= (*it2);
+	}
+	return *this;	
+}
+
+Split &Split::operator-=(Split &sp) {
+	assert(sp.ntaxa == ntaxa);
+	iterator it1, it2;
+	for (it1 = begin(), it2 = sp.begin(); it1 != end(); it1++, it2++) {
+		(*it1) -= (*it1) & (*it2);
+	}
+	return *this;	
+}
+
+bool Split::operator==(const Split &sp) const{
+	if (ntaxa != sp.ntaxa) return false;
+	for (const_iterator it = begin(), it2 = sp.begin(); it != end(); it++, it2++)
+		if ((*it) != (*it2))
+			return false;
+	return true;
+}
+
+bool Split::subsetOf (Split &sp) {
+	assert(ntaxa == sp.ntaxa);
+	for (iterator it = begin(), it2 = sp.begin(); it != end(); it++, it2++)
+		if ( ((*it) & (*it2)) != (*it) )
+			return false;
+	return true;
+}
+
+Split &Split::operator= (const Split &sp) {
+	assert(ntaxa == sp.ntaxa);
+	vector<UINT>::operator= (sp);
+	weight = sp.weight;
+	return *this;
+}
+
+/*
+void Split::copy(const Split &sp) {
+	assert(ntaxa == sp.ntaxa);
+	for (iterator it = begin(), it2 = sp.begin(); it != end(); it++, it2++)
+		(*it) = (*it2);
+	weight = sp.weight;
+}
+*/
+
+void Split::randomize(int size) {
+	assert(size < ntaxa);
+	int num = countTaxa();
+	int cnt;
+	// repeat at most 10 times
+	const int MAX_STEP = 20;
+	const int PROB_STEP = 5;
+	for (int step = 0; step < MAX_STEP && num < size; step++) {
+		// probability of including a taxon
+		double prob = (double)(size - num) / ntaxa;
+		// increase the probability if passing too many iterations
+		if (step >= PROB_STEP) prob *= 2.0;
+		if (step >= PROB_STEP*2) prob *= 2.0;
+		if (step == MAX_STEP - 1) prob = 1.0;
+		// now scan through all elements, pick up at random
+		for (cnt = 0; cnt < ntaxa && num < size; cnt++)
+			if (!containTaxon(cnt) && ( random_double() <= prob )) {
+				addTaxon(cnt);
+				num++;
+			}
+	}
+	//report(cout);
+	if (num >= size) return;
+	cerr << "WARNING: random set has less than " << size << "taxa." << endl;
+}
+
+
+bool Split::overlap(Split &sp) {
+	assert(ntaxa == sp.ntaxa);
+	iterator it, it2;
+	for (it = begin(), it2 = sp.begin(); it != end(); it++, it2++)
+		if ((*it) & (*it2)) return true;
+	return false;
+	
+}
+
+
+Split::~Split()
+{}
+
+bool Split::containAny(IntVector &tax_id) {
+	for (IntVector::iterator it = tax_id.begin(); it != tax_id.end(); it++)
+		if (containTaxon(*it)) return true;
+	return false;
+}
+
+Split *Split::extractSubSplit(Split &taxa_mask) {
+	assert(taxa_mask.getNTaxa() == getNTaxa());
+	Split *sp = new Split(taxa_mask.countTaxa());
+	int id = 0;
+	for (int tax = 0; tax < ntaxa; tax++)
+	if (taxa_mask.containTaxon(tax)) {
+		if (containTaxon(tax))
+			sp->addTaxon(id);
+		id++;
+	}
+	assert(id == sp->getNTaxa());
+	return sp;
+}
+
+
+/**
+	Solve k-means problem for one-dimensional data with dynamic programming
+	@param n number of data points
+	@param ncat number of clusters
+	@param data data point of size n: x[0..n-1]
+	@param center (OUT) output k centers of k clusters: center[0...k-1] will be filled
+	@param cluster (OUT) cluster assignments for each data point: cluster[0...n-1] will be filled
+	@return the minimum sum of squares over all k clusters
+*/
+double kMeansOneDim(int n, int ncat, double *data, double *center, int *cluster) {
+	int i, j, m, k = ncat;
+	if (ncat == 0) k = n;
+	/**
+		dynamic programming cost matrix, c[i][j] = cost of i clusters for {x1...xj}
+	*/
+	double **c = (double**) new double[k]; 
+	/**
+		id is used to trace back the minimal solution
+	*/
+	double **id = (double**) new double[k]; 
+	/** 
+		c1[i][j] = cost of 1 cluster for {xi...xj}
+	*/
+	double **c1 = (double**) new double[n];
+	/** 
+		m1[i][j] = mean of {xi...xj}
+	*/
+	double **m1 = (double**) new double[n];
+	
+	double *x = new double[n]; // sorted data points
+
+	double *h = new double[n]; // Hartigan index
+
+	// allocate memory 
+	for (i = 0; i < k; i++) c[i] = new double[n];
+	for (i = 0; i < k; i++) id[i] = new double[n];
+	for (i = 0; i < n; i++) c1[i] = new double[n];
+	for (i = 0; i < n; i++) m1[i] = new double[n];
+
+	// first sort data into x
+	memmove(x, data, sizeof(double)*n);
+	std::sort(x, x+n);
+	// first compute c1 matrix
+	for (i = 0; i < n; i++) {
+		double sum = 0.0;
+		for (j = i; j < n; j++) {
+			sum += x[j];
+			double mean = sum / (j-i+1);
+			m1[i][j] = mean;
+			double ss = 0; 
+			for (m = i; m <= j; m++) 
+				ss += (x[m]-mean)*(x[m]-mean); // sum of squared difference
+				//ss += fabs(x[m]-mean); // sum of absolute difference
+			c1[i][j] = ss;
+		}
+	}
+
+	/* now compute dynamic programming matrix */
+	// initialize the 1st row
+	for (j = 0; j < n; j++) {
+		c[0][j] = c1[0][j];
+		id[0][j] = -1;
+	}
+	for (i = 1; i < k; i++) {
+		// no i clusters exist for less than i data points
+		for (j = 0; j < i; j++) { c[i][j] = INFINITY; id[i][j] = -1; }
+		for (j = i; j < n; j++) {
+			c[i][j] = INFINITY;
+			for (m = i-1; m < j; m++)
+				if (c[i][j] > c[i-1][m] + c1[m+1][j]) {
+					c[i][j] = c[i-1][m] + c1[m+1][j];
+					id[i][j] = m;
+				}
+		}
+		// compute Hartigan index
+		h[i-1] = (n-i-1)*(c[i-1][n-1]-c[i][n-1]) / c[i][n-1];
+		//cout << i << " clusters " << h[i-1] << endl;
+	}
+
+	double min_cost = c[k-1][n-1];
+	int *bound = new int[k+1];
+	// now trace back
+	bound[k] = n-1;
+	for (i = k-1; i >= 0; i--) {
+		bound[i] = id[i][bound[i+1]];
+	}
+
+	for (i = 0; i < k; i++) {
+		center[i] = m1[bound[i]+1][bound[i+1]];
+		for (j = 0; j < n; j++)
+			if (data[j] <= x[bound[i+1]] && data[j] >= x[bound[i]+1])
+				cluster[j] = i;
+	}
+
+	// free memory
+	delete [] bound;
+	for (i = n-1; i >= 0; i--) delete [] m1[i];
+	for (i = n-1; i >= 0; i--) delete [] c1[i];
+	for (i = k-1; i >= 0; i--) delete [] id[i];
+	for (i = k-1; i >= 0; i--) delete [] c[i];
+
+	delete [] h;
+	delete [] x;
+	delete [] m1;
+	delete [] c1;
+	delete [] id;
+	delete [] c;
+		
+	return min_cost;
+}
diff --git a/split.h b/split.h
new file mode 100644
index 0000000..640ac82
--- /dev/null
+++ b/split.h
@@ -0,0 +1,298 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SPLIT_H
+#define SPLIT_H
+
+#include <vector>
+#include <string>
+#include "tools.h"
+#include "msplitsblock.h"
+
+using namespace std;
+
+
+const int UINT_BITS = sizeof(UINT) * 8;
+const int BITS_DIV = (sizeof(int) == 2) ? 4 : ((sizeof(int)==4) ? 5 : 6);
+const int BITS_MODULO = UINT_BITS-1;
+
+/**
+Defining a split, also a set of taxa.
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class Split : public vector<UINT>
+{
+public:
+
+	friend class MSplitsBlock;
+	friend class SplitGraph;
+	friend class PDNetwork;
+	friend class CircularNetwork;
+	friend class PDTree;
+    friend class MTreeSet;
+
+	/**
+		empty constructor
+	*/
+    Split();
+
+	/**
+		constructor
+		@param antaxa number of taxa
+		@param aweight weight of split
+	*/
+	Split(int antaxa, double aweight = 0.0);
+
+
+	/**
+		constructor copy from another split
+		@param sp split to be copied from
+	*/
+	Split(const Split &sp);
+
+	/**
+		construct the split from a taxa list
+		@param antaxa number of taxa
+		@param aweight weight of split
+		@param taxa_list list of taxa in one side of split
+	*/
+    Split(int antaxa, double aweight, vector<int> taxa_list);
+
+	/**
+		print infos of split graph
+		@param out the output stream
+	*/	
+	void report(ostream &out);
+
+	/**
+		destructor
+	*/
+    ~Split();
+
+	/**
+		get number of taxa
+		@return number of taxa
+	*/
+	inline int getNTaxa() const {
+		return ntaxa;
+	}
+
+	/**
+		get number of taxa being in the split
+		@return number of taxa
+	*/
+	int countTaxa() const;
+
+	/**
+		copy from another split
+	*/
+	//void copy(Split &sp); 
+
+	/**
+		set number of taxa
+		@param antaxa number of taxa
+	*/
+	void setNTaxa(int antaxa);
+
+	/**
+		get the first taxon in the set
+		@return the first taxon or -1 if empty
+	*/
+	int firstTaxon();
+
+	/**
+		@return TRUE if the set is empty
+	*/
+	bool isEmpty();
+
+	/**
+		get weight
+		@return weight
+	*/
+	inline double getWeight() const {
+		return weight;
+	}
+
+	/**
+		set weight
+		@param aweight the new weight
+	*/
+	inline void setWeight(double aweight) {
+		weight = aweight;
+	}
+
+	/**
+		check whether the split should be inverted (number of taxa > ntaxa / 2)
+		@return TRUE yes, should be inverted
+	*/
+	bool shouldInvert();
+
+	/**
+		invert the split (0->1, 1->0)
+	*/
+	void invert();
+
+	/**
+		@param sp the other split
+		@return true if this split is compatible with sp
+	*/
+	bool compatible(Split &sp);
+
+	/**	
+		@param taxa_set set of taxa
+		@return true if this split is preserved in the set taxa_set
+	*/
+	bool preserved(Split &taxa_set);
+
+	/**
+		if the split is trivial (contains 1 taxon in 1 side), return the taxon id, 
+		otherwise return -1
+		@return the taxon id if the split is trivial 
+	*/
+	int trivial();
+
+	/**
+		add a taxon into the split
+		@param tax_id id of taxon from 0..ntaxa-1
+	*/
+	void addTaxon(int tax_id); 
+
+	/**
+		remove a taxon from the split
+		@param tax_id id of taxon from 0..ntaxa-1
+	*/
+	void removeTaxon(int tax_id); 
+
+	/**
+		@param tax_id id of taxon from 0..ntaxa-1
+		@return true if tax_id is in the set
+	*/
+	bool containTaxon(int tax_id); 
+
+	/**
+		@param tax_id vector of id of taxa from 0..ntaxa-1
+		@return true if SOME taxon in tax_id is in the set
+	*/
+	bool containAny(IntVector &tax_id); 
+
+	/**
+		@param tax_id vector of id of taxa from 0..ntaxa-1
+		@return true if ALL taxa in tax_id is in the set
+	*/
+	bool containAll(IntVector &tax_id); 
+
+	/**
+		get the list of taxa contained in split
+		@param invec (OUT) taxa in this side of split
+	*/
+	void getTaxaList(vector<int> &invec);
+
+	/**
+		get the list of taxa contained in split and not contained in split
+		@param invec (OUT) taxa in this side of split
+		@param outvec (OUT) taxa on the other side
+	*/
+	void getTaxaList(vector<int> &invec, vector<int> &outvec);
+
+	/**
+	 *  Test whether the current split is smaller than \a sp
+	 *  @param sp the other split to compare
+	 *  @return true if the current split contains less taxa than \a sp
+	 */
+    bool operator<(const Split &sp) const;
+
+	/**
+		compare two split, do not compare the weight
+		@param sp the target split
+		@return TRUE if equal, FALSE otherwise
+	*/
+	bool operator==(const Split &sp) const;
+
+	/**
+		add all taxa from another split into this split (union)
+		@param sp a split
+	*/
+	Split &operator+=(Split &sp);
+
+	/**
+		get the intersection with another split
+		@param sp a split
+	*/
+	Split &operator*=(Split &sp);
+
+	/**
+		get the set difference with another split
+		@param sp a split
+	*/
+	Split &operator-=(Split &sp);
+
+	/**
+		@return TRUE if there is overlapped taxon with sp, FALSE otherwise
+		@param sp a split
+	*/
+	bool overlap(Split &sp);
+
+	/**
+		assignment
+		@param sp a split
+	*/
+	Split &operator= (const Split &sp);
+
+	/**
+		subset operator
+		@param sp a split
+		@return TRUE of this set is a subset of sp
+	*/
+	bool subsetOf (Split &sp);
+
+
+	/**
+		randomize the set of a specific size
+		@param size number of taxa in the resulting set
+	*/
+	void randomize(int size);
+
+	Split *extractSubSplit(Split &taxa_mask);
+
+	string &getName() { return name; }
+protected:
+	/**
+		number of taxa
+	*/
+	int ntaxa; 
+
+	/**
+		weight of split
+	*/
+	double weight;
+    
+    /** 2018-08-23: split name */
+    string name;
+
+};
+
+inline int splitweightcmp(const Split* a, const Split* b)
+{
+	return (a->getWeight() > b->getWeight());
+}
+
+typedef Split TaxaSet;
+
+#endif
diff --git a/splitgraph.cpp b/splitgraph.cpp
new file mode 100644
index 0000000..915b08d
--- /dev/null
+++ b/splitgraph.cpp
@@ -0,0 +1,703 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "splitgraph.h"
+#include <iostream>
+#include <fstream>
+#include <cctype>
+#include <algorithm>
+#include "node.h"
+#include "ncl/ncl.h"
+#include "myreader.h"
+#include "mtree.h"
+#include "mtreeset.h"
+
+
+bool compareSplit(Split* sp1, Split* sp2) {
+	if (sp1->countTaxa() != sp2->countTaxa())
+		return sp1->countTaxa() < sp2->countTaxa();
+	else
+		return sp1->firstTaxon() < sp2->firstTaxon();
+}
+
+//#define MY_DEBUG
+/********************************************************
+	Defining SplitGraph methods
+********************************************************/
+SplitGraph::SplitGraph()
+		: vector<Split*>()
+{
+	pda = NULL;
+	taxa = NULL;
+	splits = NULL;
+	sets = NULL;
+	trees = NULL;
+	mtrees = NULL;
+	areas_boundary = NULL;
+}
+
+SplitGraph::SplitGraph(Params &params) : vector<Split*>() {
+	init(params);
+}
+
+void SplitGraph::createBlocks() {
+	taxa = new NxsTaxaBlock();
+	splits = new MSplitsBlock(this);
+	pda = new MPdaBlock(this);
+	sets = new MSetsBlock();
+	trees = new TreesBlock(taxa);
+	//mtrees = NULL;
+}
+
+
+void SplitGraph::init(Params &params)
+{
+	mtrees = NULL;
+	if (params.intype == IN_NEWICK) {
+		// read the input file, can contain more than 1 tree
+		mtrees = new MTreeSet(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
+		//mtree = new MTree(params.user_file, params.is_rooted);
+
+		if (params.is_rooted) {
+			params.sub_size++;
+			params.min_size++;
+		}
+		if (mtrees->isRooted() && params.root != NULL)
+			outError(ERR_CONFLICT_ROOT);
+		//SplitIntMap hash_ss;
+		mtrees->convertSplits(*this, params.split_threshold, params.split_weight_summary, params.split_weight_threshold);
+
+		if (verbose_mode >= VB_DEBUG)
+			saveFileStarDot(cout);
+	} else {
+		createBlocks();
+		if (params.is_rooted) 
+			outError(ERR_ROOT_NET);
+	
+	 	cout << "Reading input file " << params.user_file << "..." << endl;
+
+		MyReader nexus(params.user_file);
+	
+		nexus.Add(taxa);
+		nexus.Add(splits);
+		nexus.Add(pda);
+		nexus.Add(sets);
+		nexus.Add(trees);
+
+		MyToken token(nexus.inf);
+		nexus.Execute(token);
+		if (trees->GetNumTrees() > 0) { 
+			if (getNSplits() > 0) 
+				outError("Ambiguous input file, pls only specify either SPLITS block or TREES block");
+			convertFromTreesBlock(params.tree_burnin, params.tree_max_count, params.split_threshold, 
+				params.split_weight_summary, params.split_weight_threshold, params.tree_weight_file);
+		}
+
+	}
+	
+	if (verbose_mode >= VB_DEBUG)
+		taxa->Report(cout);
+	//splits->Report(cout);
+	//reportConflict(cout);
+	if (params.pdtaxa_file != NULL) {
+		if (sets->getNSets() > 0)
+			outError("Taxa sets were already specified in the input file");
+		cout << "Reading taxa sets in file " << params.pdtaxa_file << "..." << endl;
+	
+		bool nexus_formated = (detectInputFile(params.pdtaxa_file) == IN_NEXUS);
+		if (nexus_formated) {
+			MyReader nexus(params.pdtaxa_file);
+			nexus.Add(sets);
+			MyToken token(nexus.inf);
+			nexus.Execute(token);
+		} else {
+			readTaxaSets(params.pdtaxa_file, sets);
+		}
+		if (sets->getNSets() == 0)
+			outError("No taxa sets found");
+	}
+
+	areas_boundary = NULL;
+	if (params.areas_boundary_file) {
+		if (sets->getNSets() == 0) outError("No taxon sets defined yet");
+		areas_boundary = new double [sets->getNSets() * sets->getNSets()];
+		cout << "Reading sets relation file " << params.areas_boundary_file << "..." << endl;
+		readAreasBoundary(params.areas_boundary_file, sets, areas_boundary);
+	}
+
+	if (verbose_mode >= VB_DEBUG && sets->getNSets() > 0)
+		sets->Report(cout);
+
+	if (sets->getNSets() > 0 && taxa->GetNumTaxonLabels() == 0) {
+		AddTaxaFromSets();
+	}
+	if (taxa->GetNumTaxonLabels() == 0)
+		outError("No taxa found");
+	if (getNSplits() == 0) {
+		//outError(ERR_NO_SPLITS);
+		createStarTree();
+	}
+	cout << getNTaxa()-params.is_rooted <<
+		" taxa and " << getNSplits()-params.is_rooted << " splits." << endl;
+
+}
+
+int SplitGraph::getNTrivialSplits() {
+	int count = 0;
+	for (iterator it = begin(); it != end(); it++)
+		if ((*it)->trivial() >= 0)
+			count++;
+	return count;
+}
+
+
+void SplitGraph::createStarTree() {
+	cout << "No splits found, creating a star tree with branch length of 1..." << endl;
+	int ntaxa = taxa->GetNumTaxonLabels();
+	for (int i = 0; i < ntaxa; i++) {
+		Split *sp = new Split(ntaxa, 1.0);
+		sp->addTaxon(i);
+		push_back(sp);
+	}
+	cout << "NOTE: subsequent PD will correspond to species richness." << endl;
+}
+
+
+void SplitGraph::AddTaxaFromSets() {
+	cout << "Taking taxa from SETS block..." << endl;
+	for (int i = 0; i < sets->getNSets(); i++)
+		for(vector<string>::iterator it = sets->getSet(i)->taxlist.begin(); 
+			it != sets->getSet(i)->taxlist.end(); it++) 
+			if (!taxa->IsAlreadyDefined(NxsString(it->c_str()))) {
+				taxa->AddTaxonLabel(NxsString(it->c_str()));
+			}	
+}
+
+void SplitGraph::freeMem() {
+	for (reverse_iterator it = rbegin(); it != rend(); it++) {
+		//(*it)->report(cout);
+		delete *it;
+	}
+	clear();
+	if (areas_boundary) delete areas_boundary;
+	if (trees) delete trees;
+	if (sets) delete sets;
+	if (pda) delete pda;
+	if (splits) delete splits;
+	if (taxa) delete taxa;
+	if (mtrees) delete mtrees;
+}
+
+SplitGraph::~SplitGraph()
+{
+	freeMem();
+}
+
+
+void SplitGraph::convertFromTreesBlock(int burnin, int max_count, double split_threshold, 
+	int split_weight_summary, double weight_threshold, const char *tree_weight_file) {
+	cout << trees->GetNumTrees() << " tree(s) loaded" << endl;
+	if (burnin >= trees->GetNumTrees())
+		outError("Burnin value is too large");
+	if (burnin > 0)
+	cout << burnin << " beginning tree(s) discarded" << endl;
+	mtrees = new MTreeSet();
+	
+	for (int i = burnin; i < trees->GetNumTrees() && (i < burnin+max_count); i++) {
+		stringstream strs(trees->GetTranslatedTreeDescription(i), ios::in | ios::out | ios::app);
+		strs << ";";
+		MTree *tree = mtrees->newTree();
+		bool myrooted = trees->IsRootedTree(i);
+		tree->readTree(strs, myrooted);
+		mtrees->push_back(tree);
+		mtrees->tree_weights.push_back(1);
+	}
+	mtrees->checkConsistency();
+	//SplitIntMap hash_ss;
+	
+	if (tree_weight_file) 
+		readIntVector(tree_weight_file, burnin, max_count, mtrees->tree_weights);
+/*	else if (!weights)
+		tree_weights.resize(size(), 1);*/
+
+	if (mtrees->size() != mtrees->tree_weights.size())
+		outError("Tree file and tree weight file have different number of entries");	
+	mtrees->convertSplits(*this, split_threshold, split_weight_summary, weight_threshold);
+}
+
+
+
+void SplitGraph::report(ostream &out)
+{
+
+	out << endl;
+	out << "Split network contains ";
+
+	if (size() == 0)
+	{
+		out << "no split" << endl;
+	}
+	else if (size() == 1)
+		out << "one split" << endl;
+	else
+		out << size() << " splits" << endl;
+
+	if (size() == 0)
+		return;
+
+	sort(begin(), end(), compareSplit);
+	int k = 0;
+	for (iterator it = begin(); it != end(); it++,k++)
+	{
+		out << '\t' << (k+1) << '\t';
+		(*it)->report(out);
+	}
+}
+
+void SplitGraph::reportConflict(ostream &out)
+{
+	int k = 0;
+	out << "Compatible splits: " << endl;
+	for (iterator i = begin(); i != end(); i++, k++)
+	{
+		out << (k+1) << '\t';
+		int k2 = 1;
+		for (iterator j = begin(); j != end(); j++, k2++)
+			if ( j != i && (*i)->compatible(*(*j)))
+			{
+				out << k2 << " ";
+			}
+		out << endl;
+	}
+}
+
+/**
+	calculate sum of weights of preserved splits in the taxa_set
+	@param taxa_set a set of taxa
+*/
+double SplitGraph::calcWeight(Split &taxa_set)
+{
+	double sum = 0.0;
+	for (iterator it = begin(); it != end(); it++)
+		if ((*it)->preserved(taxa_set))
+			sum += (*it)->getWeight();
+	return sum;
+}
+
+int SplitGraph::countSplits(Split &taxa_set)
+{
+	int cnt = 0;
+	for (iterator it = begin(); it != end(); it++)
+		if ((*it)->preserved(taxa_set))
+			cnt++;
+	return cnt;
+}
+
+int SplitGraph::countInternalSplits(Split &taxa_set)
+{
+	int cnt = 0;
+	for (iterator it = begin(); it != end(); it++)
+		if ((*it)->trivial() < 0 && (*it)->preserved(taxa_set))
+			cnt++;
+	return cnt;
+}
+
+/**
+	calculate sum of weights of all splits
+*/
+double SplitGraph::calcWeight() {
+	double sum = 0.0;
+	for (iterator it = begin(); it != end(); it++)
+		sum += (*it)->weight;
+	return sum;
+}
+
+double SplitGraph::calcTrivialWeight() {
+	double sum = 0.0;
+	for (iterator it = begin(); it != end(); it++)
+		if ((*it)->trivial() >= 0)
+			sum += (*it)->weight;
+	return sum;
+}
+
+double SplitGraph::maxWeight() {
+	double m = -1e6;
+	for (iterator it = begin(); it != end(); it++)
+		if (m < (*it)->weight) m = (*it)->weight;
+	return m;
+}
+
+void SplitGraph::generateTaxaSet(char *filename, int size, int overlap, int times) {
+	ofstream out(filename);
+	if (!out.is_open())
+		outError(ERR_WRITE_OUTPUT, filename);
+	assert(overlap <= size);
+	int total = 2*size - overlap;
+	int ntaxa = getNTaxa();
+	for (int cnt = 0; cnt < times; cnt++) {
+		// generate random taxon index 
+		IntVector ranvec;
+		BoolVector occur(ntaxa, false);
+		int i;
+		for (i = 0; i < total; i++) {
+			int rnum;
+			do { rnum = random_int(ntaxa); } while (occur[rnum]);
+			ranvec.push_back(rnum);
+			occur[rnum] = true;
+		}
+		// now write the first set
+		out << size << endl;
+		for (i = 0; i < size; i++) 
+			out << taxa->GetTaxonLabel(ranvec[i]) << endl;
+		out << endl;
+		// now write the second set
+		out << size << endl;
+		for (i = size-overlap; i < total; i++) 
+			out << taxa->GetTaxonLabel(ranvec[i]) << endl;
+		out << endl;
+	}
+	out.close();
+}
+
+void SplitGraph::calcDistance(char *filename) {
+	ofstream out(filename);
+	if (!out.is_open())
+		outError(ERR_WRITE_OUTPUT, filename);
+	matrix(double) dist;
+	int i, j;	
+	calcDistance(dist);
+
+	int ntaxa = getNTaxa();
+
+	// now write the distances in phylip .dist format
+	out << ntaxa << endl;
+	
+	for (i = 0; i < ntaxa; i++) {
+		out << taxa->GetTaxonLabel(i) << "   ";
+		for (j = 0; j < ntaxa; j++) {
+			out << dist[i][j] << "  ";
+		}
+		out << endl;
+	}
+	out.close();
+}
+
+void SplitGraph::calcDistance(matrix(double) &dist) {
+	int ntaxa = getNTaxa();
+	iterator it;
+	vector<int> vi, vj;
+	vector<int>::iterator i, j;
+
+	dist.resize(ntaxa);
+	for (matrix(double)::iterator di = dist.begin(); di != dist.end(); di++)
+		(*di).resize(ntaxa, 0);
+
+	for (it = begin(); it != end(); it++) {
+		(*it)->getTaxaList(vi, vj);
+		for (i = vi.begin(); i != vi.end(); i++)
+			for (j = vj.begin(); j < vj.end(); j++) {
+				dist[*i][*j] += (*it)->weight;
+				dist[*j][*i] += (*it)->weight;
+			}
+	}
+
+}
+
+
+void SplitGraph::calcDistance(matrix(double) &dist, vector<int> &taxa_order) {
+	int ntaxa = getNTaxa();
+	int i, j;
+
+	matrix(double) my_dist;
+	calcDistance(my_dist);
+	dist.resize(ntaxa);
+	for (i = 0; i < ntaxa; i++) {
+		dist[i].resize(ntaxa);
+		for (j = 0; j < ntaxa; j++)
+			dist[i][j] = my_dist[taxa_order[i]][taxa_order[j]];
+	}
+}
+
+bool SplitGraph::checkCircular(matrix(double) &dist) {
+	return true;
+	int ntaxa = getNTaxa();
+	Split taxa_set(ntaxa, 0.0);
+	for (int i = 0; i < ntaxa-2; i++)
+		for (int j = i+1; j < ntaxa-1; j++)
+			for (int k = j+1; k < ntaxa; k++) {
+				taxa_set.addTaxon(i);
+				taxa_set.addTaxon(j);
+				taxa_set.addTaxon(k);
+				taxa_set.weight = calcWeight(taxa_set);
+				if (fabs(2 * taxa_set.weight - (dist[i][j] + dist[i][k] + dist[j][k])) > 0.0001) {
+					cout << "Taxa " << i << " " << j << " " << k;
+					cout << " do not satisfy circular equation!" << endl;
+					cout << "Weight = " << taxa_set.weight << endl;
+					cout << "Sum dist/2 = " << (dist[i][j] + dist[i][k] + dist[j][k]) / 2.0 << endl;
+					cout << "dist = " << dist[i][j] << " " << dist[i][k] << " "
+						 << dist[j][k] << endl;
+					return false;
+				}
+				taxa_set.removeTaxon(i);
+				taxa_set.removeTaxon(j);
+				taxa_set.removeTaxon(k);
+			}
+	return true;
+}
+
+void SplitGraph::generateCircular(Params &params) {
+	int i, j;
+	int ntaxa = params.sub_size;
+	int num_splits = (params.num_splits > 0) ? params.num_splits : 3 * ntaxa;
+	if (num_splits < ntaxa) 
+		outError(ERR_FEW_SPLITS); 
+
+	taxa = new NxsTaxaBlock();
+	splits = new MSplitsBlock(this);
+
+	double threshold = (ntaxa > 3) ? (double)(num_splits - ntaxa) / (ntaxa*(ntaxa-3)/2) : 0.0;
+
+	// insert all trivial splits
+	for (i = 0; i < ntaxa; i++) {
+		double weight = randomLen(params);
+		Split *sp = new Split(ntaxa, weight);
+		sp->addTaxon(i);
+		push_back(sp);
+		ostringstream str;
+		str << "T" << (i+1);
+		taxa->AddTaxonLabel(NxsString(str.str().c_str()));
+		splits->cycle.push_back(i);
+	}
+
+	// randomly insert internal splits
+	for (i = 0; i < ntaxa-2 && getNSplits() < num_splits; i++)
+		for (j = i+1; j < ntaxa && j < ntaxa-3+i; j++) {
+			double choice = random_double();
+			if (choice > threshold) continue;
+			double weight = randomLen(params);
+			Split *sp = new Split(ntaxa, weight);
+			for (int k = i; k <= j; k++)
+				sp->addTaxon(k);
+			push_back(sp);
+			if (getNSplits() >= num_splits) break;
+		}
+
+	ofstream out(params.user_file);
+	if (!out.is_open()) {
+		outError(ERR_WRITE_OUTPUT, params.user_file);
+	}
+
+	saveFileNexus(out);
+	out.close();
+} 
+
+void SplitGraph::saveFileNexus(ostream &out, bool omit_trivial) {
+	int ntaxa = getNTaxa();
+	int i;
+	out << "#nexus" << endl << endl;
+	out << "BEGIN Taxa;" << endl;
+	out << "DIMENSIONS ntax=" << ntaxa << ";" << endl;
+	out << "TAXLABELS" << endl;
+	for (i = 0; i < ntaxa; i++)
+		out << "[" << i+1 << "] '" << taxa->GetTaxonLabel(i) << "'" << endl;
+	out << ";" << endl << "END; [Taxa]" << endl << endl;
+	out << "BEGIN Splits;" << endl;
+	out << "DIMENSIONS ntax=" << ntaxa << " nsplits=" << ((omit_trivial) ? getNSplits() - getNTrivialSplits() : getNSplits()) << ";" << endl;
+	out << "FORMAT labels=no weights=yes confidences=no intervals=no;" << endl;
+	if (isCircular()) {
+		out << "CYCLE";
+		for (i = 0; i < ntaxa; i++) 
+			out << " " << splits->cycle[i] + 1;
+		out << ";" << endl;
+	}
+	out << "MATRIX" << endl;
+	int near_zeros = 0;
+	int zeros = 0;
+	for (iterator it = begin(); it != end(); it++) {
+		if (omit_trivial && (*it)->trivial() >= 0) continue;
+		if ((*it)->weight == 0.0) zeros ++;
+		if ((*it)->weight <= 1e-6) near_zeros ++;
+		out << "\t" << (*it)->weight << "\t";
+		for (i = 0; i < ntaxa; i++)
+			if ((*it)->containTaxon(i))
+				out << " " << i+1;
+		out << "," << endl;
+	}
+	out << ";" << endl << "END; [Splits]" << endl << endl;
+	if (near_zeros) {
+		//outWarning("Some nearly-zero split weights observed");
+		//cout << zeros << " zero-weights and " << near_zeros << " near zero weights!" << endl;
+	}
+}
+
+void SplitGraph::saveFileStarDot(ostream &out, bool omit_trivial) {
+	int ntaxa = getNTaxa();
+	int i;
+	for (iterator it = begin(); it != end(); it++) {
+		if (omit_trivial && (*it)->trivial() >= 0) continue;
+		bool swap_code = !(*it)->containTaxon(0);
+		if (swap_code) {
+			for (i = 0; i < ntaxa; i++)
+				out << (((*it)->containTaxon(i)) ? '.' : '*');
+		} else {
+			for (i = 0; i < ntaxa; i++)
+				out << (((*it)->containTaxon(i)) ? '*' : '.');
+		}
+		out << "\t" << (*it)->weight << endl;
+	}
+}
+
+void SplitGraph::saveFile(const char* out_file, InputType file_format, bool omit_trivial) {
+    try {
+        ofstream out;
+        out.exceptions(ios::failbit | ios::badbit);
+        out.open(out_file);
+        if (file_format == IN_NEXUS) 
+			saveFileNexus(out, omit_trivial);
+		else
+			saveFileStarDot(out, omit_trivial);
+        out.close();
+    } catch (ios::failure) {
+        outError(ERR_WRITE_OUTPUT, out_file);
+    }
+}
+
+void SplitGraph::scaleWeight(double norm, bool make_int, int precision) {
+	for (iterator itg = begin(); itg != end(); itg ++ )
+		if (make_int)
+			(*itg)->setWeight( round((*itg)->getWeight()*norm) );
+		else if (precision < 0)
+			(*itg)->setWeight( (*itg)->getWeight()*norm);
+		else 
+			(*itg)->setWeight( round((*itg)->getWeight()*norm*pow((double)10.0,precision))/pow((double)10.0,precision));
+}
+
+bool SplitGraph::containSplit(Split &sp) {
+	Split invert_sp(sp);
+	invert_sp.invert();
+	for (iterator it = begin(); it != end(); it++)
+		if ((*(*it)) == sp || (*(*it)) == invert_sp)
+			return true;
+	return false;
+}
+
+double SplitGraph::computeBoundary(Split &area) {
+	if (!areas_boundary) return 0.0;
+	int nareas = sets->getNSets();
+	double boundary = 0.0;
+	for (int i = 0; i < nareas; i++) 
+	if (area.containTaxon(i)) {
+		boundary += areas_boundary[i*nareas+i];
+		for (int j = i+1; j < nareas; j++) 
+			if (area.containTaxon(j))
+				boundary -= 2.0 * areas_boundary[i*nareas+j];
+	}
+	return boundary;
+}
+
+bool SplitGraph::compatible(Split *sp) {
+	for (iterator it = begin(); it != end(); it++)
+		if (!(*it)->compatible(*sp))
+			return false;
+	return true;
+}
+
+void SplitGraph::findMaxCompatibleSplits(SplitGraph &maxsg) {
+
+	// maximum number of compatible splits = 2n-3!
+	int max_splits = getNTaxa() * 2 - 3;
+
+	// myset will be sorted by weight in descending order
+	SplitSet myset;
+	myset.insert(myset.end(), begin(), end());
+	sort(myset.begin(), myset.end(), splitweightcmp);
+
+	// now build the spset
+	if (!maxsg.taxa)
+		maxsg.taxa = new NxsTaxaBlock();
+	if (!maxsg.splits)
+		maxsg.splits = new MSplitsBlock(&maxsg);
+	if (!maxsg.pda)
+		maxsg.pda = new MPdaBlock(&maxsg);
+
+	for (int i = 0; i < getNTaxa(); i++)
+		maxsg.taxa->AddTaxonLabel(taxa->GetTaxonLabel(i));
+	
+	// make the cycle
+	maxsg.splits->cycle = splits->cycle;
+	// make the splits
+
+	for (SplitSet::iterator it = myset.begin(); it != myset.end(); it++) 
+		if (maxsg.compatible(*it)){
+			maxsg.push_back(new Split(*(*it)));
+			//(*it)->report(cout);
+			if (maxsg.size() >= max_splits)
+				break;
+		}
+	myset.clear();
+}
+
+bool SplitGraph::isWeaklyCompatible() {
+	if (getNSplits() < 3) return true;
+	for (iterator it1 = begin(); it1+2 != end(); it1++)
+		for (iterator it2 = it1+1; it2+1 != end(); it2++)
+			for (iterator it3 = it2+1; it3 != end(); it3++) {
+				Split sp1(*(*it1));
+				Split sp2(*(*it2));
+				Split sp3(*(*it3));
+				Split sp(sp1);
+				sp *= sp2;
+				sp *= sp3;
+				if (sp.isEmpty()) continue;
+				sp1.invert();
+				sp2.invert();
+				sp = sp1;
+				sp *= sp2;
+				sp *= sp3;
+				if (sp.isEmpty()) continue;
+				sp2.invert();
+				sp3.invert();
+				sp = sp1;
+				sp *= sp2;
+				sp *= sp3;
+				if (sp.isEmpty()) continue;
+				sp1.invert();
+				sp2.invert();
+				sp = sp1;
+				sp *= sp2;
+				sp *= sp3;
+				if (sp.isEmpty()) continue;
+				return false;
+			}
+	return true;
+}
+
+
+void SplitGraph::getTaxaName(vector<string> &taxname) {
+	taxname.clear();
+	for (int i = 0; i < getNTaxa(); i++)
+		taxname.push_back(taxa->GetTaxonLabel(i));
+}
+
+int SplitGraph::findLeafName(string &name) {
+	for (int i = 0; i < getNTaxa(); i++)
+		if (taxa->GetTaxonLabel(i) == name)
+			return i;
+	return -1;
+}
diff --git a/splitgraph.h b/splitgraph.h
new file mode 100644
index 0000000..6002a67
--- /dev/null
+++ b/splitgraph.h
@@ -0,0 +1,412 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SPLITGRAPH_H
+#define SPLITGRAPH_H
+
+#include <list>
+#include <vector>
+#include <string>
+#include "split.h"
+#include "ncl/ncl.h"
+#include "msplitsblock.h"
+#include "mpdablock.h"
+#include "msetsblock.h"
+#include "node.h"
+#include "splitset.h"
+#include "mtree.h"
+
+
+class MTreeSet;
+
+using namespace std;
+
+/**
+SplitGraph class
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class SplitGraph : public vector<Split*>
+{
+public:
+
+	friend class MTree;
+	friend class MTreeSet;
+	friend class ECOpd;
+
+/********************************************************
+	CONSTRUCTORs, INITIALIZATION AND DESTRUCTORs
+********************************************************/
+
+	/**
+		empty constructor
+	*/
+    SplitGraph();
+
+	/**
+		construct split graph from the parameters by calling init(params).
+		@param params program parameters
+	*/
+    SplitGraph(Params &params);
+
+	/**
+		init split graph from the parameters
+		@param params program parameters
+	*/
+    void init(Params &params);
+	
+	/**
+		if no taxa block found, but the sets block is present, then 
+		this function will be invoked. It takes the taxa names from the sets block.
+	*/
+	void AddTaxaFromSets();
+
+	/**
+		this function is invoked 
+	*/
+	void createStarTree();
+
+
+	/**
+		new all blocks: taxa, splits, pda
+	*/
+	void createBlocks();
+
+	/** free allocated memory, called by destructor */
+	void freeMem();
+	
+	/**
+		destructor
+	*/
+    virtual ~SplitGraph();
+
+	/**
+		convert the collection of trees in TREES block into this split graph
+		@param burnin the number of beginning trees to be discarded
+		@param max_count max number of trees to consider
+		@param split_threshold only keep those splits which appear more than this threshold 
+		@param weight_threshold minimum weight cutoff
+	*/
+	void convertFromTreesBlock(int burnin, int max_count, double split_threshold, 
+		int split_weight_summary, double weight_threshold, const char *tree_weight_file);
+
+/********************************************************
+	PRINT INFORMATION
+********************************************************/
+
+	/**
+		print infos of split graph
+		@param out the output stream
+	*/	
+	void report(ostream &out);
+
+	/**
+		print infos of compatibility graph of splits
+		@param out the output stream
+	*/	
+	void reportConflict(ostream &out);
+
+/********************************************************
+	GET INFORMATION
+********************************************************/
+
+	/**
+		calculate sum of weights of all splits
+	*/
+	double calcWeight();
+
+
+	/**
+		calculate sum of weights of all trivial splits
+	*/
+	double calcTrivialWeight();
+
+	/**
+		calculate sum of weights of preserved splits in the taxa_set
+		@param taxa_set a set of taxa
+	*/
+	double calcWeight(Split &taxa_set);
+
+	/**
+		count how many splits are covered by the taxon set
+		@param taxa_set a set of taxa
+	*/
+	int countSplits(Split &taxa_set);
+
+	/**
+		count how many internal splits are covered by the taxon set
+		@param taxa_set a set of taxa
+	*/
+	int countInternalSplits(Split &taxa_set);
+
+	/**
+		generate pairs of random taxa set with overlap of taxa in common
+		@param filename output file name
+		@param size size of the taxa set
+		@param overlap number of taxa common in both sets
+		@param times number of times repeated
+	*/
+	void generateTaxaSet(char *filename, int size, int overlap, int times);
+
+	/**
+		scale the weight of all splits to a norm factor
+		@param norm normalized factor
+		@param make_int TRUE to round weights to int, FALSE otherwise
+		@param precision numerical precision, default (-1) for no rounding
+	*/
+	void scaleWeight(double norm, bool make_int = false, int precision = -1);
+
+
+	/**
+		@return TRUE if split sp is contained in the split system
+		@param sp target split to search for
+	*/
+	bool containSplit(Split &sp);
+
+	/**
+		compute the boundary length of the area set, using areas_boundary variable
+		@param area a set of area ID
+		@return boundary length
+	*/
+	double computeBoundary(Split &area);
+
+	 /**
+	  @return max split weight
+	 */
+	double maxWeight();
+	
+	/**
+	 * @param name a name string
+	 * @return ID of leaf corresponding to name, -1 if not found
+	 */
+	int findLeafName(string &name);
+
+/********************************************************
+	compatibility
+********************************************************/
+
+	/**
+		find the maximum-weight set of compatible splits
+		@param maxsg (OUT) set of compatible splits in a split graph class
+	*/
+	void findMaxCompatibleSplits(SplitGraph &maxsg);
+
+	/**
+ 		check the compatibility of sp against all splits in this set
+		@param sp the target split
+		@return TRUE if sp is compatible with all splits here, otherwise FALSE
+	*/
+	bool compatible(Split *sp);
+
+/********************************************************
+	OTHER STUFFS
+********************************************************/
+
+	/**
+		@return number of taxa
+	*/
+	int getNTaxa() {
+		assert(size() > 0);
+		return (*begin())->ntaxa;
+	}
+
+	/**
+		@return number of areas
+	*/
+	int getNAreas() {
+		return sets->getNSets();
+	}
+
+	/**
+		@return number of splits
+	*/
+	int getNSplits() {
+		return size();
+	}
+
+	/**
+		@return number of trivial splits
+	*/
+	int getNTrivialSplits();
+
+	/**
+		@return taxa block
+	*/
+	NxsTaxaBlock *getTaxa() {
+		return taxa;
+	}
+
+	void getTaxaName(vector<string> &taxname);
+
+	/**
+		@return splits block
+	*/
+	MSplitsBlock *getSplitsBlock() {
+		return splits;
+	}
+
+	/**
+		@return PDA block
+	*/
+	MPdaBlock *getPdaBlock() {
+		return pda;
+	}
+
+	/**
+		@return SETS block
+	*/
+	MSetsBlock *getSetsBlock() {
+		return sets;
+	}
+
+	/**
+		@return TREES block
+	*/
+	NxsTreesBlock *getTreesBlock() {
+		return trees;
+	}
+
+	MTreeSet *getMTrees() {
+		return mtrees;
+	}
+
+	/**
+		@return TRUE if splits graph is circular
+	*/
+	bool isCircular() {
+		return splits->cycle.size() != 0;
+	}
+
+	/**
+		@return TRUE if split system is weakly compatible
+	*/
+	bool isWeaklyCompatible();
+
+	/**
+		@return TRUE if it is the cost-constrained PD problem
+	*/
+	bool isBudgetConstraint() {
+		return pda->cost_constrained;
+	}
+
+	/**
+		@return TRUE if the distance matrix presents for circular splits graph
+		@param mat distance matrix
+	*/
+	bool checkCircular(matrix(double) &mat);
+
+	/**
+		get the ID of the taxon around the circle in a circular splits graph
+		@param i a taxon
+		@return index of taxon on the circle
+	*/
+	int getCircleId(int i) {
+		assert(i >= 0 && i < getNTaxa());
+		return splits->cycle[i];
+	}
+
+	/**
+		generate a random circular split graph
+		@param params program parameters
+	*/
+	void generateCircular(Params &params);
+
+	/**
+		save split systems to a file in NEXUS format
+		@param out output stream
+		@param omit_trivial TRUE to omit trivial splits, FALSE otherwise
+	*/
+	void saveFileNexus(ostream &out, bool omit_trivial = false);
+
+	/**
+		save split systems to a file in star-dot format (eg **...*)
+		@param out output stream
+		@param omit_trivial TRUE to omit trivial splits, FALSE otherwise
+	*/
+	void saveFileStarDot(ostream &out, bool omit_trivial = false);
+
+	/**
+		save split systems to a file
+		@param out output file name
+		@param omit_trivial TRUE to omit trivial splits, FALSE otherwise
+	*/
+	void saveFile(const char* out_file, InputType file_format, bool omit_trivial = false);
+
+	/**
+		calculate the distance matrix, print to file in phylip format
+		@param filename output file name
+	*/
+	void calcDistance(char *filename);
+
+
+	/**
+		calculate the distance matrix
+		@param dist (OUT) distance matrix
+	*/
+	void calcDistance(matrix(double) &dist);
+
+	/**
+		calculate the distance matrix, based on the taxa_order
+		@param dist (OUT) distance matrix
+		@param taxa_order an order of taxa
+	*/
+	void calcDistance(matrix(double) &dist, vector<int> &taxa_order);
+
+
+protected:
+
+	/**
+		taxa block
+	*/
+	NxsTaxaBlock *taxa;
+
+	/**
+		splits block
+	*/
+	MSplitsBlock *splits;
+
+	/**
+		PDA block
+	*/
+	MPdaBlock *pda;
+
+	
+	/**
+		SETS block
+	*/
+	MSetsBlock *sets;
+
+	/**
+		relationship between the sets. For example, the common boundary length between two areas.
+	*/
+	double *areas_boundary;
+
+	/**
+		TREES block
+	*/
+	NxsTreesBlock *trees;
+
+	/**
+		storing set of trees if the split graph is converted from it
+	*/
+	MTreeSet *mtrees;
+
+};
+
+#endif
diff --git a/splitset.cpp b/splitset.cpp
new file mode 100644
index 0000000..555b1a1
--- /dev/null
+++ b/splitset.cpp
@@ -0,0 +1,59 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "splitset.h"
+
+SplitSet::SplitSet()
+ : vector<Split*>()
+{
+}
+
+double SplitSet::getWeight() {
+	if (empty())
+		return 0;
+	//assert(!empty());
+	return (*begin())->getWeight();
+}
+
+
+/**
+	release the memory of all element splits, then resize to 0
+*/
+void SplitSet::removeAll() {
+	for (reverse_iterator it = rbegin(); it != rend(); it++)
+		if (*it) delete *it;
+	clear();
+}
+
+
+bool SplitSet::compatible(Split *sp) {
+	for (iterator it = begin(); it != end(); it++)
+		if (!(*it)->compatible(*sp))
+			return false;
+	return true;
+}
+
+
+SplitSet::~SplitSet()
+{
+	removeAll();
+	//cout << "deleted" << endl;
+}
+
+
diff --git a/splitset.h b/splitset.h
new file mode 100644
index 0000000..890337d
--- /dev/null
+++ b/splitset.h
@@ -0,0 +1,57 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SPLITSET_H
+#define SPLITSET_H
+
+#include <vector>
+#include "split.h"
+
+/**
+Vector of Splits
+
+ at author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
+*/
+class SplitSet : public vector<Split*>
+{
+public:
+    SplitSet();
+	
+	/**
+		release the memory of all element splits, then resize to 0
+	*/
+	void removeAll();
+	
+	/**
+		get the weight of the first split in the vector
+	*/
+	double getWeight();
+
+	/**
+ 		check the compatibility of sp against all splits in this set
+		@param sp the target split
+		@return TRUE if sp is compatible with all splits here, otherwise FALSE
+	*/
+	bool compatible(Split *sp);
+
+    virtual ~SplitSet();
+
+};
+
+#endif
diff --git a/stoprule.cpp b/stoprule.cpp
new file mode 100644
index 0000000..858ad19
--- /dev/null
+++ b/stoprule.cpp
@@ -0,0 +1,501 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "stoprule.h"
+#include "timeutil.h"
+
+StopRule::StopRule()
+{
+//	nTime_ = 0;
+	predicted_iteration = 0;
+
+	stop_condition = SC_FIXED_ITERATION;
+	confidence_value = 0.95;
+	min_iteration = 0;
+	max_iteration = 0;
+	unsuccess_iteration = 100;
+	min_correlation = 0.99;
+	step_iteration = 100;
+	start_real_time = -1.0;
+	max_run_time = -1.0;
+	curIteration = 0;
+}
+
+void StopRule::initialize(Params &params) {
+	stop_condition = params.stop_condition;
+	confidence_value = params.stop_confidence;
+	min_iteration = params.min_iterations;
+	max_iteration = params.max_iterations;
+	unsuccess_iteration = params.unsuccess_iteration;
+	min_correlation = params.min_correlation;
+	step_iteration = params.step_iterations;
+	start_real_time = getRealTime();
+	max_run_time = params.maxtime * 60; // maxtime is in minutes
+}
+
+StopRule::~StopRule()
+{
+}
+//
+//int StopRule::getNumIterations() {
+//	if (stop_condition == SC_FIXED_ITERATION || predicted_iteration == 0)
+//		return min_iteration;
+//	return predicted_iteration;
+//}
+
+//int StopRule::getPredictedIteration(int cur_iteration) {
+//	double realtime_secs = getRealTime() - start_real_time;
+//
+//	switch (stop_condition) {
+//	case SC_FIXED_ITERATION:
+//		return min_iteration;
+//	case SC_WEIBULL:
+//		if (predicted_iteration == 0)
+//			return min_iteration;
+//		else
+//			return predicted_iteration;
+//	case SC_UNSUCCESS_ITERATION:
+//		return getLastImprovedIteration() + unsuccess_iteration;
+//	case SC_BOOTSTRAP_CORRELATION:
+//		return ((cur_iteration+step_iteration-1)/step_iteration)*step_iteration;
+//	case SC_REAL_TIME:
+////		return ((max_run_time - realtime_secs)/max_run_time);
+//		assert(0); // TODO
+//		return 0;
+//	}
+//}
+
+bool StopRule::meetStopCondition(int cur_iteration, double cur_correlation) {
+	switch (stop_condition) {
+	case SC_FIXED_ITERATION:
+		return cur_iteration > min_iteration;
+	case SC_WEIBULL:
+		if (predicted_iteration == 0)
+			return cur_iteration > min_iteration;
+		else
+			return cur_iteration > predicted_iteration;
+	case SC_UNSUCCESS_ITERATION:
+		return cur_iteration > getLastImprovedIteration() + unsuccess_iteration;
+	case SC_BOOTSTRAP_CORRELATION:
+		return ((cur_correlation >= min_correlation) && (cur_iteration > getLastImprovedIteration() + unsuccess_iteration))
+				|| cur_iteration > max_iteration;
+	case SC_REAL_TIME:
+		return (getRealTime() - start_real_time >= max_run_time);
+	}
+	return false;
+}
+
+double StopRule::getRemainingTime(int cur_iteration, double cur_correlation) {
+	double realtime_secs = getRealTime() - start_real_time;
+	int niterations;
+	switch (stop_condition) {
+	case SC_REAL_TIME:
+		return max_run_time - realtime_secs;
+	case SC_FIXED_ITERATION:
+		niterations = min_iteration;
+		break;
+	case SC_WEIBULL:
+		niterations = (predicted_iteration == 0) ? min_iteration : predicted_iteration;
+		break;
+	case SC_UNSUCCESS_ITERATION:
+		niterations = getLastImprovedIteration() + unsuccess_iteration;
+		break;
+	case SC_BOOTSTRAP_CORRELATION:
+		niterations = max(((cur_iteration+step_iteration-1)/step_iteration)*step_iteration, getLastImprovedIteration() + unsuccess_iteration);
+//		if (cur_correlation >= min_correlation)
+//			niterations = getLastImprovedIteration() + unsuccess_iteration;
+		break;
+	}
+	return (niterations - cur_iteration) * realtime_secs / (cur_iteration - 1);
+}
+
+//void StopRule::setStopCondition(STOP_CONDITION sc) {
+//	stop_condition = sc;
+//}
+//
+//void StopRule::setIterationNum(int min_it, int max_it) {
+//	min_iteration = min_it;
+//	max_iteration = max_it;
+//}
+//
+//void StopRule::setConfidenceValue(double confidence_val)
+//{
+//	confidence_value = confidence_val;
+//	assert(confidence_value > 0 && confidence_value < 1);
+//}
+//
+//void StopRule::setUnsuccessIteration(int unsuccess_iteration) {
+//	this->unsuccess_iteration = unsuccess_iteration;
+//}
+//
+//void StopRule::setMinCorrelation(double min_correlation, int step_iteration) {
+//	this->min_correlation = min_correlation;
+//	this->step_iteration = step_iteration;
+//}
+//
+//void StopRule::setRealTime(double start_real_time, double max_un_time) {
+//	this->start_real_time = start_real_time;
+//	this->max_run_time = max_run_time;
+//}
+
+
+double StopRule::predict (double &upperTime) {
+	if (time_vec.size() < 4) return 0;
+	//readVector(time_vec);
+	double predictedTime_ = cmpExtinctTime (time_vec.size());
+	upperTime = cmpUpperTime (time_vec.size(), 1.0 - confidence_value);
+	return predictedTime_;
+}
+
+void StopRule::addImprovedIteration(int iteration) {
+	time_vec.insert(time_vec.begin(), iteration);
+//	nTime_++;
+	if (stop_condition != SC_WEIBULL) return;
+	double upperTime;
+	if (predict(upperTime) == 0) return;
+	predicted_iteration = upperTime;
+	if (stop_condition == SC_WEIBULL && predicted_iteration > max_iteration)
+		predicted_iteration = max_iteration;
+	if (predicted_iteration < min_iteration)
+			predicted_iteration = min_iteration;
+	//cout << "Stopping rule suggests " << predicted_iteration << " iterations ("
+	//	<< (predicted_iteration - iteration) << " more iterations)" << endl;
+}
+
+int StopRule::getLastImprovedIteration() {
+	if (time_vec.empty())
+		return 0;
+	return time_vec[0];
+}
+
+void StopRule::cmpInvMat (DoubleMatrix &oriMat, DoubleMatrix &invMat, int size) {
+	//invMat.setLimit (size, size);
+	double eps = 1.0e-20; /* ! */
+	int i, j, k, l, maxi=0, idx, ix, jx;
+	double sum, tmp, maxb, aw;
+
+	invMat.resize(size);
+	for (i = 0; i < size; i++) invMat[i].resize(size);
+
+	IntVector index (size);
+	double *wk;
+	DoubleMatrix omtrx (size);
+	for (i = 0; i < size; i++) omtrx[i].resize(size);
+
+
+
+	/* copy oriMat to omtrx */
+	for (i = 0; i < size; i++)
+		for (j = 0; j < size; j++)
+			omtrx[i][j] = oriMat[i][j];
+
+	wk = (double *) calloc((size_t)size, sizeof(double));
+	aw = 1.0;
+	for (i = 0; i < size; i++) {
+		maxb = 0.0;
+		for (j = 0; j < size; j++) {
+			if (fabs(omtrx[i][j]) > maxb)
+				maxb = fabs(omtrx[i][j]);
+		}
+		if (maxb == 0.0) {
+			/* Singular matrix */
+			cout << "\n\n\nHALT: PLEASE REPORT ERROR D TO DEVELOPERS\n\n\n";
+			//OutStream::write(oriMat, cout);
+			exit(1);
+		}
+		wk[i] = 1.0 / maxb;
+	}
+	for (j = 0; j < size; j++) {
+		for (i = 0; i < j; i++) {
+			sum = omtrx[i][j];
+			for (k = 0; k < i; k++)
+				sum -= omtrx[i][k] * omtrx[k][j];
+			omtrx[i][j] = sum;
+		}
+		maxb = 0.0;
+		for (i = j; i < size; i++) {
+			sum = omtrx[i][j];
+			for (k = 0; k < j; k++)
+				sum -= omtrx[i][k] * omtrx[k][j];
+			omtrx[i][j] = sum;
+			tmp = wk[i] * fabs(sum);
+			if (tmp >= maxb) {
+				maxb = tmp;
+				maxi = i;
+			}
+		}
+		if (j != maxi) {
+			for (k = 0; k < size; k++) {
+				tmp = omtrx[maxi][k];
+				omtrx[maxi][k] = omtrx[j][k];
+				omtrx[j][k] = tmp;
+			}
+			aw = -aw;
+			wk[maxi] = wk[j];
+		}
+		index[j] = maxi;
+		if (omtrx[j][j] == 0.0)
+			omtrx[j][j] = eps;
+		if (j != size - 1) {
+			tmp = 1.0 / omtrx[j][j];
+			for (i = j + 1; i < size; i++)
+				omtrx[i][j] *= tmp;
+		}
+	}
+	for (jx = 0; jx < size; jx++) {
+		for (ix = 0; ix < size; ix++)
+			wk[ix] = 0.0;
+		wk[jx] = 1.0;
+		l = -1;
+		for (i = 0; i < size; i++) {
+			idx = index[i];
+			sum = wk[idx];
+			wk[idx] = wk[i];
+			if (l != -1) {
+				for (j = l; j < i; j++)
+					sum -= omtrx[i][j] * wk[j];
+			} else if (sum != 0.0)
+				l = i;
+			wk[i] = sum;
+		}
+		for (i = size - 1; i >= 0; i--) {
+			sum = wk[i];
+			for (j = i + 1; j < size; j++)
+				sum -= omtrx[i][j] * wk[j];
+			wk[i] = sum / omtrx[i][i];
+		}
+		for (ix = 0; ix < size; ix++)
+			invMat[ix][jx] = wk[ix];
+	}
+	free((char *)wk);
+	wk = NULL;
+} /* luinverse */
+
+void StopRule::readMat (char *fileName, DoubleMatrix &oriMat, int &size) {
+	std::ifstream inFile_;
+	inFile_.open(fileName);
+	inFile_ >> size;
+	oriMat.resize(size);
+	for (int i = 0; i < size; i++) oriMat[i].resize(size);
+	for (int row_ = 0; row_ < size; row_ ++)
+		for (int col_ = 0; col_ < size; col_ ++)
+			inFile_ >> oriMat[row_][col_];
+	inFile_.close ();
+
+}
+
+void StopRule::multiple (DoubleMatrix &mat1, DoubleMatrix &mat2, DoubleMatrix &proMat) {
+	int row_, col_;
+	//proMat.setLimit (mat1.getNRow (), mat2.getNCol ());
+	proMat.resize(mat1.size());
+	int nrow_ = proMat.size();
+	int ncol_ = mat2[0].size();
+	for (row_ = 0; row_ < proMat.size(); row_++)   proMat[row_].resize(ncol_);
+	for (row_ = 0; row_ < nrow_; row_ ++)
+		for (col_ = 0; col_ < ncol_; col_ ++) {
+			proMat[row_][col_] = 0.0;
+			for (int count_ = 0; count_ < mat1[0].size(); count_ ++) {
+				proMat[row_][col_] += mat1[row_][count_] * mat2[count_][col_];
+				//         std::cout << mat1[row_][count_] << " --> " << mat2[count_][col_] << endl;
+			}
+		}
+}
+
+void StopRule::multiple (DoubleMatrix &mat1, DoubleVector &vec2, DoubleVector &proVec) {
+	int row_, col_;
+	proVec.resize(mat1.size());
+
+	for (row_ = 0; row_ < mat1.size (); row_ ++) {
+		proVec[row_] = 0.0;
+		for (col_ = 0; col_ < mat1[0].size(); col_ ++)
+			proVec[row_] += mat1[row_][col_] * vec2[col_];
+	}
+}
+
+void StopRule::multiple (DoubleVector &vec1, DoubleMatrix &mat2, DoubleVector &proVec) {
+	int row_, col_;
+	proVec.resize(mat2[0].size());
+	for (col_ = 0; col_ < mat2[0].size(); col_ ++) {
+		proVec[col_] = 0.0;
+		for (row_ = 0; row_ < mat2.size(); row_ ++)
+			proVec[col_] += vec1[row_] * mat2[row_][col_];
+	}
+}
+
+void StopRule::multiple (DoubleVector &vec1, DoubleVector &vec2, DoubleMatrix &proMat) {
+	int row_, col_;
+	proMat.resize(vec1.size());
+	for (row_ = 0; row_ < vec1.size(); row_++)
+		proMat[row_].resize(vec2.size());
+
+	for (row_ = 0; row_ < vec1.size(); row_ ++)
+		for (col_ = 0; col_ < vec2.size(); col_ ++)
+			proMat[row_][col_] = vec1[row_] * vec2[col_];
+}
+
+double StopRule::multiple (DoubleVector &vec1, DoubleVector &vec2) {
+	double sum_ = 0.0;
+	for (int count_ = 0; count_ < vec1.size(); count_ ++)
+		sum_ += vec1[count_] * vec2[count_];
+	return sum_;
+}
+
+/* THE FOLLOWING CODE COMES FROM tools.c in Yang's PAML package */
+//----------------------------------------------------------------------------------------
+double StopRule::cmpLnGamma (double alpha) {
+	/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places.
+	   Stirling's formula is used for the central polynomial part of the procedure.
+	   Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function.
+	   Communications of the Association for Computing Machinery, 9:684
+	*/
+	double x=alpha, f=0, z;
+
+	if (x<7) {
+		f=1;  z=x-1;
+		while (++z<7)  f*=z;
+		x=z;   f=-log(f);
+	}
+	z = 1/(x*x);
+	return  f + (x-0.5)*log(x) - x + .918938533204673
+	        + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z
+	           +.083333333333333)/x;
+} //end of function cmpLnGamma
+
+
+void StopRule::readVector(DoubleVector &tmpTimeVec_) {
+//	nTime_ = tmpTimeVec_.size();
+	time_vec.resize(tmpTimeVec_.size());
+	for (int count_ = 0; count_ < tmpTimeVec_.size(); count_ ++)
+		time_vec[count_] = tmpTimeVec_[tmpTimeVec_.size() - count_ - 1];
+}
+
+void StopRule::readFile (const char *fileName) {
+	std::ifstream inFile_;
+	inFile_.open (fileName);
+
+//	int nTime_ = 0;
+
+
+	DoubleVector tmpTimeVec_;// (MAX_ITERATION, MAX_ITERATION);
+
+	double old_time = -1.0;
+	while (inFile_.eof () == 0) {
+		double tmpTime_ = -1.0;
+		inFile_ >> tmpTime_;
+		if (tmpTime_ > old_time) {
+			tmpTimeVec_.push_back(tmpTime_);
+//			nTime_ ++;
+			old_time = tmpTime_;
+		}
+	}
+	inFile_.close ();
+
+	time_vec.resize(tmpTimeVec_.size());
+	for (int count_ = 0; count_ < tmpTimeVec_.size(); count_ ++)
+		time_vec[count_] = tmpTimeVec_[tmpTimeVec_.size() - count_ - 1];
+}
+
+double StopRule::cmpMuy (int k) {
+	double sum_ = 0.0;
+
+	for (int i = 0; i < k - 2; i ++)
+		sum_ += log ( (time_vec[0] - time_vec[ k - 1]) / (time_vec[0] - time_vec[i + 1]) );
+
+	double lamda_;
+	lamda_ = (1.0 / (k - 1.0) ) * sum_;
+	return lamda_;
+}
+
+
+void StopRule::cmpLamdaMat (int k, DoubleMatrix &lamdaMat) {
+	int i, j;
+	lamdaMat.resize(k);
+	for (i = 0; i < k; i ++)
+		lamdaMat[i].resize(k);
+	double muy_ = cmpMuy (k);
+	for (i = 0; i < k; i ++)
+		for (j = 0; j <= i; j ++) {
+			/*
+			lamdaMat[i][j] = (cmpGamma (2*muy_ + i + 1) * cmpGamma (muy_ + j + 1) ) /
+			                 ( cmpGamma (muy_ + i + 1) * cmpGamma (j + 1) );*/
+
+			// to fix divide by zero PROBLEM!
+			lamdaMat[i][j] = cmpLnGamma (2*muy_ + i + 1) + cmpLnGamma (muy_ + j + 1) -
+			                 cmpLnGamma (muy_ + i + 1) - cmpLnGamma (j + 1);
+
+			//if (i == 98 && j == 97)
+			 //     std::cout << i << "," << j << " -> " << lamdaMat[i][j] << endl;
+			lamdaMat[i][j] = exp(lamdaMat[i][j]);
+			lamdaMat[j][i] = lamdaMat[i][j];
+		}
+}
+
+void StopRule::cmpVecA (int k, DoubleVector &aVec) {
+	DoubleVector eVec_ (k, k);
+	int count_;
+	for (count_ = 0; count_ < k; count_ ++)
+		eVec_[count_] = 1.0;
+
+	DoubleMatrix lamdaMat_;
+	cmpLamdaMat (k, lamdaMat_);
+	 // OutStream::write (lamdaMat_, std::cout);
+
+	DoubleMatrix invLamdaMat_;
+	cmpInvMat (lamdaMat_, invLamdaMat_, k);
+
+	//  OutStream::write (invLamdaMat_, std::cout);
+
+	DoubleMatrix proMat_;
+	multiple (lamdaMat_, invLamdaMat_, proMat_);
+	//OutStream::write (proMat_, std::cout);
+
+
+	DoubleVector tmp1Vec_;
+	multiple (eVec_, invLamdaMat_, tmp1Vec_);
+	//OutStream::write (tmp1Vec_, std::cout);
+	double tmp2_ = multiple (tmp1Vec_, eVec_);
+	double invTmp2_ = 1.0 / tmp2_;
+
+	for (int row_ = 0; row_ < k; row_ ++)
+		for (int col_ = 0; col_ < k; col_ ++)
+			invLamdaMat_[row_][col_] *= invTmp2_;
+
+
+	DoubleVector tmp3Vec_;
+	multiple (invLamdaMat_, eVec_, aVec);
+}
+
+double StopRule::cmpExtinctTime (int k) {
+	DoubleVector a;
+	cmpVecA (k, a);
+	double extinctTime_ = 0.0;
+	for (int count_ = 0; count_ < k; count_ ++)
+		extinctTime_ += a[count_] * time_vec[count_];
+	return extinctTime_;
+}
+
+
+
+double StopRule::cmpUpperTime (int k, double alpha) {
+	double muy_ = cmpMuy (k);
+	double priSu_ = -log (alpha) / k ;
+	double su_ = pow (priSu_, -muy_);
+	return time_vec[0] + (time_vec[0] - time_vec[k - 1]) / (su_ - 1.0);
+}
+
diff --git a/stoprule.h b/stoprule.h
new file mode 100644
index 0000000..1619e8f
--- /dev/null
+++ b/stoprule.h
@@ -0,0 +1,176 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef STOPRULE_H
+#define STOPRULE_H
+
+#include "tools.h"
+
+
+/**
+Stopping rule
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class StopRule
+{
+public:
+
+	/**
+		constructor
+	*/
+    StopRule();
+
+    void initialize(Params &params);
+	/**
+		destructor
+	*/
+    ~StopRule();
+
+	/**
+		read improved iteration number from a file
+		@param fileName file name
+	*/
+	void readFile (const char *fileName);
+
+	/**
+		Add the iteration number that improve trees
+		@param iteration improved iteration number
+	*/
+	void addImprovedIteration(int iteration);
+
+	/**
+		Get the last iteration number that improved trees
+		@return the last iteration number that improved trees
+	*/
+	int getLastImprovedIteration();
+
+	/**
+		main function to check the stop condition
+		@param current_iteration current iteration number
+		@param cur_correlation current correlation coefficient for bootstrap convergence
+		@return TRUE if stop condition is met, FALSE otherwise
+	*/
+	bool meetStopCondition(int cur_iteration, double cur_correlation);
+	
+	/** get the remaining time to converge, in seconds */
+	double getRemainingTime(int cur_iteration, double cur_correlation);
+
+	/**
+		@return the number of iterations required to stop the search
+	*/
+//	int getNumIterations();
+
+	/**
+		@return predicted iteration, 0 if no prediction has been made
+	*/
+//	int getPredictedIteration(int cur_iteration);
+
+
+    int getCurIt() const {
+        return curIteration;
+    }
+
+    void setCurIt(int curIteration) {
+        StopRule::curIteration = curIteration;
+    }
+
+private:
+
+    /**
+	 *  Current iteration number
+	 */
+	int curIteration;
+
+	double predict (double &upperTime);
+
+	/**
+		stop condition 
+	*/
+	STOP_CONDITION stop_condition;	
+	
+	/**
+		confidence value of prediction
+	*/
+	double confidence_value;
+
+	/**
+		minimum number of iterations
+	*/
+	int min_iteration;
+
+	/**
+		maximum number of iterations
+	*/
+	int max_iteration;
+
+	/**
+		predicted number of iterations
+	*/
+	int predicted_iteration;
+
+	/** number of unsuccessful iterations to stop the search */
+	int unsuccess_iteration;
+
+	/** bootstrap correlation threshold to stop */
+	double min_correlation;
+
+	/** step size for checking bootstrap convergence */
+	int step_iteration;
+
+	/** max wall-clock running time to stop */
+	double max_run_time;
+
+    /** starting real time of the program */
+    double start_real_time;
+
+	/* FOLLOWING CODES ARE FROM IQPNNI version 3 */	
+
+//	int nTime_;
+	DoubleVector time_vec;
+
+	void cmpInvMat (DoubleMatrix &oriMat, DoubleMatrix &invMat, int size);
+
+	void readMat (char *fileName, DoubleMatrix &oriMat, int &size);
+
+	void multiple (DoubleMatrix &mat1, DoubleMatrix &mat2, DoubleMatrix &proMat);
+
+
+	void multiple (DoubleMatrix &mat1, DoubleVector &vec2, DoubleVector &proVec);
+
+	void multiple (DoubleVector &vec1, DoubleMatrix &mat2, DoubleVector &proVec);
+	void multiple (DoubleVector &vec1, DoubleVector &vec2, DoubleMatrix &proMat);
+	double multiple (DoubleVector &vec1, DoubleVector &vec2);
+
+	void readVector(DoubleVector &tmpTimeVec_);
+
+	/* THE FOLLOWING CODE COMES FROM tools.c in Yang's PAML package */
+	//----------------------------------------------------------------------------------------
+	double cmpLnGamma (double alpha);
+
+	double cmpMuy (int k);
+
+	void cmpLamdaMat (int k, DoubleMatrix &lamdaMat);
+
+	void cmpVecA (int k, DoubleVector &aVec);
+
+	double cmpExtinctTime (int k);
+	double cmpUpperTime (int k, double alpha);
+};
+
+#endif
diff --git a/superalignment.cpp b/superalignment.cpp
new file mode 100644
index 0000000..542f0b0
--- /dev/null
+++ b/superalignment.cpp
@@ -0,0 +1,610 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <stdarg.h>
+#include "phylotree.h"
+#include "superalignment.h"
+#include "phylosupertree.h"
+
+SuperAlignment::SuperAlignment()
+ : Alignment() {}
+
+SuperAlignment::SuperAlignment(PhyloSuperTree *super_tree)
+ : Alignment()
+{
+	// first build taxa_index and partitions
+	int site, seq, nsite = super_tree->size();
+	PhyloSuperTree::iterator it;
+	for (site = 0, it = super_tree->begin(); it != super_tree->end(); it++, site++) {
+		partitions.push_back((*it)->aln);
+		int nseq = (*it)->aln->getNSeq();
+		//cout << "nseq  = " << nseq << endl;
+		for (seq = 0; seq < nseq; seq++) {
+			int id = getSeqID((*it)->aln->getSeqName(seq));
+			if (id < 0) {
+				seq_names.push_back((*it)->aln->getSeqName(seq));
+				id = seq_names.size()-1;
+				IntVector vec(nsite, -1);
+				vec[site] = seq;
+				taxa_index.push_back(vec);
+			} else
+				taxa_index[id][site] = seq;
+		}
+	}
+	// now the patterns of sequence-genes presence/absence
+	buildPattern();
+}
+
+void SuperAlignment::buildPattern() {
+	int site, seq, nsite = partitions.size();
+
+	seq_type = SEQ_BINARY;
+	num_states = 2; // binary type because the super alignment presents the presence/absence of taxa in the partitions
+	STATE_UNKNOWN = 2;
+	site_pattern.resize(nsite, -1);
+	clear();
+	pattern_index.clear();
+	VerboseMode save_mode = verbose_mode; 
+	verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
+	int nseq = getNSeq();
+	for (site = 0; site < nsite; site++) {
+ 		Pattern pat;
+ 		pat.append(nseq, 0);
+		for (seq = 0; seq < nseq; seq++)
+			pat[seq] = (taxa_index[seq][site] >= 0)? 1 : 0;
+		addPattern(pat, site);
+	}
+	verbose_mode = save_mode;
+	countConstSite();
+    buildSeqStates();
+}
+
+
+
+void SuperAlignment::linkSubAlignment(int part) {
+	assert(taxa_index.size() == getNSeq());
+	int nseq = getNSeq(), seq;
+	vector<bool> checked;
+	checked.resize(partitions[part]->getNSeq(), false);
+	for (seq = 0; seq < nseq; seq++) {
+		int id = partitions[part]->getSeqID(getSeqName(seq));
+		if (id < 0)
+			taxa_index[seq][part] = -1;
+		else {
+			taxa_index[seq][part] = id;
+			checked[id] = true;
+		}
+	}
+	if (verbose_mode >= VB_MED) {
+
+	}
+	// sanity check that all seqnames in partition must be present in superalignment
+	for (seq = 0; seq < checked.size(); seq++) {
+		assert(checked[seq]);
+	}
+}
+
+void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char) {
+	assert(aln->isSuperAlignment());
+	SuperAlignment *saln = (SuperAlignment*)aln;
+
+    IntVector::iterator it;
+    for (it = seq_id.begin(); it != seq_id.end(); it++) {
+        assert(*it >= 0 && *it < aln->getNSeq());
+        seq_names.push_back(aln->getSeqName(*it));
+    }
+
+	// BUG HERE!
+	//Alignment::extractSubAlignment(aln, seq_id, 0);
+
+	taxa_index.resize(getNSeq());
+	for (int i = 0; i < getNSeq(); i++)
+		taxa_index[i].resize(saln->partitions.size(), -1);
+
+	int part = 0;
+	partitions.resize(saln->partitions.size());
+	for (vector<Alignment*>::iterator ait = saln->partitions.begin(); ait != saln->partitions.end(); ait++, part++) {
+		IntVector sub_seq_id;
+		for (IntVector::iterator it = seq_id.begin(); it != seq_id.end(); it++)
+			if (saln->taxa_index[*it][part] >= 0)
+				sub_seq_id.push_back(saln->taxa_index[*it][part]);
+		Alignment *subaln = new Alignment;
+		subaln->extractSubAlignment(*ait, sub_seq_id, 0);
+		partitions[part] = subaln;
+		linkSubAlignment(part);
+//		cout << subaln->getNSeq() << endl;
+//		subaln->printPhylip(cout);
+	}
+
+	// now build the patterns based on taxa_index
+	buildPattern();
+}
+
+Alignment *SuperAlignment::removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs) {
+    IntVector checked;
+    vector<bool> removed;
+    checked.resize(getNSeq(), 0);
+    removed.resize(getNSeq(), false);
+    int seq1;
+
+	for (seq1 = 0; seq1 < getNSeq(); seq1++) {
+        if (checked[seq1]) continue;
+        bool first_ident_seq = true;
+		for (int seq2 = seq1+1; seq2 < getNSeq(); seq2++) {
+			if (getSeqName(seq2) == not_remove) continue;
+			bool equal_seq = true;
+			int part = 0;
+			// check if seq1 and seq2 are identical over all partitions
+			for (vector<Alignment*>::iterator ait = partitions.begin(); ait != partitions.end(); ait++, part++) {
+				int subseq1 = taxa_index[seq1][part];
+				int subseq2 = taxa_index[seq2][part];
+				if (subseq1 < 0 && subseq2 < 0) // continue if both seqs are absent in this partition
+					continue;
+				if (subseq1 < 0 && subseq2 > 0) {
+					// if one sequence is present and the other is absent for a gene, we conclude that they are not identical
+					equal_seq = false;
+					break;
+				}
+				if (subseq1 > 0 && subseq2 < 0) {
+					// if one sequence is present and the other is absent for a gene, we conclude that they are not identical
+					equal_seq = false;
+					break;
+				}
+				// now if both seqs are present, check sequence content
+				for (iterator it = (*ait)->begin(); it != (*ait)->end(); it++)
+					if  ((*it)[subseq1] != (*it)[subseq2]) {
+						equal_seq = false;
+						break;
+					}
+			}
+			if (equal_seq) {
+				if (removed_seqs.size() < getNSeq()-3 && (!keep_two || !first_ident_seq)) {
+					removed_seqs.push_back(getSeqName(seq2));
+					target_seqs.push_back(getSeqName(seq1));
+					removed[seq2] = true;
+				}
+				checked[seq2] = 1;
+				first_ident_seq = false;
+			}
+		}
+		checked[seq1] = 1;
+	}
+
+	if (removed_seqs.empty()) return this; // do nothing if the list is empty
+
+    if (removed_seqs.size() >= getNSeq()-3)
+        outWarning("Your alignment contains too many identical sequences!");
+
+	// now remove identical sequences
+	IntVector keep_seqs;
+	for (seq1 = 0; seq1 < getNSeq(); seq1++)
+		if (!removed[seq1]) keep_seqs.push_back(seq1);
+	SuperAlignment *aln;
+	aln = new SuperAlignment;
+	aln->extractSubAlignment(this, keep_seqs, 0);
+	return aln;
+}
+
+/*
+void SuperAlignment::checkGappySeq() {
+	int nseq = getNSeq(), part = 0, i;
+	IntVector gap_only_seq;
+	gap_only_seq.resize(nseq, 1);
+	//cout << "Checking gaps..." << endl;
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
+		IntVector keep_seqs;
+		for (i = 0; i < nseq; i++)
+			if (taxa_index[i][part] >= 0)
+			if (!(*it)->isGapOnlySeq(taxa_index[i][part])) {
+				keep_seqs.push_back(taxa_index[i][part]);
+				gap_only_seq[i] = 0;
+			}
+		if (keep_seqs.size() < (*it)->getNSeq()) {
+			cout << "Discard " << (*it)->getNSeq() - keep_seqs.size() 
+				 << " sequences from partition number " << part+1 << endl;
+			Alignment *aln = new Alignment;
+			aln->extractSubAlignment((*it), keep_seqs, 0);
+			delete (*it);
+			(*it) = aln;
+			linkSubAlignment(part);
+		}
+		cout << __func__ << " num_states = " << (*it)->num_states << endl;
+	}
+	int wrong_seq = 0;
+	for (i = 0; i < nseq; i++)
+		if (gap_only_seq[i]) {
+			cout << "ERROR: Sequence " << getSeqName(i) << " contains only gaps or missing data" << endl;
+			wrong_seq++;
+		}
+	if (wrong_seq) {
+		outError("Some sequences (see above) are problematic, please check your alignment again");
+		}
+}
+*/
+void SuperAlignment::getSitePatternIndex(IntVector &pattern_index) {
+	int nptn = 0;
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		int nsite = pattern_index.size();
+		pattern_index.insert(pattern_index.end(), (*it)->site_pattern.begin(), (*it)->site_pattern.end());
+		for (int i = nsite; i < pattern_index.size(); i++)
+			pattern_index[i] += nptn;
+		nptn += (*it)->getNPattern();
+	}
+}
+
+void SuperAlignment::getPatternFreq(IntVector &pattern_freq) {
+	if (!isSuperAlignment()) outError("Internal error: ", __func__);
+	int offset = 0;
+	if (!pattern_freq.empty()) pattern_freq.resize(0);
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		IntVector freq;
+		(*it)->getPatternFreq(freq);
+		pattern_freq.insert(pattern_freq.end(), freq.begin(), freq.end());
+		offset += freq.size();
+	}
+}
+
+void SuperAlignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
+	if (!aln->isSuperAlignment()) outError("Internal error: ", __func__);
+	if (pattern_freq) outError("Unsupported yet.", __func__);
+	if (spec) outError("Unsupported yet.", __func__);
+	Alignment::copyAlignment(aln);
+	SuperAlignment *super_aln = (SuperAlignment*) aln;
+	if (!partitions.empty()) outError("Internal error: ", __func__);
+	for (vector<Alignment*>::iterator it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
+		Alignment *boot_aln = new Alignment;
+		boot_aln->createBootstrapAlignment(*it);
+		partitions.push_back(boot_aln);
+	}
+	taxa_index = super_aln->taxa_index;
+}
+
+void SuperAlignment::createBootstrapAlignment(IntVector &pattern_freq, const char *spec) {
+	if (!isSuperAlignment()) outError("Internal error: ", __func__);
+	int nptn = 0;
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		nptn += (*it)->getNPattern();
+	}
+	pattern_freq.resize(0);
+	int *internal_freq = new int[nptn];
+	createBootstrapAlignment(internal_freq, spec);
+	pattern_freq.insert(pattern_freq.end(), internal_freq, internal_freq + nptn);
+	delete [] internal_freq;
+
+/*	if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet.", __func__);
+
+	int offset = 0;
+	if (!pattern_freq.empty()) pattern_freq.resize(0);
+
+	if (spec && strncmp(spec, "GENE", 4) == 0) {
+		// resampling whole genes
+		int nptn = 0;
+		IntVector part_pos;
+		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+			part_pos.push_back(nptn);
+			nptn += (*it)->getNPattern();
+		}
+		pattern_freq.resize(nptn, 0);
+		for (int i = 0; i < partitions.size(); i++) {
+			int part = random_int(partitions.size());
+			for (int j = 0; j < partitions[part]->getNPattern(); j++)
+				pattern_freq[j + part_pos[part]] += partitions[part]->at(j).frequency;
+		}
+	} else {
+		// resampling sites within genes
+		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+			IntVector freq;
+			(*it)->createBootstrapAlignment(freq);
+			pattern_freq.insert(pattern_freq.end(), freq.begin(), freq.end());
+			offset += freq.size();
+		}
+	}*/
+}
+
+
+void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spec) {
+	if (!isSuperAlignment()) outError("Internal error: ", __func__);
+	if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet. ", __func__);
+
+	if (spec && strncmp(spec, "GENE", 4) == 0) {
+		// resampling whole genes
+		int nptn = 0;
+		IntVector part_pos;
+		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+			part_pos.push_back(nptn);
+			nptn += (*it)->getNPattern();
+		}
+		memset(pattern_freq, 0, nptn * sizeof(int));
+		for (int i = 0; i < partitions.size(); i++) {
+			int part = random_int(partitions.size());
+			Alignment *aln = partitions[part];
+			if (strncmp(spec,"GENESITE",8) == 0) {
+				// then resampling sites in resampled gene
+				for (int j = 0; j < aln->getNSite(); j++) {
+					int ptn_id = aln->getPatternID(random_int(aln->getNPattern()));
+					pattern_freq[ptn_id + part_pos[part]]++;
+				}
+
+			} else {
+				for (int j = 0; j < aln->getNPattern(); j++)
+					pattern_freq[j + part_pos[part]] += aln->at(j).frequency;
+			}
+		}
+	} else {
+		// resampling sites within genes
+		int offset = 0;
+		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+			(*it)->createBootstrapAlignment(pattern_freq + offset);
+			offset += (*it)->getNPattern();
+		}
+	}
+}
+
+/**
+ * shuffle alignment by randomizing the order of sites
+ */
+void SuperAlignment::shuffleAlignment() {
+	if (!isSuperAlignment()) outError("Internal error: ", __func__);
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		(*it)->shuffleAlignment();
+	}
+}
+
+
+double SuperAlignment::computeObsDist(int seq1, int seq2) {
+	int site;
+	int diff_pos = 0, total_pos = 0;
+	for (site = 0; site < getNSite(); site++) {
+		int id1 = taxa_index[seq1][site];
+		int id2 = taxa_index[seq2][site];
+		if (id1 < 0 || id2 < 0) continue;
+		int num_states = partitions[site]->num_states;
+		for (Alignment::iterator it = partitions[site]->begin(); it != partitions[site]->end(); it++) 
+			if  ((*it)[id1] < num_states && (*it)[id2] < num_states) {
+				total_pos += (*it).frequency;
+				if ((*it)[id1] != (*it)[id2] )
+					diff_pos += (*it).frequency;
+			}
+	}
+	if (!total_pos) 
+		return MAX_GENETIC_DIST; // return +INF if no overlap between two sequences
+	return ((double)diff_pos) / total_pos;
+}
+
+
+double SuperAlignment::computeDist(int seq1, int seq2) {
+	if (partitions.empty()) return 0.0;
+	double obs_dist = computeObsDist(seq1, seq2);
+    int num_states = partitions[0]->num_states;
+    double z = (double)num_states / (num_states-1);
+    double x = 1.0 - (z * obs_dist);
+
+    if (x <= 0) {
+        /*		string str = "Too long distance between two sequences ";
+        		str += getSeqName(seq1);
+        		str += " and ";
+        		str += getSeqName(seq2);
+        		outWarning(str);*/
+        return MAX_GENETIC_DIST;
+    }
+
+    return -log(x) / z;
+    //return computeObsDist(seq1, seq2);
+	//  AVERAGE DISTANCE
+
+	double dist = 0;
+	int part = 0, num = 0;
+	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
+		int id1 = taxa_index[seq1][part];
+		int id2 = taxa_index[seq2][part];
+		if (id1 < 0 || id2 < 0) continue;
+		dist += (*it)->computeDist(id1, id2);
+	}
+	if (num == 0) // two sequences are not overlapping at all!
+		return MAX_GENETIC_DIST;
+	return dist / num;
+}
+
+SuperAlignment::~SuperAlignment()
+{
+	for (vector<Alignment*>::reverse_iterator it = partitions.rbegin(); it != partitions.rend(); it++)
+		delete (*it);
+	partitions.clear();
+}
+
+void SuperAlignment::printCombinedAlignment(ostream &out, bool append) {
+	vector<Alignment*>::iterator pit;
+	int final_length = 0;
+	for (pit = partitions.begin(); pit != partitions.end(); pit++)
+		final_length += (*pit)->getNSite();
+
+	out << getNSeq() << " " << final_length << endl;
+	StrVector::iterator it;
+	int max_len = getMaxSeqNameLength();
+	if (max_len < 10) max_len = 10;
+	int seq_id = 0;
+	for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+		out.width(max_len);
+		out << left << (*it) << " ";
+		int part = 0;
+		for (pit = partitions.begin(); pit != partitions.end(); pit++, part++) {
+			int part_seq_id = taxa_index[seq_id][part];
+			int nsite = (*pit)->getNSite();
+			if (part_seq_id >= 0) {
+				for (int i = 0; i < nsite; i++)
+					out << (*pit)->convertStateBackStr((*pit)->getPattern(i) [part_seq_id]);
+			} else {
+				string str(nsite, '?');
+				out << str;
+			}
+		}
+		out << endl;
+	}
+}
+
+void SuperAlignment::printCombinedAlignment(const char *file_name, bool append) {
+	vector<Alignment*>::iterator pit;
+	int final_length = 0;
+	for (pit = partitions.begin(); pit != partitions.end(); pit++)
+        if ((*pit)->seq_type == SEQ_CODON)
+            final_length += 3*(*pit)->getNSite();
+        else
+            final_length += (*pit)->getNSite();
+	try {
+		ofstream out;
+		out.exceptions(ios::failbit | ios::badbit);
+
+		if (append)
+			out.open(file_name, ios_base::out | ios_base::app);
+		else
+			out.open(file_name);
+		out << getNSeq() << " " << final_length << endl;
+		StrVector::iterator it;
+		int max_len = getMaxSeqNameLength();
+		if (max_len < 10) max_len = 10;
+		int seq_id = 0;
+		for (it = seq_names.begin(); it != seq_names.end(); it++, seq_id++) {
+			out.width(max_len);
+			out << left << (*it) << " ";
+			int part = 0;
+			for (pit = partitions.begin(); pit != partitions.end(); pit++, part++) {
+				int part_seq_id = taxa_index[seq_id][part];
+				int nsite = (*pit)->getNSite();
+				if (part_seq_id >= 0) {
+					for (int i = 0; i < nsite; i++)
+						out << (*pit)->convertStateBackStr((*pit)->getPattern(i) [part_seq_id]);
+				} else {
+					string str(nsite, '?');
+					out << str;
+				}
+			}
+			out << endl;
+		}
+		out.close();
+		cout << "Concatenated alignment was printed to " << file_name << endl;
+	} catch (ios::failure) {
+		outError(ERR_WRITE_OUTPUT, file_name);
+	}	
+}
+
+void SuperAlignment::printSubAlignments(Params &params, vector<PartitionInfo> &part_info) {
+	vector<Alignment*>::iterator pit;
+	string filename;
+	int part;
+	assert(part_info.size() == partitions.size());
+	for (pit = partitions.begin(), part = 0; pit != partitions.end(); pit++, part++) {
+		if (params.aln_output)
+			filename = params.aln_output;
+		else
+			filename = params.out_prefix;
+		filename += "." + part_info[part].name;
+		 if (params.aln_output_format == ALN_PHYLIP)
+			(*pit)->printPhylip(filename.c_str(), false, NULL, params.aln_nogaps, false, NULL);
+		else if (params.aln_output_format == ALN_FASTA)
+			(*pit)->printFasta(filename.c_str(), false, NULL, params.aln_nogaps, false, NULL);
+	}
+}
+
+double SuperAlignment::computeUnconstrainedLogL() {
+	double logl = 0.0;
+	vector<Alignment*>::iterator pit;
+	for (pit = partitions.begin(); pit != partitions.end(); pit++)
+		logl += (*pit)->computeUnconstrainedLogL();
+	return logl;
+}
+
+double SuperAlignment::computeMissingData() {
+	double ret = 0.0;
+	int len = 0;
+	vector<Alignment*>::iterator pit;
+	for (pit = partitions.begin(); pit != partitions.end(); pit++) {
+		ret += (*pit)->getNSeq() * (*pit)->getNSite();
+		len += (*pit)->getNSite();
+	}
+	ret /= getNSeq() * len;
+	return 1.0 - ret;
+
+}
+
+Alignment *SuperAlignment::concatenateAlignments(IntVector &ids) {
+	string union_taxa;
+	int nsites = 0, nstates = 0, i;
+	SeqType sub_type = SEQ_UNKNOWN;
+	for (i = 0; i < ids.size(); i++) {
+		int id = ids[i];
+		if (id < 0 || id >= partitions.size())
+			outError("Internal error ", __func__);
+		if (nstates == 0) nstates = partitions[id]->num_states;
+		if (sub_type == SEQ_UNKNOWN) sub_type = partitions[id]->seq_type;
+		if (sub_type != partitions[id]->seq_type)
+			outError("Cannot concatenate sub-alignments of different type");
+		if (nstates != partitions[id]->num_states)
+			outError("Cannot concatenate sub-alignments of different #states");
+
+		string taxa_set = getPattern(id);
+		nsites += partitions[id]->getNSite();
+		if (i == 0) union_taxa = taxa_set; else {
+			for (int j = 0; j < union_taxa.length(); j++)
+				if (taxa_set[j] == 1) union_taxa[j] = 1;
+		}
+	}
+
+	Alignment *aln = new Alignment;
+	for (i = 0; i < union_taxa.length(); i++)
+		if (union_taxa[i] == 1) {
+			aln->seq_names.push_back(getSeqName(i));
+		}
+	aln->num_states = nstates;
+	aln->seq_type = sub_type;
+	aln->site_pattern.resize(nsites, -1);
+    aln->clear();
+    aln->pattern_index.clear();
+    aln->STATE_UNKNOWN = partitions[ids[0]]->STATE_UNKNOWN;
+    aln->genetic_code = partitions[ids[0]]->genetic_code;
+
+    int site = 0;
+    for (i = 0; i < ids.size(); i++) {
+    	int id = ids[i];
+		string taxa_set = getPattern(id);
+    	for (Alignment::iterator it = partitions[id]->begin(); it != partitions[id]->end(); it++) {
+    		Pattern pat;
+    		int part_seq = 0;
+    		for (int seq = 0; seq < union_taxa.size(); seq++)
+    			if (union_taxa[seq] == 1) {
+    				char ch = aln->STATE_UNKNOWN;
+    				if (taxa_set[seq] == 1) {
+    					ch = (*it)[part_seq++];
+    				}
+    				pat.push_back(ch);
+    			}
+    		assert(part_seq == partitions[id]->getNSeq());
+    		aln->addPattern(pat, site, (*it).frequency);
+    		// IMPORTANT BUG FIX FOLLOW
+    		int ptnindex = aln->pattern_index[pat];
+            for (int j = 0; j < (*it).frequency; j++)
+                aln->site_pattern[site++] = ptnindex;
+
+    	}
+    }
+    aln->countConstSite();
+    aln->buildSeqStates();
+
+	return aln;
+}
diff --git a/superalignment.h b/superalignment.h
new file mode 100644
index 0000000..7ae6a11
--- /dev/null
+++ b/superalignment.h
@@ -0,0 +1,229 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SUPERALIGNMENT_H
+#define SUPERALIGNMENT_H
+
+#include "alignment.h"
+
+
+struct PartitionInfo {
+	string name; // partition name
+	string model_name; // model name
+	string aln_file; // alignment file associated
+	string sequence_type; // sequence type (DNA/AA/BIN)
+	string position_spec; // position specification, e.g., "1-100\1 1-100\2"
+
+	double cur_score;	// current log-likelihood
+	double part_rate;	// partition heterogeneity rate
+	int    evalNNIs;	// number of evaluated NNIs on subtree
+
+	//DoubleVector null_score; // log-likelihood of each branch collapsed to zero
+	//DoubleVector opt_score;  // optimized log-likelihood for every branch
+	//DoubleVector nni1_score; // log-likelihood for 1st NNI for every branch
+	//DoubleVector nni2_score; // log-likelihood for 2nd NNI for every branch
+
+	DoubleVector cur_brlen;  // current branch lengths
+	//DoubleVector opt_brlen;  // optimized branch lengths for every branch
+	DoubleVector nni1_brlen; // branch length for 1st NNI for every branch
+	DoubleVector nni2_brlen; // branch length for 2nd NNI for every branch
+
+	//double *mem_ptnlh; // total memory allocated for all pattern likelihood vectors
+	double *cur_ptnlh; // current pattern likelihoods of the tree
+	//double *nni1_ptnlh; // pattern likelihoods of 1st NNI tree
+	//double *nni2_ptnlh; // pattern likelihoods of 2nd NNI tree
+	NNIMove nniMoves[2];
+};
+
+class PhyloSuperTree;
+
+/**
+Super alignment representing presence/absence of sequences in
+k partitions for a total of n sequences. It has the form:
+		Site_1 Site_2 ... Site_k
+Seq_1     1      0    ...   1
+Seq_2     0      1    ...   0
+...      ...
+Seq_n     1      1    ...   0
+
+Where (i,j)=1 means Seq_i is present in partition j, 0 otherwise
+
+So data is binary.
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+
+class SuperAlignment : public Alignment
+{
+public:
+	/** constructor initialize from a supertree */
+    SuperAlignment(PhyloSuperTree *super_tree);
+
+	/** constructor initialize empty alignment */
+    SuperAlignment();
+
+    /** destructor */
+    ~SuperAlignment();
+
+    /** return that this is a super-alignment structure */
+	virtual bool isSuperAlignment() { return true; }
+
+	/**
+	 * create taxa_index from super-alignment to sub-alignment
+	 * @param part index of sub-alignment
+	 */
+	void linkSubAlignment(int part);
+
+	/**
+	 * @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites
+	 * the index of sites in 2nd, 3rd,... genes have to be increased by the number of patterns in previous genes
+	 * so that all indices are distinguishable
+	*/
+	virtual void getSitePatternIndex(IntVector &pattern_index);
+
+	/**
+	 * @param freq (OUT) vector of site-pattern frequencies for all sub-alignments
+	*/
+	virtual void getPatternFreq(IntVector &pattern_freq);
+
+
+    /**
+            extract sub-alignment of a sub-set of sequences
+            @param aln original input alignment
+            @param seq_id ID of sequences to extract from
+            @param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
+     */
+    virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char);
+
+    /**
+     * remove identical sequences from alignment
+     * @param not_remove name of sequence where removal is avoided
+     * @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1
+     * @param removed_seqs (OUT) name of removed sequences
+     * @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences
+     * @return this if no sequences were removed, or new alignment if at least 1 sequence was removed
+     */
+    virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs);
+
+
+	/**
+		Quit if some sequences contain only gaps or missing data
+	*/
+	//virtual void checkGappySeq(bool force_error = true);
+
+	/**
+		create a non-parametric bootstrap alignment by resampling sites within partitions
+		@param aln input alignment
+		@param pattern_freq (OUT) if not NULL, will store the resampled pattern frequencies
+        @param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
+            	to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
+            	must equal m, where m is the alignment length. Otherwise, an error will occur.
+            	If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
+	*/
+	virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL);
+
+	/**
+		resampling pattern frequency by a non-parametric bootstrap 
+		@param pattern_freq (OUT) resampled pattern frequencies
+        @param spec bootstrap specification, see above
+	*/
+	virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL);
+
+	/**
+		resampling pattern frequency by a non-parametric bootstrap
+		@param pattern_freq (OUT) resampled pattern frequencies
+        @param spec bootstrap specification, see above
+	*/
+	virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL);
+
+	/**
+	 * shuffle alignment by randomizing the order of sites over all sub-alignments
+	 */
+	virtual void shuffleAlignment();
+
+	/**
+		compute the observed (Hamming) distance (number of different pairs of positions per site)
+			between two sequences
+		@param seq1 index of sequence 1
+		@param seq2 index of sequence 2
+		@return the observed distance between seq1 and seq2 (between 0.0 and 1.0)
+	*/
+	virtual double computeObsDist(int seq1, int seq2);
+
+	/**
+		compute the Juke-Cantor corrected distance between 2 sequences over all partitions
+		@param seq1 index of sequence 1
+		@param seq2 index of sequence 2		
+		@return any distance between seq1 and seq2
+	*/
+	virtual double computeDist(int seq1, int seq2);
+
+	/**
+	 * print the super-alignment to a file
+	 * @param filename
+	 * @param append TRUE to append to this file, false to write new file
+	 */
+	void printCombinedAlignment(const char *filename, bool append = false);
+
+	void printCombinedAlignment(ostream &out, bool append = false);
+
+	/**
+	 * print all sub alignments into files with prefix, suffix is the charset name
+	 * @param prefix prefix of output files
+	 */
+	void printSubAlignments(Params &params, vector<PartitionInfo> &part_info);
+
+	/**
+		@return unconstrained log-likelihood (without a tree)
+	*/
+	virtual double computeUnconstrainedLogL();
+
+	/**
+	 * @return proportion of missing data in super alignment
+	 */
+	double computeMissingData();
+
+	/**
+	 * build all patterns of super alignent from partitions and taxa_index
+	 * it is in form of a binary alignment, where 0 means absence and 1 means presence
+	 * of a gene in a sequence
+	 */
+	void buildPattern();
+
+	/**
+		actual partition alignments
+	*/
+	vector<Alignment*> partitions;
+
+	/**
+		matrix represents the index of taxon i in partition j, -1 if the taxon is not present
+	*/
+	vector<IntVector> taxa_index;
+
+	/**
+	 * concatenate subset of alignments
+	 * @param ids IDs of sub-alignments
+	 * @return concatenated alignment
+	 */
+    Alignment *concatenateAlignments(IntVector &ids);
+
+
+};
+
+#endif
diff --git a/superalignmentpairwise.cpp b/superalignmentpairwise.cpp
new file mode 100644
index 0000000..49544b3
--- /dev/null
+++ b/superalignmentpairwise.cpp
@@ -0,0 +1,74 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "superalignmentpairwise.h"
+
+SuperAlignmentPairwise::SuperAlignmentPairwise()
+ : AlignmentPairwise()
+{
+}
+
+SuperAlignmentPairwise::SuperAlignmentPairwise(PhyloSuperTree *atree, int seq1, int seq2) 
+ : AlignmentPairwise() 
+{
+	tree = atree;
+	seq_id1 = seq1;
+	seq_id2 = seq2;
+	SuperAlignment *aln = (SuperAlignment*) atree->aln;
+	int part = 0;
+	for (PhyloSuperTree::iterator it = atree->begin(); it != atree->end(); it++, part++) {
+		int id1 = aln->taxa_index[seq1][part];
+		int id2 = aln->taxa_index[seq2][part];
+		if (id1 >= 0 && id2 >= 0)
+		partitions.push_back(new AlignmentPairwise((*it), id1, id2));
+	}
+}
+
+double SuperAlignmentPairwise::computeFunction(double value) {
+	double lh = 0.0;
+	for (vector<AlignmentPairwise*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		lh += (*it)->computeFunction(value);
+	}
+	return lh;
+}
+
+
+void SuperAlignmentPairwise::computeFuncDerv(double value, double &df, double &ddf) {
+//	double lh = 0.0;
+	df = 0.0;
+	ddf = 0.0;
+	for (vector<AlignmentPairwise*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
+		double d1, d2;
+//		lh += (*it)->computeFuncDerv(value, d1, d2);
+		(*it)->computeFuncDerv(value, d1, d2);
+		df += d1;
+		ddf += d2;
+	}
+//	return lh;
+}
+
+
+SuperAlignmentPairwise::~SuperAlignmentPairwise()
+{
+	for (vector<AlignmentPairwise*>::reverse_iterator it = partitions.rbegin(); it != partitions.rend(); it++)
+		delete (*it);
+	partitions.clear();
+}
+
+
diff --git a/superalignmentpairwise.h b/superalignmentpairwise.h
new file mode 100644
index 0000000..5c6c86d
--- /dev/null
+++ b/superalignmentpairwise.h
@@ -0,0 +1,66 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SUPERALIGNMENTPAIRWISE_H
+#define SUPERALIGNMENTPAIRWISE_H
+
+#include "alignmentpairwise.h"
+#include "superalignment.h"
+#include "phylosupertree.h"
+
+/**
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class SuperAlignmentPairwise : public AlignmentPairwise
+{
+public:
+    SuperAlignmentPairwise();
+	/**
+		construct the pairwise alignment from two sequences of a multiple alignment
+		@param aln input multiple alignment
+		@param seq_id1 ID of the first sequence
+		@param seq_id2 ID of the second sequence
+	*/
+	SuperAlignmentPairwise(PhyloSuperTree *atree, int seq1, int seq2);
+
+    ~SuperAlignmentPairwise();
+
+	/**
+		compute the likelihood for a distance between two sequences. Used for the ML optimization of the distance.
+		@param value x-value of the function
+		@return log-likelihood 
+	*/
+	virtual double computeFunction(double value);
+
+
+	/**
+		This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
+		used by Newton raphson method to minimize the function.
+		@param value x-value of the function
+		@param df (OUT) first derivative
+		@param ddf (OUT) second derivative
+		@return f(value) of function f you want to minimize
+	*/
+	virtual void computeFuncDerv(double value, double &df, double &ddf);
+
+	vector<AlignmentPairwise*> partitions;
+
+};
+
+#endif
diff --git a/supernode.cpp b/supernode.cpp
new file mode 100644
index 0000000..2802295
--- /dev/null
+++ b/supernode.cpp
@@ -0,0 +1,56 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "supernode.h"
+
+SuperNode::SuperNode()
+ : PhyloNode()
+{
+	init();
+}
+
+
+SuperNode::SuperNode(int aid) : PhyloNode(aid)
+{
+	init();
+}
+
+SuperNode::SuperNode(int aid, int aname) : PhyloNode (aid, aname) {
+	init();
+}
+
+
+SuperNode::SuperNode(int aid, const char *aname) : PhyloNode(aid, aname) {
+	init();
+}
+
+void SuperNode::init() {
+	//partial_lh = NULL;
+}
+
+
+void SuperNode::addNeighbor(Node *node, double length, int id) {
+	neighbors.push_back(new SuperNeighbor(node, length, id));
+}
+
+SuperNode::~SuperNode()
+{
+}
+
+
diff --git a/supernode.h b/supernode.h
new file mode 100644
index 0000000..65bfc38
--- /dev/null
+++ b/supernode.h
@@ -0,0 +1,114 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef SUPERNODE_H
+#define SUPERNODE_H
+
+#include "phylonode.h"
+
+typedef vector<PhyloNeighbor*> PhyloNeighborVec;
+
+/**
+A neighbor in a phylogenetic SUPER tree
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class SuperNeighbor : public PhyloNeighbor {
+
+	friend class SuperNode;
+	friend class PhyloSuperTree;
+
+public:
+	/**
+		construct class with a node and length
+		@param anode the other end of the branch
+		@param alength length of branch
+	*/
+	SuperNeighbor(Node *anode, double alength) : PhyloNeighbor(anode, alength) {	
+	}
+
+	/**
+		construct class with a node and length
+		@param anode the other end of the branch
+		@param alength length of branch
+		@param aid branch ID
+	*/
+	SuperNeighbor(Node *anode, double alength, int aid) : PhyloNeighbor(anode, alength, aid) {	
+	}
+
+	/**
+		vector of size m (m = #partitions)
+	*/
+	PhyloNeighborVec link_neighbors;
+
+};
+
+/**
+Node of a super tree
+
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class SuperNode : public PhyloNode
+{
+	friend class PhyloSuperTree;
+
+public:
+	/**
+		constructor 
+	*/
+    SuperNode();
+
+	/**
+		constructor 
+		@param aid id of this node
+	*/
+	SuperNode(int aid);
+
+	/**
+		constructor 
+		@param aid id of this node
+		@param aname name of this node
+	*/
+	SuperNode(int aid, int aname);
+
+	/**
+		constructor 
+		@param aid id of this node
+		@param aname name of this node
+	*/
+	SuperNode(int aid, const char *aname);
+
+	/**
+		initialization
+	*/
+	void init();
+
+	/**
+		add a neighbor
+		@param node the neighbor node
+		@param length branch length
+		@param id branch ID
+	*/
+	virtual void addNeighbor(Node *node, double length, int id = -1);
+
+    ~SuperNode();
+
+};
+
+#endif
diff --git a/timeutil.h b/timeutil.h
new file mode 100644
index 0000000..7151d70
--- /dev/null
+++ b/timeutil.h
@@ -0,0 +1,287 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef TIMEUTIL_H
+#define TIMEUTIL_H
+
+#include <iqtree_config.h>
+#include <stdlib.h>
+
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#if !defined(_MSC_VER)
+#include <sys/time.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+//#if defined(_MSC_VER)
+//#define inline __inline
+//#endif
+
+#if (defined _WIN32 || defined __WIN32__ || defined WIN32) 
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x500
+#endif
+#endif
+
+#ifdef HAVE_GETRUSAGE
+	#include <sys/resource.h>
+#else 
+	#if (defined _WIN32 || defined __WIN32__) && ! defined __CYGWIN__
+	# include <windows.h>
+	#else
+	# include <sys/times.h>
+	# include <unistd.h>
+	#endif
+#endif /* HAVE_GETRUSAGE */
+
+/*********************************************
+ * gettimeofday()
+ ********************************************/
+#ifndef HAVE_GETTIMEOFDAY
+	#if defined WIN32 || defined _WIN32 || defined __WIN32__
+	#include <sys/timeb.h>
+	#include <sys/types.h>
+	#include <winsock.h>
+
+	struct timezone {
+		char dummy;
+	};
+
+	__inline void gettimeofday(struct timeval* t, void* timezone)
+	{       
+		struct _timeb timebuffer;
+		_ftime( &timebuffer );
+		t->tv_sec=timebuffer.time;
+		t->tv_usec=1000*timebuffer.millitm;
+	}
+	#else /* UNIX */
+	#include <sys/time.h>
+	__inline void gettimeofday(struct timeval* t, void* timezone) {
+		time_t cur_time;
+		time(&cur_time);
+		t->tv_sec = cur_time;
+		t->tv_usec = 0;
+	}
+	#endif
+#endif /* HAVE_GETTIMEOFDAY */
+
+
+
+/**
+ * @return CPU time in seconds since program was started (corrrect up to micro-seconds)
+ * with correction for OpenMP
+ */
+__inline double getCPUTime() {
+#ifdef HAVE_GETRUSAGE
+	struct rusage usage;
+	getrusage(RUSAGE_SELF, &usage);
+	return (usage.ru_utime.tv_sec + (double)usage.ru_utime.tv_usec / 1.0e6);
+#elif (defined _WIN32 || defined __WIN32__) && ! defined __CYGWIN__
+	/* Fill in the ru_utime and ru_stime members.  */
+	FILETIME creation_time;
+	FILETIME exit_time;
+	FILETIME kernel_time;
+	FILETIME user_time;
+
+	if (GetProcessTimes (GetCurrentProcess (),
+						&creation_time, &exit_time,
+						&kernel_time, &user_time))
+	{
+		/* Convert to microseconds, rounding.  */
+		uint64_t user_usec = ((((uint64_t) user_time.dwHighDateTime << 32) | (uint64_t) user_time.dwLowDateTime) + 5) / 10;
+		return (double)user_usec / 1.0e6;
+	}
+#else
+	/* Fill in the ru_utime and ru_stime members.  */
+	struct tms time;
+
+	if (times (&time) != (clock_t) -1) {
+		unsigned int clocks_per_second = sysconf (_SC_CLK_TCK);
+		if (clocks_per_second > 0) {
+			uint64_t user_usec;
+			user_usec =	(((uint64_t) time.tms_utime * (uint64_t) 1000000U) + clocks_per_second / 2) / clocks_per_second;
+			return (double)user_usec / 1.0e6;
+		}
+	}
+#endif
+	abort();
+}
+
+/**
+ * @return real wall-clock time in seconds since Epoch (correct up to micro-seconds)
+ */
+__inline double getRealTime() {
+#ifdef _OPENMP
+	return omp_get_wtime();
+#else
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	//Tung: the if statement below causes compiling error because gettimeofday() return void not boolean
+	//if (gettimeofday(&tv, NULL)) return -1.0; /* error */
+	return (tv.tv_sec + (double)tv.tv_usec / 1.0e6);
+#endif
+}
+/*
+#if defined _WIN32 || defined __WIN32__ || defined WIN32
+#include <windows.h>
+#include <winbase.h>
+inline uint64_t getTotalSystemMemory()
+{
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    return status.ullTotalPhys;
+}
+
+#elif defined __APPLE__ || defined __MACH__
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+inline uint64_t getTotalSystemMemory()
+{
+	int mib[2];
+	uint64_t physical_memory;
+	mib[0] = CTL_HW;
+	mib[1] = HW_MEMSIZE;
+	size_t length = sizeof(uint64_t);
+	sysctl(mib, 2, &physical_memory, &length, NULL, 0);
+	return physical_memory;
+}
+#else
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/sysinfo.h>
+
+inline uint64_t getTotalSystemMemory()
+{
+    struct sysinfo memInfo;
+	sysinfo (&memInfo);
+	int64_t totalram = memInfo.totalram;
+	return (totalram * memInfo.mem_unit);
+}
+
+#endif*/ /* for declaring getTotalSystemMemory() */
+
+/*
+ * Author:  David Robert Nadeau
+ * Site:    http://NadeauSoftware.com/
+ * License: Creative Commons Attribution 3.0 Unported License
+ *          http://creativecommons.org/licenses/by/3.0/deed.en_US
+ */
+
+#if defined(_WIN32)
+#include <Windows.h>
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#if defined(BSD)
+#include <sys/sysctl.h>
+#endif
+
+#else
+#error "Unable to define getMemorySize( ) for an unknown OS."
+#endif
+
+
+
+/**
+ * Returns the size of physical memory (RAM) in bytes.
+ */
+__inline uint64_t getMemorySize( )
+{
+#if defined(_WIN32) && (defined(__CYGWIN__) || defined(__CYGWIN32__))
+	/* Cygwin under Windows. ------------------------------------ */
+	/* New 64-bit MEMORYSTATUSEX isn't available.  Use old 32.bit */
+#warning "getMemorySize() will be wrong if RAM is actually > 4GB"
+	MEMORYSTATUS status;
+	status.dwLength = sizeof(status);
+	GlobalMemoryStatus( &status );
+	return (uint64_t)status.dwTotalPhys;
+
+#elif defined(_WIN32)
+	/* Windows. ------------------------------------------------- */
+	/* Use new 64-bit MEMORYSTATUSEX, not old 32-bit MEMORYSTATUS */
+	MEMORYSTATUSEX status;
+	status.dwLength = sizeof(status);
+	GlobalMemoryStatusEx( &status );
+	return (uint64_t)status.ullTotalPhys;
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+	/* UNIX variants. ------------------------------------------- */
+	/* Prefer sysctl() over sysconf() except sysctl() HW_REALMEM and HW_PHYSMEM */
+
+#if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64))
+	int mib[2];
+	mib[0] = CTL_HW;
+#if defined(HW_MEMSIZE)
+	mib[1] = HW_MEMSIZE;            /* OSX. --------------------- */
+#elif defined(HW_PHYSMEM64)
+	mib[1] = HW_PHYSMEM64;          /* NetBSD, OpenBSD. --------- */
+#endif
+	uint64_t size = 0;               /* 64-bit */
+	size_t len = sizeof( size );
+	if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 )
+		return (uint64_t)size;
+	return 0L;			/* Failed? */
+
+#elif defined(_SC_AIX_REALMEM)
+	/* AIX. ----------------------------------------------------- */
+	return (uint64_t)sysconf( _SC_AIX_REALMEM ) * (uint64_t)1024L;
+
+#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
+	/* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */
+	return (uint64_t)sysconf( _SC_PHYS_PAGES ) *
+		(uint64_t)sysconf( _SC_PAGESIZE );
+
+#elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE)
+	/* Legacy. -------------------------------------------------- */
+	return (uint64_t)sysconf( _SC_PHYS_PAGES ) *
+		(uint64_t)sysconf( _SC_PAGE_SIZE );
+
+#elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM))
+	/* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */
+	int mib[2];
+	mib[0] = CTL_HW;
+#if defined(HW_REALMEM)
+	mib[1] = HW_REALMEM;		/* FreeBSD. ----------------- */
+#elif defined(HW_PYSMEM)
+	mib[1] = HW_PHYSMEM;		/* Others. ------------------ */
+#endif
+	uint64_t size = 0;		/* 32-bit */
+	size_t len = sizeof( size );
+	if ( sysctl( mib, 2, &size, &len, NULL, 0 ) == 0 )
+		return (uint64_t)size;
+	return 0L;			/* Failed? */
+#endif /* sysctl and sysconf variants */
+
+#else
+	return 0L;			/* Unknown OS. */
+#endif
+}
+
+#endif
diff --git a/tinatree.cpp b/tinatree.cpp
new file mode 100644
index 0000000..794bfa6
--- /dev/null
+++ b/tinatree.cpp
@@ -0,0 +1,141 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#include "tinatree.h"
+
+TinaTree::TinaTree()
+ : PhyloTree()
+{
+}
+TinaTree::TinaTree(Alignment *alignment) : PhyloTree(alignment) {
+}
+
+
+TinaTree::~TinaTree()
+{
+}
+
+
+int TinaTree::computeParsimonyScore(int ptn, int &states, PhyloNode *node, PhyloNode *dad) {
+    int score = 0;
+    states = 0;
+    if (!node) node = (PhyloNode*) root;
+    if (node->degree() > 3)
+        outError("Does not work with multifurcating tree");
+    if (verbose_mode == VB_DEBUG)
+        cout << ptn << " " << node->id << "  " << node->name << endl;
+
+    if (node->isLeaf()) {
+        char state;
+        if (node->name == ROOT_NAME) {
+            state = aln->STATE_UNKNOWN;
+        } else {
+            assert(node->id < aln->getNSeq());
+            state = (*aln)[ptn][node->id];
+        }
+        if (state == aln->STATE_UNKNOWN) {
+            states = (1 << aln->num_states) - 1;
+        } else if (state < aln->num_states)
+            states = (1 << state);
+        else {
+            // ambiguous character, for DNA, RNA
+            states = state - 3;
+        }
+    }
+    if (!node->isLeaf() || node == root) {
+        int union_states = 0;
+        int intersect_states = (1 << aln->num_states) - 1;
+        if (states != 0) {
+            union_states = states;
+            intersect_states = states;
+        }
+
+        FOR_NEIGHBOR_IT(node, dad, it) {
+            int states_child;
+            int score_child = computeParsimonyScore(ptn, states_child, (PhyloNode*) ((*it)->node), node);
+            union_states |= states_child;
+            intersect_states &= states_child;
+            score += score_child;
+        }
+        if (intersect_states)
+            states = intersect_states;
+        else {
+            states = union_states;
+            score++;
+        }
+    }
+    return score;
+}
+
+
+int TinaTree::computeParsimonyScore() {
+    assert(root && root->isLeaf());
+
+    int score = 0;
+    for (int ptn = 0; ptn < aln->size(); ptn++)
+        if (!aln->at(ptn).is_const) {
+            int states;
+            int ptn_score = computeParsimonyScore(ptn, states);
+            score += ptn_score * (*aln)[ptn].frequency;
+            if (verbose_mode >= VB_MAX) {
+            	for (int seq=0; seq < aln->getNSeq(); seq++)
+            		cout << aln->convertStateBackStr(aln->at(ptn)[seq]);
+            	cout << " " << ptn_score << endl;
+            }
+        }
+    if (verbose_mode >= VB_MAX)
+    	cout << endl;
+    return score;
+}
+
+void TinaTree::initializeAllPartialLh() {
+    int index, indexlh;
+    initializeAllPartialLh(index, indexlh);
+    assert(index == (nodeNum - 1)*2);
+}
+
+void TinaTree::initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node, PhyloNode *dad) {
+    int pars_block_size = getBitsBlockSize();
+    if (!node) {
+        node = (PhyloNode*) root;
+        // allocate the big central partial likelihoods memory
+
+        if (!central_partial_pars) {
+            if (verbose_mode >= VB_MED)
+                cout << "Allocating " << (leafNum - 1)*4 * pars_block_size * sizeof (UINT) << " bytes for partial parsimony vectors" << endl;
+            central_partial_pars = new UINT[(leafNum-1)*4*pars_block_size];
+            if (!central_partial_pars)
+                outError("Not enough memory for partial parsimony vectors");
+        }
+        index = 0;
+    }
+    if (dad) {
+        // assign a region in central_partial_lh to both Neihgbors (dad->node, and node->dad)
+        PhyloNeighbor *nei = (PhyloNeighbor*) node->findNeighbor(dad);
+        //assert(!nei->partial_lh);
+        nei->partial_pars = central_partial_pars + (index * pars_block_size);
+        nei = (PhyloNeighbor*) dad->findNeighbor(node);
+        //assert(!nei->partial_lh);
+        nei->partial_pars = central_partial_pars + ((index + 1) * pars_block_size);
+        index += 2;
+        assert(index < nodeNum * 2 - 1);
+    }
+    FOR_NEIGHBOR_IT(node, dad, it)
+    initializeAllPartialLh(index, indexlh, (PhyloNode*) (*it)->node, node);
+}
diff --git a/tinatree.h b/tinatree.h
new file mode 100644
index 0000000..5f40542
--- /dev/null
+++ b/tinatree.h
@@ -0,0 +1,61 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+#ifndef TINATREE_H
+#define TINATREE_H
+
+#include "phylotree.h"
+
+/**
+	@author BUI Quang Minh <minh.bui at univie.ac.at>
+*/
+class TinaTree : public PhyloTree
+{
+public:
+    TinaTree();
+    /**
+     * Constructor with given alignment
+     * @param alignment
+     */
+    TinaTree(Alignment *alignment);
+
+    ~TinaTree();
+    /**
+            SLOW VERSION: compute the parsimony score of the tree, given the alignment
+            @return the parsimony score
+     */
+    int computeParsimonyScore();
+
+    /**
+            SLOW VERSION: compute the parsimony score of the tree, given the alignment
+            @return the parsimony score
+            @param node the current node
+            @param dad dad of the node, used to direct the search
+            @param ptn pattern ID
+            @param states set of admissible states at the current node (in binary code)
+     */
+    int computeParsimonyScore(int ptn, int &states, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+	virtual void initializeAllPartialLh();
+
+	virtual void initializeAllPartialLh(int &index, int &indexlh, PhyloNode *node = NULL, PhyloNode *dad = NULL);
+
+};
+
+#endif
diff --git a/tools.cpp b/tools.cpp
new file mode 100644
index 0000000..41a42d6
--- /dev/null
+++ b/tools.cpp
@@ -0,0 +1,3654 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32)
+#include <execinfo.h>
+#include <cxxabi.h>
+#endif
+
+#include "tools.h"
+#include "timeutil.h"
+
+VerboseMode verbose_mode;
+
+/*
+        WIN32 does not define gettimeofday() function.
+        Here declare it extra for WIN32 only.
+ */
+//#if defined(WIN32) && !defined(HAVE_GETTIMEOFDAY)
+#if defined(WIN32)
+#include <sstream>
+#endif
+//
+//struct timezone {
+//};
+//
+//void gettimeofday(struct timeval* t, void* timezone) {
+//    struct _timeb timebuffer;
+//    _ftime(&timebuffer);
+//    t->tv_sec = timebuffer.time;
+//    t->tv_usec = 1000 * timebuffer.millitm;
+//}
+//#else
+//#include <sys/time.h>
+//#endif
+
+
+/********************************************************
+        Defining DoubleMatrix methods
+ ********************************************************/
+
+/*DoubleMatrix::DoubleMatrix(int arows, int acols) {
+        rows = arows;
+        cols = acols;
+        size =  rows * cols;
+        value = new double[size];
+}
+
+void DoubleMatrix::setZero() {
+        memset(value, 0, size * sizeof(double));
+}
+
+
+DoubleMatrix::~DoubleMatrix() {
+        if (value) delete value;
+        value = NULL;
+}
+ */
+
+/********************************************************
+        Miscellaneous
+ ********************************************************/
+
+/**
+        Output an error to screen, then exit program
+        @param error error message
+ */
+/*
+void outError(char *error)
+{
+        cerr << "ERROR: " << error << endl;
+        exit(2);
+}
+ */
+
+
+/**
+        Output an error to screen, then exit program
+        @param error error message
+ */
+void outError(const char *error, bool quit) {
+	if (error == ERR_NO_MEMORY) {
+        print_stacktrace(cerr);
+	}
+	cerr << "ERROR: " << error << endl;
+    if (quit)
+    	exit(2);
+}
+
+/**
+        Output an error to screen, then exit program
+        @param error error message
+ */
+void outError(string error, bool quit) {
+    outError(error.c_str(), quit);
+}
+
+void outError(const char *error, const char *msg, bool quit) {
+    string str = error;
+    str += msg;
+    outError(str, quit);
+}
+
+void outError(const char *error, string msg, bool quit) {
+    string str = error;
+    str += msg;
+    outError(str, quit);
+}
+
+/**
+        Output a warning message to screen
+        @param error warning message
+ */
+void outWarning(const char *warn) {
+    cerr << "WARNING: " << warn << endl;
+}
+
+void outWarning(string warn) {
+    outWarning(warn.c_str());
+}
+
+double randomLen(Params &params) {
+    double ran = static_cast<double> (random_int(999) + 1) / 1000;
+    double len = -params.mean_len * log(ran);
+
+    if (len < params.min_len) {
+        int fac = random_int(1000);
+        double delta = static_cast<double> (fac) / 1000.0; //delta < 1.0
+        len = params.min_len + delta / 1000.0;
+    }
+
+    if (len > params.max_len) {
+        int fac = random_int(1000);
+        double delta = static_cast<double> (fac) / 1000.0; //delta < 1.0
+        len = params.max_len - delta / 1000.0;
+    }
+    return len;
+}
+
+//From Tung
+
+string convertIntToString(int number) {
+    stringstream ss; //create a stringstream
+    ss << number; //add number to the stream
+    return ss.str(); //return a string with the contents of the stream
+}
+
+string convertInt64ToString(int64_t number) {
+    stringstream ss; //create a stringstream
+    ss << number; //add number to the stream
+    return ss.str(); //return a string with the contents of the stream
+}
+
+string convertDoubleToString(double number) {
+    stringstream ss; //create a stringstream
+    ss << number; //add number to the stream
+    return ss.str(); //return a string with the contents of the stream
+}
+
+//From Tung
+
+bool copyFile(const char SRC[], const char DEST[]) {
+    std::ifstream src; // the source file
+    std::ofstream dest; // the destination file
+
+    src.open(SRC, std::ios::binary); // open in binary to prevent jargon at the end of the buffer
+    dest.open(DEST, std::ios::binary); // same again, binary
+    if (!src.is_open() || !dest.is_open())
+        return false; // could not be copied
+
+    dest << src.rdbuf(); // copy the content
+    dest.close(); // close destination file
+    src.close(); // close source file
+
+    return true; // file copied successfully
+}
+
+bool fileExists(string strFilename) {
+    struct stat stFileInfo;
+    bool blnReturn;
+    int intStat;
+
+    // Attempt to get the file attributes
+    intStat = stat(strFilename.c_str(), &stFileInfo);
+    if (intStat == 0) {
+        // We were able to get the file attributes
+        // so the file obviously exists.
+        blnReturn = true;
+    } else {
+        // We were not able to get the file attributes.
+        // This may mean that we don't have permission to
+        // access the folder which contains this file. If you
+        // need to do that level of checking, lookup the
+        // return values of stat which will give you
+        // more details on why stat failed.
+        blnReturn = false;
+    }
+    return (blnReturn);
+}
+
+int convert_int(const char *str) throw (string) {
+    char *endptr;
+    int i = strtol(str, &endptr, 10);
+
+    if ((i == 0 && endptr == str) || abs(i) == HUGE_VALL || *endptr != 0) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    return i;
+}
+
+int convert_int(const char *str, int &end_pos) throw (string) {
+	char *endptr;
+	int i = strtol(str, &endptr, 10);
+
+	if ((i == 0 && endptr == str) || abs(i) == HUGE_VALL) {
+		string err = "Expecting integer, but found \"";
+		err += str;
+		err += "\" instead";
+		throw err;
+	}
+	end_pos = endptr - str;
+	return i;
+}
+
+void convert_int_vec(const char *str, IntVector &vec) throw (string) {
+    char *beginptr = (char*)str, *endptr;
+    vec.clear();
+    do {
+		int i = strtol(beginptr, &endptr, 10);
+
+		if ((i == 0 && endptr == beginptr) || abs(i) == HUGE_VALL) {
+			string err = "Expecting integer, but found \"";
+			err += beginptr;
+			err += "\" instead";
+			throw err;
+		}
+		vec.push_back(i);
+		if (*endptr == ',') endptr++;
+		beginptr = endptr;
+    } while (*endptr != 0);
+}
+
+
+double convert_double(const char *str) throw (string) {
+    char *endptr;
+    double d = strtod(str, &endptr);
+    if ((d == 0.0 && endptr == str) || fabs(d) == HUGE_VALF || *endptr != 0) {
+        string err = "Expecting floating-point number, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+    return d;
+}
+
+double convert_double(const char *str, int &end_pos) throw (string) {
+	char *endptr;
+	double d = strtod(str, &endptr);
+	if ((d == 0.0 && endptr == str) || fabs(d) == HUGE_VALF) {
+		string err = "Expecting floating-point number, but found \"";
+		err += str;
+		err += "\" instead";
+		throw err;
+	}
+	end_pos = endptr - str;
+	return d;
+}
+
+void convert_double_vec(const char *str, DoubleVector &vec) throw (string) {
+    char *beginptr = (char*)str, *endptr;
+    vec.clear();
+    do {
+		double d = strtod(beginptr, &endptr);
+
+		if ((d == 0.0 && endptr == beginptr) || fabs(d) == HUGE_VALF) {
+			string err = "Expecting floating-point number, but found \"";
+			err += beginptr;
+			err += "\" instead";
+			throw err;
+		}
+		vec.push_back(d);
+		if (*endptr == ',') endptr++;
+		beginptr = endptr;
+    } while (*endptr != 0);
+}
+
+string convert_time(const double sec) {
+    int sec_int = (int) floor(sec);
+    int secs = sec_int % 60;
+    int mins = (sec_int % 3600) / 60;
+    int hours = sec_int / 3600;
+    stringstream ss;
+    ss << hours << "h:" << mins << "m:" << secs << "s";
+    return ss.str();
+}
+
+void convert_range(const char *str, int &lower, int &upper, int &step_size) throw (string) {
+    char *endptr;
+    char *beginptr = (char*) str;
+
+    // parse the lower bound of the range
+    int d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL || (*endptr != 0 && *endptr != ':')) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+    //lower = d;
+    int d_save = d;
+    upper = d;
+    if (*endptr == 0) return;
+
+
+    // parse the upper bound of the range
+    str = endptr + 1;
+    d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL || (*endptr != 0 && *endptr != ':')) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    lower = d_save;
+    upper = d;
+    if (*endptr == 0) return;
+
+    // parse the step size of the range
+    str = endptr + 1;
+    d = strtol(str, &endptr, 10);
+    if ((d == 0 && endptr == str) || abs(d) == HUGE_VALL || *endptr != 0) {
+        string err = "Expecting integer, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    step_size = d;
+    str = beginptr;
+
+}
+
+void convert_range(const char *str, double &lower, double &upper, double &step_size) throw (string) {
+    char *endptr;
+    char *beginptr = (char*) str;
+
+    // parse the lower bound of the range
+    double d = strtod(str, &endptr);
+    if ((d == 0.0 && endptr == str) || fabs(d) == HUGE_VALF || (*endptr != 0 && *endptr != ':')) {
+        string err = "Expecting floating-point number, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+    //lower = d;
+    double d_save = d;
+    upper = d;
+    if (*endptr == 0) return;
+
+
+    // parse the upper bound of the range
+    str = endptr + 1;
+    d = strtod(str, &endptr);
+    if ((d == 0.0 && endptr == str) || fabs(d) == HUGE_VALF || (*endptr != 0 && *endptr != ':')) {
+        string err = "Expecting floating-point number, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    lower = d_save;
+    upper = d;
+    if (*endptr == 0) return;
+
+    // parse the step size of the range
+    str = endptr + 1;
+    d = strtod(str, &endptr);
+    if ((d == 0.0 && endptr == str) || fabs(d) == HUGE_VALF || *endptr != 0) {
+        string err = "Expecting floating-point number, but found \"";
+        err += str;
+        err += "\" instead";
+        throw err;
+    }
+
+    step_size = d;
+    str = beginptr;
+
+}
+
+void convert_string_vec(const char *str, StrVector &vec) throw (string) {
+    char *beginptr = (char*)str, *endptr;
+    vec.clear();
+    string elem;
+    do {
+    	endptr = strchr(beginptr, ',');
+    	if (!endptr) {
+    		elem.assign(beginptr);
+    		vec.push_back(elem);
+    		return;
+    	}
+    	elem.assign(beginptr, endptr-beginptr);
+    	vec.push_back(elem);
+		beginptr = endptr+1;
+    } while (*endptr != 0);
+
+}
+
+void readWeightFile(Params &params, int ntaxa, double &scale, StrVector &tax_name, DoubleVector &tax_weight) {
+    cout << "Reading scale factor and taxa weights file " << params.param_file << " ..." << endl;
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(params.param_file);
+        string name, tmp;
+
+        in >> tmp;
+        scale = convert_double(tmp.c_str());
+
+        for (; !in.eof() && ntaxa > 0; ntaxa--) {
+            // remove the failbit
+            in.exceptions(ios::badbit);
+            if (!(in >> name)) break;
+            // set the failbit again
+            in.exceptions(ios::failbit | ios::badbit);
+
+            tax_name.push_back(name);
+            // read the sequence weight
+            in >> tmp;
+            tax_weight.push_back(convert_double(tmp.c_str()));
+        }
+        in.clear();
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    } catch (string str) {
+        outError(str);
+    }
+}
+
+void readStringFile(const char* filename, int max_num, StrVector &strv) {
+    try {
+        ifstream in;
+        // set the failbit and badbit
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(filename);
+        string name;
+
+        // remove the failbit
+        in.exceptions(ios::badbit);
+        for (; !in.eof() && max_num > 0; max_num--) {
+            if (!(in >> name)) break;
+            strv.push_back(name);
+        }
+        in.clear();
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    }
+}
+
+void readInitTaxaFile(Params &params, int ntaxa, StrVector &tax_name) {
+    cout << "Reading initial taxa set file " << params.initial_file << " ..." << endl;
+    readStringFile(params.initial_file, ntaxa, tax_name);
+}
+
+void printString2File(string myString, string filename) {
+    ofstream myfile(filename.c_str());
+    if (myfile.is_open()) {
+        myfile << myString;
+        myfile.close();
+    } else {
+        cout << "Unable to open file " << filename << endl;
+    }
+}
+
+void readInitAreaFile(Params &params, int nareas, StrVector &area_name) {
+    cout << "Reading initial area file " << params.initial_area_file << " ..." << endl;
+    readStringFile(params.initial_area_file, nareas, area_name);
+}
+
+void readAreasBoundary(char *file_name, MSetsBlock *areas, double *areas_boundary) {
+
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(file_name);
+
+        int nset;
+        in >> nset;
+        if (nset != areas->getNSets())
+            throw "File has different number of areas";
+        int pos = 0, seq1, seq2;
+        for (seq1 = 0; seq1 < nset; seq1++) {
+            string seq_name;
+            in >> seq_name;
+            if (seq_name != areas->getSet(seq1)->name)
+                throw "Area name " + seq_name + " is different from " + areas->getSet(seq1)->name;
+            for (seq2 = 0; seq2 < nset; seq2++) {
+                in >> areas_boundary[pos++];
+            }
+        }
+        // check for symmetric matrix
+        for (seq1 = 0; seq1 < nset - 1; seq1++) {
+            if (areas_boundary[seq1 * nset + seq1] <= 1e-6)
+                throw "Diagonal elements of distance matrix should represent the boundary of single areas";
+            for (seq2 = seq1 + 1; seq2 < nset; seq2++)
+                if (areas_boundary[seq1 * nset + seq2] != areas_boundary[seq2 * nset + seq1])
+                    throw "Shared boundary between " + areas->getSet(seq1)->name + " and " + areas->getSet(seq2)->name + " is not symmetric";
+        }
+
+
+        in.close();
+        cout << "Areas relation matrix was read from " << file_name << endl;
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT, file_name);
+    }
+
+}
+
+void readTaxaSets(char *filename, MSetsBlock *sets) {
+    TaxaSetNameVector *allsets = sets->getSets();
+    try {
+        int count = 0;
+        ifstream in;
+        // set the failbit and badbit
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(filename);
+        string name;
+
+        // remove the failbit
+        in.exceptions(ios::badbit);
+        while (!in.eof()) {
+            int ntaxa = 0;
+            string str;
+            if (!(in >> str)) break;
+            ntaxa = convert_int(str.c_str());
+            if (ntaxa <= 0) throw "Number of taxa must be > 0";
+            count++;
+            //allsets->resize(allsets->size()+1);
+            TaxaSetName *myset = new TaxaSetName;
+            allsets->push_back(myset);
+            myset->name = "";
+            myset->name += count;
+            for (; ntaxa > 0; ntaxa--) {
+                string str;
+                if (!(in >> str)) throw "Cannot read in taxon name";
+                if ((ntaxa > 1) && in.eof()) throw "Unexpected end of file while reading taxon names";
+                myset->taxlist.push_back(str);
+            }
+        }
+        in.clear();
+        // set the failbit again
+        in.exceptions(ios::failbit | ios::badbit);
+        in.close();
+        if (count == 0) throw "No set found, you must specify at least 1 set";
+    } catch (ios::failure) {
+        outError(ERR_READ_INPUT);
+    } catch (const char *str) {
+        outError(str);
+    } catch (string str) {
+        outError(str);
+    }
+}
+
+void get2RandNumb(const int size, int &first, int &second) {
+    // pick a random element
+    first = random_int(size);
+    // pick a random element from what's left (there is one fewer to choose from)...
+    second = random_int(size - 1);
+    // ...and adjust second choice to take into account the first choice
+    if (second >= first) {
+        ++second;
+    }
+}
+
+void quickStartGuide();
+
+void parseArg(int argc, char *argv[], Params &params) {
+    int cnt;
+    verbose_mode = VB_MIN;
+    params.tree_gen = NONE;
+    params.user_file = NULL;
+    params.fai = false;
+    params.testAlpha = false;
+    params.testAlphaEps = 100.0;
+    params.exh_ai = false;
+    params.alpha_invar_file = NULL;
+    params.out_prefix = NULL;
+    params.out_file = NULL;
+    params.sub_size = 0;
+    params.pd_proportion = 0.0;
+    params.min_proportion = 0.0;
+    params.step_proportion = 0.01;
+    params.min_size = 0;
+    params.step_size = 1;
+    params.find_all = false;
+    params.run_mode = DETECTED;
+    params.detected_mode = DETECTED;
+    params.param_file = NULL;
+    params.initial_file = NULL;
+    params.initial_area_file = NULL;
+    params.pdtaxa_file = NULL;
+    params.areas_boundary_file = NULL;
+    params.boundary_modifier = 1.0;
+    params.dist_file = NULL;
+    params.compute_obs_dist = false;
+    params.compute_jc_dist = true;
+    params.compute_ml_dist = true;
+    params.compute_ml_tree = true;
+    params.budget_file = NULL;
+    params.overlap = 0;
+    params.is_rooted = false;
+    params.sample_size = -1;
+    params.repeated_time = 1;
+    //params.nr_output = 10000;
+    params.nr_output = 0;
+    //params.smode = EXHAUSTIVE;
+    params.intype = IN_OTHER;
+    params.budget = -1;
+    params.min_budget = -1;
+    params.step_budget = 1;
+    params.root = NULL;
+    params.num_splits = 0;
+    params.min_len = 0.001;
+    params.mean_len = 0.1;
+    params.max_len = 0.999;
+    params.num_zero_len = 0;
+    params.pd_limit = 100;
+    params.calc_pdgain = false;
+    params.multi_tree = false;
+    params.second_tree = NULL;
+    params.tree_weight_file = NULL;
+    params.consensus_type = CT_NONE;
+    params.find_pd_min = false;
+    params.branch_cluster = 0;
+    params.taxa_order_file = NULL;
+    params.endemic_pd = false;
+    params.exclusive_pd = false;
+    params.complement_area = NULL;
+    params.scaling_factor = -1;
+    params.numeric_precision = -1;
+    params.binary_programming = false;
+    params.quad_programming = false;
+    params.test_input = TEST_NONE;
+    params.tree_burnin = 0;
+    params.tree_max_count = 1000000;
+    params.split_threshold = 0.0;
+    params.split_weight_threshold = -1000;
+    params.split_weight_summary = SW_SUM;
+    params.gurobi_format = true;
+    params.gurobi_threads = 1;
+    params.num_bootstrap_samples = 0;
+    params.bootstrap_spec = NULL;
+
+    params.aln_file = NULL;
+    params.treeset_file = NULL;
+    params.topotest_replicates = 0;
+    params.do_weighted_test = false;
+    params.do_au_test = false;
+    params.siteLL_file = NULL; //added by MA
+    params.partition_file = NULL;
+    params.partition_type = 0;
+    params.partfinder_rcluster = 100;
+    params.remove_empty_seq = true;
+    params.terrace_aware = true;
+    params.sequence_type = NULL;
+    params.aln_output = NULL;
+    params.aln_site_list = NULL;
+    params.aln_output_format = ALN_PHYLIP;
+    params.gap_masked_aln = NULL;
+    params.concatenate_aln = NULL;
+    params.aln_nogaps = false;
+    params.aln_no_const_sites = false;
+//    params.parsimony = false;
+//    params.parsimony_tree = false;
+    params.tree_spr = false;
+    params.nexus_output = false;
+    params.k_representative = 4;
+    params.loglh_epsilon = 0.001;
+    params.numSmoothTree = 1;
+    params.nni5 = true;
+    params.leastSquareBranch = false;
+    params.pars_branch_length = false;
+    params.bayes_branch_length = false;
+    params.manuel_analytic_approx = false;
+    params.leastSquareNNI = false;
+    params.ls_var_type = OLS;
+    params.maxCandidates = 1000;
+    params.popSize = 5;
+    params.p_delete = -1;
+    params.min_iterations = -1;
+    params.max_iterations = 1;
+    params.num_param_iterations = 100;
+    params.stop_condition = SC_UNSUCCESS_ITERATION;
+    params.stop_confidence = 0.95;
+    params.model_name = "";
+    params.model_set = NULL;
+    params.model_subset = NULL;
+    params.state_freq_set = NULL;
+    params.ratehet_set = NULL;
+    params.model_def_file = NULL;
+    params.model_test_again = false;
+    params.model_test_and_tree = 0;
+    params.model_test_separate_rate = false;
+    params.optimize_mixmodel_weight = false;
+    params.store_trans_matrix = false;
+    //params.freq_type = FREQ_EMPIRICAL;
+    params.freq_type = FREQ_UNKNOWN;
+    params.min_rate_cats = 2;
+    params.num_rate_cats = 4;
+    params.max_rate_cats = 10;
+    params.gamma_shape = -1.0;
+    params.gamma_median = false;
+    params.p_invar_sites = -1.0;
+    params.optimize_model_rate_joint = false;
+    params.optimize_by_newton = true;
+    params.optimize_alg = "2-BFGS-B";
+    params.fixed_branch_length = false;
+    params.iqp_assess_quartet = IQP_DISTANCE;
+    params.iqp = false;
+    params.write_intermediate_trees = 0;
+    params.avoid_duplicated_trees = false;
+    params.rf_dist_mode = 0;
+    params.mvh_site_rate = false;
+    params.rate_mh_type = true;
+    params.discard_saturated_site = false;
+    params.mean_rate = 1.0;
+    params.aLRT_threshold = 101;
+    params.aLRT_replicates = 0;
+    params.localbp_replicates = 0;
+    params.SSE = LK_EIGEN_SSE;
+    params.lk_no_avx = false;
+    params.print_site_lh = 0;
+    params.print_site_rate = false;
+    params.print_site_posterior = 0;
+    params.print_tree_lh = false;
+    params.lambda = 1;
+    params.speed_conf = 1.0;
+    params.whtest_simulations = 1000;
+    params.mcat_type = MCAT_LOG + MCAT_PATTERN;
+    params.rate_file = NULL;
+    params.ngs_file = NULL;
+    params.ngs_mapped_reads = NULL;
+    params.ngs_ignore_gaps = true;
+    params.do_pars_multistate = false;
+    params.gene_pvalue_file = NULL;
+    params.gene_scale_factor = -1;
+    params.gene_pvalue_loga = false;
+    params.second_align = NULL;
+    params.ncbi_taxid = 0;
+    params.ncbi_taxon_level = NULL;
+    params.ncbi_names_file = NULL;
+    params.ncbi_ignore_level = NULL;
+
+	params.eco_dag_file  = NULL;
+	params.eco_type = NULL;
+	params.eco_detail_file = NULL;
+	params.k_percent = 0;
+	params.diet_min = 0;
+	params.diet_max = 0;
+	params.diet_step = 0;
+	params.eco_weighted = false;
+	params.eco_run = 0;
+
+	params.upper_bound = false;
+	params.upper_bound_NNI = false;
+	params.upper_bound_frac = 0.0;
+
+    params.gbo_replicates = 0;
+	params.ufboot_epsilon = 0.5;
+    params.check_gbo_sample_size = 0;
+    params.use_rell_method = true;
+    params.use_elw_method = false;
+    params.use_weighted_bootstrap = false;
+    params.use_max_tree_per_bootstrap = true;
+    params.max_candidate_trees = 0;
+    params.distinct_trees = false;
+    params.online_bootstrap = true;
+    params.min_correlation = 0.99;
+    params.step_iterations = 100;
+    params.store_candidate_trees = false;
+	params.print_ufboot_trees = false;
+    //const double INF_NNI_CUTOFF = -1000000.0;
+    params.nni_cutoff = -1000000.0;
+    params.estimate_nni_cutoff = false;
+    params.nni_sort = false;
+    //params.nni_opt_5branches = false;
+    params.testNNI = false;
+    params.approximate_nni = false;
+    params.do_compression = false;
+
+    params.new_heuristic = true;
+    params.iteration_multiple = 1;
+    params.initPS = 0.5;
+#ifdef USING_PLL
+    params.pll = true;
+#else
+    params.pll = false;
+#endif
+    params.modeps = 0.01;
+    params.parbran = false;
+    params.binary_aln_file = NULL;
+    params.maxtime = 1000000;
+    params.reinsert_par = false;
+    params.bestStart = true;
+    params.snni = true; // turn on sNNI default now
+//    params.autostop = true; // turn on auto stopping rule by default now
+    params.unsuccess_iteration = 100;
+    params.speednni = true; // turn on reduced hill-climbing NNI by default now
+    params.reduction = false;
+    params.numInitTrees = 100;
+    params.fix_stable_splits = false;
+    params.numSupportTrees = 20;
+//    params.sprDist = 20;
+    params.sprDist = 6;
+    params.numNNITrees = 20;
+    params.avh_test = 0;
+    params.bootlh_test = 0;
+    params.bootlh_partitions = NULL;
+    params.site_freq_file = NULL;
+#ifdef _OPENMP
+    params.num_threads = 0;
+#else
+    params.num_threads = 1;
+#endif
+    params.model_test_criterion = MTC_BIC;
+    params.model_test_stop_rule = MTC_ALL;
+    params.model_test_sample_size = 0;
+    params.root_state = NULL;
+    params.print_bootaln = false;
+	params.print_subaln = false;
+	params.print_partition_info = false;
+	params.print_conaln = false;
+	params.count_trees = false;
+	params.print_branch_lengths = false;
+	params.lh_mem_save = LM_PER_NODE; // auto detect
+	params.start_tree = STT_PLL_PARSIMONY;
+	params.print_splits_file = false;
+    params.ignore_identical_seqs = true;
+    params.write_init_tree = false;
+    params.write_local_optimal_trees = false;
+    params.freq_const_patterns = NULL;
+    params.no_rescale_gamma_invar = false;
+    params.compute_seq_identity_along_tree = false;
+
+	if (params.nni5) {
+	    params.nni_type = NNI5;
+	} else {
+	    params.nni_type = NNI1;
+	}
+
+    struct timeval tv;
+    struct timezone tz;
+    // initialize random seed based on current time
+    gettimeofday(&tv, &tz);
+    //params.ran_seed = (unsigned) (tv.tv_sec+tv.tv_usec);
+    params.ran_seed = (unsigned) (tv.tv_usec);
+
+    for (cnt = 1; cnt < argc; cnt++) {
+        try {
+
+            if (strcmp(argv[cnt], "-h") == 0 || strcmp(argv[cnt], "--help") == 0) {
+#ifdef IQ_TREE
+                usage_iqtree(argv, false);
+#else
+                usage(argv, false);
+#endif
+                continue;
+            }
+			if (strcmp(argv[cnt], "-ho") == 0 || strcmp(argv[cnt], "-?") == 0) {
+				usage_iqtree(argv, false);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-hh") == 0
+					|| strcmp(argv[cnt], "-hhh") == 0) {
+				usage(argv, true);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-v0") == 0) {
+				verbose_mode = VB_QUIET;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-v") == 0 || strcmp(argv[cnt], "-v1") == 0) {
+				verbose_mode = VB_MED;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-vv") == 0
+					|| strcmp(argv[cnt], "-v2") == 0) {
+				verbose_mode = VB_MAX;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-vvv") == 0
+					|| strcmp(argv[cnt], "-v3") == 0) {
+				verbose_mode = VB_DEBUG;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-k") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -k <num_taxa>";
+				convert_range(argv[cnt], params.min_size, params.sub_size,
+						params.step_size);
+				params.k_representative = params.min_size;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pre") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -pre <output_prefix>";
+				params.out_prefix = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -pp <pd_proportion>";
+				convert_range(argv[cnt], params.min_proportion,
+						params.pd_proportion, params.step_proportion);
+				if (params.pd_proportion < 0 || params.pd_proportion > 1)
+					throw "PD proportion must be between 0 and 1";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mk") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mk <min_taxa>";
+				params.min_size = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bud") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bud <budget>";
+				convert_range(argv[cnt], params.min_budget, params.budget,
+						params.step_budget);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mb") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mb <min_budget>";
+				params.min_budget = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-o") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -o <taxon>";
+				params.root = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-optalg") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -opt_alg <1-BFGS|2-BFGS|EM>";
+				params.optimize_alg = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-root") == 0) {
+				params.is_rooted = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-all") == 0) {
+				params.find_all = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-g") == 0
+					|| strcmp(argv[cnt], "--greedy") == 0) {
+				params.run_mode = GREEDY;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pr") == 0
+					|| strcmp(argv[cnt], "--pruning") == 0) {
+				params.run_mode = PRUNING;
+				//continue; } if (strcmp(argv[cnt],"--both") == 0) {
+				//params.run_mode = BOTH_ALG;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-e") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -e <file>";
+				params.param_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-if") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -if <file>";
+				params.initial_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nni_nr_step") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nni_nr_step <newton_raphson_steps>";
+				NNI_MAX_NR_STEP = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ia") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ia <file>";
+				params.initial_area_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-u") == 0) {
+				// file containing budget information
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -u <file>";
+				params.budget_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dd") == 0) {
+				// compute distribution of PD score on random sets
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dd <sample_size>";
+				params.run_mode = PD_DISTRIBUTION;
+				params.sample_size = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ts") == 0) {
+				// calculate PD score a taxa set listed in the file
+				cnt++;
+				//params.run_mode = PD_USER_SET;
+				if (cnt >= argc)
+					throw "Use -ts <taxa_file>";
+				params.pdtaxa_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bound") == 0) {
+				// boundary length of areas
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bound <file>";
+				params.areas_boundary_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-blm") == 0) {
+				// boundary length modifier
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -blm <boundary_modifier>";
+				params.boundary_modifier = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dist") == 0
+					|| strcmp(argv[cnt], "-d") == 0) {
+				// calculate distance matrix from the tree
+				params.run_mode = CALC_DIST;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dist <distance_file>";
+				params.dist_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-djc") == 0) {
+				params.compute_ml_dist = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dobs") == 0) {
+				params.compute_obs_dist = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-r") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -r <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = YULE_HARDING;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rs") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rs <alignment_file>";
+				params.tree_gen = YULE_HARDING;
+				params.aln_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rstar") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rstar <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = STAR_TREE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ru") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ru <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = UNIFORM;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rcat") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rcat <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = CATERPILLAR;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rbal") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rbal <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = BALANCED;
+				continue;
+			}
+            if (strcmp(argv[cnt], "-keep_ident") == 0) {
+                params.ignore_identical_seqs = false;
+                continue;
+            }
+			if (strcmp(argv[cnt], "-rcsg") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rcsg <num_taxa>";
+				params.sub_size = convert_int(argv[cnt]);
+				params.tree_gen = CIRCULAR_SPLIT_GRAPH;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rpam") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rpam <num_splits>";
+				params.num_splits = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rlen") == 0) {
+				cnt++;
+				if (cnt >= argc - 2)
+					throw "Use -rlen <min_len> <mean_len> <max_len>";
+				params.min_len = convert_double(argv[cnt]);
+				params.mean_len = convert_double(argv[cnt + 1]);
+				params.max_len = convert_double(argv[cnt + 2]);
+				cnt += 2;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rzero") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rzero <num_zero_branch>";
+				params.num_zero_len = convert_int(argv[cnt]);
+				if (params.num_zero_len < 0)
+					throw "num_zero_len must not be negative";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rset") == 0) {
+				cnt++;
+				if (cnt >= argc - 1)
+					throw "Use -rset <overlap> <outfile>";
+				params.overlap = convert_int(argv[cnt]);
+				cnt++;
+				params.pdtaxa_file = argv[cnt];
+				params.tree_gen = TAXA_SET;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rep") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rep <repeated_times>";
+				params.repeated_time = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lim") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -lim <pd_limit>";
+				params.pd_limit = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-noout") == 0) {
+				params.nr_output = 0;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-1out") == 0) {
+				params.nr_output = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-oldout") == 0) {
+				params.nr_output = 100;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nexout") == 0) {
+				params.nexus_output = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-exhaust") == 0) {
+				params.run_mode = EXHAUSTIVE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-seed") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -seed <random_seed>";
+				params.ran_seed = (unsigned) convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pdgain") == 0) {
+				params.calc_pdgain = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sup") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sup <target_tree_file>";
+				params.second_tree = argv[cnt];
+				params.consensus_type = CT_ASSIGN_SUPPORT;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-suptag") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -suptag <tagname or ALL>";
+				params.support_tag = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sup2") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sup2 <target_tree_file>";
+				params.second_tree = argv[cnt];
+				params.consensus_type = CT_ASSIGN_SUPPORT_EXTENDED;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-treew") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -treew <tree_weight_file>";
+				params.tree_weight_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-con") == 0) {
+				params.consensus_type = CT_CONSENSUS_TREE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-net") == 0) {
+				params.consensus_type = CT_CONSENSUS_NETWORK;
+			} /**MINH ANH: to serve some statistics on tree*/
+			else if (strcmp(argv[cnt], "-comp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -comp <treefile>";
+				params.consensus_type = COMPARE;
+				params.second_tree = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-stats") == 0) {
+				params.run_mode = STATS;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gbo") == 0) { //guided bootstrap
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -gbo <site likelihod file>";
+				params.siteLL_file = argv[cnt];
+				//params.run_mode = GBO;
+			} // MA
+			else if (strcmp(argv[cnt], "-mprob") == 0) { //compute multinomial distribution probability
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mprob <ref_alignment>";
+				params.second_align = argv[cnt];
+				//params.run_mode = MPRO;
+			} // MA
+			else if (strcmp(argv[cnt], "-min") == 0) {
+				params.find_pd_min = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-excl") == 0) {
+				params.exclusive_pd = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-endem") == 0) {
+				params.endemic_pd = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-compl") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -compl <area_name>";
+				params.complement_area = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cluster") == 0) {
+				params.branch_cluster = 4;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -cluster <taxa_order_file>";
+				params.taxa_order_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-taxa") == 0) {
+				params.run_mode = PRINT_TAXA;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-area") == 0) {
+				params.run_mode = PRINT_AREA;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-scale") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -scale <scaling_factor>";
+				params.scaling_factor = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-scaleg") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -scaleg <gene_scale_factor>";
+				params.gene_scale_factor = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-scalebranch") == 0) {
+				params.run_mode = SCALE_BRANCH_LEN;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -scalebranch <scaling_factor>";
+				params.scaling_factor = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-scalenode") == 0) {
+				params.run_mode = SCALE_NODE_NAME;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -scalenode <scaling_factor>";
+				params.scaling_factor = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-prec") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -prec <numeric_precision>";
+				params.numeric_precision = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lp") == 0) {
+				params.run_mode = LINEAR_PROGRAMMING;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lpbin") == 0) {
+				params.run_mode = LINEAR_PROGRAMMING;
+				params.binary_programming = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-qp") == 0) {
+				params.gurobi_format = true;
+				params.quad_programming = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-quiet") == 0) {
+				verbose_mode = VB_QUIET;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mult") == 0) {
+				params.multi_tree = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bi") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bi <burnin_value>";
+				params.tree_burnin = convert_int(argv[cnt]);
+				if (params.tree_burnin < 0)
+					throw "Burnin value must not be negative";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-tm") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -tm <tree_max_count>";
+				params.tree_max_count = convert_int(argv[cnt]);
+				if (params.tree_max_count < 0)
+					throw "tree_max_count must not be negative";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-minsup") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -minsup <split_threshold>";
+				params.split_threshold = convert_double(argv[cnt]);
+				if (params.split_threshold < 0 || params.split_threshold > 1)
+					throw "Split threshold must be between 0 and 1";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-tw") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -tw <split_weight_threshold>";
+				params.split_weight_threshold = convert_double(argv[cnt]);
+				if (params.split_weight_threshold < 0)
+					throw "Split weight threshold is negative";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-swc") == 0) {
+				params.split_weight_summary = SW_COUNT;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-swa") == 0) {
+				params.split_weight_summary = SW_AVG_ALL;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-swp") == 0) {
+				params.split_weight_summary = SW_AVG_PRESENT;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-iwc") == 0) {
+				params.test_input = TEST_WEAKLY_COMPATIBLE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-aln") == 0
+					|| strcmp(argv[cnt], "-s") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -aln, -s <alignment_file>";
+				params.aln_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-z") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -aln, -z <user_trees_file>";
+				params.treeset_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-zb") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -zb <#replicates>";
+				params.topotest_replicates = convert_int(argv[cnt]);
+				if (params.topotest_replicates < 1000)
+					throw "Please specify at least 1000 replicates";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-zw") == 0) {
+				params.do_weighted_test = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-zau") == 0) {
+				params.do_au_test = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sp <partition_file>";
+				params.partition_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-spp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -spp <type of partition model>";
+				params.partition_file = argv[cnt];
+				params.partition_type = 'p';
+				continue;
+			}
+			if (strcmp(argv[cnt], "-spj") == 0 || strcmp(argv[cnt], "-q") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -q <type of partition model>";
+				params.partition_file = argv[cnt];
+				params.partition_type = 'j';
+				continue;
+			}
+			if (strcmp(argv[cnt], "-M") == 0) {
+                params.partition_type = 0;
+                continue;
+            }
+            if (strcmp(argv[cnt], "-rcluster") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rcluster <percent>";
+                params.partfinder_rcluster = convert_double(argv[cnt]);
+                if (params.partfinder_rcluster < 0 || params.partfinder_rcluster > 100)
+                    throw "rcluster percentage must be between 0 and 100";
+				continue;
+            }
+			if (strcmp(argv[cnt], "-keep_empty_seq") == 0) {
+				params.remove_empty_seq = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-no_terrace") == 0) {
+				params.terrace_aware = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sf") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sf <ngs_file>";
+				params.ngs_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sm") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sm <ngs_mapped_read_file>";
+				params.ngs_mapped_reads = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ngs_gap") == 0) {
+				params.ngs_ignore_gaps = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-st") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -st BIN or -st DNA or -st AA or -st CODON or -st MORPH";
+				params.sequence_type = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-starttree") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -starttree BIONJ|PARS|PLLPARS";
+				if (strcmp(argv[cnt], "BIONJ") == 0)
+					params.start_tree = STT_BIONJ;
+				else if (strcmp(argv[cnt], "PARS") == 0)
+					params.start_tree = STT_PARSIMONY;
+				else if (strcmp(argv[cnt], "PLLPARS") == 0)
+					params.start_tree = STT_PLL_PARSIMONY;
+				else
+					throw "Invalid option, please use -starttree with BIONJ or PARS or PLLPARS";
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-ao") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ao <alignment_file>";
+				params.aln_output = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-as") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -as <aln_site_list>";
+				params.aln_site_list = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-an") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -an <ref_seq_name>";
+				params.ref_seq_name = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-af") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -af phy|fasta";
+				if (strcmp(argv[cnt], "phy") == 0)
+					params.aln_output_format = ALN_PHYLIP;
+				else if (strcmp(argv[cnt], "fasta") == 0)
+					params.aln_output_format = ALN_FASTA;
+				else
+					throw "Unknown output format";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-am") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -am <gap_masked_aln>";
+				params.gap_masked_aln = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ac") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ac <concatenate_aln>";
+				params.concatenate_aln = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nogap") == 0) {
+				params.aln_nogaps = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-noconst") == 0) {
+				params.aln_no_const_sites = true;
+				continue;
+			}
+//			if (strcmp(argv[cnt], "-parstree") == 0) {
+				// maximum parsimony
+//				params.parsimony_tree = true;
+//            continue; } if (strcmp(argv[cnt], "-pars") == 0) {
+//                // maximum parsimony
+//                params.parsimony = true;
+//				continue;
+//			}
+			if (strcmp(argv[cnt], "-spr") == 0) {
+				// subtree pruning and regrafting
+				params.tree_spr = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-krep") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -krep <num_k>";
+				params.k_representative = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pdel") == 0
+					|| strcmp(argv[cnt], "-p") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -pdel <probability>";
+				params.p_delete = convert_double(argv[cnt]);
+				if (params.p_delete < 0.0 || params.p_delete > 1.0)
+					throw "Probability of deleting a leaf must be between 0 and 1";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pers") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -pers <perturbation_strength>";
+				params.initPS = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-n") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -n <#iterations>";
+				params.min_iterations = convert_int(argv[cnt]);
+				params.stop_condition = SC_FIXED_ITERATION;
+//                params.autostop = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nparam") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nparam <#iterations>";
+				params.num_param_iterations = convert_int(argv[cnt]);
+				if (params.num_param_iterations < 0)
+					throw "Number of parameter optimization iterations (-nparam) must be non negative";
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-nb") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nb <#bootstrap_replicates>";
+				params.min_iterations = convert_int(argv[cnt]);
+				params.iqp_assess_quartet = IQP_BOOTSTRAP;
+				params.avoid_duplicated_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mod") == 0
+					|| strcmp(argv[cnt], "-m") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mod <model_name>";
+				params.model_name = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mset") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mset <model_set>";
+				params.model_set = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-msub") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -msub <model_subset>";
+				params.model_subset = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mfreq") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mfreq <state_freq_set>";
+				params.state_freq_set = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mrate") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mrate <rate_set>";
+				params.ratehet_set = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mdef") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mdef <model_definition_file>";
+				params.model_def_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mredo") == 0) {
+				params.model_test_again = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mtree") == 0) {
+				params.model_test_and_tree = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mretree") == 0) {
+				params.model_test_and_tree = 2;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-msep") == 0) {
+				params.model_test_separate_rate = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mwopt") == 0) {
+				params.optimize_mixmodel_weight = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mh") == 0) {
+				params.mvh_site_rate = true;
+				params.discard_saturated_site = false;
+				params.SSE = LK_NORMAL;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mhs") == 0) {
+				params.mvh_site_rate = true;
+				params.discard_saturated_site = true;
+				params.SSE = LK_NORMAL;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rl") == 0) {
+				params.rate_mh_type = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nr") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nr <mean_rate>";
+				params.mean_rate = convert_double(argv[cnt]);
+				if (params.mean_rate < 0)
+					throw "Wrong mean rate for MH model";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mstore") == 0) {
+				params.store_trans_matrix = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nni_lh") == 0) {
+				params.nni_lh = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lmd") == 0) {
+				cnt++;
+				params.lambda = convert_double(argv[cnt]);
+				if (params.lambda > 1.0)
+					throw "Lambda must be in (0,1]";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nosse") == 0) {
+				params.SSE = LK_NORMAL;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-slowsse") == 0) {
+				params.SSE = LK_SSE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fastlk") == 0) {
+				params.SSE = LK_EIGEN;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fastsse") == 0
+					|| strcmp(argv[cnt], "-fasttipsse") == 0) {
+				params.SSE = LK_EIGEN_SSE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-noavx") == 0) {
+				params.lk_no_avx = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-f") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -f <c | o | u | q>";
+				if (strcmp(argv[cnt], "q") == 0 || strcmp(argv[cnt], "EQ") == 0)
+					params.freq_type = FREQ_EQUAL;
+				else if (strcmp(argv[cnt], "c") == 0
+						|| strcmp(argv[cnt], "EM") == 0)
+					params.freq_type = FREQ_EMPIRICAL;
+				else if (strcmp(argv[cnt], "o") == 0
+						|| strcmp(argv[cnt], "ES") == 0)
+					params.freq_type = FREQ_ESTIMATE;
+				else if (strcmp(argv[cnt], "u") == 0
+						|| strcmp(argv[cnt], "UD") == 0)
+					params.freq_type = FREQ_USER_DEFINED;
+				else
+					throw "Use -f <c | o | u | q>";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fs") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -fs <site_freq_file>";
+				params.site_freq_file = argv[cnt];
+				params.SSE = LK_NORMAL;
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-fconst") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -fconst <const_pattern_frequencies>";
+				params.freq_const_patterns = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-c") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -c <#rate_category>";
+				params.num_rate_cats = convert_int(argv[cnt]);
+				if (params.num_rate_cats < 1)
+					throw "Wrong number of rate categories";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cmin") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -cmin <#min_rate_category>";
+				params.min_rate_cats = convert_int(argv[cnt]);
+				if (params.min_rate_cats < 2)
+					throw "Wrong number of rate categories for -cmin";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cmax") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -cmax <#max_rate_category>";
+				params.max_rate_cats = convert_int(argv[cnt]);
+				if (params.max_rate_cats < 2)
+					throw "Wrong number of rate categories for -cmax";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-a") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -a <gamma_shape>";
+				params.gamma_shape = convert_double(argv[cnt]);
+//				if (params.gamma_shape < 0)
+//					throw "Wrong number of gamma shape parameter (alpha)";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gmean") == 0) {
+				params.gamma_median = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gmedian") == 0) {
+				params.gamma_median = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-i") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -i <p_invar_sites>";
+				params.p_invar_sites = convert_double(argv[cnt]);
+				if (params.p_invar_sites < 0)
+					throw "Wrong number of proportion of invariable sites";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-brent") == 0) {
+				params.optimize_by_newton = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-jointopt") == 0) {
+				params.optimize_model_rate_joint = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-brent_ginvar") == 0) {
+				params.optimize_model_rate_joint = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fixbr") == 0) {
+				params.fixed_branch_length = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sr") == 0) {
+				params.stop_condition = SC_WEIBULL;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sr <#max_iteration>";
+				params.max_iterations = convert_int(argv[cnt]);
+				if (params.max_iterations <= params.min_iterations)
+					throw "Specified max iteration must be greater than min iteration";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nm") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nm <#max_iteration>";
+				params.max_iterations = convert_int(argv[cnt]);
+				if (params.max_iterations <= params.min_iterations)
+					throw "Specified max iteration must be greater than min iteration";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sc") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sc <stop_confidence_value>";
+				params.stop_confidence = convert_double(argv[cnt]);
+				if (params.stop_confidence <= 0.5
+						|| params.stop_confidence >= 1)
+					throw "Stop confidence value must be in range (0.5,1)";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gurobi") == 0) {
+				params.gurobi_format = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gthreads") == 0) {
+				params.gurobi_format = true;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -gthreads <gurobi_threads>";
+				params.gurobi_threads = convert_int(argv[cnt]);
+				if (params.gurobi_threads < 1)
+					throw "Wrong number of threads";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-b") == 0 || strcmp(argv[cnt], "-bo") == 0) {
+				params.multi_tree = true;
+				if (strcmp(argv[cnt], "-bo") == 0)
+					params.compute_ml_tree = false;
+				if (strcmp(argv[cnt], "-b") == 0)
+					params.consensus_type = CT_CONSENSUS_TREE;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -b <num_bootstrap_samples>";
+				params.num_bootstrap_samples = convert_int(argv[cnt]);
+				if (params.num_bootstrap_samples < 1)
+					throw "Wrong number of bootstrap samples";
+				if (params.num_bootstrap_samples == 1)
+					params.compute_ml_tree = false;
+				if (params.num_bootstrap_samples == 1)
+					params.consensus_type = CT_NONE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bspec") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bspec <bootstrap_specification>";
+				params.bootstrap_spec = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bc") == 0) {
+				params.multi_tree = true;
+				params.compute_ml_tree = false;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bc <num_bootstrap_samples>";
+				params.num_bootstrap_samples = convert_int(argv[cnt]);
+				if (params.num_bootstrap_samples < 1)
+					throw "Wrong number of bootstrap samples";
+				if (params.num_bootstrap_samples > 1)
+					params.consensus_type = CT_CONSENSUS_TREE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-iqppars") == 0) {
+				params.iqp_assess_quartet = IQP_PARSIMONY;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-iqp") == 0) {
+				params.iqp = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wlt") == 0) {
+				// write all candidate trees
+				params.write_local_optimal_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wt") == 0) {
+				params.write_intermediate_trees = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wt2") == 0) {
+				params.write_intermediate_trees = 2;
+				params.avoid_duplicated_trees = true;
+				params.print_tree_lh = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wt3") == 0) {
+				params.write_intermediate_trees = 3;
+				params.avoid_duplicated_trees = true;
+				params.print_tree_lh = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wbl") == 0) {
+				params.print_branch_lengths = true;
+				continue;
+			}
+            if (strcmp(argv[cnt], "-wit") == 0) {
+                params.write_init_tree = true;
+                continue;
+            }
+			if (strcmp(argv[cnt], "-nodup") == 0) {
+				params.avoid_duplicated_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rf_all") == 0) {
+				params.rf_dist_mode = RF_ALL_PAIR;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rf_adj") == 0) {
+				params.rf_dist_mode = RF_ADJACENT_PAIR;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rf") == 0) {
+				params.rf_dist_mode = RF_TWO_TREE_SETS;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rf <second_tree>";
+				params.second_tree = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rf2") == 0) {
+				params.rf_dist_mode = RF_TWO_TREE_SETS_EXTENDED;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -rf2 <second_tree>";
+				params.second_tree = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-aLRT") == 0) {
+				cnt++;
+				if (cnt + 1 >= argc)
+					throw "Use -aLRT <threshold%> <#replicates>";
+				params.aLRT_threshold = convert_int(argv[cnt]);
+				if (params.aLRT_threshold < 85 || params.aLRT_threshold > 101)
+					throw "aLRT threshold must be between 85 and 100";
+				cnt++;
+				params.aLRT_replicates = convert_int(argv[cnt]);
+				if (params.aLRT_replicates < 1000
+						&& params.aLRT_replicates != 0)
+					throw "aLRT replicates must be at least 1000";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-alrt") == 0) {
+				cnt++;
+				params.aLRT_replicates = convert_int(argv[cnt]);
+				if (params.aLRT_replicates < 1000
+						&& params.aLRT_replicates != 0)
+					throw "aLRT replicates must be at least 1000";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lbp") == 0) {
+				cnt++;
+				params.localbp_replicates = convert_int(argv[cnt]);
+				if (params.localbp_replicates < 1000
+						&& params.localbp_replicates != 0)
+					throw "Local bootstrap (LBP) replicates must be at least 1000";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wsl") == 0) {
+				params.print_site_lh = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wslg") == 0) {
+				params.print_site_lh = 2;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wsr") == 0) {
+				params.print_site_rate = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wsp") == 0) {
+				params.print_site_posterior = 1;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wba") == 0) {
+				params.print_bootaln = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wsa") == 0) {
+				params.print_subaln = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wtl") == 0) {
+				params.print_tree_lh = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wpi") == 0) {
+				params.print_partition_info = true;
+				params.print_conaln = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wca") == 0) {
+				params.print_conaln = true;
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-wsplits") == 0) {
+				params.print_splits_file = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ns") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ns <num_simulations>";
+				params.whtest_simulations = convert_int(argv[cnt]);
+				if (params.whtest_simulations < 1)
+					throw "Wrong number of simulations for WH-test";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-mr") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -mr <rate_file>";
+				params.rate_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cat_mean") == 0) {
+				params.mcat_type |= MCAT_MEAN;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cat_nolog") == 0) {
+				params.mcat_type &= (127 - MCAT_LOG);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-cat_site") == 0) {
+				params.mcat_type &= (127 - MCAT_PATTERN);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-tina") == 0) {
+				params.do_pars_multistate = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pval") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -pval <gene_pvalue_file>";
+				params.gene_pvalue_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nnitest") == 0) {
+				params.testNNI = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-anni") == 0) {
+				params.approximate_nni = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nnicut") == 0) {
+				params.estimate_nni_cutoff = true;
+				//nni_cutoff = -5.41/2;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nnichi2") == 0) {
+				params.nni_cutoff = -5.41 / 2;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nnicutval") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nnicutval <log_diff_value>";
+				params.nni_cutoff = convert_double(argv[cnt]);
+				if (params.nni_cutoff >= 0)
+					throw "cutoff value for -nnicutval must be negative";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nnisort") == 0) {
+				params.nni_sort = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-plog") == 0) {
+				params.gene_pvalue_loga = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dmp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dmp <ncbi_taxid>";
+				params.ncbi_taxid = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dmplevel") == 0
+					|| strcmp(argv[cnt], "-dmprank") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dmprank <ncbi_taxon_rank>";
+				params.ncbi_taxon_level = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dmpignore") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dmpignore <ncbi_ignore_level>";
+				params.ncbi_ignore_level = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-dmpname") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -dmpname <ncbi_names_file>";
+				params.ncbi_names_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-eco") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -eco <eco_dag_file>";
+				params.eco_dag_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-k%") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -k% <k in %>";
+				//convert_range(argv[cnt], params.k_percent, params.sub_size, params.step_size);
+				params.k_percent = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-diet") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -diet <d in %>";
+				convert_range(argv[cnt], params.diet_min, params.diet_max,
+						params.diet_step);
+				//params.diet = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-up") == 0) {
+				params.upper_bound = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-upNNI") == 0) {
+ 				params.upper_bound_NNI = true;
+			}
+			if (strcmp(argv[cnt], "-upFrac") == 0) {
+				cnt++;
+				if (cnt >= argc)
+				  throw "Use -upFrac <fraction>";
+				params.upper_bound_frac = convert_double(argv[cnt]);
+			}
+			if (strcmp(argv[cnt], "-ecoR") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ecoR <run number>";
+				params.eco_run = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bb") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bb <#replicates>";
+				params.gbo_replicates = convert_int(argv[cnt]);
+				params.avoid_duplicated_trees = true;
+				if (params.gbo_replicates < 1000)
+					throw "#replicates must be >= 1000";
+				params.consensus_type = CT_CONSENSUS_TREE;
+				params.stop_condition = SC_BOOTSTRAP_CORRELATION;
+				//params.nni5Branches = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-beps") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -beps <epsilon>";
+				params.ufboot_epsilon = convert_double(argv[cnt]);
+				if (params.ufboot_epsilon <= 0.0)
+					throw "Epsilon must be positive";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-wbt") == 0) {
+				params.print_ufboot_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bs") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bs <begin_sampling_size>";
+				params.check_gbo_sample_size = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bmax") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bmax <max_candidate_trees>";
+				params.max_candidate_trees = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bcor") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bcor <min_correlation>";
+				params.min_correlation = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nstep") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nstep <step_iterations>";
+				params.step_iterations = convert_int(argv[cnt]);
+				if (params.step_iterations < 10
+						|| params.step_iterations % 2 == 1)
+					throw "At least step size of 10 and even number please";
+				params.min_iterations = params.step_iterations;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-boff") == 0) {
+				params.online_bootstrap = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nostore") == 0
+					|| strcmp(argv[cnt], "-memsave") == 0) {
+				params.store_candidate_trees = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lhmemsave") == 0) {
+				params.lh_mem_save = LM_PER_NODE;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nolhmemsave") == 0) {
+				params.lh_mem_save = LM_ALL_BRANCH;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-storetrees") == 0) {
+				params.store_candidate_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nodiff") == 0) {
+				params.distinct_trees = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-treediff") == 0) {
+				params.distinct_trees = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-norell") == 0) {
+				params.use_rell_method = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-elw") == 0) {
+				params.use_elw_method = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-noweight") == 0) {
+				params.use_weighted_bootstrap = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nomore") == 0) {
+				params.use_max_tree_per_bootstrap = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bweight") == 0) {
+				params.use_weighted_bootstrap = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bmore") == 0) {
+				params.use_max_tree_per_bootstrap = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-gz") == 0) {
+				params.do_compression = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-newheu") == 0) {
+				params.new_heuristic = true;
+				// Enable RAxML kernel
+				continue;
+			}
+			if (strcmp(argv[cnt], "-maxtime") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -maxtime <time_in_minutes>";
+				params.maxtime = convert_double(argv[cnt]);
+				params.min_iterations = 1000000;
+				params.stop_condition = SC_REAL_TIME;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-numpars") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -numpars <number_of_parsimony_trees>";
+				params.numInitTrees = convert_int(argv[cnt]);
+				if (params.numInitTrees < params.numNNITrees)
+					params.numNNITrees = params.numInitTrees;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fss") == 0) {
+				params.fix_stable_splits = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-toppars") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -toppars <number_of_top_parsimony_trees>";
+				params.numNNITrees = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-nsp") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -nsp <number_of_support_trees>";
+				params.numSupportTrees = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fixai") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -fixai <alpha_invar_file>";
+				params.alpha_invar_file = argv[cnt];
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "--test-alpha") == 0) {
+				params.testAlpha = true;
+				continue;
+			}
+            if (strcmp(argv[cnt], "--test-alpha-eps") == 0) {
+                cnt++;
+                if (cnt >= argc)
+                    throw "Use --test-alpha-eps <logl_eps>";
+                params.testAlphaEps = convert_double(argv[cnt]);
+                params.testAlpha = true;
+                continue;
+            }
+
+            if (strcmp(argv[cnt], "-fai") == 0) {
+                params.fai = true;
+                continue;
+            }
+
+            if (strcmp(argv[cnt], "-eai") == 0) {
+                params.exh_ai = true;
+                continue;
+            }
+			if (strcmp(argv[cnt], "-poplim") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -poplim <max_pop_size>";
+				params.maxCandidates = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-popsize") == 0
+					|| strcmp(argv[cnt], "-numcand") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -numcand <number_of_candidate_trees>";
+				params.popSize = convert_int(argv[cnt]);
+				assert(params.popSize < params.numInitTrees);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-beststart") == 0) {
+				params.bestStart = true;
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -best_start <binary_alignment_file>";
+				params.binary_aln_file = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pll") == 0) {
+				params.pll = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-me") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -me <model_epsilon>";
+				params.modeps = convert_double(argv[cnt]);
+				if (params.modeps <= 0.0)
+					throw "Model epsilon must be positive";
+				if (params.modeps > 0.1)
+					throw "Model epsilon must not be larger than 0.1";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pars_ins") == 0) {
+				params.reinsert_par = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-allnni") == 0) {
+				params.speednni = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-reduction") == 0) {
+				params.reduction = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-snni") == 0) {
+				params.snni = true;
+				// dont need to turn this on here
+				//params.autostop = true;
+				//params.speednni = true;
+				// Minh: why do you turn this on? it doubles curPerStrength at some point
+				//params.adaptPert = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-iqpnni") == 0) {
+				params.snni = false;
+				params.start_tree = STT_BIONJ;
+				params.reduction = false;
+				params.numNNITrees = 1;
+//            continue; } if (strcmp(argv[cnt], "-auto") == 0) {
+//            	params.autostop = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-stop_cond") == 0 || strcmp(argv[cnt], "-numstop") == 0) {
+				if (params.stop_condition != SC_BOOTSTRAP_CORRELATION)
+					params.stop_condition = SC_UNSUCCESS_ITERATION;
+				cnt++;
+				params.unsuccess_iteration = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lsbran") == 0) {
+				params.leastSquareBranch = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-manuel") == 0) {
+				params.manuel_analytic_approx = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-parsbran") == 0) {
+				params.pars_branch_length = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bayesbran") == 0) {
+				params.bayes_branch_length = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-fivebran") == 0
+					|| strcmp(argv[cnt], "-nni5") == 0) {
+				params.nni5 = true;
+				params.nni_type = NNI5;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-onebran") == 0
+					|| strcmp(argv[cnt], "-nni1") == 0) {
+				params.nni_type = NNI1;
+				params.nni5 = false;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-smooth") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -smooth <num_iterations>";
+				params.numSmoothTree = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lsnni") == 0) {
+				params.leastSquareNNI = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-lsvar") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -lsvar <o|ft|fm|st|p>";
+				if (strcmp(argv[cnt], "o") == 0
+						|| strcmp(argv[cnt], "ols") == 0) {
+					params.ls_var_type = OLS;
+					continue;
+				}
+				if (strcmp(argv[cnt], "ft") == 0
+						|| strcmp(argv[cnt], "first_taylor") == 0) {
+					params.ls_var_type = WLS_FIRST_TAYLOR;
+					continue;
+				}
+				if (strcmp(argv[cnt], "fm") == 0
+						|| strcmp(argv[cnt], "fitch_margoliash") == 0) {
+					params.ls_var_type = WLS_FITCH_MARGOLIASH;
+					continue;
+				}
+				if (strcmp(argv[cnt], "st") == 0
+						|| strcmp(argv[cnt], "second_taylor") == 0) {
+					params.ls_var_type = WLS_SECOND_TAYLOR;
+					continue;
+				}
+				if (strcmp(argv[cnt], "p") == 0
+						|| strcmp(argv[cnt], "pauplin") == 0) {
+					params.ls_var_type = WLS_PAUPLIN;
+				} else {
+					throw "Use -lsvar <o|ft|fm|st|p>";
+				}
+				continue;
+			}
+			if (strcmp(argv[cnt], "-eps") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -eps <log-likelihood epsilon>";
+				params.loglh_epsilon = convert_double(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-pb") == 0) { // Enable parsimony branch length estimation
+				params.parbran = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-x") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -x <iteration_multiple>";
+				params.iteration_multiple = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-sp_iter") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sp_iter <number_iteration>";
+				params.speedup_iter = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-avh") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -avh <arndt_#bootstrap>";
+				params.avh_test = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bootlh") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bootlh <#replicates>";
+				params.bootlh_test = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-bootpart") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -bootpart <part1_length,part2_length,...>";
+				params.bootlh_partitions = argv[cnt];
+				continue;
+			}
+			if (strcmp(argv[cnt], "-AIC") == 0) {
+				params.model_test_criterion = MTC_AIC;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-AICc") == 0 || strcmp(argv[cnt], "-AICC") == 0) {
+				params.model_test_criterion = MTC_AICC;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-merit") == 0) {
+                cnt++;
+				if (cnt >= argc)
+					throw "Use -merit AIC|AICC|BIC";
+                if (strcmp(argv[cnt], "AIC") == 0)
+                    params.model_test_stop_rule = MTC_AIC;
+                else if (strcmp(argv[cnt], "AICc") == 0 || strcmp(argv[cnt], "AICC") == 0)
+                    params.model_test_stop_rule = MTC_AICC;
+                else if (strcmp(argv[cnt], "BIC") == 0)
+                    params.model_test_stop_rule = MTC_BIC;
+                else throw "Use -merit AIC|AICC|BIC";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-ms") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -ms <model_test_sample_size>";
+				params.model_test_sample_size = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-omp") == 0 || strcmp(argv[cnt], "-nt") == 0) {
+				cnt++;
+				if (cnt >= argc)
+				throw "Use -nt <num_threads>";
+				params.num_threads = convert_int(argv[cnt]);
+				if (params.num_threads < 1)
+					throw "At least 1 thread please";
+				continue;
+			}
+			if (strcmp(argv[cnt], "-rootstate") == 0) {
+                cnt++;
+                if (cnt >= argc)
+                    throw "Use -rootstate <rootstate>";
+                params.root_state = argv[cnt];
+                params.SSE = LK_NORMAL;
+                continue;
+			}
+			if (strcmp(argv[cnt], "-ct") == 0) {
+            	params.count_trees = true;
+            	continue;
+			}
+			if (strcmp(argv[cnt], "-sprdist") == 0 || strcmp(argv[cnt], "-sprrad") == 0) {
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -sprrad <SPR radius used in parsimony search>";
+				params.sprDist = convert_int(argv[cnt]);
+				continue;
+			}
+			if (strcmp(argv[cnt], "-no_rescale_gamma_invar") == 0) {
+				params.no_rescale_gamma_invar = true;
+				continue;
+			}
+
+			if (strcmp(argv[cnt], "-wsi") == 0) {
+				params.compute_seq_identity_along_tree = true;
+				continue;
+			}
+			if (strcmp(argv[cnt], "-t") == 0 || strcmp(argv[cnt], "-te") == 0) {
+                if (strcmp(argv[cnt], "-te") == 0) {
+                    params.min_iterations = 0;
+                    params.stop_condition = SC_FIXED_ITERATION;
+                }
+				cnt++;
+				if (cnt >= argc)
+					throw "Use -t,-te <start_tree | BIONJ | PARS | PLLPARS>";
+				if (strcmp(argv[cnt], "BIONJ") == 0)
+					params.start_tree = STT_BIONJ;
+				else if (strcmp(argv[cnt], "PARS") == 0)
+					params.start_tree = STT_PARSIMONY;
+				else if (strcmp(argv[cnt], "PLLPARS") == 0)
+					params.start_tree = STT_PLL_PARSIMONY;
+                else if (strcmp(argv[cnt], "RANDOM") == 0)
+					params.start_tree = STT_RANDOM_TREE;
+				else
+                    params.user_file = argv[cnt];
+				continue;
+			}
+            
+			if (argv[cnt][0] == '-') {
+                string err = "Invalid \"";
+                err += argv[cnt];
+                err += "\" option.";
+                throw err;
+            } else {
+                if (params.user_file == NULL)
+                    params.user_file = argv[cnt];
+                else
+                    params.out_file = argv[cnt];
+            }
+            if (params.root != NULL && params.is_rooted)
+                throw "Not allowed to specify both -o <taxon> and -root";
+
+        }
+        // try
+        catch (const char *str) {
+            outError(str);
+            //} catch (char *str) {
+            //outError(str);
+        } catch (string str) {
+            outError(str);
+        } catch (...) {
+            string err = "Unknown argument \"";
+            err += argv[cnt];
+            err += "\"";
+            outError(err);
+        }
+
+    } // for
+    if (!params.user_file && !params.aln_file && !params.ngs_file && !params.ngs_mapped_reads && !params.partition_file) {
+#ifdef IQ_TREE
+        quickStartGuide();
+//        usage_iqtree(argv, false);
+#else
+        usage(argv, false);
+#endif
+    }
+    if (!params.out_prefix) {
+    	if (params.eco_dag_file)
+    		params.out_prefix = params.eco_dag_file;
+    	else if (params.partition_file)
+            params.out_prefix = params.partition_file;
+        else if (params.aln_file)
+            params.out_prefix = params.aln_file;
+        else if (params.ngs_file)
+            params.out_prefix = params.ngs_file;
+        else if (params.ngs_mapped_reads)
+            params.out_prefix = params.ngs_mapped_reads;
+        else
+            params.out_prefix = params.user_file;
+    }
+}
+
+extern void printCopyright(ostream &out);
+
+void usage(char* argv[], bool full_command) {
+    printCopyright(cout);
+    cout << "Usage: " << argv[0] << " [OPTIONS] <file_name> [<output_file>]" << endl;
+    cout << "GENERAL OPTIONS:" << endl;
+    cout << "  -hh               Print this help dialog" << endl;
+    cout << "  -h                Print help options for phylogenetic inference" << endl;
+    cout << "  <file_name>       User tree in NEWICK format or split network in NEXUS format" << endl;
+    cout << "  <output_file>     Output file to store results, default is '<file_name>.pda'" << endl;
+    cout << "  -k <num_taxa>     Find optimal set of size <num_taxa>" << endl;
+    cout << "  -k <min>:<max>    Find optimal sets of size from <min> to <max>" << endl;
+    cout << "  -k <min>:<max>:<step>" << endl;
+    cout << "                    Find optimal sets of size min, min+step, min+2*step,..." << endl;
+    cout << "  -o <taxon>        Root name to compute rooted PD (default: unrooted)" << endl;
+    cout << "  -if <file>        File containing taxa to be included into optimal sets" << endl;
+    cout << "  -e <file>         File containing branch/split scale and taxa weights" << endl;
+    cout << "  -all              Identify all multiple optimal sets" << endl;
+    cout << "  -lim <max_limit>  The maximum number of optimal sets for each k if -a is specified" << endl;
+    cout << "  -min              Compute minimal sets (default: maximal)" << endl;
+    cout << "  -1out             Print taxa sets and scores to separate files" << endl;
+    cout << "  -oldout           Print output compatible with version 0.3" << endl;
+    cout << "  -v                Verbose mode" << endl;
+    cout << endl;
+    cout << "OPTIONS FOR PHYLOGENETIC DIVERSITY (PD):" << endl;
+    cout << "  -root             Make the tree ROOTED, default is unrooted" << endl;
+    cout << "    NOTE: this option and -o <taxon> cannot be both specified" << endl;
+    cout << "  -g                Run greedy algorithm only (default: auto)" << endl;
+    cout << "  -pr               Run pruning algorithm only (default: auto)" << endl;
+    cout << endl;
+    /*
+    cout << "OPTIONS FOR SPLIT DIVERSITY:" << endl;
+    cout << "  -exhaust          Force to use exhaustive search" << endl;
+    cout << "    NOTE: by default, the program applies dynamic programming algorithm" << endl;
+    cout << "          on circular networks and exhaustive search on general networks" << endl;
+    cout << endl;*/
+    cout << "OPTIONS FOR BUDGET CONSTRAINTS:" << endl;
+    cout << "  -u <file>         File containing total budget and taxa preservation costs" << endl;
+    cout << "  -b <budget>       Total budget to conserve taxa" << endl;
+    cout << "  -b <min>:<max>    Find all sets with budget from <min> to <max>" << endl;
+    cout << "  -b <min>:<max>:<step>" << endl;
+    cout << "                    Find optimal sets with budget min, min+step, min+2*step,..." << endl;
+    cout << endl;
+    cout << "OPTIONS FOR AREA ANALYSIS:" << endl;
+    cout << "  -ts <taxa_file>   Compute/maximize PD/SD of areas (combine with -k to maximize)" << endl;
+    cout << "  -excl             Compute exclusive PD/SD" << endl;
+    cout << "  -endem            Compute endemic PD/SD" << endl;
+    cout << "  -compl <areas>    Compute complementary PD/SD given the listed <areas>" << endl;
+    cout << endl;
+
+    cout << "OPTIONS FOR VIABILITY CONSTRAINTS:" << endl;
+    cout << "  -eco <food_web>   File containing food web matrix" << endl;
+    cout << "  -k% <n>           Find optimal set of size relative the total number of taxa" << endl;
+    cout << "  -diet <min_diet>  Minimum diet portion (%) to be preserved for each predator" << endl;
+    cout << endl;
+    //if (!full_command) exit(0);
+
+    cout << "MISCELLANEOUS:" << endl;
+    cout << "  -dd <sample_size> Compute PD distribution of random sets of size k" << endl;
+    /*
+    cout << "  -gbo <sitelh_file> Compute and output the alignment of (normalized)" << endl;
+    cout << "                    expected frequencies given in site_ll_file" << endl;
+	*/
+
+    //	cout << "  -rep <times>        Repeat algorithm a number of times." << endl;
+    //	cout << "  -noout              Print no output file." << endl;
+    cout << endl;
+    //cout << "HIDDEN OPTIONS: see the source code file pda.cpp::parseArg()" << endl;
+
+    exit(0);
+}
+
+void usage_iqtree(char* argv[], bool full_command) {
+    printCopyright(cout);
+    cout << "Usage: " << argv[0] << " -s <alignment> [OPTIONS]" << endl << endl;
+    cout << "GENERAL OPTIONS:" << endl
+            << "  -? or -h             Printing this help dialog" << endl
+            << "  -s <alignment>       Input alignment in PHYLIP/FASTA/NEXUS/CLUSTAL/MSF format" << endl
+            << "  -st <data_type>      BIN, DNA, AA, NT2AA, CODON, MORPH (default: auto-detect)" << endl
+            << "  -q <partition_file>  Edge-linked partition model (file in NEXUS/RAxML format)" << endl
+            << " -spp <partition_file> Like -q option but allowing partition-specific rates" << endl
+            << "  -sp <partition_file> Edge-unlinked partition model (like -M option of RAxML)" << endl
+            << "  -t <start_tree_file> | BIONJ | RANDOM" << endl
+            << "                       Starting tree (default: 100 parsimony trees and BIONJ)" << endl
+            << "  -te <user_tree_file> Evaluating a fixed user tree (no tree search performed)" << endl
+            << "  -z <trees_file>      Evaluating user trees at the end (can be used with -t, -te)" << endl
+            << "  -o <outgroup_taxon>  Outgroup taxon name for writing .treefile" << endl
+            << "  -pre <PREFIX>        Using <PREFIX> for output files (default: aln/partition)" << endl
+#ifdef _OPENMP
+            << "  -nt <#cpu_cores>     Number of cores/threads to use (REQUIRED)" << endl
+#endif
+            << "  -seed <number>       Random seed number, normally used for debugging purpose" << endl
+            << "  -v, -vv, -vvv        Verbose mode, printing more messages to screen" << endl
+            << endl << "NEW STOCHASTIC TREE SEARCH ALGORITHM:" << endl
+            << "  -pll                 Use phylogenetic likelihood library (PLL) (default: off)" << endl
+            << "  -numpars <number>    Number of initial parsimony trees (default: 100)" << endl
+            << "  -toppars <number>    Number of best parsimony trees (default: 20)" << endl
+            << "  -sprrad <number>     Radius for parsimony SPR search (default: 6)" << endl
+            << "  -numcand <number>    Size of the candidate tree set (defaut: 5)" << endl
+            << "  -pers <proportion>   Perturbation strength for randomized NNI (default: 0.5)" << endl
+            << "  -allnni              Perform more thorough NNI search (default: off)" << endl
+            << "  -numstop <number>    Number of unsuccessful iterations to stop (default: 100)" << endl
+            << "  -n <#iterations>     Fix number of iterations to <#iterations> (default: auto)" << endl
+            << "  -iqp                 Use the IQP tree perturbation (default: randomized NNI)" << endl
+            << "  -iqpnni              Switch back to the old IQPNNI tree search algorithm" << endl
+            << endl << "ULTRAFAST BOOTSTRAP:" << endl
+            << "  -bb <#replicates>    Ultrafast bootstrap (>=1000)" << endl
+            << "  -wbt                 Write bootstrap trees to .ufboot file (default: none)" << endl
+//            << "  -n <#iterations>     Minimum number of iterations (default: 100)" << endl
+            << "  -nm <#iterations>    Maximum number of iterations (default: 1000)" << endl
+			<< "  -nstep <#iterations> #Iterations for UFBoot stopping rule (default: 100)" << endl
+            << "  -bcor <min_corr>     Minimum correlation coefficient (default: 0.99)" << endl
+			<< "  -beps <epsilon>      RELL epsilon to break tie (default: 0.5)" << endl
+            << endl << "STANDARD NON-PARAMETRIC BOOTSTRAP:" << endl
+            << "  -b <#replicates>     Bootstrap + ML tree + consensus tree (>=100)" << endl
+            << "  -bc <#replicates>    Bootstrap + consensus tree" << endl
+            << "  -bo <#replicates>    Bootstrap only" << endl
+            << "  -t <threshold>       Minimum bootstrap support [0...1) for consensus tree" << endl
+            << endl << "SINGLE BRANCH TEST:" << endl
+            << "  -alrt <#replicates>  SH-like approximate likelihood ratio test (SH-aLRT)" << endl
+            << "  -lbp <#replicates>   Fast local bootstrap probabilities" << endl
+            << endl << "AUTOMATIC MODEL SELECTION:" << endl
+            << "  -m TESTONLY          Standard model selection (like jModelTest, ProtTest)" << endl
+            << "  -m TEST              Like -m TESTONLY but followed by tree reconstruction" << endl
+            << "  -m TESTNEWONLY       New model selection with FreeRate model replacing I+G" << endl
+            << "  -m TESTNEW           Like -m TESTNEWONLY but followed by tree reconstruction" << endl
+            << "  -m TESTMERGEONLY     Select best-fit partition scheme (like PartitionFinder)" << endl
+            << "  -m TESTMERGE         Like -m TESTMERGEONLY but followed by tree reconstruction" << endl
+            << "  -rcluster <percent>  Percentage of partition pairs (relaxed clustering alg.)" << endl
+            << "  -mset program        Restrict search to models supported by other programs" << endl
+            << "                       (i.e., raxml, phyml or mrbayes)" << endl
+            << "  -mset m1,...,mk      Restrict search to models in a comma-separated list" << endl
+            << "                       (e.g. -mset WAG,LG,JTT)" << endl            
+            << "  -msub source         Restrict search to AA models designed for specific sources" << endl
+            << "                       (i.e., nuclear, mitochondrial, chloroplast or viral)" << endl            
+            << "  -mfreq f1,...,fk     Restrict search to using a list of state frequencies" << endl
+            << "                       (default protein: -mfreq FU,F; codon: -mfreq ,F1x4,F3x4,F)" << endl            
+            << "  -mrate r1,...,rk     Restrict search to using a list of rate-across-sites models" << endl
+            << "                       (e.g. -mrate E,I,G,I+G,R)" << endl
+            << "  -cmin <kmin>         Min #categories for FreeRate model [+R] (default: 2)" << endl
+            << "  -cmax <kmax>         Max #categories for FreeRate model [+R] (default: 10)" << endl
+            << "  –merit AIC|AICc|BIC  Optimality criterion to use (default: all)" << endl
+//            << "  -msep                Perform model selection and then rate selection" << endl
+            << "  -mtree               Performing full tree search for each model considered" << endl
+            << "  -mredo               Ignoring model results computed earlier (default: no)" << endl
+            << "  -mdef <nexus_file>   A model definition NEXUS file (see Manual)" << endl
+
+            << endl << "SUBSTITUTION MODEL:" << endl
+            << "  -m <model_name>" << endl
+            << "                  DNA: HKY (default), JC, F81, K2P, K3P, K81uf, TN/TrN, TNef," << endl
+            << "                       TIM, TIMef, TVM, TVMef, SYM, GTR, or 6-digit model" << endl
+            << "                       specification (e.g., 010010 = HKY)" << endl
+            << "              Protein: WAG (default), Poisson, cpREV, mtREV, Dayhoff, mtMAM," << endl
+            << "                       JTT, LG, mtART, mtZOA, VT, rtREV, DCMut, PMB, HIVb," << endl
+            << "                       HIVw, JTTDCMut, FLU, Blosum62" << endl
+            << "      Protein mixture: C10,...,C60, EX2, EX3, EHO, UL2, UL3, EX_EHO, LG4M, LG4X," << endl
+            << "                       JTTCF4G" << endl
+            << "               Binary: JC2 (default), GTR2" << endl
+            << "      Empirical codon: KOSI07, SCHN05" << endl 
+            << "    Mechanistic codon: GY (default), MG, MGK, GY0K, GY1KTS, GY1KTV, GY2K," << endl
+            << "                       MG1KTS, MG1KTV, MG2K" << endl
+            << " Semi-empirical codon: XX_YY where XX is empirical and YY is mechanistic model" << endl
+            << "       Morphology/SNP: MK (default), ORDERED" << endl
+            << "            Otherwise: Name of file containing user-model parameters" << endl
+            << "                       (rate parameters and state frequencies)" << endl
+            << "  -m <model_name>+F or +FO or +FU or +FQ (default: auto)" << endl
+            << "                       counted, optimized, user-defined, equal state frequency" << endl
+            << "  -m <model_name>+F1x4 or +F3x4" << endl
+            << "                       Codon frequencies" << endl
+            << "  -m <model_name>+ASC  Ascertainment bias correction for morphological/SNP data" << endl
+            << "  -m \"MIX{m1,...mK}\"   Mixture model with K components" << endl
+            << "  -m \"FMIX{f1,...fK}\"  Frequency mixture model with K components" << endl
+            << "  -mwopt               Turn on optimizing mixture weights (default: none)" << endl
+            << endl << "RATE HETEROGENEITY:" << endl
+            << "  -m <model_name>+I or +G[n] or +I+G[n] or +R[n]" << endl
+            << "                       Invar, Gamma, Invar+Gamma, or FreeRate model where 'n' is" << endl
+            << "                       number of categories (default: n=4)" << endl
+            << "  -a <Gamma_shape>     Gamma shape parameter for site rates (default: estimate)" << endl
+            << "  -gmedian             Computing mean for Gamma rate category (default: mean)" << endl
+            << "  -i <p_invar>         Proportion of invariable sites (default: estimate)" << endl
+            << "  -mh                  Computing site-specific rates to .mhrate file using" << endl
+            << "                       Meyer & von Haeseler (2003) method" << endl
+            //<< "  -c <#categories>     Number of Gamma rate categories (default: 4)" << endl
+            << endl << "TEST OF MODEL HOMOGENEITY:" << endl
+            << "  -m WHTEST            Testing model (GTR+G) homogeneity assumption using" << endl
+            << "                       Weiss & von Haeseler (2003) method" << endl
+            << "  -ns <#simulations>   #Simulations to obtain null-distribution (default: 1000)" << endl
+//            << endl << "TREE INFERENCE:" << endl
+//            << "  -p <probability>     IQP: Probability of deleting leaves (default: auto)" << endl
+//            << "  -k <#representative> IQP: Size of representative leaf set (default: 4)" << endl
+//            << "  -n <#iterations>     Number of iterations  (default: auto)" << endl
+//            << "  -sr <#iterations>    Stopping rule with max. #iterations (default: off)" << endl
+//            << "  -sc <confidence>     Confidence value for stopping rule (default: 0.95)" << endl
+//            << "  -spc <level>         Confidence level for NNI adaptive search (default 0.95)" << endl
+//            << "  -sp_iter <number>    #iterations before NNI adaptive heuristic is started" << endl
+//            << "  -lmd <lambda>        lambda parameter for the PhyML search (default 0.75)" << endl
+//            << "  -nosse               Disable SSE instructions" << endl
+//            << "  -wt                  Writing all intermediate trees into .treels file" << endl
+//            << "  -d <file>            Reading genetic distances from file (default: JC)" << endl
+//            << "  -fixbr               Fix branch lengths of <treefile>" << endl
+//            << "  -seed <number>       Random seed number, normally used for debugging purpose" << endl
+//            << "  -v, -vv, -vvv        Verbose mode, printing more messages to screen" << endl
+            << endl << "CONSENSUS RECONSTRUCTION:" << endl
+            << "  -t <tree_file>       Set of input trees for consensus reconstruction" << endl
+            << "  -minsup <threshold>  Min split support in range [0,1]; 0.5 for majority-rule" << endl
+            << "                       consensus (default: 0, i.e. extended consensus)" << endl
+            << "  -bi <burnin>         Discarding <burnin> trees at beginning of <treefile>" << endl
+            << "  -con                 Computing consensus tree to .contree file" << endl
+            << "  -net                 Computing consensus network to .nex file" << endl
+            << "  -sup <target_tree>   Assigning support values for <target_tree> to .suptree" << endl
+            << "  -suptag <name>       Node name (or ALL) to assign tree IDs where node occurs" << endl
+            << endl << "ROBINSON-FOULDS DISTANCE:" << endl
+            << "  -rf_all              Computing all-to-all RF distances of trees in <treefile>" << endl
+            << "  -rf <treefile2>      Computing all RF distances between two sets of trees" << endl
+            << "                       stored in <treefile> and <treefile2>" << endl
+            << "  -rf_adj              Computing RF distances of adjacent trees in <treefile>" << endl
+            << endl << "TREE TOPOLOGY TEST:" << endl
+            << "  -zb <#replicates>    BP,KH,SH,ELW tests with RELL for trees passed via -z" << endl
+            << "  -zw                  Also performing weighted-KH and weighted-SH tests" << endl
+            << endl;
+
+			cout << "GENERATING RANDOM TREES:" << endl;
+			cout << "  -r <num_taxa>        Create a random tree under Yule-Harding model." << endl;
+			cout << "  -ru <num_taxa>       Create a random tree under Uniform model." << endl;
+			cout << "  -rcat <num_taxa>     Create a random caterpillar tree." << endl;
+			cout << "  -rbal <num_taxa>     Create a random balanced tree." << endl;
+			cout << "  -rcsg <num_taxa>     Create a random circular split network." << endl;
+			cout << "  -rlen <min_len> <mean_len> <max_len>  " << endl;
+			cout << "                       min, mean, and max branch lengths of random trees." << endl;
+
+			cout << endl << "MISCELLANEOUS:" << endl
+		    << "  -wt                  Write locally optimal trees into .treels file" << endl
+			<< "  -fixbr               Fix branch lengths of <treefile>." << endl
+            << "                       Used with -n 0 to compute log-likelihood of <treefile>" << endl
+			<< "  -wsl                 Writing site log-likelihoods to .sitelh file" << endl
+            << "  -wslg                Writing site log-likelihoods per Gamma category" << endl
+            << "  -fconst f1,...,fN    Add constant patterns into alignment (N=#nstates)" << endl;
+//            << "  -d <file>            Reading genetic distances from file (default: JC)" << endl
+//			<< "  -d <outfile>         Calculate the distance matrix inferred from tree" << endl
+//			<< "  -stats <outfile>     Output some statistics about branch lengths" << endl
+//			<< "  -comp <treefile>     Compare tree with each in the input trees" << endl;
+
+
+			cout << endl;
+
+    if (full_command) {
+        //TODO Print other options here (to be added)
+    }
+
+    exit(0);
+}
+
+void quickStartGuide() {
+    printCopyright(cout);
+    cout << "Minimal command-line examples (replace 'iqtree ...' with actual path to executable):" << endl << endl
+        << "1. Reconstruct maximum-likelihood tree from a sequence alignment (example.phy)" << endl
+         << "   with the best-fit substitution model automatically selected:" << endl
+         << "     iqtree -s example.phy -m TEST" << endl << endl
+         << "2. Reconstruct ML tree and assess branch supports with ultrafast bootstrap" << endl
+         << "   and SH-aLRT test (1000 replicates):" << endl
+         << "     iqtree -s example.phy -m TEST -alrt 1000 -bb 1000" << endl << endl
+         << "3. Perform partitioned analysis with partition definition file (example.nex)" << endl
+         << "   in Nexus or RAxML format using edge-linked model and gene-specific rates:" << endl
+         << "     iqtree -s example.phy -spp example.nex -m TEST" << endl << endl
+         << "   (for edge-unlinked model replace '-spp' with '-sp' option)" << endl << endl
+         << "4. Merge partitions to reduce model complexity:" << endl
+         << "     iqtree -s example.phy -sp example.nex -m TESTMERGE" << endl << endl
+         << "5. Perform model selection only: use '-m TESTONLY' or '-m TESTMERGEONLY'" << endl << endl
+#ifdef _OPENMP
+         << "6. Use 4 CPU cores to speed up computation: add '-nt 4' option" << endl << endl
+#endif
+         << "To show all available options: run 'iqtree -h'" << endl << endl
+         << "Have a look at the tutorial and manual for more information:" << endl
+         << "     http://www.cibiv.at/software/iqtree" << endl << endl;
+    exit(0);
+}
+
+InputType detectInputFile(char *input_file) {
+
+    try {
+        ifstream in;
+        in.exceptions(ios::failbit | ios::badbit);
+        in.open(input_file);
+
+        unsigned char ch, ch2;
+        int count = 0;
+        do {
+            in >> ch;
+        } while (ch <= 32 && !in.eof() && count++ < 20);
+        in >> ch2;
+        in.close();
+        switch (ch) {
+            case '#': return IN_NEXUS;
+            case '(': return IN_NEWICK;
+            case '[': return IN_NEWICK;
+            case '>': return IN_FASTA;
+            case 'C': if (ch2 == 'L') return IN_CLUSTAL; else return IN_OTHER;
+            case '!': if (ch2 == '!') return IN_MSF; else return IN_OTHER;
+            default:
+                if (isdigit(ch)) return IN_PHYLIP;
+                return IN_OTHER;
+        }
+    } catch (ios::failure) {
+        outError("Cannot read file ", input_file);
+    }
+    return IN_OTHER;
+}
+
+bool overwriteFile(char *filename) {
+    ifstream infile(filename);
+    if (infile.is_open()) {
+        cout << "Overwrite " << filename << " (y/n)? ";
+        char ch;
+        cin >> ch;
+        if (ch != 'Y' && ch != 'y') {
+            infile.close();
+            return false;
+        }
+    }
+    infile.close();
+    return true;
+}
+
+void parseAreaName(char *area_names, set<string> &areas) {
+    string all = area_names;
+    int pos;
+    while (!all.empty()) {
+        pos = all.find(',');
+        if (pos < 0) pos = all.length();
+        areas.insert(all.substr(0, pos));
+        if (pos >= all.length())
+            all = "";
+        else
+            all = all.substr(pos + 1);
+    }
+}
+
+double logFac(const int num) {
+    if (num < 0) return -1.0;
+    if (num == 0) return 0.0;
+    double ret = 0;
+    for (int i = 1; i <= num; i++)
+        ret += log((double) i);
+    return ret;
+}
+
+template <typename I>
+I random_element(I begin, I end)
+{
+    const unsigned long n = std::distance(begin, end);
+    const unsigned long divisor = (RAND_MAX + 1) / n;
+
+    unsigned long k;
+    do { k = std::rand() / divisor; } while (k >= n);
+
+    return std::advance(begin, k);
+}
+
+template <class T>
+inline T quantile(const vector<T>& v, const double q) {
+    unsigned int size = v.size();
+    if (q <= 0) return *std::min_element(v.begin(), v.end());
+    if (q >= 1) return *std::max_element(v.begin(), v.end());
+    //double pos = (size - 1) * q;
+    //unsigned int ind = (unsigned int)(pos);
+    //double delta = pos - ind;
+    vector<T> w(size);
+    std::copy(v, v.begin() + size, w.begin());
+}
+
+#define RAN_STANDARD 1
+#define RAN_SPRNG    2
+#define RAN_RAND4    3
+
+#define RAN_TYPE 2
+
+#if RAN_TYPE == RAN_STANDARD
+
+int init_random(int seed) {
+    srand(seed);
+    cout << "(Using rand() - Standard Random Number Generator)" << endl;
+    return seed;
+}
+
+int finish_random() {
+	return 0;
+}
+
+
+#elif RAN_TYPE == RAN_RAND4
+/******************************************************************************/
+/* random numbers generator  (Numerical recipes)                              */
+/******************************************************************************/
+
+/* variable */
+long _idum;
+
+/* definitions */
+#define IM1 2147483563
+#define IM2 2147483399
+#define AM (1.0/IM1)
+#define IMM1 (IM1-1)
+#define IA1 40014
+#define IA2 40692
+#define IQ1 53668
+#define IQ2 52774
+#define IR1 12211
+#define IR2 3791
+#define NTAB 32
+#define NDIV (1+IMM1/NTAB)
+#define EPS 1.2e-7
+#define RNMX (1.0-EPS)
+
+double randomunitintervall()
+/* Long period (> 2e18) random number generator. Returns a uniform random
+   deviate between 0.0 and 1.0 (exclusive of endpoint values).
+
+   Source:
+   Press et al., "Numerical recipes in C", Cambridge University Press, 1992
+   (chapter 7 "Random numbers", ran2 random number generator) */ {
+    int j;
+    long k;
+    static long _idum2 = 123456789;
+    static long iy = 0;
+    static long iv[NTAB];
+    double temp;
+
+    if (_idum <= 0) {
+        if (-(_idum) < 1)
+            _idum = 1;
+        else
+            _idum = -(_idum);
+        _idum2 = (_idum);
+        for (j = NTAB + 7; j >= 0; j--) {
+            k = (_idum) / IQ1;
+            _idum = IA1 * (_idum - k * IQ1) - k*IR1;
+            if (_idum < 0)
+                _idum += IM1;
+            if (j < NTAB)
+                iv[j] = _idum;
+        }
+        iy = iv[0];
+    }
+    k = (_idum) / IQ1;
+    _idum = IA1 * (_idum - k * IQ1) - k*IR1;
+    if (_idum < 0)
+        _idum += IM1;
+    k = _idum2 / IQ2;
+    _idum2 = IA2 * (_idum2 - k * IQ2) - k*IR2;
+    if (_idum2 < 0)
+        _idum2 += IM2;
+    j = iy / NDIV;
+    iy = iv[j] - _idum2;
+    iv[j] = _idum;
+    if (iy < 1)
+        iy += IMM1;
+    if ((temp = AM * iy) > RNMX)
+        return RNMX;
+    else
+        return temp;
+} /* randomunitintervall */
+
+#undef IM1
+#undef IM2
+#undef AM
+#undef IMM1
+#undef IA1
+#undef IA2
+#undef IQ1
+#undef IQ2
+#undef IR1
+#undef IR2
+#undef NTAB
+#undef NDIV
+#undef EPS
+#undef RNMX
+
+int init_random(int seed) /* RAND4 */ {
+    //    srand((unsigned) time(NULL));
+    //    if (seed < 0)
+    // 	seed = rand();
+    _idum = -(long) seed;
+#ifndef PARALLEL
+    cout << "(Using RAND4 Random Number Generator)" << endl;
+#else /* PARALLEL */
+    {
+        int n;
+        if (PP_IamMaster) {
+            cout << "(Using RAND4 Random Number Generator with leapfrog method)" << endl;
+        }
+        for (n = 0; n < PP_Myid; n++)
+            (void) randomunitintervall();
+        if (verbose_mode >= VB_MED) {
+            cout << "(" << PP_Myid << ") !!! random seed set to " << seed << ", " << n << " drawn !!!" << endl;
+        }
+    }
+#endif
+    return (seed);
+} /* initrandom */
+
+int finish_random() {
+	return 0;
+}
+/******************/
+
+#else /* SPRNG */
+
+/******************/
+
+int *randstream;
+
+int init_random(int seed) {
+    //    srand((unsigned) time(NULL));
+    if (seed < 0)
+        seed = make_sprng_seed();
+#ifndef PARALLEL
+    cout << "(Using SPRNG - Scalable Parallel Random Number Generator)" << endl;
+    randstream = init_sprng(0, 1, seed, SPRNG_DEFAULT); /*init stream*/
+    if (verbose_mode >= VB_MED) {
+        print_sprng(randstream);
+    }
+#else /* PARALLEL */
+    if (PP_IamMaster) {
+        cout << "(Using SPRNG - Scalable Parallel Random Number Generator)" << endl;
+    }
+    /* MPI_Bcast(&seed, 1, MPI_UNSIGNED, PP_MyMaster, MPI_COMM_WORLD); */
+    randstream = init_sprng(PP_Myid, PP_NumProcs, seed, SPRNG_DEFAULT); /*initialize stream*/
+    if (verbose_mode >= VB_MED) {
+        cout << "(" << PP_Myid << ") !!! random seed set to " << seed << " !!!" << endl;
+        print_sprng(randstream);
+    }
+#endif /* PARALLEL */
+    return (seed);
+} /* initrandom */
+
+int finish_random() {
+	return free_sprng(randstream);
+}
+
+#endif /* USE_SPRNG */
+
+/******************/
+
+/* returns a random integer in the range [0; n - 1] */
+int random_int(int n) {
+    return (int) floor(random_double() * n);
+} /* randominteger */
+
+//int randint(int a, int b) {
+//	return a + (RAND_MAX * rand() + rand()) % (b + 1 - a);
+//}
+//
+
+double random_double() {
+#ifndef FIXEDINTRAND
+#ifndef PARALLEL
+#if RAN_TYPE == RAN_STANDARD
+    return ((double) rand()) / ((double) RAND_MAX + 1);
+#elif RAN_TYPE == RAN_SPRNG
+    return sprng(randstream);
+#else /* NO_SPRNG */
+    return randomunitintervall();
+#endif /* NO_SPRNG */
+#else /* NOT PARALLEL */
+#if RAN_TYPE == RAN_SPRNG
+    return sprng(randstream);
+#else /* NO_SPRNG */
+    int m;
+    for (m = 1; m < PP_NumProcs; m++)
+        (void) randomunitintervall();
+    PP_randn += (m - 1);
+    PP_rand++;
+    return randomunitintervall();
+#endif /* NO_SPRNG */
+#endif /* NOT PARALLEL */
+#else /* FIXEDINTRAND */
+    cerr << "!!! fixed \"random\" integers for testing purposes !!!" << endl;
+    return 0.0;
+#endif /* FIXEDINTRAND */
+
+}
+
+/* Following part is taken from ModelTest software */
+#define	BIGX            20.0                                 /* max value to represent exp (x) */
+#define	LOG_SQRT_PI     0.5723649429247000870717135          /* log (sqrt (pi)) */
+#define	I_SQRT_PI       0.5641895835477562869480795          /* 1 / sqrt (pi) */
+#define	Z_MAX           6.0                                  /* maximum meaningful z value */
+#define	ex(x)           (((x) < -BIGX) ? 0.0 : exp (x))
+
+/************** Normalz: probability of normal z value *********************/
+
+/*
+ALGORITHM:	Adapted from a polynomial approximation in:
+                        Ibbetson D, Algorithm 209
+                        Collected Algorithms of the CACM 1963 p. 616
+                Note:
+                        This routine has six digit accuracy, so it is only useful for absolute
+                        z values < 6.  For z values >= to 6.0, Normalz() returns 0.0.
+ */
+
+double Normalz(double z) /*VAR returns cumulative probability from -oo to z VAR normal z value */ {
+    double y, x, w;
+
+    if (z == 0.0)
+        x = 0.0;
+    else {
+        y = 0.5 * fabs(z);
+        if (y >= (Z_MAX * 0.5))
+            x = 1.0;
+        else if (y < 1.0) {
+            w = y*y;
+            x = ((((((((0.000124818987 * w
+                    - 0.001075204047) * w + 0.005198775019) * w
+                    - 0.019198292004) * w + 0.059054035642) * w
+                    - 0.151968751364) * w + 0.319152932694) * w
+                    - 0.531923007300) * w + 0.797884560593) * y * 2.0;
+        } else {
+            y -= 2.0;
+            x = (((((((((((((-0.000045255659 * y
+                    + 0.000152529290) * y - 0.000019538132) * y
+                    - 0.000676904986) * y + 0.001390604284) * y
+                    - 0.000794620820) * y - 0.002034254874) * y
+                    + 0.006549791214) * y - 0.010557625006) * y
+                    + 0.011630447319) * y - 0.009279453341) * y
+                    + 0.005353579108) * y - 0.002141268741) * y
+                    + 0.000535310849) * y + 0.999936657524;
+        }
+    }
+    return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5));
+}
+
+
+/**************  ChiSquare: probability of chi square value *************/
+
+/*ALGORITHM Compute probability of chi square value.
+Adapted from: 	Hill, I. D. and Pike, M. C.  Algorithm 299.Collected Algorithms for the CACM 1967 p. 243
+Updated for rounding errors based on remark inACM TOMS June 1985, page 185. Found in Perlman.lib*/
+
+double computePValueChiSquare(double x, int df) /* x: obtained chi-square value,  df: degrees of freedom */ {
+    double a, y, s;
+    double e, c, z;
+    int even; /* true if df is an even number */
+
+    if (x <= 0.0 || df < 1)
+        return (1.0);
+
+    y = 1;
+
+    a = 0.5 * x;
+    even = (2 * (df / 2)) == df;
+    if (df > 1)
+        y = ex(-a);
+    s = (even ? y : (2.0 * Normalz(-sqrt(x))));
+    if (df > 2) {
+        x = 0.5 * (df - 1.0);
+        z = (even ? 1.0 : 0.5);
+        if (a > BIGX) {
+            e = (even ? 0.0 : LOG_SQRT_PI);
+            c = log(a);
+            while (z <= x) {
+                e = log(z) + e;
+                s += ex(c * z - a - e);
+                z += 1.0;
+            }
+            return (s);
+        } else {
+            e = (even ? 1.0 : (I_SQRT_PI / sqrt(a)));
+            c = 0.0;
+            while (z <= x) {
+                e = e * (a / z);
+                c = c + e;
+                z += 1.0;
+            }
+            return (c * y + s);
+        }
+    } else
+        return (s);
+}
+
+
+void trimString(string &str) {
+    str.erase(0, str.find_first_not_of(" \n\r\t"));
+    str.erase(str.find_last_not_of(" \n\r\t")+1);
+}
+
+Params& Params::getInstance() {
+    static Params instance;
+    return instance;
+}
+
+
+int countPhysicalCPUCores() {
+    uint32_t registers[4];
+    unsigned logicalcpucount;
+    unsigned physicalcpucount;
+#if defined(_WIN32) || defined(WIN32)
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo( &systeminfo );
+    logicalcpucount = systeminfo.dwNumberOfProcessors;
+#else
+    logicalcpucount = sysconf( _SC_NPROCESSORS_ONLN );
+#endif
+    return logicalcpucount;
+    
+    if (logicalcpucount % 2 != 0)
+        return logicalcpucount;
+    __asm__ __volatile__ ("cpuid " :
+                          "=a" (registers[0]),
+                          "=b" (registers[1]),
+                          "=c" (registers[2]),
+                          "=d" (registers[3])
+                          : "a" (1), "c" (0));
+
+    unsigned CPUFeatureSet = registers[3];
+    bool hyperthreading = CPUFeatureSet & (1 << 28);    
+    if (hyperthreading){
+        physicalcpucount = logicalcpucount / 2;
+    } else {
+        physicalcpucount = logicalcpucount;
+    }
+    return physicalcpucount;
+}
+
+// stacktrace.h (c) 2008, Timo Bingmann from http://idlebox.net/
+// published under the WTFPL v2.0
+
+/** Print a demangled stack backtrace of the caller function to FILE* out. */
+
+#if  defined(WIN32) || defined(__CYGWIN__) 
+
+// donothing for WIN32
+void print_stacktrace(ostream &out, unsigned int max_frames) {}
+
+#else
+
+void print_stacktrace(ostream &out, unsigned int max_frames)
+{
+#ifdef _OPENMP
+#pragma omp master
+{
+#endif
+    out << "STACK TRACE FOR DEBUGGING:" << endl;
+
+    // storage array for stack trace address data
+    void* addrlist[max_frames+1];
+
+    // retrieve current stack addresses
+    int addrlen = backtrace(addrlist, sizeof(addrlist) / sizeof(void*));
+
+//    if (addrlen == 0) {
+//        out << "  <empty, possibly corrupt>" << endl;
+//        return;
+//    }
+
+    // resolve addresses into strings containing "filename(function+address)",
+    // this array must be free()-ed
+    char** symbollist = backtrace_symbols(addrlist, addrlen);
+
+    // allocate string which will be filled with the demangled function name
+    size_t funcnamesize = 256;
+    char* funcname = (char*)malloc(funcnamesize);
+
+    // iterate over the returned symbol lines. skip the first, it is the
+    // address of this function.
+    for (int i = 1; i < addrlen; i++)
+    {
+	char *begin_name = 0, *begin_offset = 0;
+
+	// find parentheses and +address offset surrounding the mangled name:
+#ifdef __clang__
+      // OSX style stack trace
+      for ( char *p = symbollist[i]; *p; ++p )
+      {
+         if (( *p == '_' ) && ( *(p-1) == ' ' ))
+            begin_name = p-1;
+         else if ( *p == '+' )
+            begin_offset = p-1;
+      }
+
+      if ( begin_name && begin_offset && ( begin_name < begin_offset ))
+      {
+         *begin_name++ = '\0';
+         *begin_offset++ = '\0';
+
+         // mangled name is now in [begin_name, begin_offset) and caller
+         // offset in [begin_offset, end_offset). now apply
+         // __cxa_demangle():
+         int status;
+         char* ret = abi::__cxa_demangle( begin_name, &funcname[0],
+                                          &funcnamesize, &status );
+         if ( status == 0 )
+         {
+            funcname = ret; // use possibly realloc()-ed string
+//            out << "  " << symbollist[i] << " : " << funcname << "+"<< begin_offset << endl;
+            out << i << "   "  << funcname << endl;
+         } else {
+            // demangling failed. Output function name as a C function with
+            // no arguments.
+//             out << "  " << symbollist[i] << " : " << begin_name << "()+"<< begin_offset << endl;
+            out << i << "   " << begin_name << "()" << endl;
+         }
+
+#else // !DARWIN - but is posix
+         // ./module(function+0x15c) [0x8048a6d]
+    char *end_offset = 0;
+	for (char *p = symbollist[i]; *p; ++p)
+	{
+	    if (*p == '(')
+		begin_name = p;
+	    else if (*p == '+')
+		begin_offset = p;
+	    else if (*p == ')' && begin_offset) {
+		end_offset = p;
+		break;
+	    }
+	}
+
+	if (begin_name && begin_offset && end_offset
+	    && begin_name < begin_offset)
+	{
+	    *begin_name++ = '\0';
+	    *begin_offset++ = '\0';
+	    *end_offset = '\0';
+
+	    // mangled name is now in [begin_name, begin_offset) and caller
+	    // offset in [begin_offset, end_offset). now apply
+	    // __cxa_demangle():
+
+	    int status;
+	    char* ret = abi::__cxa_demangle(begin_name,
+					    funcname, &funcnamesize, &status);
+	    if (status == 0) {
+            funcname = ret; // use possibly realloc()-ed string
+//            out << "  " << symbollist[i] << " : " << funcname << "+"<< begin_offset << endl;
+            out << i << "   " << funcname << endl;
+	    }
+	    else {
+            // demangling failed. Output function name as a C function with
+            // no arguments.
+//            out << "  " << symbollist[i] << " : " << begin_name << "()+"<< begin_offset << endl;
+            out << i << "   " << begin_name << "()" << endl;
+	    }
+#endif
+	}
+	else
+	{
+	    // couldn't parse the line? print the whole line.
+//	    out << i << ". " << symbollist[i] << endl;
+	}
+    }
+
+    free(funcname);
+    free(symbollist);
+#ifdef _OPENMP
+}
+#endif
+
+}
+
+#endif // WIN32
diff --git a/tools.h b/tools.h
new file mode 100644
index 0000000..b1a1d02
--- /dev/null
+++ b/tools.h
@@ -0,0 +1,2246 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+#include <iqtree_config.h>
+#include <vector>
+#include <string>
+#include <set>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+//#include <sys/time.h>
+//#include <time.h>
+#include <sys/stat.h>
+#include <math.h>
+#include "ncl/ncl.h"
+#include "msetsblock.h"
+
+#define SPRNG
+#include "sprng/sprng.h"
+
+
+#define USE_HASH_MAP
+
+#ifdef __GNUC__
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION 0
+#endif
+
+// for MSVC
+#ifndef __func__
+#define __func__ __FUNCTION__
+#endif
+
+#if defined(USE_HASH_MAP)
+	#if defined(_MSC_VER)
+		#include <unordered_map>
+		#include <unordered_set>
+    #elif defined(__clang__)
+		#include <tr1/unordered_map>
+		#include <tr1/unordered_set>
+		using namespace std::tr1;    
+	#elif !defined(__GNUC__)
+		#include <hash_map>
+		#include <hash_set>
+		using namespace stdext;
+	#elif GCC_VERSION < 40300
+		#include <ext/hash_map>
+		#include <ext/hash_set>
+		using namespace __gnu_cxx;
+		#define unordered_map hash_map
+		#define unordered_set hash_set
+	#else
+		#include <tr1/unordered_map>
+		#include <tr1/unordered_set>
+		using namespace std::tr1;
+	#endif
+#else
+	#include <map>
+	#include <set>
+#endif
+
+using namespace std;
+
+
+#if	defined(USE_HASH_MAP) && GCC_VERSION < 40300 && !defined(_MSC_VER) && !defined(__clang__)
+/*
+        Define the hash function of Split
+ */
+#if !defined(__GNUC__)
+namespace stdext {
+#else
+namespace __gnu_cxx {
+#endif
+
+    template<>
+    struct hash<string> {
+
+        size_t operator()(string str) const {
+            hash<const char*> hash_str;
+            return hash_str(str.c_str());
+        }
+    };
+} // namespace
+#endif // USE_HASH_MAP
+
+
+class Linear {
+public:
+
+    Linear(int n, double *x, double *y) {
+
+        // calculate the averages of arrays x and y
+        double xa = 0, ya = 0;
+        for (int i = 0; i < n; i++) {
+            xa += x[i];
+            ya += y[i];
+        }
+        xa /= n;
+        ya /= n;
+
+        // calculate auxiliary sums
+        double xx = 0, yy = 0, xy = 0;
+        for (int i = 0; i < n; i++) {
+            double tmpx = x[i] - xa, tmpy = y[i] - ya;
+            xx += tmpx * tmpx;
+            yy += tmpy * tmpy;
+            xy += tmpx * tmpy;
+        }
+
+        // calculate regression line parameters
+
+        // make sure slope is not infinite
+        assert(fabs(xx) != 0);
+
+        m_b = xy / xx;
+        m_a = ya - m_b * xa;
+        m_coeff = (fabs(yy) == 0) ? 1 : xy / sqrt(xx * yy);
+
+    }
+
+    double getValue(double x) {
+        return m_a + m_b * x;
+    }
+
+    //! Returns the slope of the regression line
+
+    double getSlope() {
+        return m_b;
+    }
+
+    //! Returns the intercept on the Y axis of the regression line
+
+    double getIntercept() {
+        return m_a;
+    }
+
+    //! Returns the linear regression coefficient
+
+    double getCoefficient() {
+        return m_coeff;
+    }
+
+private:
+
+    double m_a, m_b, m_coeff;
+};
+
+/**
+        vector of double number
+ */
+typedef vector<double> DoubleVector;
+
+/**
+        vector of int
+ */
+typedef vector<int> IntList;
+
+
+/**
+        vector of int
+ */
+typedef vector<int> IntVector;
+
+/**
+        vector of bool
+ */
+typedef vector<bool> BoolVector;
+
+
+/**
+        vector of char
+ */
+typedef vector<char> CharVector;
+
+/**
+        vector of string
+ */
+typedef vector<string> StrVector;
+
+
+/**
+        matrix of double number
+ */
+#define matrix(T) vector<vector<T> >
+
+/**
+        matrix of double
+ */
+/*
+class DoubleMatrix {
+public:
+        double *value;
+        int rows, cols, size;
+        DoubleMatrix(int arows, int acols);
+        //inline double operator() (int i, int j);
+        inline double &operator() (int i, int j) { return value[i * cols + j]; };
+        inline double *operator[] (int i) {	return value + (i*cols); };
+        virtual ~DoubleMatrix();
+        void setZero();
+};
+ */
+typedef matrix(double) DoubleMatrix;
+
+typedef unsigned int UINT;
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+        run mode of program
+ */
+enum RunMode {
+    DETECTED, GREEDY, PRUNING, BOTH_ALG, EXHAUSTIVE, DYNAMIC_PROGRAMMING,
+    CALC_DIST, PD_USER_SET, PRINT_TAXA, PRINT_AREA, SCALE_BRANCH_LEN,
+    SCALE_NODE_NAME, PD_DISTRIBUTION, LINEAR_PROGRAMMING, STATS //, GBO, MPRO
+}; //STATS and GBO added by MA (STATS for some statistics on tree, GBO = guided 'bootstrap'
+
+/**
+        type of generating trees or splits graphs
+ */
+enum TreeGenType {
+    NONE, YULE_HARDING, UNIFORM, CATERPILLAR, BALANCED,
+    CIRCULAR_SPLIT_GRAPH, TAXA_SET, STAR_TREE
+};
+
+/**
+        when writing tree:
+                WT_BR_LEN - output branch length
+                WT_BR_CLADE - put branch length into internal node name
+                WT_TAXON_ID - output taxon ID
+                WT_INT_NODE - for draw tree, draw the internal node ID
+                WT_BR_SCALE - for draw tree, draw the branch proportional to its length
+                WT_SORT_TAXA - sort the taxa s.t. subtrees with least taxon ID come first
+                WT_APPEND    - append the output file
+                WT_NEWLINE   - print a newline after
+                WT_BR_LEN_FIXED_WIDTH - print branch length in fixed number format
+ */
+const int WT_BR_LEN = 1;
+const int WT_BR_CLADE = 2;
+const int WT_TAXON_ID = 4;
+const int WT_INT_NODE = 8;
+const int WT_BR_SCALE = 16;
+const int WT_SORT_TAXA = 32;
+const int WT_APPEND = 64;
+const int WT_NEWLINE = 128;
+const int WT_BR_LEN_FIXED_WIDTH = 256;
+const int WT_BR_ID = 512;
+const int WT_BR_LEN_ROUNDING = 1024;
+const int TRUE = 1;
+const int FALSE = 0;
+
+/**
+ *  Specify different ways of doing an NNI.
+ *  TOPO_ONLY: only change the tree topology
+ *  TOPO_UPDATE_LV: the same as above but the partial likelihoods are update in addition
+ *  NNI1: optimize the central branch after changing the tree topology
+ *  NNI5: optimized the 5 affected branches after changing the tree topology
+ */
+enum NNI_Type {
+    TOPO_ONLY,
+    TOPO_UPDATE_LV,
+    NNI1,
+    NNI5
+};
+
+/**
+        when computing Robinson-Foulds distances
+ */
+const int RF_ADJACENT_PAIR = 1;
+const int RF_ALL_PAIR = 2;
+const int RF_TWO_TREE_SETS = 3;
+const int RF_TWO_TREE_SETS_EXTENDED = 4; // work for trees with non-equal taxon sets
+
+/**
+        split weight summarization
+ */
+const int SW_COUNT = 1; // just counting the number of splits
+const int SW_SUM = 2; // take the sum of all split weights
+const int SW_AVG_ALL = 3; // take the split weight average over all trees
+const int SW_AVG_PRESENT = 4; // take the split weight average over all trees that the split is present
+
+/**
+        search mode
+ */
+//enum SearchMode {EXHAUSTIVE, EXHAUSTIVE_CIRCULAR};
+
+/**
+        input type, tree or splits graph
+ */
+enum InputType {
+    IN_NEWICK, IN_NEXUS, IN_FASTA, IN_PHYLIP, IN_CLUSTAL, IN_MSF, IN_OTHER
+};
+
+/**
+        verbose mode, determine how verbose should the screen be printed.
+ */
+enum VerboseMode {
+    VB_QUIET, VB_MIN, VB_MED, VB_MAX, VB_DEBUG
+};
+
+/**
+        verbose level on the screen
+ */
+extern VerboseMode verbose_mode;
+
+/**
+        consensus reconstruction type
+ */
+enum ConsensusType {
+    CT_NONE, CT_CONSENSUS_TREE, CT_CONSENSUS_NETWORK,
+    CT_ASSIGN_SUPPORT, CT_ASSIGN_SUPPORT_EXTENDED, COMPARE
+};
+
+enum TestType {
+    TEST_NONE, TEST_COMPATIBLE, TEST_CIRCULAR, TEST_WEAKLY_COMPATIBLE, TEST_K_COMPATIBLE
+};
+
+/**
+        State frequency type
+ */
+enum StateFreqType {
+    FREQ_UNKNOWN, FREQ_USER_DEFINED, FREQ_EQUAL, FREQ_EMPIRICAL, FREQ_ESTIMATE,
+    FREQ_CODON_1x4, FREQ_CODON_3x4, FREQ_CODON_3x4C, // special frequency for codon model
+    FREQ_MIXTURE // mixture-frequency model
+};
+
+/**
+        alignment format type
+ */
+
+enum AlnFormat {
+    ALN_PHYLIP, ALN_FASTA
+};
+
+enum ModelTestCriterion {
+    MTC_AIC, MTC_AICC, MTC_BIC, MTC_ALL
+};
+
+/**
+        Stopping condition type
+ */
+enum STOP_CONDITION {
+    SC_FIXED_ITERATION, SC_WEIBULL, SC_UNSUCCESS_ITERATION, SC_BOOTSTRAP_CORRELATION, SC_REAL_TIME
+};
+
+enum IQP_ASSESS_QUARTET {
+    IQP_DISTANCE, IQP_PARSIMONY, IQP_BOOTSTRAP
+};
+
+enum LEAST_SQUARE_VAR {
+    OLS, WLS_FIRST_TAYLOR, WLS_FITCH_MARGOLIASH, WLS_SECOND_TAYLOR, WLS_PAUPLIN
+};
+
+enum START_TREE_TYPE {
+	STT_BIONJ, STT_PARSIMONY, STT_PLL_PARSIMONY, STT_RANDOM_TREE
+};
+
+const int MCAT_LOG = 1; // categorize by log(rate) for Meyer & von Haeseler model
+const int MCAT_MEAN = 2; // take the mean of rates for each category for Meyer & von Haeseler model
+const int MCAT_PATTERN = 4; // categorize site-patterns instead of sites for Meyer & von Haeseler model
+
+const double MAX_GENETIC_DIST = 9.0;
+
+struct NNIInfo {
+    double lh_score[4]; // tree log-likelihood of zero-branch, current tree, NNI tree 1, NNI tree 2
+    double br_len[4]; // length of current branch, optimized branch, NNI branch 1, NNI branch 2
+    int nni_round;
+    int iqpnni_iteration;
+};
+
+enum LikelihoodKernel {
+	LK_NORMAL, LK_SSE, LK_EIGEN, LK_EIGEN_SSE
+};
+
+enum LhMemSave {
+	LM_DETECT, LM_ALL_BRANCH, LM_PER_NODE
+};
+
+/** maximum number of newton-raphson steps for NNI branch evaluation */
+extern int NNI_MAX_NR_STEP;
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+        program parameters, everything is specified here
+        Use singleton pattern to avoid using global variable or
+        having to pass the params variable around
+ */
+class Params {
+public:
+    static Params& getInstance();
+private:
+    Params () {}; // Disable constructor
+    // Temoprarily commented out because void PhyloSuperTree::readPartition(Params &params)
+    // make a copy of params?
+    //Params (Params const&) {}; // Disable copy constructor
+    //void operator=(Params const&) {}; // Disable assignment
+public:
+
+    /**
+    *  Fast and accurate optimiation for alpha and p_invar
+    */
+    bool fai;
+
+	/**
+	 *  Use random restart strategy for estimating alpha and p_invar
+	 */
+	bool testAlpha;
+
+    /**
+     *  Logl epsilon to test for initial alpha and pinvar values.
+     *  This does not need to be small (default value = 100)
+     */
+    double testAlphaEps;
+
+    /**
+     *  Perform exhaustive search for parameter alpha and p_invar
+     */
+    bool exh_ai;
+
+	/**
+	 *  User file contains the alpha and invar parameters
+	 */
+	char* alpha_invar_file;
+
+	/**
+	 * Turn on feature to identify stable splits and fix them during tree search
+	 */
+	bool fix_stable_splits;
+
+	/**
+	 *  Number of distinct locally optimal trees
+	 */
+	int numSupportTrees;
+
+	/**
+	 *  Number of starting parsimony trees
+	 */
+	int numInitTrees;
+
+	/**
+	 *  SPR distance (radius) for parsimony tree
+	 */
+	int sprDist;
+
+	/**
+	 *  Number of NNI locally optimal trees generated from the set of parsimony trees
+	 *  Default = 20 (out of 100 parsimony trees)
+	 */
+	int numNNITrees;
+
+	/**
+	 *  Number of best trees in the candidate set used to generate perturbed trees
+	 */
+	int popSize;
+
+	/**
+	 *  Maximum number of trees stored in the candidate tree set
+	 */
+	int maxCandidates;
+
+	/**
+	 *  heuristics for speeding up NNI evaluation
+	 */
+	bool speednni;
+
+	/**
+	 *  use reduction technique to constraint tree space
+	 */
+	bool reduction;
+
+	/**
+	 *  portion of NNI used for perturbing the tree
+	 */
+	double initPS;
+
+	/**
+	 *  logl epsilon for model parameter optimization
+	 */
+	double modeps;
+
+	/**
+	 *  New search heuristics (DEFAULT: ON)
+	 */
+	bool snni;
+
+	/**
+	 *  Specify how the branch lengths are optimzed after each NNI operation
+	 *  (No optimization, 1 branch optimization, 5 branch optimization)
+	 */
+    NNI_Type nni_type;
+
+    /**
+     *  Different type of Least Square variances
+     */
+	LEAST_SQUARE_VAR ls_var_type;
+
+	/**
+	 *  Threshold (likelihood difference between NNI and current tree)
+	 *  to start optimizing 5 branches
+	 */
+	double nniThresHold;
+
+	/**
+	 *  Optimize 5 branches on NNI tree
+	 */
+	bool nni5;
+
+    /**
+     *  Number of branch length optimization rounds performed after
+     *  each NNI step (DEFAULT: 1)
+     */
+    int numSmoothTree;
+
+    /**
+     *   compute least square branches for a given tree
+     */
+    bool leastSquareBranch;
+
+    /** TRUE to apply Manuel's analytic approximation formulae for branch length */
+    bool manuel_analytic_approx;
+
+    /** TRUE to compute parsimony branch length of final tree */
+    bool pars_branch_length;
+
+    /** TRUE to compute bayesian branch length for the final tree */
+    bool bayes_branch_length;
+
+    /**
+     *  use Least Square to evaluate NNI
+     */
+    bool leastSquareNNI;
+
+    /**
+     *  epsilon value used to compare log-likelihood between trees
+     */
+    double loglh_epsilon;
+
+    /*
+     *  reinsert leaves back to tree using parsimony
+     */
+    bool reinsert_par;
+
+    /*
+     *  Option to evaluate 10 different starting tree and take the best
+     */
+    bool bestStart;
+
+    /**
+     *  Maximum running time of the tree search in minutes
+     */
+    double maxtime;
+
+    /**
+     *  Turn on parsimony branch length estimation
+     */
+    bool parbran;
+
+    /**
+     *  option to turn on phylogenetic library
+     */
+    bool pll;
+
+    /**
+     *  OBSOLETE! Stopping rule for the tree search
+     */
+//    bool autostop;
+
+    /**
+     *  Number of maximum unsuccessful iterations after the search is stopped.
+     *  Used for the automatic stopping rule
+     */
+    int unsuccess_iteration;
+
+    char *binary_aln_file;
+
+    /**
+     *  the speed up heuristic will be used after
+     *  speedup_iter iteration
+     */
+    int speedup_iter;
+
+    /**
+     *  starting CPU time of the program
+     */
+    double startCPUTime;
+
+    /** starting real time of the program */
+    double start_real_time;
+
+    /**
+     *  Number iteration = num_taxa * iteration_multiple
+     */
+    int iteration_multiple;
+    /**
+             input file name
+     */
+    char *user_file;
+
+    /* type of starting tree */
+    START_TREE_TYPE start_tree;
+
+    /**
+            prefix of the output file, default is the same as input file
+     */
+    char *out_prefix;
+
+    /**
+            alignment file name
+     */
+    char *aln_file;
+
+    /**
+            file containing multiple trees to evaluate at the end
+     */
+    char *treeset_file;
+
+    /** number of bootstrap replicates for tree topology test */
+    int topotest_replicates;
+
+    /** true to perform weighted SH and KH test */
+    bool do_weighted_test;
+
+    /** true to do the approximately unbiased (AU) test */
+    bool do_au_test;
+
+    /**
+            file specifying partition model
+     */
+    char *partition_file;
+
+    /**
+     * 		defines the relation between edge lengths in supertree and subtrees
+     * 		0 (NULL) for separate edge length (default)
+     * 		'p' for proportional edge length
+     * 		'j' for joint edge length
+     */
+    char partition_type;
+
+    /** percentage for rcluster algorithm like PartitionFinder */
+    double partfinder_rcluster; 
+
+    /** remove all-gap sequences in partition model to account for terrace default: TRUE */
+    bool remove_empty_seq;
+
+    /** use terrace aware data structure for partition models, default: TRUE */
+    bool terrace_aware;
+
+    /**
+            B, D, or P for Binary, DNA, or Protein sequences
+     */
+    char *sequence_type;
+
+    /**
+            alignment output file name
+     */
+    char *aln_output;
+
+    /**
+            file containing site likelihood as input for 'guided bootstrap' (added by MA)
+     */
+    char *siteLL_file;
+
+    /**
+            alignment where the gappy patterns will be superimposed into the input alignment
+     */
+    char *gap_masked_aln;
+
+    /**
+            alignment to be concatenated into the input alignment
+     */
+    char *concatenate_aln;
+
+    /**
+            file containing list of sites posititon to keep, format:
+            pos1 pos2
+            ....
+     */
+    char *aln_site_list;
+
+    /**
+            name of the reference sequence where aln_site_list is based on,
+            NULL to take alignment positions.
+     */
+    char *ref_seq_name;
+
+    /**
+            alignment output format
+     */
+    AlnFormat aln_output_format;
+
+    /**
+            TRUE to discard all gappy positions
+     */
+    bool aln_nogaps;
+
+    /**
+     * TRUE to discard all constant sites
+     */
+    bool aln_no_const_sites;
+
+    /**
+            OBSOLETE compute parsimony score on trees
+     */
+//    bool parsimony;
+
+    /**
+            compute random step-wise addition parsimony tree instead of BIONJ
+     */
+//    bool parsimony_tree;
+
+    /**
+             output file name
+     */
+    char *out_file;
+
+    /**
+             size of the maximal PD-tree
+     */
+    int sub_size;
+
+    /**
+             min size of the maximal PD-tree
+             used to calculate all PD-k trees from min_size to sub_size
+     */
+    int min_size;
+
+    /**
+            step_size when running from min_size to sub_size
+     */
+    int step_size;
+
+    /**
+            conservation proprotion, another way of input set size
+     */
+    double pd_proportion;
+
+    /**
+            min conservation proprotion
+     */
+    double min_proportion;
+
+    /**
+            step conservation proprotion
+     */
+    double step_proportion;
+
+    /**
+            sample size for computing PD distribution
+     */
+    int sample_size;
+
+
+    /**
+            TRUE if want to find all optimal PD-k set
+            with the same maximal PD score
+     */
+    bool find_all;
+
+    /**
+             type of random tree to be generated
+     */
+    TreeGenType tree_gen;
+
+    /**
+            when generating random split graph, specify the number of
+            splits here!
+     */
+    int num_splits;
+
+    /**
+             running mode: which algorithms to be applied
+     */
+    RunMode run_mode;
+
+    /**
+             real running mode if run_mode == DETECTED
+     */
+    RunMode detected_mode;
+
+    /**
+             parameter file
+     */
+    char *param_file;
+
+    /**
+            file containing taxa names to be included into the PD-tree
+     */
+    char *initial_file;
+
+    /**
+            file containing area names to be included into the PD set
+     */
+    char *initial_area_file;
+
+    /**
+            file containing a list of specific taxa sets which user wants
+            to compute PD score on these sets only
+     */
+    char *pdtaxa_file;
+
+    /**
+            sets relation file, in form of a distance matrix file
+     */
+    char *areas_boundary_file;
+
+    /**
+            boundary length modifier
+     */
+    double boundary_modifier;
+
+    /**
+            output file to store the distance matrix
+     */
+    char *dist_file;
+
+    /**
+            TRUE to compute the observed distances instead of Juke-Cantor distances, default: FALSE
+     */
+    bool compute_obs_dist;
+
+    /**
+            TRUE to compute the Juke-Cantor distances, default: FALSE
+     */
+    bool compute_jc_dist;
+
+    /**
+            TRUE to compute the maximum-likelihood distances
+     */
+    bool compute_ml_dist;
+
+    /**
+            TRUE to compute the maximum-likelihood tree
+     */
+    bool compute_ml_tree;
+
+    /**
+            file containing budget information
+     */
+    char *budget_file;
+
+    /**
+            used when generating pair of taxa set with overlapping
+     */
+    int overlap;
+
+    // private use
+    /**
+             number of times to repeat the algorithms
+     */
+    int repeated_time;
+
+    /**
+             print no tree to output
+     */
+    int nr_output;
+
+    /**
+            input type, tree or splits graph
+     */
+    InputType intype;
+
+    /**
+            total budget, for cost constrained PD problem
+     */
+    int budget;
+
+    /**
+            minimum budget, for cost constrained PD problem
+     */
+    int min_budget;
+
+    /**
+            step_budget when running from min_budget to budget
+     */
+    int step_budget;
+
+    /**
+            name of the root taxon
+     */
+    const char *root;
+
+    /**
+            true if tree is forced to be rooted
+     */
+    bool is_rooted;
+
+
+    /**
+            min branch length, used to create random tree/network
+     */
+    double min_len;
+
+    /**
+            mean branch length, used to create random tree/network
+     */
+    double mean_len;
+
+    /**
+            max branch length, used to create random tree/network
+     */
+    double max_len;
+
+    /**
+            number of internal branches to set zero length
+     */
+    int num_zero_len;
+
+    /**
+            random number seed
+     */
+    unsigned int ran_seed;
+
+    /**
+            run time of the algorithm
+     */
+    double run_time;
+
+    /**
+            limit on the number of optimal PD sets
+     */
+    int pd_limit;
+
+    /**
+            TRUE if one wants to calculate the PD gain matrix in terms of delta_k^j = pd(PD_k \/ {j}) - pd_k
+     */
+    bool calc_pdgain;
+
+    /**
+            TRUE if tree file contains more than 1 tree
+     */
+    bool multi_tree;
+
+    /**
+            2nd user tree used in assignBootstrapSupport
+     */
+    char *second_tree;
+
+    /** 
+        tag each branch with the tree ID where it occurs; "ALL" to tag all branches
+    */
+    char *support_tag;
+
+    /**
+            2nd alignment used in computing multinomialProb (Added by MA)
+     */
+    char *second_align;
+    /**
+            type of consensus building
+     */
+    ConsensusType consensus_type;
+
+    /**
+            file containing weights for every tree in the input tree file
+     */
+    char *tree_weight_file;
+
+    /**
+            set the TRUE if want to find the minimal PD set, instead of the default maximal PD set
+     */
+    bool find_pd_min;
+
+    /**
+            set TRUE to find area's endemic PD instead of regular PD
+     */
+    bool endemic_pd;
+
+    /**
+            set TRUE to find exclusive PD instead of regular PD
+     */
+    bool exclusive_pd;
+
+    /**
+            to find PD complementarity given this area
+     */
+    char *complement_area;
+
+    /**
+            used for likelihood mapping: for each branch, print the four cluster
+     */
+    int branch_cluster;
+
+    /**
+            file containing taxa order
+     */
+    char *taxa_order_file;
+
+    /**
+            to scale branch length or clade support with a factor
+     */
+    double scaling_factor;
+
+    /**
+            TRUE if always use binary linear programming
+     */
+    bool binary_programming;
+
+    /**
+            test the input split system in one of the TestType
+     */
+    TestType test_input;
+
+    /**
+            burnin value: number of beginning trees to be discarded
+     */
+    int tree_burnin;
+
+    /**
+            maximum number of trees to consider (for e.g. consensus tree construction)
+     */
+    int tree_max_count;
+
+    /**
+        threshold of split frequency, splits appear less than threshold will be discarded
+     */
+    double split_threshold;
+
+    /**
+            threshold of split weight, splits with weight less than or equal to threshold will be discarded
+     */
+    double split_weight_threshold;
+
+    /**
+            Way to summarize split weight in the consensus tree or network: SW_SUM, SW_AVG_ALL, or SW_AVG_PRESENT
+     */
+    double split_weight_summary;
+
+    /**
+            TRUE if use quadratic programming (for GUROBI)
+     */
+    bool quad_programming;
+
+    /**
+            true if one wants to optimize tree by subtree pruning and regrafting
+     */
+    bool tree_spr;
+
+    /**
+            true if printing out of optimal sets in NEXUS format
+     */
+    bool nexus_output;
+
+    /**
+            k-representative parameter, used for IQP algorithm
+     */
+    int k_representative;
+
+    /**
+            probability of deleting a leaf, used for IQP algorithm
+     */
+    double p_delete;
+
+    /**
+            min number of iqpnni iterations
+     */
+    int min_iterations;
+
+    /**
+            max number of iqpnni iterations
+     */
+    int max_iterations;
+
+    /**
+            stop condition, SC_FIXED_ITERATION or SC_STOP_PREDICT
+     */
+    STOP_CONDITION stop_condition;
+
+    /**
+            confidence value for stop rule
+     */
+    double stop_confidence;
+
+    /** number iterations for parameter optimization, default: 100 */
+    int num_param_iterations;
+
+    /**
+            name of the substitution model (e.g., HKY, GTR, TN+I+G, JC+G, etc.)
+     */
+    string model_name;
+
+    /** set of models for testing */
+    char *model_set;
+
+    /** subset of models for testing, e.g. viral, mitochondrial */
+    char *model_subset;
+
+    /** set of state frequencies model for testing */
+    char *state_freq_set;
+
+    /** set of rate heterogeneity model for testing */
+    char *ratehet_set;
+
+    /** model defition file */
+    char *model_def_file;
+
+    /** true to redo model testing even if .model file exists */
+    bool model_test_again;
+
+    /** 0: use the same tree for model testing 
+        1: estimate tree for each model, but initialize the tree for next model 
+           by the tree reconstructed from the previous model
+        2: estimate tree for each model independently
+        */
+    short int model_test_and_tree;
+
+    /** true to fist test equal rate model, then test rate heterogeneity (default: false) */
+    bool model_test_separate_rate;
+
+    /** TRUE to optimize mixture model weights */
+    bool optimize_mixmodel_weight;
+
+    /**
+            TRUE to store transition matrix into a hash table for computation efficiency
+     */
+    bool store_trans_matrix;
+
+    /**
+            state frequency type
+     */
+    StateFreqType freq_type;
+
+
+    /**
+            the number of rate categories
+     */
+    int num_rate_cats;
+
+    /**
+            maximum number of rate categories
+     */
+    int min_rate_cats;
+
+    /**
+            maximum number of rate categories
+     */
+    int max_rate_cats;
+
+    /**
+            shape parameter (alpha) of the Gamma distribution for site rates
+     */
+    double gamma_shape;
+
+    /**
+            TRUE to use median rate for discrete categories, FALSE to use mean rate instead
+     */
+    bool gamma_median;
+
+    /**
+            proportion of invariable sites
+     */
+    double p_invar_sites;
+
+    /** TRUE to optimize all model and rate parameters jointly by BFGS, default: FALSE */
+    bool optimize_model_rate_joint;
+
+    /**
+            TRUE if you want to optimize branch lengths by Newton-Raphson method
+     */
+    bool optimize_by_newton;
+
+    /** optimization algorithm for parameter estimation: 1-BFGS, 2-BFGS, EM */
+    string optimize_alg;
+
+    /**
+            TRUE if you want to fix branch lengths during model optimization
+     */
+    bool fixed_branch_length;
+
+    /**
+            criterion to assess important quartet
+     */
+    IQP_ASSESS_QUARTET iqp_assess_quartet;
+
+    /**
+     *      Using IQP algorithm to do tree perturbation
+     */
+    bool iqp;
+
+    /**
+            the LP file is in gurobi format or not
+     */
+    bool gurobi_format;
+
+    /**
+            number of threads for gurobi call
+     */
+    bool gurobi_threads;
+
+    /**
+            TRUE if doing bootstrap on the input trees (good, bad, ugly)
+     */
+    int num_bootstrap_samples;
+
+    /** bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
+        to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
+        must equal m, where m is the alignment length. Otherwise, an error will occur.
+        The default bootstrap_spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
+    */
+    char *bootstrap_spec;
+
+    /**
+            1 if output all intermediate trees from every IQPNNI iteration
+            2 if output all intermediate trees + 1-NNI-away trees
+     */
+    int write_intermediate_trees;
+
+    /**
+     *  Write out all candidate trees (the locally optimal trees)
+     */
+    int write_local_optimal_trees;
+
+    /**
+        TRUE to avoid duplicated trees while writing intermediate trees
+     */
+    bool avoid_duplicated_trees;
+
+    /**
+            Robinson-Foulds distance computation mode: RF_ADJACENT PAIR, RF_ALL_PAIR
+     */
+    int rf_dist_mode;
+
+    /**
+            compute the site-specific rates by Meyer & von Haeseler method
+     */
+    bool mvh_site_rate;
+
+    /**
+            FALSE to use MH Model, FALSE for using tree-likelihood
+     */
+    bool rate_mh_type;
+
+    /**
+            TRUE to discard saturated for Meyer & von Haeseler (2003) model
+     */
+    bool discard_saturated_site;
+
+    /**
+            rates will be normalized to this mean value
+     */
+    double mean_rate;
+
+    /**
+            Percentage threshold to accept a branch of the approximate likelihood ratio test
+            (aLRT) with SH-like interpretation. See Guindon et al. (2010) Syst. Biol. for details.
+            Default: 90%.
+     */
+    int aLRT_threshold;
+
+    /**
+            number of replicates, default: 1000
+     */
+    int aLRT_replicates;
+
+    /**
+            number of replicates for local bootstrap probabilities method of Adachi & Hasegawa (1996) in MOLPHY
+     */
+    int localbp_replicates;
+
+    /**
+            SSE Option
+     */
+    LikelihoodKernel SSE;
+
+    /** TRUE to not use AVX even available in CPU, default: FALSE */
+    bool lk_no_avx;
+
+    /**
+     	 	0: do not print anything
+            1: print site log-likelihood
+            2: print site log-likelihood per Gamma category
+     */
+    int print_site_lh;
+
+    /** TRUE to print site-specific rates, default: FALSE */
+    bool print_site_rate;
+
+    /* 1: print site posterior probability */
+    int print_site_posterior;
+
+    /**
+            TRUE to print tree log-likelihood
+     */
+    bool print_tree_lh;
+
+    bool print_branch_lengths;
+
+    /****** adaptive NNI search heuristic ******/
+
+    /**
+     *  Output log-likelihood
+     */
+    bool nni_lh;
+
+    /**
+     *  The number of iqp iteration before the heuristics is applied
+     */
+    int speedUpFromIter;
+
+    /**
+     *  Lambda in PhyML algorithm
+     */
+    double lambda;
+
+    /**
+     * Confidence level for the speed up heuristics
+     */
+    double speed_conf;
+
+    bool new_heuristic;
+
+    /***** WH-test (Weiss & von Haeseler 2003) *****/
+
+    /**
+            Results of Weiss & Haeseler test of model homogeneity
+     */
+    double whtest_simulations;
+    double whtest_delta;
+    double whtest_delta_quantile;
+    double whtest_p_value;
+
+
+    /**
+            bit-wise type including MCAT_LOG, MCAT_MEAN
+     */
+    int mcat_type;
+
+    /**
+            initial rate file in format:
+            Site Rate
+            1  f_1
+            2  f_2
+            ...
+     */
+    char *rate_file;
+
+    /***** NGS stuffs   ********/
+
+    /**
+            next-generation sequencing input file for Fritz project
+     */
+    char *ngs_file;
+
+    /**
+            next-generation sequencing input file containing mapped reads to the reference genome
+     */
+    char *ngs_mapped_reads;
+
+    bool ngs_ignore_gaps;
+
+    bool do_pars_multistate;
+
+    /**
+            File containing p-values of the genes, for GSS project with Roland
+     */
+    char *gene_pvalue_file;
+
+    /**
+            scaling factor for the p-values
+     */
+    double gene_scale_factor;
+
+    /**
+            transforming pvalues to logarithms
+     */
+    bool gene_pvalue_loga;
+
+    /***** variables for reading NCBI taxonomy tree *******/
+
+    /**
+            NCBI taxonomy ID, for processing nodes.dmp file
+     */
+    int ncbi_taxid;
+
+    /**
+            NCBI taxon rank, restricting the tree to that rank
+     */
+    const char *ncbi_taxon_level;
+
+    /**
+            rank to ingore, e.g., "no rank", branch length to such node will be set to zero
+     */
+    const char *ncbi_ignore_level;
+
+    /**
+            typically names.dmp from NCBI
+     */
+    const char *ncbi_names_file;
+
+    /**********************************************/
+    /******* variables for ECOpd analysis *********/
+
+	/**
+		eco_dag_file - contains the food web in matrix form (n species, nxn matrix), 0 for no connection, 1 for predation of j predator on i prey
+	*/
+	char *eco_dag_file;
+
+    /**
+		eco_detail_file - contains IDs of species present in the final set and/or species absent in the TREE or SPLIT system, but present in the food web
+	*/
+	const char *eco_detail_file;
+
+	/*
+	 * the type of the phylo input - tree or network
+	 */
+	const char *eco_type;
+
+	/*
+		k% - percent of species to be conserved
+	 */
+	int k_percent;
+
+    /*
+		diet - percent of species diet to be preserved for species survival
+	*/
+	int diet_min;
+	int diet_max;
+	int diet_step;
+
+    /*
+		eco_run - run number, used when random branch length is assigned to the edges of an input tree
+	*/
+	int eco_run;
+
+    /*
+		eco_weighted - indicates whether to treat the food web as weighted or not weighted
+	*/
+	bool eco_weighted;
+
+    /**********************************************/
+    /****** variables for upper bound tests *******/
+	bool upper_bound;
+	bool upper_bound_NNI;
+	/*
+	 * fraction of current likelihood by which UB will be increased.
+	 * if UBincreased < L, ignore corresponding NNI Add a comment to this line
+	 */
+	double upper_bound_frac;
+
+
+    /**********************************************/
+    /**** variables for ultra-fast bootstrap ******/
+
+    /**
+            number of replicates for guided bootstrap
+     */
+    int gbo_replicates;
+
+	/* interval (l-epsilon,l+epsilon) indicates tie for bootstrap tree
+	 * in this case, one tree is picked up at random
+	 */
+	double ufboot_epsilon;
+
+    /**
+            TRUE to check with different max_candidate_trees
+     */
+    int check_gbo_sample_size;
+
+    /**
+            TRUE to use RELL method of Simodaira Hasegawa, FALSE otherwise
+     */
+    bool use_rell_method;
+
+    /**
+            TRUE to use ELW method of Strimmer & Rambaut for new bootstrap, FALSE otherwise
+     */
+    bool use_elw_method;
+
+    /**
+            TRUE to weight each bootstrap sample by its probability, FALSE otherwise
+     */
+    bool use_weighted_bootstrap;
+
+    /**
+            TRUE to use the single ML tree per bootstrap, FALSE to include several sup-optima
+     */
+    bool use_max_tree_per_bootstrap;
+
+    /** maximum number of candidate trees to consider for new bootstrap */
+    int max_candidate_trees;
+
+    /** TRUE if user_file contains topologically distinct trees */
+    bool distinct_trees;
+
+    /** NEW: TRUE to update bootstrap trees during the search (do not store treels_ptnlh).
+            FALSE to call runGuidedBootstrap() at the end */
+    bool online_bootstrap;
+
+    /** minimal correlation coefficient for bootstrap stopping rule */
+    double min_correlation;
+
+    /** number of iterations between bootstrap stopping rule check */
+    int step_iterations;
+
+    /** TRUE to store all candidate trees in memory */
+    bool store_candidate_trees;
+
+	/** true to print all UFBoot trees to a file */
+	bool print_ufboot_trees;
+
+    /****** variables for NNI cutoff heuristics ******/
+
+    /**
+            TRUE to empirically estimate nni_cutoff
+     */
+    bool estimate_nni_cutoff;
+
+    /**
+            logl difference with zero-branch tree, to cutoff before evaluating NNI
+     */
+    double nni_cutoff;
+
+    /**
+            sort the NNI before evaluating
+     */
+    bool nni_sort;
+
+    /**
+            Obsolete: TRUE to optimize 5 branches around NNI
+     */
+    //bool nni_opt_5branches;
+
+    /** print some output info for NNI */
+    bool testNNI;
+
+    /** TRUE to do approximate NNIs with approximate branch lengths before a normal NNI */
+    bool approximate_nni;
+
+
+    /** TRUE to compress big file using zlib */
+    bool do_compression;
+
+    /**
+            number of bootstrap samples for AvH curiosity
+     */
+    int avh_test;
+
+    /**
+            number of bootstrap samples for Arndt's bootstrap plot
+     */
+    int bootlh_test;
+
+    /**
+            partition definition for Arndt's bootstrap plot
+     */
+    char* bootlh_partitions;
+
+    /** precision when printing out for floating-point number */
+    int numeric_precision;
+
+    /** file containing state-frequencies per site for site-specific state frequency model
+     * each line has n+1 entries (n=number of states):
+     * site_ID state1_freq state2_freq ... staten_freq
+     * where site_ID is from 1 to m (m=number of sites)
+     */
+    char *site_freq_file;
+
+    /** number of threads for OpenMP version     */
+    int num_threads;
+
+    /** either MTC_AIC, MTC_AICc, MTC_BIC */
+    ModelTestCriterion model_test_criterion;
+
+    /** either MTC_AIC, MTC_AICc, MTC_BIC, or MTC_ALL to stop +R increasing categories */
+    ModelTestCriterion model_test_stop_rule;
+
+    /** sample size for AICc and BIC */
+    int model_test_sample_size;
+
+    /** root state, for Tina's zoombie domain */
+    char *root_state;
+
+	/**
+	 * TRUE to print bootstrap alignments, default: false
+	 */
+	bool print_bootaln;
+
+	/** true to print sub alignments of super alignment, default: false */
+	bool print_subaln;
+
+	/** print partition information */
+	bool print_partition_info;
+
+	/** TRUE to print concatenated alignment, default: false */
+	bool print_conaln;
+
+	/** true to count all distinct trees visited during tree search */
+	bool count_trees;
+
+	/* -1 (auto-detect): will be set to 0 if there is enough memory, 1 otherwise
+	 * 0: store all partial likelihood vectors
+	 * 1: only store 1 partial likelihood vector per node */
+	LhMemSave lh_mem_save;
+
+	/* TRUE to print .splits file in star-dot format */
+	bool print_splits_file;
+    
+    /** TRUE (default) to ignore identical sequences and add them back at the end */
+    bool ignore_identical_seqs;
+
+    /** TRUE to write initial tree to a file (default: false) */
+    bool write_init_tree;
+
+    /** frequencies of const patterns to be inserted into alignment */
+    char *freq_const_patterns;
+
+    /** BQM 2015-02-25: true to NOT rescale Gamma+Invar rates by (1-p_invar) */
+    bool no_rescale_gamma_invar;
+
+    /** true to compute sequence identity along tree */
+    bool compute_seq_identity_along_tree;
+};
+
+/**
+        related measures for PD
+ */
+struct PDRelatedMeasures {
+    /**
+            names of areas
+     */
+    vector<string> setName;
+
+    /**
+            pd scores of areas
+     */
+    DoubleVector PDScore;
+
+    /**
+            exclusive PD scores
+     */
+    DoubleVector exclusivePD;
+
+    /**
+            endemic pd scores of an area given all other areas
+     */
+    DoubleVector PDEndemism;
+
+    /**
+            pd-complementarity scores of an area given some provided area
+     */
+    DoubleVector PDComplementarity;
+
+};
+
+
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+        @return TRUE of ch is a control character (ascii <= 32)
+ */
+inline bool controlchar(char ch) {
+    return ch <= 32;
+}
+
+inline bool is_newick_token(char ch) {
+    return ch == ':' || ch == ';' || ch == ',' || ch == ')' || ch == '(' || ch == '[' || ch == ']';
+}
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+        print error message then exit program
+ */
+//void outError(char *error);
+
+/**
+        print error message then exit program
+ */
+void outError(const char *error, bool quit = true);
+
+/**
+        print error message then exit program
+ */
+void outError(string error, bool quit = true);
+
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+        print double error messages then exit program
+ */
+void outError(const char *error, const char *msg, bool quit = true);
+
+/**
+        print double error messages then exit program
+ */
+void outError(const char *error, string msg, bool quit = true);
+
+/**
+        Output a warning message to screen
+        @param error warning message
+ */
+void outWarning(const char *warn);
+void outWarning(string warn);
+
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+
+/**
+        generate a random branch length under an exponential distribution
+        with mean params.mean_len. Also make sure that the resulting
+        length is in the range (params.min_len, params.max_len)
+        @return the random branch length
+ */
+double randomLen(Params &params);
+
+/**
+        convert string to int, with error checking
+        @param str original string
+        @return the integer
+ */
+/**
+        Compute the logarithm of the factorial of an integer number
+        @param num: the number
+        @return logarithm of (num! = 1*2*...*num)
+ */
+double logFac(const int num);
+
+/**
+ * Function to randomly select an element in a C++ container
+ *
+ * @param begin
+ * @param end
+ * @return
+ */
+template <typename I>
+I random_element(I begin, I end);
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+/*
+        Error messages
+ */
+const char ERR_NO_TAXON[] = "Find no taxon with name ";
+const char ERR_NO_AREA[] = "Find no area with name ";
+const char ERR_NO_ROOT[] = "Root taxon not found: ";
+const char ERR_ROOT_NET[] = "-root option is not available for network";
+const char ERR_CONFLICT_ROOT[] = "Tree is already rooted, -o <taxon> is not allowed.";
+const char ERR_DUPLICATED_TAXA[] = "Duplicated taxa name in the tree.";
+const char ERR_FEW_TAXA[] = "Number of taxa must be greater than 2.";
+const char ERR_NO_SPLITS[] = "No splits found!";
+const char ERR_FEW_SPLITS[] = "Number of splits must be at least equal to the number of taxa";
+const char ERR_NEG_BRANCH[] = "Negative branch length not allowed.";
+const char ERR_NO_MEMORY[] = "Not enough memory!";
+
+const char ERR_READ_INPUT[] = "File not found or incorrect input, pls check it again.";
+const char ERR_UNEXPECTED_EOF[] = "Unexpected end of file.";
+const char ERR_READ_ANY[] = "Unidentified error while reading file, pls check it carefully again.";
+const char ERR_WRITE_OUTPUT[] = "Cannot write to file ";
+
+const char ERR_NO_K[] = "You must specify the number of taxa in the PD set.";
+const char ERR_TOO_SMALL_K[] = "Size of PD-set must be at least the size of initial set.";
+const char ERR_NO_BUDGET[] = "Total budget is not specified or less than zero.";
+const char ERR_TOO_SMALL_BUDGET[] = "Not enough budget to conserve the inital set of taxa.";
+
+const char ERR_INTERNAL[] = "Internal error, pls contact authors!";
+
+/*--------------------------------------------------------------*/
+/*--------------------------------------------------------------*/
+
+/**
+ * convert int to string
+ * @param int
+ * @return string
+ */
+string convertIntToString(int number);
+string convertInt64ToString(int64_t number);
+
+string convertDoubleToString(double number);
+
+/**
+ *
+ * @param SRC
+ * @param DEST
+ * @return bool
+ */
+bool copyFile(const char SRC[], const char DEST[]);
+
+/**
+ * Check if the file exists
+ * @param strFilename
+ * @return
+ */
+bool fileExists(string strFilename);
+
+/**
+        convert string to int, with error checking
+        @param str original string
+        @return the number
+ */
+int convert_int(const char *str) throw (string);
+
+/**
+        convert string to int, with error checking
+        @param str original string
+        @param end_pos end position
+        @return the number
+ */
+int convert_int(const char *str, int &end_pos) throw (string);
+
+/**
+        convert comma-separated string to integer vector, with error checking
+        @param str original string with integers separated by comma
+        @param vec (OUT) integer vector
+ */
+void convert_int_vec(const char *str, IntVector &vec) throw (string);
+
+/**
+        convert string to double, with error checking
+        @param str original string
+        @return the double
+ */
+double convert_double(const char *str) throw (string);
+
+/**
+        convert string to double, with error checking
+        @param str original string
+        @param end_pos end position
+        @return the double
+ */
+double convert_double(const char *str, int &end_pos) throw (string);
+
+/**
+        convert comma-separated string to integer vector, with error checking
+        @param str original string with integers separated by comma
+        @param vec (OUT) integer vector
+ */
+void convert_double_vec(const char *str, DoubleVector &vec) throw (string);
+
+/**
+ * Convert seconds to hour, minute, second
+ * @param sec
+ * @return string represent hour, minute, second
+ */
+string convert_time(const double sec);
+
+
+/**
+        convert a string to to range lower:upper:step_size with error checking
+        @param str original string
+        @param lower (OUT) lower bound of the range
+        @param upper (OUT) upper bound of the range
+        @param step_size (OUT) step size of the range
+ */
+void convert_range(const char *str, int &lower, int &upper, int &step_size) throw (string);
+
+/**
+        convert a string to to range lower:upper:step_size with error checking
+        @param str original string
+        @param lower (OUT) lower bound of the range
+        @param upper (OUT) upper bound of the range
+        @param step_size (OUT) step size of the range
+ */
+void convert_range(const char *str, double &lower, double &upper, double &step_size) throw (string);
+
+void convert_string_vec(const char *str, StrVector &str_vec) throw (string);
+
+/**
+        read the file containing branch/split scaling factor and taxa weights
+        @param params program parameters
+        @param ntaxa total number of taxa
+        @param scale (OUT) scaling factor
+        @param tax_name (OUT) vector of taxa names
+        @param tax_weight (OUT) vector of corresponding taxa weights
+ */
+void readWeightFile(Params &params, int ntaxa, double &scale, StrVector &tax_name, DoubleVector &tax_weight);
+
+/**
+        read the initial taxa set from the file
+        @param params program parameters
+        @param ntaxa number of taxa
+        @param tax_name (OUT) vector of taxa names
+ */
+void readInitTaxaFile(Params &params, int ntaxa, StrVector &tax_name);
+
+/**
+        read the initial area set from the file
+        @param params program parameters
+        @param nareas number of areas
+        @param area_name (OUT) vector of area names
+ */
+void readInitAreaFile(Params &params, int nareas, StrVector &area_name);
+
+
+/**
+        read a list of taxa set from a file, not in nexus format but as follows:
+        n1
+        tax-name-1
+        ...
+        tax-name-n1
+
+        n2
+        tax-name-1
+        ...
+        tax-name-n2
+        ....
+
+        @param filename file name
+        @param sets (OUT) the returned sets of taxa
+ */
+void readTaxaSets(char *filename, MSetsBlock *sets);
+
+/**
+        read areas shared boundary file, in form of a standard distance matrix
+        @param file_name file name
+        @param areas the read sets block
+        @param areas_shared_boundary (OUT) shared boundary length between areas.
+                Diagonal elements represent the boundary length of single areas
+ */
+void readAreasBoundary(char *file_name, MSetsBlock *areas, double *areas_shared_boundary);
+
+/**
+        parse program argument into params
+        @param argc number of arguments
+        @param argv list of arguments
+        @param params (OUT) program parameters
+ */
+void parseArg(int argc, char *argv[], Params &params);
+
+/**
+        detect the format of input file
+        @param input_file file name
+        @return
+                IN_NEWICK if file in newick format,
+                IN_NEXUS if in nexus format,
+                IN_FASTA if in fasta format,
+                IN_PHYLIP if in phylip format,
+                IN_OTHER if file format unknown.
+ */
+InputType detectInputFile(char *input_file);
+
+/**
+        if file exists, ask user to overwrite it or not
+        @param filename file name
+        @return TRUE if agree to overwrite an existing file, or simply file does not exist
+ */
+bool overwriteFile(char *filename);
+
+/**
+        print usage information
+        @param argv program arguments list
+        @param full_command TRUE to print all available commands, FALSE to print normal usage dialog
+ */
+void usage(char* argv[], bool full_command);
+
+/**
+ *   Print a string into a file
+ */
+void printString2File(string myString, string filename);
+
+/**
+ * print usage for iq-tree
+ * @param program arguments list
+ * @param full_command TRUE to print all available commands, FALSE to print normal usage dialog
+ */
+void usage_iqtree(char* argv[], bool full_command);
+
+/**
+        parse area name string, where names are separated by commas
+        @param area_names a string of name
+        @param areas (OUT) a set of name string
+ */
+void parseAreaName(char *area_names, set<string> &areas);
+
+/**
+ * generate 2 different random integer numbers smaller than a specific integer threshold
+ * @param size integer threshold
+ * @param &first first random integer number
+ * @param @second second random integer number
+ */
+void get2RandNumb(const int size, int &first, int &second);
+
+/*
+inline double getCPUTime(clock_t startTime) {
+        return double(clock() - startTime) / CLOCKS_PER_SEC;
+}*/
+
+
+/**
+ *  Fills the range [first, last) with sequentially increasing values,
+ *  starting with value and repetitively evaluating ++value.
+ *  Introduced in C++11 --> this is a reimplementation
+ */
+template<class ForwardIterator, class T>
+void iota( ForwardIterator first, ForwardIterator last, T value );
+
+/**
+        compute p-value for a chi-square value
+        @param chi_square chi-square value
+        @param df degree of freedom
+        @return p-value
+ */
+double computePValueChiSquare(double x, int df);
+
+/*--------------------------------------------------------------*/
+/* random number generator */
+/*--------------------------------------------------------------*/
+
+/**
+ * initialize the random number generator
+ * @param seed seed for generator
+ */
+int init_random(int seed);
+
+/**
+ * finalize random number generator (e.g. free memory
+ */
+int finish_random();
+
+/**
+ * returns a random integer in the range [0; n - 1]
+ * @param n upper-bound of random number
+ */
+int random_int(int n);
+
+/**
+ *  return a random integer in the range [a,b]
+ */
+//int randint(int a, int b);
+
+/**
+ * returns a random integer in the range [0; RAND_MAX - 1]
+ * = random_int(RAND_MAX)
+ */
+int random_int();
+
+/**
+ * returns a random floating-point nuber in the range [0; 1)
+ */
+double random_double();
+
+template <class T>
+void my_random_shuffle (T first, T last)
+{
+	int n = last - first;
+	for (int i=n-1; i>0; --i) {
+		swap (first[i],first[random_int(i+1)]);
+	}
+}
+
+/**
+ * generic function for sorting by index
+ */
+template <class T>
+void quicksort_index(T* arr, int* index, int left, int right) {
+    int i = left, j = right, tmp2;
+    T tmp, pivot = arr[(left + right) / 2];
+
+    /* partition */
+    while (i <= j) {
+        while (arr[i] < pivot)
+            i++;
+        while (pivot < arr[j])
+            j--;
+        if (i <= j) {
+            tmp = arr[i];
+            arr[i] = arr[j];
+            arr[j] = tmp;
+            tmp2 = index[i];
+            index[i] = index[j];
+            index[j] = tmp2;
+            i++;
+            j--;
+        }
+    };
+
+    /* recursion */
+    if (left < j)
+        quicksort_index(arr, index, left, j);
+    if (i < right)
+        quicksort_index(arr, index, i, right);
+}
+
+/**
+ * generic function for sorting by index preseving entries in [first,last)
+ * @param first first element
+ * @param last last element
+ * @param index (OUT) ordered index of elements from smallest to largest
+ */
+template <class T>
+void sort_index(T* first, T* last, int *index) {
+    T* x;
+    int i;
+    T* arr = new T[last - first];
+    for (x = first, i = 0; x != last; x++, i++) {
+        index[i] = i;
+        arr[i] = *x;
+    }
+    assert(last - first == i);
+    quicksort_index(arr, index, 0, (last - first) - 1);
+    delete [] arr;
+}
+
+/**
+ * print the header of summary file
+ */
+void summarizeHeader(ostream &out, Params &params, bool budget_constraint, InputType analysis_type);
+
+/**
+ * print footer of summary file
+ */
+void summarizeFooter(ostream &out, Params &params);
+
+
+/**
+    remove white space at the beginning and end of the string
+    @param str (IN/OUT) string to be trimmed
+*/
+void trimString(string &str);
+
+/**
+    get number of processor cores
+*/
+int countPhysicalCPUCores();
+
+void print_stacktrace(ostream &out, unsigned int max_frames = 63);
+
+/**
+    quicksort template
+*/
+template<class T1, class T2>
+void quicksort(T1* arr, int left, int right, T2* arr2 = NULL) {
+      assert(left <= right);
+      int i = left, j = right;
+      T1 pivot = arr[(left + right) / 2];
+
+      /* partition */
+      while (i <= j) {
+            while (arr[i] < pivot)
+                  i++;
+            while (arr[j] > pivot)
+                  j--;
+            if (i <= j) {
+                  T1 tmp = arr[i];
+                  arr[i] = arr[j];
+                  arr[j] = tmp;
+                  if (arr2) {
+                      T2 tmp2 = arr2[i];
+                      arr2[i] = arr2[j];
+                      arr2[j] = tmp2;
+                  }
+                  i++;
+                  j--;
+            }
+      };
+
+      /* recursion */
+      if (left < j)
+            quicksort(arr, left, j, arr2);
+      if (i < right)
+            quicksort(arr, i, right, arr2);
+}
+
+/* An optimized version of C̩dric Lauradoux's 64-bit merging3 algorithm
+   implemented by Kim Walisch, see:
+   http://code.google.com/p/primesieve/source/browse/trunk/src/soe/bithacks.h
+   Modified ever so slightly to maintain the same API. Note that
+   it assumes the buffer is a multiple of 64 bits in length.
+*/
+inline uint32_t popcount_lauradoux(unsigned *buf, int n) {
+  const uint64_t* data = (uint64_t*) buf;
+  uint32_t size = n/(sizeof(uint64_t)/sizeof(int));
+  const uint64_t m1  = (0x5555555555555555ULL);
+  const uint64_t m2  = (0x3333333333333333ULL);
+  const uint64_t m4  = (0x0F0F0F0F0F0F0F0FULL);
+  const uint64_t m8  = (0x00FF00FF00FF00FFULL);
+  const uint64_t m16 = (0x0000FFFF0000FFFFULL);
+  const uint64_t h01 = (0x0101010101010101ULL);
+
+  uint32_t bitCount = 0;
+  uint32_t i, j;
+  uint64_t count1, count2, half1, half2, acc;
+  uint64_t x;
+  uint32_t limit30 = size - size % 30;
+
+  // 64-bit tree merging (merging3)
+  for (i = 0; i < limit30; i += 30, data += 30) {
+    acc = 0;
+    for (j = 0; j < 30; j += 3) {
+      count1  =  data[j];
+      count2  =  data[j+1];
+      half1   =  data[j+2];
+      half2   =  data[j+2];
+      half1  &=  m1;
+      half2   = (half2  >> 1) & m1;
+      count1 -= (count1 >> 1) & m1;
+      count2 -= (count2 >> 1) & m1;
+      count1 +=  half1;
+      count2 +=  half2;
+      count1  = (count1 & m2) + ((count1 >> 2) & m2);
+      count1 += (count2 & m2) + ((count2 >> 2) & m2);
+      acc    += (count1 & m4) + ((count1 >> 4) & m4);
+    }
+    acc = (acc & m8) + ((acc >>  8)  & m8);
+    acc = (acc       +  (acc >> 16)) & m16;
+    acc =  acc       +  (acc >> 32);
+    bitCount += (uint32_t)acc;
+  }
+
+  // count the bits of the remaining bytes (MAX 29*8) using
+  // "Counting bits set, in parallel" from the "Bit Twiddling Hacks",
+  // the code uses wikipedia's 64-bit popcount_3() implementation:
+  // http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+  for (i = 0; i < size - limit30; i++) {
+    x = data[i];
+    x =  x       - ((x >> 1)  & m1);
+    x = (x & m2) + ((x >> 2)  & m2);
+    x = (x       +  (x >> 4)) & m4;
+    bitCount += (uint32_t)((x * h01) >> 56);
+  }
+  return bitCount;
+}
+
+#endif
diff --git a/upperbounds.cpp b/upperbounds.cpp
new file mode 100644
index 0000000..c66fea9
--- /dev/null
+++ b/upperbounds.cpp
@@ -0,0 +1,907 @@
+/*
+ * upperbounds.cpp
+ *
+ *  Created on: Aug 13, 2014
+ *      Author: olga
+ */
+#include "upperbounds.h"
+#include "phylonode.h"
+#include <string.h>
+#include "timeutil.h"
+
+void UpperBounds(Params *params, Alignment* alignment, IQTree* tree){
+
+// Output details --------------------------------------------------
+// UpperBounds File
+	string out_file = params->out_prefix;
+	//out_file += ".ub";
+	out_file = "results.trueSplits.ub";
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
+
+// Details on Split: A|B
+	string out_file_split = params->out_prefix;
+	//out_file_split += ".split.ub";
+	out_file_split = "results.trueSplits.ub.splits";
+	ofstream out_split;
+	out_split.exceptions(ios::failbit | ios::badbit);
+	out_split.open((char*)out_file_split.c_str(),std::ofstream::out | std::ofstream::app);
+
+// Within Family Info: A|B
+	string out_file_within = params->out_prefix;
+	//out_file_within += ".within.ub";
+	out_file_within = "results.within.ub";
+	ofstream out_within;
+	out_within.exceptions(ios::failbit | ios::badbit);
+	out_within.open((char*)out_file_within.c_str(),std::ofstream::out | std::ofstream::app);
+
+// Between Families Info: A|B
+	string out_file_between = params->out_prefix;
+	//out_file_between += ".between.ub";
+	out_file_between = "results.between.ub";
+	ofstream out_between;
+	out_between.exceptions(ios::failbit | ios::badbit);
+	out_between.open((char*)out_file_between.c_str(),std::ofstream::out | std::ofstream::app);
+
+	/* ------------------------------------------------------------------------------------------------------
+	 * All output files:
+	 * 	out 		-> "results.trueSplits.ub"			-> upper bounds for all splits from an input tree
+	 * 	out_split   -> "results.trueSplits.ub.splits"	-> list of 			all splits from an input tree
+	 * 	out_within	-> "results.within.ub"				-> comparison of upper bounds within  Split Families
+	 * 	out_between	-> "results.between.ub"				-> comparison of upper bounds between Split Families
+	 * ------------------------------------------------------------------------------------------------------
+	 * FORMAT:
+	 * general info (first columns of every file)...........................................................................
+	 * 		leafNum		getNSite()	min(taxaA,taxaB)	brLen
+	 *
+	 * out .................................................................................................................
+	 * 		L(A|B)		L(A)L(B)	cN*L(A)L(B)		L(A|B)/L(A)L(B)		L(A|B)/cN*L(A)L(B)	coef
+	 *
+	 *[4],[5] - the difference between likelihood and UB normalized by likelihood value
+	 * if >1, the inequality is true.
+	 * if <1, false.
+	 *
+	 * out_within ..........................................................................................................
+	 * 		UB_true		[1..N] UB_random_AB/UB_true (how smaller is the bound for random tree)
+	 *
+	 * out_between..........................................................................................................
+	 * 		UB_true		[1..N] UB_random_CD/UB_true (how smaller is the bound for random tree)
+	 *
+	 * ------------------------------------------------------------------------------------------------------ */
+
+	int i=0;//, h=0;
+
+	// Printing info about the TreeLogL changes during the tree search
+/*	cout<<"mlInitial  = "<<tree->mlInitial<<endl;
+	cout<<"mlFirstOpt = "<<tree->mlFirstOpt<<endl;
+	cout<<"mlBestTree = "<<tree->getBestScore()<<endl;
+	cout<<"mlUnConstr = "<<alignment->computeUnconstrainedLogL()<<endl;*/
+
+	//double mlQuestionary = tree->mlInitial; //or tree->mlFirstOpt for example
+
+	/* ------------------------------------------------------------------------------------------------------
+	 * Main PART
+	 * ------------------------------------------------------------------------------------------------------ */
+	cout<<"Starting Upper Bounds analysis.."<<endl;
+
+	NodeVector branch1, branch2;
+	tree->getBranches(branch1, branch2);
+	int allSplits = 0;
+//	int R=10; // R is the number of random trees we will generate
+
+// A loop over all A|B present on tree T
+	for(i = 0; i != branch1.size(); i++){
+		vector<int> taxaA, taxaB;
+		vector<string> taxaAname, taxaBname;
+		tree->getTaxaID(taxaA,branch1[i],branch2[i]);
+		tree->getTaxaID(taxaB,branch2[i],branch1[i]);
+
+		/* ------------------------------------------------------------------------------------------------------------
+		 * TEST 1: This is the part for tests on [ai/(ai+bi)] and [bi/(ai+bi)] fractions
+		 */
+		int test1 = 1;
+		if(test1 == 1){
+			if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
+				allSplits++;
+				sumFraction(((PhyloNode*) branch1[i]), ((PhyloNode*) branch2[i]), tree);
+		}
+		}
+
+		/* ------------------------------------------------------------------------------------------------------------
+		 * TEST 2: This is the part for tests on random trees and evaluation of Upper Bounds for each split on the input tree
+		 */
+		int test2 = 0;
+		if(test2 == 1){
+		if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
+			allSplits++;
+
+			// Dealing with subtrees T_A and T_B
+			PhyloTree *treeA, *treeB;
+			treeA = extractSubtreeUB(taxaA,tree,params,1);
+			treeB = extractSubtreeUB(taxaB,tree,params,1);
+
+			// Upper Bound for a given split from the input tree
+			double brLen = branch1[i]->findNeighbor(branch2[i])->length;
+			double coef  = tree->aln->getNSite()*(log(1+3*exp(-brLen)) - log(1-exp(-brLen)));
+			double coef2  = tree->aln->getNSite()*log(1+3*exp(-brLen));
+			double UB_true  = coef + treeA->getCurScore() + treeB->getCurScore();
+			double UB_true2 = coef2 + treeA->getCurScore() + treeB->getCurScore();
+
+			//cout<<"UB_true = "<<UB_true<<endl;
+			out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"
+					<<tree->getCurScore()<<"\t"<<treeA->getCurScore() + treeB->getCurScore()<<"\t"<<UB_true<<"\t"<<UB_true2<<"\t"
+					<<tree->getCurScore()/(treeA->getCurScore() + treeB->getCurScore())<<"\t"
+					<<tree->getCurScore()/UB_true<<"\t"<<tree->getCurScore()/UB_true2<<"\t"<<coef<<"\t"<<coef2<<endl;
+
+/*
+			// Comparison of Upper Bounds within Split Family ----------------------------------------
+			out_within<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
+
+			cout<<"comparison within family...."<<endl;
+			double UB_random_AB;
+			for(j=0; j<30; j++){
+				//cout<<"generating "<<j<<" random_AB tree..."<<endl;
+				UB_random_AB = RandomTreeAB(tree, treeA, treeB, taxaA, taxaB, params,brLen);
+				//cout<<"The upper bound for random tree: "<<UB_random_AB<<endl;
+				out_within<<UB_random_AB/UB_true<<"\t";
+			}
+			out_within<<endl;
+
+
+			// --------------------------------------------------------------------------------------
+
+			// Comparison of Upper Bounds between Split Families ------------------------------------
+			out_between<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
+
+			cout<<"comparison between families...."<<endl;
+			// creating split C|D which conflicts with A|B
+			int n=0;
+			n=int(min(taxaA.size(),taxaB.size())/2.);
+			//cout<<"taxaA.size() = "<<taxaA.size()<<", taxaB.size() = "<<taxaB.size()<<", n = "<<n<<endl;
+
+			vector<int> taxaC, taxaD;
+			PhyloTree *treeC, *treeD;
+
+			// ContraSplit1: changing 1/2 of taxa
+			taxaC = taxaA;
+			taxaD = taxaB;
+			for(h=0; h<n; h++){
+				taxaC[h]=taxaB[h];
+				taxaD[h]=taxaA[h];
+			}
+			treeC = extractSubtreeUB(taxaC,tree,params);
+			treeD = extractSubtreeUB(taxaD,tree,params);
+			double UB_random_CD;
+			for(j=0; j<R; j++){
+				//cout<<"generating "<<j<<" random_CD1 tree..."<<endl;
+				UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
+				out_between<<UB_random_CD/UB_true<<"\t";
+			}
+
+			// ContraSplit 2: changing 1/4 of taxa
+			taxaC = taxaA;
+			taxaD = taxaB;
+			for(h=0; h<int(n/2.); h++){
+				taxaC[h]=taxaB[h];
+				taxaD[h]=taxaA[h];
+			}
+			treeC = extractSubtreeUB(taxaC,tree,params);
+			treeD = extractSubtreeUB(taxaD,tree,params);
+			for(j=0; j<R; j++){
+				//cout<<"generating "<<j<<" random_CD2 tree..."<<endl;
+				UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
+				out_between<<UB_random_CD/UB_true<<"\t";
+			}
+			out_between<<endl;
+*/
+
+/*
+
+	// Printing Tree and its subtrees. This was just for check.
+			cout<<"Tree T(A|B)"<<endl;
+			tree->printTree(cout,2);
+			cout<<endl<<"Tree T(A)"<<endl;
+			treeA->printTree(cout,2);
+			cout<<endl<<"Tree T(B)"<<endl;
+			treeB->printTree(cout,2);
+			cout<<endl;
+
+			cout<<"Tree T(A|B)"<<endl;
+			printTreeUB(tree);
+			cout<<endl<<endl<<"Tree T(A)"<<endl;
+			printTreeUB(treeA);
+			cout<<endl<<"Tree T(B)"<<endl;
+			printTreeUB(treeB);
+
+	// Printing out the results ----------------------------------------------------------
+			// Split A|B ------------------------------------------
+			out_split<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t";
+			if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
+				for(int f = 0; f < taxaA.size()-1; f++)
+					out_split<<taxaA[f]<<",";
+				out_split<<taxaA[taxaA.size()-1]<<"\t|\t";
+				for(int f = 0; f < taxaB.size()-1; f++)
+					out_split<<taxaB[f]<<",";
+				out_split<<taxaB[taxaB.size()-1];
+			} else {
+				for(int f = 0; f < taxaB.size()-1; f++)
+					out_split<<taxaB[f]<<",";
+				out_split<<taxaB[taxaB.size()-1]<<"\t|\t";
+				for(int f = 0; f < taxaA.size()-1; f++)
+					out_split<<taxaA[f]<<",";
+				out_split<<taxaA[taxaA.size()-1];
+			}
+			out_split<<endl;
+			// ----------------------------------------------------
+
+			//out<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
+			out<<params->aln_file<<"\t";
+			if(tree->mlInitial == 0)
+				out<<"0"<<"\t";
+			else
+				out<<"1"<<"\t";
+			out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
+
+
+			if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
+				out<<treeA->curScore<<"\t"<<treeB->curScore<<"\t";
+			}
+			else{
+				out<<treeB->curScore<<"\t"<<treeA->curScore<<"\t";
+			}
+
+			out<<tree->aln->size()*(log(1+3*exp(-br_len)) - log(1-exp(-br_len)))<<"\t"<<diff_1<<"\t"<<diff_2;
+
+			if(diff_1>0){
+				out<<"\t"<<"FALSE\t0";
+				BadSplits1++;
+			}else{
+				out<<"\t"<<"TRUE\t1";
+			}
+
+			if(diff_2>0){
+				out<<"\t"<<"FALSE\t0";
+				BadSplits2++;
+			}else{
+				out<<"\t"<<"TRUE\t1";
+			}
+			out<<endl;
+
+	// END: Printing out the results -----------------------------------------------------
+*/
+		}} // END: if taxaA.size() and taxaB.size() >3
+
+	}
+	// END: the loop over all A|B present on tree T
+
+	out_within.close();
+	out_between.close();
+	out.close();
+	out_split.close();
+}
+
+PhyloTree* extractSubtreeUB(IntVector &ids, MTree* tree, Params *params, int sw) {
+	string taxa_set;
+	int i;
+	for(i = 0; i < tree->leafNum; i++)
+		taxa_set.push_back(0);
+	for (i = 0; i < ids.size(); i++)
+		taxa_set[ids[i]]=1;
+
+	PhyloTree *treeCopy = new PhyloTree(); // this will be a new subtree
+	Alignment *alignment = new Alignment();
+	alignment->extractSubAlignment(((PhyloTree*)tree)->aln,ids,0);
+
+	treeCopy->copyTree(tree, taxa_set);
+	treeCopy->setAlignment(alignment);
+	if(sw == 1){
+		treeCopy->setModel(((PhyloTree*)tree)->getModel());
+		treeCopy->setRate(((PhyloTree*)tree)->getRate());
+		treeCopy->setModelFactory(((PhyloTree*)tree)->getModelFactory());
+		treeCopy->initializeAllPartialLh();
+		treeCopy->setCurScore(treeCopy->computeLikelihood());
+	}
+
+	return treeCopy;
+}
+
+void printTreeUB(MTree *tree){
+	int i=0, j=0;
+
+	NodeVector nodeLeaves;
+	tree->getTaxa(nodeLeaves);
+	cout<<"Taxa nodes:"<<endl;
+	for(i=0; i<nodeLeaves.size(); i++){
+		cout<<nodeLeaves[i]->name<<":"<<nodeLeaves[i]->id<<"(";
+		for(j=0; j<nodeLeaves[i]->neighbors.size(); j++)
+			cout<<nodeLeaves[i]->neighbors[j]->node->name<<":"<<nodeLeaves[i]->neighbors[j]->node->id<<"["<<nodeLeaves[i]->neighbors[j]->length<<"]"<<",";
+		cout<<")"<<endl;
+	}
+
+	NodeVector nodeInternal;
+	tree->getInternalNodes(nodeInternal);
+	cout<<"Internal nodes:"<<endl;
+	if(nodeInternal.size() == 0)
+		cout<<"no internal nodes"<<endl;
+	else{
+		for(i=0; i<nodeInternal.size(); i++){
+			cout<<nodeInternal[i]->name<<":"<<nodeInternal[i]->id<<"(";
+			for(j=0; j<nodeInternal[i]->neighbors.size(); j++)
+				cout<<nodeInternal[i]->neighbors[j]->node->name<<":"<<nodeInternal[i]->neighbors[j]->node->id<<"["<<nodeInternal[i]->neighbors[j]->length<<"]"<<",";
+			cout<<")"<<endl;
+		}
+	}
+}
+
+MTree* generateRandomYH_UB(Params &params, PhyloTree *tree){
+	MExtTree* treeR = new MExtTree();
+	bool binary = TRUE;
+
+	int size = tree->leafNum;
+	if (size < 3)
+		outError(ERR_FEW_TAXA);
+
+	treeR->root = treeR->newNode();
+	int i;
+	NodeVector myleaves;
+	NodeVector innodes;
+	Node *node;
+	double len;
+
+	innodes.push_back(treeR->root);
+	// create initial tree with 3 leaves
+	for (i = 0; i < 3; i++) {
+		node = treeR->newNode();
+		len = randomLen(params);
+		treeR->root->addNeighbor(node, len);
+		node->addNeighbor(treeR->root, len);
+		myleaves.push_back(node);
+	}
+
+	// additionally add a leaf
+	for (i = 3; i < size; i++)
+	{
+		int index;
+		if (binary) {
+			index = random_int(i);
+		} else {
+ 			index = random_int(i + innodes.size());
+		}
+		if (index < i) {
+			node = myleaves[index];
+			innodes.push_back(node);
+			// add the first leaf
+			Node *newleaf = treeR->newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves[index] = newleaf;
+
+			// add the second leaf
+			newleaf = treeR->newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves.push_back(newleaf);
+		}
+		else {
+			node = innodes[index-i];
+			// add only 1 new leaf
+			Node *newleaf = treeR->newNode();
+			len = randomLen(params);
+			node->addNeighbor(newleaf, len);
+			newleaf->addNeighbor(node, len);
+			myleaves.push_back(newleaf);
+		}
+	}
+
+	treeR->root = myleaves[0];
+	// indexing the leaves
+	treeR->setLeavesName(myleaves);
+	treeR->leafNum = myleaves.size();
+	treeR->nodeNum = treeR->leafNum;
+	treeR->initializeTree();
+
+	NodeVector taxa;
+	treeR->getTaxa(taxa);
+	assert(taxa.size() == size);
+	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
+		(*it)->name = tree->aln->getSeqName((*it)->id);
+
+	return (MTree*)treeR;
+}
+
+double RandomTreeAB(PhyloTree* treeORGN, PhyloTree* treeAorgn, PhyloTree* treeBorgn, IntVector &taxaA, IntVector &taxaB, Params* params, double brLen){
+	PhyloTree *tree  = new PhyloTree();
+	MTree *treeA = new MTree();
+	MTree *treeB = new MTree();
+
+	treeA = generateRandomYH_UB(*params,treeAorgn);
+	treeB = generateRandomYH_UB(*params,treeBorgn);
+
+/*
+	// PrintTree ---------------
+	cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
+	cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
+	cout<<"TreeA:"<<endl;
+	treeA->printTree(cout);
+	cout<<endl<<"TreeB:"<<endl;
+	treeB->printTree(cout);
+	cout<<endl;
+	// -------------------------
+*/
+
+	extendingTree(treeA,params);
+	extendingTree(treeB,params);
+
+/*
+	// PrintTree ---------------
+	cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
+	cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
+
+	cout<<"extended TreeA:"<<endl;
+	treeA->printTree(cout);
+	cout<<endl<<"extended TreeB:"<<endl;
+	treeB->printTree(cout);
+	cout<<endl;
+	// -------------------------
+*/
+
+	treeA->root->name = "NewNodeA";
+	treeB->root->name = "NewNodeB";
+	treeA->root->addNeighbor(treeB->root,0.0,tree->branchNum);
+	treeB->root->addNeighbor(treeA->root,0.0,tree->branchNum);
+
+	tree->copyTree(treeA);
+/*	cout<<"Leaves number = "<<tree->leafNum<<endl;
+	cout<<"Nodes  number = "<<tree->nodeNum<<endl;
+	cout<<"Branch number:"<<tree->branchNum<<endl;
+	*/
+	//tree->printTree(cout);
+	//cout<<endl;
+
+	NodeVector brID;
+	//brID= getBranchABid(brLen, tree);
+	brID.push_back(tree->findNodeName(treeA->root->name));
+	brID.push_back(tree->findNodeName(treeB->root->name));
+
+	if(brLen == 0){
+		brLen = randomLen(*params);
+	}
+	//tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length = brLen;
+	//tree->findNodeName(treeB->root->name)->findNeighbor(treeA->root)->length = brLen;
+
+
+	tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length = brLen;
+	tree->findNodeID(brID[1]->id)->findNeighbor(brID[0])->length = brLen;
+
+
+	tree->setAlignment(treeORGN->aln);
+	tree->setModel(((PhyloTree*)treeORGN)->getModel());
+	tree->setRate(((PhyloTree*)treeORGN)->getRate());
+	tree->setModelFactory(((PhyloTree*)treeORGN)->getModelFactory());
+	tree->initializeAllPartialLh();
+
+	tree->setCurScore(tree->computeLikelihood());
+	//cout<<"LogLh score before optimization: "<<tree->curScore<<endl;
+	tree->params = params;
+	//tree->curScore = tree->optimizeAllBranches(50);
+	//cout<<"LogLh score after  optimization: "<<tree->curScore<<endl;
+
+	//double len = tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length;
+	double len = tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length;
+	//cout<<"The length of corresponding branch after optimization: "<<len<<endl;
+	//cout<<"before it was equal to "<<brLen<<endl;
+
+	string out_file = "results.branches.ub";
+	ofstream out;
+	out.exceptions(ios::failbit | ios::badbit);
+	out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
+
+	//len = 1;
+
+	double coef = tree->aln->getNSite()*(log(1+3*exp(-len)) - log(1-exp(-len)));
+	double U = coef + UpperBoundAB(taxaA, taxaB, tree, params);
+
+	// leafNum		alnLen		brLen (before opt)		brLen (after opt)		coef 		UB
+	//out<<treeORGN->leafNum<<"\t"<<treeORGN->aln->getNSite()<<"\t"<<brLen<<"\t"<<len<<"\t"<<coef<<"\t"<<U<<endl;
+
+	out.close();
+	return U;
+}
+
+double UpperBoundAB(IntVector &taxaA, IntVector &taxaB, PhyloTree* tree, Params *params){
+	double U = 0.0;
+
+	PhyloTree *treeA, *treeB;
+	treeA = extractSubtreeUB(taxaA,tree,params,1);
+	treeB = extractSubtreeUB(taxaB,tree,params,1);
+
+	U = treeA->getCurScore() + treeB->getCurScore();
+
+	return U;
+}
+
+NodeVector getBranchABid(double brLen, PhyloTree* tree){
+	NodeVector branch1, branch2;
+	NodeVector branch;
+	tree->getBranches(branch1, branch2);
+	for(int i = 0; i != branch1.size(); i++){
+		if(branch1[i]->findNeighbor(branch2[i])->length == 0.0){
+			branch.push_back(branch1[i]);
+			branch.push_back(branch2[i]);
+			return branch;
+		}
+	}
+	outError("UpperBounds: did not find matching branch:(");
+	return branch;
+}
+
+void extendingTree(MTree *tree, Params* params){
+
+	// Choose random internal node
+	int maxR = tree->nodeNum-1;
+	int randomNodeID = rand() % maxR;
+	//cout<<"randomNodeID = "<<randomNodeID<<endl;
+	if(randomNodeID<tree->leafNum){
+		if(randomNodeID+tree->leafNum > tree->nodeNum-1){
+			randomNodeID += tree->nodeNum - tree->leafNum;
+			//cout<<"adding nodeNum-leafNum"<<endl;
+		}
+		else{
+			randomNodeID+=tree->leafNum;
+			//cout<<"adding leafNum"<<endl;
+		}
+	}
+
+	//cout<<"leafNum = "<<tree->leafNum-1<<" < random = "<<randomNodeID<<" < nodeNum = "<<tree->nodeNum-1<<endl;
+
+	assert(randomNodeID < tree->nodeNum && randomNodeID > tree->leafNum-1);
+
+	// Choose random neighbor
+	int randomNeiID = rand() % 2;
+	//cout<<"randomNeiID = "<<randomNeiID<<endl;
+
+
+	Node *randomNode;
+	randomNode = tree->findNodeID(randomNodeID);
+	Node *randomNeiNode = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->node;
+
+	// Create new node ----------------------------
+	string str;
+	str = "NewNode";
+	const char *ch = str.c_str();
+	Node* newNode1 = tree->newNode(tree->nodeNum,ch);
+	tree->nodeNum++;
+
+	// Add new node as a neighbor to randomNei->node
+	double len = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->length;
+	int    id  = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->id;
+
+	randomNeiNode->findNeighbor(randomNode)->node = newNode1;
+	newNode1->addNeighbor(randomNeiNode,len,id);
+
+
+	//Change randomNei with this new node for randomNode. Create new branch.
+	randomNode->neighbors[randomNeiID]->node = newNode1;
+	randomNode->neighbors[randomNeiID]->id = tree->branchNum;
+	tree->branchNum++;
+	randomNode->neighbors[randomNeiID]->length = randomLen(*params);
+
+	newNode1->addNeighbor(randomNode,randomNode->neighbors[randomNeiID]->length,randomNode->neighbors[randomNeiID]->id);
+
+	tree->root = newNode1;
+
+}
+
+NNIMove getBestNNIForBranUB(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
+
+	NNIMove nniMoves[2];
+
+    // Initialize node1 and node2 in nniMoves
+	nniMoves[0].node1 = nniMoves[1].node1 = node1;
+	nniMoves[0].node2 = nniMoves[1].node2 = node2;
+
+	// Initialize two NNIs
+	int cnt;
+	double t[4];
+    FOR_NEIGHBOR_IT(node1, node2, node1_it) {
+			cnt = 0;
+			t[cnt]=(*node1_it)->length;
+			FOR_NEIGHBOR_IT(node2, node1, node2_it) {
+				//   Initialize the 2 NNI moves
+				nniMoves[cnt].node1Nei_it = node1_it; // for both cnt = 0,1 this is the same neighbor of node1,
+													  // which will be swapped with nei1 and nei2 of node2
+				nniMoves[cnt].node2Nei_it = node2_it;
+				t[cnt+2] = (*node2_it)->length;
+				cnt++;
+			}
+			break;
+    }
+
+    NeighborVec::iterator node1Nei2_it;
+
+    FOR_NEIGHBOR_IT(node1, node2, node1_it){
+    	if ((*node1_it)->node != (*nniMoves[0].node1Nei_it)->node){
+    		t[cnt]=(*node1_it)->length;
+    		node1Nei2_it = node1_it;
+    		break;
+    	}
+    }
+
+    /*
+     * Correspondence:
+     *
+     * Nodes, incident to node1 with corresponding branches:
+     * nniMoves[0].node1Nei_it	| t[0]
+     * node1Nei2_it				| t[1]
+     *
+     * Nodes, incident to node2 with corresponding branches:
+     * nniMoves[0].node2Nei_it	| t[2]
+     * nniMoves[1].node2Nei_it	| t[3]
+     *
+     * NNIs:
+     * nniMoves[0] -> swapping (nniMoves[0].node1Nei_it	| t[0]) with (nniMoves[0].node2Nei_it	| t[2])
+     * corresponding coef: q1
+     *
+     * nniMoves[1] -> swapping (nniMoves[1].node1Nei_it	| t[0]) with (nniMoves[1].node2Nei_it	| t[3])
+     * corresponding coef: q2
+     */
+
+    double L[4]; // likelihoods of 4 subtrees
+    double score[4];
+    L[0] = L[1] = L[2] = L[3] = 0.0;
+    score[0] = score[1] = score[2] = score[3] = 0.0;
+
+    double UB = 0.0; // in log terms
+    int nsite = tree->aln->getNSite();
+    UB = nsite*logC(node1->findNeighbor(node2)->length,tree); // coefficient c
+
+    //int ncat = tree->site_rate->getNDiscreteRate();
+    int nptn = tree->aln->getNPattern();
+    int nstates = tree->aln->num_states;
+    int i,x;
+    //int cat;
+    IntVector ptnFreq;
+    tree->aln->getPatternFreq(ptnFreq);
+
+    int clear_pl_lh[4]; // if equals to 1, partial likelihoods were computed, don't clear.
+    clear_pl_lh[0] = clear_pl_lh[1] = clear_pl_lh[2] = clear_pl_lh[3] = 1;
+
+    double* T1_partial_lh;
+    if(((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood((PhyloNeighbor*) (*nniMoves[0].node1Nei_it), node1);
+    	clear_pl_lh[0] = 0;
+    }
+    T1_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh();
+
+    double* T2_partial_lh;
+    if(((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood(((PhyloNeighbor*) (*node1Nei2_it)), node1);
+    	clear_pl_lh[1] = 0;
+    }
+    T2_partial_lh = ((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh();
+
+    double* T3_partial_lh;
+    if(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it)), node1);
+    	clear_pl_lh[2] = 0;
+    }
+    T3_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh();
+
+    double* T4_partial_lh;
+    if(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it)), node1);
+    	clear_pl_lh[3] = 0;
+    }
+    T4_partial_lh = ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh();
+
+    for(i = 0; i<nptn; i++){
+    	score[0] = score[1] = score[2] = score[3] = 0.0;
+    	// Sum over Gamma categories and over states
+    	//for(cat = 0; cat < ncat; cat++){
+    		for(x = 0; x < nstates; x++){
+    		// First  subtree --------------------------
+    			score[0] += tree->getModel()->state_freq[x]*T1_partial_lh[i*nstates+x];
+    		// Second subtree --------------------------
+    			score[1] += tree->getModel()->state_freq[x]*T2_partial_lh[i*nstates+x];
+    	   	// Third  subtree --------------------------
+    			score[2] += tree->getModel()->state_freq[x]*T3_partial_lh[i*nstates+x];
+    	   	// Fourth subtree --------------------------
+    			score[3] += tree->getModel()->state_freq[x]*T4_partial_lh[i*nstates+x];
+    		}
+   	//}
+    	L[0] += log(score[0])*ptnFreq[i];
+    	L[1] += log(score[1])*ptnFreq[i];
+    	L[2] += log(score[2])*ptnFreq[i];
+    	L[3] += log(score[3])*ptnFreq[i];
+
+        assert(isnormal(L[0] + L[1] + L[2] + L[3]));
+
+    }
+
+/*
+    if(clear_pl_lh[0] == 0){
+    	((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->clearPartialLh();
+    }
+    if(clear_pl_lh[1] == 0){
+    	((PhyloNeighbor*) (*node1Nei2_it))->clearPartialLh();
+    }
+    if(clear_pl_lh[2] == 0){
+    	((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->clearPartialLh();
+    }
+    if(clear_pl_lh[3] == 0){
+    	((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->clearPartialLh();
+    }*/
+    //cout<<"Clear_pl_lh:"<<clear_pl_lh[0]<<" "<<clear_pl_lh[1]<<" "<<clear_pl_lh[2]<<" "<<clear_pl_lh[3]<<endl;
+
+    //double logNcat = log(((double)ncat));
+    L[0] = L[0] + ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_lh_scale_factor();
+    L[1] = L[1] + ((PhyloNeighbor*) (*node1Nei2_it))->get_lh_scale_factor();
+    L[2] = L[2] + ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_lh_scale_factor();
+    L[3] = L[3] + ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_lh_scale_factor();
+
+/*   // Print some info:
+    cout<<"The log likelihood  of the parent tree T:"<<tree->computeLikelihood()<<endl;
+    cout<<"The log likelihoods of the four subtrees:"<<endl;
+    cout<<"Node"<<(*nniMoves[0].node1Nei_it)->node->id<<": L[0] = "<<L[0]<<endl;
+    cout<<"Node"<<(*node1Nei2_it)->node->id<<": L[1] = "<<L[1]<<endl;
+    cout<<"Node"<<(*nniMoves[0].node2Nei_it)->node->id<<": L[2] = "<<L[2]<<endl;
+    cout<<"Node"<<(*nniMoves[1].node2Nei_it)->node->id<<": L[3] = "<<L[3]<<endl;*/
+
+    UB += L[0] + L[1] + L[2] + L[3];
+
+    double q1 = logC(t[0]+t[3],tree) + logC(t[1]+t[2],tree);
+    double q2 = logC(t[0]+t[2],tree) + logC(t[1]+t[3],tree);
+    //cout<<"Coefficients q1 and q2:"<<endl<<q1<<endl<<q2<<endl;
+
+    double UBq1 = UB + nsite*q1;
+    double UBq2 = UB + nsite*q2;
+
+	string out_file_UB = tree->params->out_prefix;
+	out_file_UB += ".UB.NNI.upperBounds";
+	ofstream out_UB;
+	out_UB.exceptions(ios::failbit | ios::badbit);
+	out_UB.open((char*)out_file_UB.c_str(),std::ofstream::out | std::ofstream::app);
+
+	out_UB << tree->getCurScore() << "\t" << UBq1 << "\t" << UBq2 << "\t" << tree->getCurScore() - UBq1 << "\t" << tree->getCurScore() - UBq2 << endl;
+
+	out_UB.close();
+
+    if(UBq1 < tree->getCurScore()){
+    	tree->skippedNNIub += 1;
+  /*  	tree->meanUB += UBq1;
+    	if(UBq1 < tree->minUB){
+    		tree->minUB = UBq1;
+    	} else if(UBq1 > tree->maxUB){
+    		tree->maxUB = UBq1;
+    	}*/
+    	//cout<<"----------------- UBq1 < L !!!"<<endl;
+    }
+    if(UBq2 < tree->getCurScore()){
+    	tree->skippedNNIub += 1;
+    	/*tree->meanUB += UBq2;
+    	if(UBq2 < tree->minUB){
+    		tree->minUB = UBq2;
+    	} else if(UBq2 > tree->maxUB){
+    		tree->maxUB = UBq2;
+    	}*/
+    	//cout<<"----------------- UBq2 < L !!!"<<endl;
+    }
+
+	// Decide which NNI has a larger UB (we base our decision on q coefficients)
+	if(q1 > q2){
+		// NNI 1:
+		//nniMoves[0].newLen[0] = NULL;
+		nniMoves[0].newloglh = UBq1;
+		//cout<<"q1 and NNI1 is chosen with UB "<<UBq1<<endl;
+		return nniMoves[0];
+	} else {
+		// NNI 2:
+		//nniMoves[1].newLen[0] = NULL;
+		nniMoves[1].newloglh = UBq2;
+		//cout<<"q2 and NNI2 is chosen with UB "<<UBq2<<endl;
+		return nniMoves[1];
+	}
+}
+
+double logC(double t, PhyloTree* tree){
+	//double c = log((1+3*exp(-t)))-log(1-exp(-t));
+
+	int i, m = tree->aln->num_states*tree->aln->num_states, n = tree->aln->num_states;
+	double* TransMatrix = new double[m];
+	tree->getModelFactory()->computeTransMatrix(t,TransMatrix);
+	double maxTransProb = 0.0;
+	for(i = 0; i < m; i++)
+		if(TransMatrix[i]>maxTransProb)
+			maxTransProb = TransMatrix[i];
+	//maxTransProb = 0.25*(1+3*exp(-3*t/4));
+	//maxTransProb = 1;
+
+	if(tree->minStateFreq == 0.0){
+		tree->minStateFreq = tree->getModel()->state_freq[0];
+		for(i = 1; i < n; i++){
+			if(tree->minStateFreq > tree->getModel()->state_freq[i])
+				tree->minStateFreq = tree->getModel()->state_freq[i];
+		}
+
+	}
+	//cout<<tree->minStateFreq<<endl;
+	//tree->minStateFreq = 0.25;
+	//assert(isnormal(log(maxTransProb/tree->minStateFreq)));
+	return log(maxTransProb/tree->minStateFreq);
+}
+
+void sumFraction(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
+	PhyloNeighbor* nei1 = (PhyloNeighbor*) node1->findNeighbor(node2);
+	PhyloNeighbor* nei2 = (PhyloNeighbor*) node2->findNeighbor(node1);
+
+//	int loglh = tree->computeLikelihood();
+
+    double* T1_partial_lh;
+    if(nei1->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood(nei1, node1);
+    }
+    T1_partial_lh = nei1->get_partial_lh();
+
+    double* T2_partial_lh;
+    if(nei2->get_partial_lh_computed() == 0){
+    	tree->computePartialLikelihood(nei2, node2);
+    }
+    T2_partial_lh = nei2->get_partial_lh();
+
+    double score[3];
+    score[0] = score[1] = score[2] = 0.0;
+
+    double plh[3];
+    plh[0]=plh[1]=plh[2]=0.0;
+
+    int nptn = tree->aln->getNPattern();
+    int nstates = tree->aln->num_states;
+    int j,i,x,y;
+
+    double plhx[nstates];
+    double plhy[nstates];
+
+    double *eigen = tree->getModel()->getEigenvectors();
+
+    for(i = 0; i<nptn; i++){
+    	score[0] = score[1] = score[2] = 0.0;
+
+    	// computing partial likelihoods
+		for(x = 0; x < nstates; x++){
+			plhx[x] = 0.0;
+			plhy[x] = 0.0;
+			for(j = 0; j<nstates; j++){
+				plhx[x]+= T1_partial_lh[i*nstates+j]*eigen[x*nstates+j];
+				plhy[x]+= T2_partial_lh[i*nstates+j]*eigen[x*nstates+j];
+			}
+		}
+
+		for(x = 0; x < nstates; x++){
+			for(y = 0; y < nstates; y++){
+				if(x == y){
+				// Term for a pair of matching nucleotides --------------------------
+					score[0] += plhx[x]*plhy[y];
+				} else {
+				// Term for a pair of non-matching nucleotides ----------------------
+					score[1] += plhx[x]*plhy[y];
+				}
+			// Full sum ---------------------------------------------------------
+			score[2] += plhx[x]*plhy[y];
+			}
+		}
+
+        assert(isnormal(score[0] + score[1] + score[2]));
+
+		cout<<"BranchLEN |"<< nei1->length
+			<<"| FRACTION of ai (sum over matching pairs) |"<<score[0]/score[2]
+		    <<"| FRACTION of bi (sum over non-matching pairs) |"<<score[1]/score[2]
+		    <<"| likelihood |"<<score[2]
+		    <<endl;
+
+    }
+
+
+}
diff --git a/upperbounds.h b/upperbounds.h
new file mode 100644
index 0000000..6433381
--- /dev/null
+++ b/upperbounds.h
@@ -0,0 +1,89 @@
+/*
+ * upperbounds.h
+ *
+ *  Created on: Aug 13, 2014
+ *      Author: olga
+ */
+
+#ifndef UPPERBOUNDS_H_
+#define UPPERBOUNDS_H_
+
+/**
+	main function to carry out Upper Bounds analysis
+*/
+#include "iqtree.h"
+#include "mexttree.h"
+#include "alignment.h"
+#include "phylotree.h"
+
+class PhyloTree;
+class IQTree;
+
+void UpperBounds(Params* params, Alignment* alignment, IQTree* tree);
+
+void printUB();
+
+void printTreeUB(MTree *tree);
+
+/**
+ * extracting subtree spanned by corresponding taxa together with subalignment
+ *
+ * ids  - vector of taxa ids, the subtree is spanned by these taxa
+ * type - specifies the copying procedure:
+ * 		= 0, normal (PhyloTree) tree->copyTree;
+ * 			 two branches, incident to one of the end nodes of the branch spliting two subtrees, are collapsed
+ * 		= 1, remember the end of the branch, do not collapse two branches
+ * 			 this is used for the subtree with "artificial root",
+ * 			 this will be a leaf without any nucleotide fixed at any site
+ */
+PhyloTree* extractSubtreeUB(IntVector &ids, MTree* tree, Params *params, int sw = 0);
+
+// Slightly changed functions from usual MTree.copy, to allow for non collapsed branches: type=1
+void copyTreeUB(MTree *tree, MTree *treeCopy, string &taxa_set);
+Node* copyTreeUBnode(MTree *tree, MTree *treeCopy, string &taxa_set, double &len, Node *node = NULL, Node *dad = NULL);
+
+// With artificial root internal node might get ID < taxaNUM. Just to make consistent with any other part of program,
+// reindex all taxa from 0 to taxaNUM-1, and then internal nodes will get ids from taxaNUM to nodesNUM.
+void reindexTaxonIDs(MTree *tree);
+
+
+/*
+ * Generate random YH tree, which is considered as a subtree of parent tree
+ * input tree - is a subtree,
+ * output - log-likelihood of generated subtree
+ */
+MTree* generateRandomYH_UB(Params &params, PhyloTree *tree);
+
+/*
+ * This function generates a random tree that has A|B split.
+ * One can also specify the length of corresponding branch.
+ * t(A|B) = brLen
+ */
+double RandomTreeAB(PhyloTree* treeORGN, PhyloTree* treeAorgn, PhyloTree* treeBorgn, IntVector &taxaA, IntVector &taxaB, Params *params, double brLen = 0.0);
+
+/*
+ * This function computes the product of logLhs of two subtrees corresponding to a given split A|B on tree.
+ */
+double UpperBoundAB(IntVector &taxaA, IntVector &taxaB, PhyloTree* tree, Params *params);
+
+/*
+ * Auxiliary function for RandomTreeAB, chooses randomly node and inserts new branch with randomLen
+ */
+void extendingTree(MTree* tree, Params* params);
+
+NodeVector getBranchABid(double brLen, PhyloTree* tree);
+
+/*
+ * Applying UBs to NNI search
+ */
+NNIMove getBestNNIForBranUB(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree);
+double logC(double t, PhyloTree* tree);
+
+/**
+ * Tests on fractions ai/(ai+bi) and bi/(ai+bi)
+ * (fractions of sums for matching and non-matching pairs of nucleotides on the ends of branch)
+ */
+
+void sumFraction(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree);
+
+#endif /* UPPERBOUNDS_H_ */
diff --git a/vectorclass/CMakeLists.txt b/vectorclass/CMakeLists.txt
new file mode 100644
index 0000000..e62252d
--- /dev/null
+++ b/vectorclass/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_library(vectorclass
+instrset_detect.cpp
+)
diff --git a/vectorclass/changelog.txt b/vectorclass/changelog.txt
new file mode 100755
index 0000000..dd30ddb
--- /dev/null
+++ b/vectorclass/changelog.txt
@@ -0,0 +1,110 @@
+change log for vectorclass.zip
+------------------------------
+
+2015-10-24 version 1.16
+  * workaround for problem in Clang compiler extended to version 3.09 because not fixed yet by Clang
+    (vectorf128.h line 134)
+  * recognize problem with Apple version of Clang reporting wrong version number
+  * remove various minor problems with Clang
+  * function pow(vector, int) modified to strengthen type checking and avoid compiler warnings
+  * manual discusses dynamic allocation of arrays of vectors
+  * various minor changes
+
+
+2015-10-17 version 1.15
+  * added files ranvec1.h and ranvec1.cpp for random number generator
+  * constructors to make boolean vectors from their elements
+  * constructors and = operators to broadcast boolean scalar into boolean vectors
+  * various lookup functions improved
+  * operators &, |, ^, ~, etc. defined for various boolean vectors to avoid converson
+    to integer vectors
+  * nmul_add functions
+  * mul_add etc. moved to main header files
+  * explicit fused multiply-and-add used in math functions to improve performance 
+    on compilers that don't automatically insert FMA
+
+
+2014-07-24 version 1.14
+  * support for AVX-512f instruction set and 512-bit vectors:
+    Vec16i, Vec16ui, Vec8q, Vec8uq, Vec16f, Vec8d, and corresponding boolean vectors
+  * new define MAX_VECTOR_SIZE, valid values are 128, 256 and 512
+  * added hyperbolic functions sinh, cosh, tanh, asinh, acosh, atanh
+  * size() member function on all vector classes returns the number of elements
+  * functions for conversion between boolean vectors and integer bitfields
+  * extracting an element from a boolean vector now returns a bool, not an int
+  * improved precision in exp2 and exp10 functions
+  * various bug fixes
+
+
+2014-05-11 version 1.13
+  * pow function improved
+  * mul_add, mul_sub, mul_sub_x functions
+  * propagation of error codes through nan_code function
+  * "denormal" renamed to "subnormal" everywhere, in accordance with IEEE 754-2008 standard
+
+
+2014-04-20 version 1.12
+  * inline implementation of mathematical functions added (vectormath_exp.h vectormath_trig.h vectormath_common.h)
+  * vectormath.h renamed to vectormath_lib.h because a new alternative is added
+  * gather functions with constant indexes
+  * function sign_combine
+  * function pow_const(vector, const int)
+  * function pow_ratio(vector, const int, const int)
+  * functions horizontal_find_first, horizontal_count
+  * function recipr_sqrt removed
+  * functions round_to_int64_limited, truncate_to_int64_limited, to_double_limited
+  * function cubic_root renamed to cbrt
+  * function atan(vector,vector) renamed to atan2
+  * function if_mul
+  * function Vec4i round_to_int(Vec2d)
+  * operator & (float vector, boolean vector)
+  * operator &= (int vector, int vector)
+  * removed constructor Vec128b(int) and Vec256b(int) to avoid implicit conversion
+  * removed signalling nan function
+  * minor improvements in various blend and lookup functions
+
+
+2014-03-01 version 1.11
+  * fixed missing unsigned operators >>= in vectori256.h
+
+
+2013-10-04 version 1.10
+  * clear distinction between boolean vectors and integer vectors for the sake of 
+    compatibility with mask registers in forthcoming AVX512 instruction set
+  * added function if_add
+  * tentative support for clang version 3.3 with workaround for bugs
+  * remove ambiguity for builtin m128i operator == in clang compiler. 
+  * problems in clang compiler, bug reports filed at clang
+    (http://llvm.org/bugs/show_bug.cgi?id=17164, 17312)
+  * instrset.h fixes problem with macros named min and max in MS windows.h
+  * workaround problem in MS Visual Studio 11.0. Bug report 735861 and 804274
+  * minor bug fixes
+
+
+2013-03-31 version 1.03 beta
+  * bug fix for Vec2d cos (Vec2d const & x), VECTORMATH = 1
+
+
+2012-08-01 version 1.02 beta
+  * added file vector3d.h for 3-dimensional vectors
+  * added file complexvec.h for complex numbers and complex vectors
+  * added file quaternion.h for quaternions
+  * added function change_sign for floating point vectors
+  * added operators +, -, *, / between floating point vectors and scalars to remove overloading ambiguity
+
+
+2012-07-08 version 1.01 beta
+  * added file decimal.h with Number <-> string conversion functions: 
+    bin2bcd, bin2ascii, bin2hex_ascii, ascii2bin
+  * added andnot function for boolean vectors
+  * added functions shift_bytes_up and shift_bytes_down
+  * added operators for unsigned integer vector classes: >>=, &, &&, |, ||, ^, ~
+  * inteldispatchpatch.cpp removed. Use asmlib instead (www.agner.org/optimize/#asmlib)
+  * prefix ++ and -- operators now return a reference, postfix operators return a value
+  * various improvements in permute and blend functions
+  * minor improvement in abs function
+  * added version number to VECTORCLASS_H
+
+
+2012-05-30 version 1.00 beta
+  * first public release
diff --git a/vectorclass/dispatch_example.cpp b/vectorclass/dispatch_example.cpp
new file mode 100755
index 0000000..640a683
--- /dev/null
+++ b/vectorclass/dispatch_example.cpp
@@ -0,0 +1,99 @@
+/*************************  dispatch_example.cpp   ****************************
+| Author:        Agner Fog
+| Date created:  2012-05-30
+| Last modified: 2014-07-23
+| Version:       1.14
+| Project:       vector classes
+| Description:
+| Example of CPU dispatching.
+|
+| # Example of compiling this with GCC compiler:
+| # Compile dispatch_example.cpp five times for different instruction sets:
+| g++ -O3 -msse2    -c dispatch_example.cpp -od2.o
+| g++ -O3 -msse4.1  -c dispatch_example.cpp -od5.o
+| g++ -O3 -mavx     -c dispatch_example.cpp -od7.o
+| g++ -O3 -mavx2    -c dispatch_example.cpp -od8.o
+| g++ -O3 -mavx512f -c dispatch_example.cpp -od9.o
+| g++ -O3 -msse2 -otest instrset_detect.cpp d2.o d5.o d7.o d8.o d9.o
+| ./test
+|
+| (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+#include <stdio.h>
+
+#define MAX_VECTOR_SIZE 512
+#include "vectorclass.h"
+
+
+// define function type (change this to fit your purpose. Should not contain vector types)
+typedef float MyFuncType(float*);
+
+// function prototypes for each version
+MyFuncType  myfunc, myfunc_SSE2, myfunc_SSE41, myfunc_AVX, myfunc_AVX2, myfunc_AVX512, myfunc_dispatch; 
+
+// Define function name depending on which instruction set we compile for
+#if   INSTRSET == 2                    // SSE2
+#define FUNCNAME myfunc_SSE2
+#elif INSTRSET == 5                    // SSE4.1
+#define FUNCNAME myfunc_SSE41
+#elif INSTRSET == 7                    // AVX
+#define FUNCNAME myfunc_AVX
+#elif INSTRSET == 8                    // AVX2
+#define FUNCNAME myfunc_AVX2
+#elif INSTRSET == 9                    // AVX512
+#define FUNCNAME myfunc_AVX512
+#endif
+
+// specific version of the function. Compile once for each version
+float FUNCNAME (float * f) {
+    Vec16f a;                          // vector of 16 floats
+    a.load(f);                         // load array into vector
+    return horizontal_add(a);          // return sum of 16 elements
+}
+
+
+#if INSTRSET == 2
+// make dispatcher in only the lowest of the compiled versions
+
+// Function pointer initially points to the dispatcher.
+// After first call it points to the selected version
+MyFuncType * myfunc_pointer = &myfunc_dispatch;            // function pointer
+
+// Dispatcher
+float myfunc_dispatch(float * f) {
+    int iset = instrset_detect();                          // Detect supported instruction set
+    if      (iset >= 9) myfunc_pointer = &myfunc_AVX512;   // AVX512 version
+    else if (iset >= 8) myfunc_pointer = &myfunc_AVX2;     // AVX2 version
+    else if (iset >= 7) myfunc_pointer = &myfunc_AVX;      // AVX version
+    else if (iset >= 5) myfunc_pointer = &myfunc_SSE41;    // SSE4.1 version
+    else if (iset >= 2) myfunc_pointer = &myfunc_SSE2;     // SSE2 version
+    else {
+        // Error: lowest instruction set not supported (put your own error message here:)
+        fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer");
+        return 0.f;
+    }
+    // continue in dispatched version
+    return (*myfunc_pointer)(f);
+}
+
+
+// Entry to dispatched function call
+inline float myfunc(float * f) {
+    return (*myfunc_pointer)(f);                           // go to dispatched version
+}
+
+
+// Example: main calls myfunc
+int main(int argc, char* argv[]) 
+{
+    float a[16]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};  // array of 16 floats
+
+    float sum = myfunc(a);                                 // call function with dispatching
+
+    printf("\nsum = %8.3f \n", sum);                       // print result
+    return 0;
+}
+
+#endif  // INSTRSET == 2
+
diff --git a/vectorclass/instrset.h b/vectorclass/instrset.h
new file mode 100755
index 0000000..4fb83e2
--- /dev/null
+++ b/vectorclass/instrset.h
@@ -0,0 +1,203 @@
+/****************************  instrset.h   **********************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file for various compiler-specific tasks and other common tasks to 
+* vector class library:
+* > selects the supported instruction set
+* > defines integer types
+* > defines compiler version macros
+* > undefines certain macros that prevent function overloading
+* > defines template class to represent compile-time integer constant
+* > defines template for compile-time error messages
+*
+* (c) Copyright 2012 - 2014 GNU General Public License www.gnu.org/licenses
+******************************************************************************/
+
+#ifndef INSTRSET_H
+#define INSTRSET_H 116
+
+// Detect 64 bit mode
+#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
+#define __x86_64__ 1  // There are many different macros for this, decide on only one
+#endif
+
+// Find instruction set from compiler macros if INSTRSET not defined
+// Note: Microsoft compilers do not define these macros automatically
+#ifndef INSTRSET
+#if defined ( __AVX512F__ ) || defined ( __AVX512__ ) // || defined ( __AVX512ER__ ) 
+#define INSTRSET 9
+#elif defined ( __AVX2__ )
+#define INSTRSET 8
+#elif defined ( __AVX__ )
+#define INSTRSET 7
+#elif defined ( __SSE4_2__ )
+#define INSTRSET 6
+#elif defined ( __SSE4_1__ )
+#define INSTRSET 5
+#elif defined ( __SSSE3__ )
+#define INSTRSET 4
+#elif defined ( __SSE3__ )
+#define INSTRSET 3
+#elif defined ( __SSE2__ ) || defined ( __x86_64__ )
+#define INSTRSET 2
+#elif defined ( __SSE__ )
+#define INSTRSET 1
+#elif defined ( _M_IX86_FP )           // Defined in MS compiler. 1: SSE, 2: SSE2
+#define INSTRSET _M_IX86_FP
+#else 
+#define INSTRSET 0
+#endif // instruction set defines
+#endif // INSTRSET
+
+// Include the appropriate header file for intrinsic functions
+#if INSTRSET > 7                       // AVX2 and later
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+#include <x86intrin.h>                 // x86intrin.h includes header files for whatever instruction 
+                                       // sets are specified on the compiler command line, such as:
+                                       // xopintrin.h, fma4intrin.h
+#else
+#include <immintrin.h>                 // MS version of immintrin.h covers AVX, AVX2 and FMA3
+#endif // __GNUC__
+#elif INSTRSET == 7
+#include <immintrin.h>                 // AVX
+#elif INSTRSET == 6
+#include <nmmintrin.h>                 // SSE4.2
+#elif INSTRSET == 5
+#include <smmintrin.h>                 // SSE4.1
+#elif INSTRSET == 4
+#include <tmmintrin.h>                 // SSSE3
+#elif INSTRSET == 3
+#include <pmmintrin.h>                 // SSE3
+#elif INSTRSET == 2
+#include <emmintrin.h>                 // SSE2
+#elif INSTRSET == 1
+#include <xmmintrin.h>                 // SSE
+#endif // INSTRSET
+
+#if INSTRSET >= 8 && !defined(__FMA__)
+// Assume that all processors that have AVX2 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__)
+// Prevent error message in g++ when using FMA intrinsics with avx2:
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#else
+#define __FMA__  1
+#endif
+#endif
+
+// AMD  instruction sets
+#if defined (__XOP__) || defined (__FMA4__)
+#ifdef __GNUC__
+#include <x86intrin.h>                 // AMD XOP (Gnu)
+#else
+#include <ammintrin.h>                 // AMD XOP (Microsoft)
+#endif //  __GNUC__
+#elif defined (__SSE4A__)              // AMD SSE4A
+#include <ammintrin.h>
+#endif // __XOP__ 
+
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
+#include <fmaintrin.h> 
+#endif // __FMA__ 
+
+// FMA4 instruction set
+#if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__))
+#include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
+#endif // __FMA4__ 
+
+
+// Define integer types with known size
+#if defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && _MSC_VER >= 1600)
+  // Compilers supporting C99 or C++0x have stdint.h defining these integer types
+  #include <stdint.h>
+#elif defined(_MSC_VER)
+  // Older Microsoft compilers have their own definitions
+  typedef signed   __int8  int8_t;
+  typedef unsigned __int8  uint8_t;
+  typedef signed   __int16 int16_t;
+  typedef unsigned __int16 uint16_t;
+  typedef signed   __int32 int32_t;
+  typedef unsigned __int32 uint32_t;
+  typedef signed   __int64 int64_t;
+  typedef unsigned __int64 uint64_t;
+  #ifndef _INTPTR_T_DEFINED
+    #define _INTPTR_T_DEFINED
+    #ifdef  __x86_64__
+      typedef int64_t intptr_t;
+    #else
+      typedef int32_t intptr_t;
+    #endif
+  #endif
+#else
+  // This works with most compilers
+  typedef signed   char      int8_t;
+  typedef unsigned char      uint8_t;
+  typedef signed   short int int16_t;
+  typedef unsigned short int uint16_t;
+  typedef signed   int       int32_t;
+  typedef unsigned int       uint32_t;
+  typedef long long          int64_t;
+  typedef unsigned long long uint64_t;
+  #ifdef  __x86_64__
+    typedef int64_t intptr_t;
+  #else
+    typedef int32_t intptr_t;
+  #endif
+#endif
+
+#include <stdlib.h>                              // define abs(int)
+
+#ifdef _MSC_VER                                  // Microsoft compiler or compatible Intel compiler
+#include <intrin.h>                              // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
+#endif // _MSC_VER
+
+// functions in instrset_detect.cpp
+int  instrset_detect(void);                      // tells which instruction sets are supported
+bool hasFMA3(void);                              // true if FMA3 instructions supported
+bool hasFMA4(void);                              // true if FMA4 instructions supported
+bool hasXOP (void);                              // true if XOP  instructions supported
+
+// GCC version
+#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
+#define GCC_VERSION  ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
+#endif
+
+// Clang version
+#if defined (__clang__)
+#define CLANG_VERSION  ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__))
+// Problem: The version number is not consistent across platforms
+// http://llvm.org/bugs/show_bug.cgi?id=12643
+// Apple bug 18746972
+#endif
+
+// Fix problem with macros named min and max in WinDef.h
+#ifdef _MSC_VER
+#if defined (_WINDEF_) && defined(min) && defined(max)
+#undef min
+#undef max
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#endif
+
+// Template class to represent compile-time integer constant
+template <int32_t  n> class Const_int_t  {};     // represent compile-time signed integer constant
+template <uint32_t n> class Const_uint_t {};     // represent compile-time unsigned integer constant
+#define const_int(n)  (Const_int_t <n>())        // n must be compile-time integer constant
+#define const_uint(n) (Const_uint_t<n>())        // n must be compile-time unsigned integer constant
+
+// Template for compile-time error messages
+template <bool> class Static_error_check {
+    public:  Static_error_check(){};
+};
+template <> class Static_error_check<false> {    // generate compile-time error if false
+    private: Static_error_check(){};
+};
+
+
+#endif // INSTRSET_H
diff --git a/vectorclass/instrset_detect.cpp b/vectorclass/instrset_detect.cpp
new file mode 100755
index 0000000..03c5777
--- /dev/null
+++ b/vectorclass/instrset_detect.cpp
@@ -0,0 +1,153 @@
+/**************************  instrset_detect.cpp   ****************************
+| Author:        Agner Fog
+| Date created:  2012-05-30
+| Last modified: 2014-07-23
+| Version:       1.14
+| Project:       vector classes
+| Description:
+| Functions for checking which instruction sets are supported.
+|
+| (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+#include "instrset.h"
+
+// Define interface to cpuid instruction.
+// input:  eax = functionnumber, ecx = 0
+// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
+static inline void cpuid (int output[4], int functionnumber) {	
+#if defined (_MSC_VER) || defined (__INTEL_COMPILER)       // Microsoft or Intel compiler, intrin.h included
+
+    __cpuidex(output, functionnumber, 0);                  // intrinsic function for CPUID
+
+#elif defined(__GNUC__) || defined(__clang__)              // use inline assembly, Gnu/AT&T syntax
+
+   int a, b, c, d;
+   __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber),"c"(0) : );
+   output[0] = a;
+   output[1] = b;
+   output[2] = c;
+   output[3] = d;
+
+#else                                                      // unknown platform. try inline assembly with masm/intel syntax
+
+    __asm {
+        mov eax, functionnumber
+        xor ecx, ecx
+        cpuid;
+        mov esi, output
+        mov [esi],    eax
+        mov [esi+4],  ebx
+        mov [esi+8],  ecx
+        mov [esi+12], edx
+    }
+
+#endif
+}
+
+// Define interface to xgetbv instruction
+static inline int64_t xgetbv (int ctr) {	
+#if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) // Microsoft or Intel compiler supporting _xgetbv intrinsic
+
+    return _xgetbv(ctr);                                   // intrinsic function for XGETBV
+
+#elif defined(__GNUC__)                                    // use inline assembly, Gnu/AT&T syntax
+
+   uint32_t a, d;
+   __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
+   return a | (uint64_t(d) << 32);
+
+#else  // #elif defined (_WIN32)                           // other compiler. try inline assembly with masm/intel/MS syntax
+
+   uint32_t a, d;
+    __asm {
+        mov ecx, ctr
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd0 ; // xgetbv
+        mov a, eax
+        mov d, edx
+    }
+   return a | (uint64_t(d) << 32);
+
+#endif
+}
+
+
+/* find supported instruction set
+    return value:
+    0           = 80386 instruction set
+    1  or above = SSE (XMM) supported by CPU (not testing for O.S. support)
+    2  or above = SSE2
+    3  or above = SSE3
+    4  or above = Supplementary SSE3 (SSSE3)
+    5  or above = SSE4.1
+    6  or above = SSE4.2
+    7  or above = AVX supported by CPU and operating system
+    8  or above = AVX2
+    9  or above = AVX512F
+*/
+int instrset_detect(void) {
+
+    static int iset = -1;                                  // remember value for next call
+    if (iset >= 0) {
+        return iset;                                       // called before
+    }
+    iset = 0;                                              // default value
+    int abcd[4] = {0,0,0,0};                               // cpuid results
+    cpuid(abcd, 0);                                        // call cpuid function 0
+    if (abcd[0] == 0) return iset;                         // no further cpuid function supported
+    cpuid(abcd, 1);                                        // call cpuid function 1 for feature flags
+    if ((abcd[3] & (1 <<  0)) == 0) return iset;           // no floating point
+    if ((abcd[3] & (1 << 23)) == 0) return iset;           // no MMX
+    if ((abcd[3] & (1 << 15)) == 0) return iset;           // no conditional move
+    if ((abcd[3] & (1 << 24)) == 0) return iset;           // no FXSAVE
+    if ((abcd[3] & (1 << 25)) == 0) return iset;           // no SSE
+    iset = 1;                                              // 1: SSE supported
+    if ((abcd[3] & (1 << 26)) == 0) return iset;           // no SSE2
+    iset = 2;                                              // 2: SSE2 supported
+    if ((abcd[2] & (1 <<  0)) == 0) return iset;           // no SSE3
+    iset = 3;                                              // 3: SSE3 supported
+    if ((abcd[2] & (1 <<  9)) == 0) return iset;           // no SSSE3
+    iset = 4;                                              // 4: SSSE3 supported
+    if ((abcd[2] & (1 << 19)) == 0) return iset;           // no SSE4.1
+    iset = 5;                                              // 5: SSE4.1 supported
+    if ((abcd[2] & (1 << 23)) == 0) return iset;           // no POPCNT
+    if ((abcd[2] & (1 << 20)) == 0) return iset;           // no SSE4.2
+    iset = 6;                                              // 6: SSE4.2 supported
+    if ((abcd[2] & (1 << 27)) == 0) return iset;           // no OSXSAVE
+    if ((xgetbv(0) & 6) != 6)       return iset;           // AVX not enabled in O.S.
+    if ((abcd[2] & (1 << 28)) == 0) return iset;           // no AVX
+    iset = 7;                                              // 7: AVX supported
+    cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
+    if ((abcd[1] & (1 <<  5)) == 0) return iset;           // no AVX2
+    iset = 8;                                              // 8: AVX2 supported
+    cpuid(abcd, 0xD);                                      // call cpuid leaf 0xD for feature flags
+    if ((abcd[0] & 0x60) != 0x60)   return iset;           // no AVX512
+    iset = 9;                                              // 8: AVX512F supported
+    return iset;
+}
+
+// detect if CPU supports the FMA3 instruction set
+bool hasFMA3(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 1);                                        // call cpuid function 1
+    return ((abcd[2] & (1 << 12)) != 0);                   // ecx bit 12 indicates FMA3
+}
+
+// detect if CPU supports the FMA4 instruction set
+bool hasFMA4(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
+    return ((abcd[2] & (1 << 16)) != 0);                   // ecx bit 16 indicates FMA4
+}
+
+// detect if CPU supports the XOP instruction set
+bool hasXOP(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
+    return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
+}
diff --git a/vectorclass/license.txt b/vectorclass/license.txt
new file mode 100755
index 0000000..52a8372
--- /dev/null
+++ b/vectorclass/license.txt
@@ -0,0 +1,619 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
diff --git a/vectorclass/special.zip b/vectorclass/special.zip
new file mode 100755
index 0000000..dfb1e13
Binary files /dev/null and b/vectorclass/special.zip differ
diff --git a/vectorclass/vectorclass.h b/vectorclass/vectorclass.h
new file mode 100755
index 0000000..6509bca
--- /dev/null
+++ b/vectorclass/vectorclass.h
@@ -0,0 +1,69 @@
+/****************************  vectorclass.h   ********************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-24
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining vector classes as interface to intrinsic functions 
+* in x86 microprocessors with SSE2 and later instruction sets up to AVX512.
+*
+* Instructions:
+* Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least SSE2. Specify the supported 
+* instruction set by a command line define, e.g. __SSE4_1__ if the 
+* compiler does not automatically do so.
+*
+* Each vector object is represented internally in the CPU as a vector
+* register with 128, 256 or 512 bits.
+*
+* This header file includes the appropriate header files depending on the
+* supported instruction set
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2014 GNU General Public License www.gnu.org/licenses
+******************************************************************************/
+#ifndef VECTORCLASS_H
+#define VECTORCLASS_H  116
+
+// Maximum vector size, bits. Allowed values are 128, 256, 512
+#ifndef MAX_VECTOR_SIZE
+#define MAX_VECTOR_SIZE 256
+#endif
+
+#include "instrset.h"        // Select supported instruction set
+
+#if INSTRSET < 2             // SSE2 required
+  #error Please compile for the SSE2 instruction set or higher
+#else
+
+#include "vectori128.h"      // 128-bit integer vectors
+#include "vectorf128.h"      // 128-bit floating point vectors
+
+#if MAX_VECTOR_SIZE >= 256
+#if INSTRSET >= 8
+  #include "vectori256.h"    // 256-bit integer vectors, requires AVX2 instruction set
+#else
+  #include "vectori256e.h"   // 256-bit integer vectors, emulated
+#endif  // INSTRSET >= 8
+#if INSTRSET >= 7
+  #include "vectorf256.h"    // 256-bit floating point vectors, requires AVX instruction set
+#else
+  #include "vectorf256e.h"   // 256-bit floating point vectors, emulated
+#endif  //  INSTRSET >= 7
+#endif  //  MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+#if INSTRSET >= 9
+  #include "vectori512.h"    // 512-bit integer vectors, requires AVX512 instruction set
+  #include "vectorf512.h"    // 512-bit floating point vectors, requires AVX512 instruction set
+#else
+  #include "vectori512e.h"   // 512-bit integer vectors, emulated
+  #include "vectorf512e.h"   // 512-bit floating point vectors, emulated
+#endif  //  INSTRSET >= 9
+#endif  //  MAX_VECTOR_SIZE >= 512
+
+#endif  // INSTRSET < 2 
+
+#endif  // VECTORCLASS_H
diff --git a/vectorclass/vectorclass.pdf b/vectorclass/vectorclass.pdf
new file mode 100755
index 0000000..87bcdfc
Binary files /dev/null and b/vectorclass/vectorclass.pdf differ
diff --git a/vectorclass/vectorf128.h b/vectorclass/vectorf128.h
new file mode 100755
index 0000000..0c12ece
--- /dev/null
+++ b/vectorclass/vectorf128.h
@@ -0,0 +1,2619 @@
+/****************************  vectorf128.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-24
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining floating point vector classes as interface to 
+* intrinsic functions in x86 microprocessors with SSE2 and later instruction
+* sets up to AVX.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least SSE2. Specify the supported 
+* instruction set by a command line define, e.g. __SSE4_1__ if the 
+* compiler does not automatically do so.
+*
+* The following vector classes are defined here:
+* Vec4f     Vector of 4 single precision floating point numbers
+* Vec4fb    Vector of 4 Booleans for use with Vec4f
+* Vec2d     Vector of 2 double precision floating point numbers
+* Vec2db    Vector of 2 Booleans for use with Vec2d
+*
+* Each vector object is represented internally in the CPU as a 128-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For example:
+* Vec2d a(1.0, 2.0), b(3.0, 4.0), c;
+* c = a + b;     // now c contains (4.0, 6.0)
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+#ifndef VECTORF128_H
+#define VECTORF128_H
+
+#include "vectori128.h"  // Define integer vectors
+
+
+
+/*****************************************************************************
+*
+*          select functions
+*
+*****************************************************************************/
+// Select between two __m128 sources, element by element. Used in various functions 
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are 
+// allowed. The implementation depends on the instruction set: 
+// If SSE4.1 is supported then only bit 31 in each dword of s is checked, 
+// otherwise all bits in s are used.
+static inline __m128 selectf (__m128 const & s, __m128 const & a, __m128 const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_blendv_ps (b, a, s);
+#else
+    return _mm_or_ps(
+        _mm_and_ps(s,a),
+        _mm_andnot_ps(s,b));
+#endif
+}
+
+// Same, with two __m128d sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other 
+// values are allowed. The implementation depends on the instruction set: 
+// If SSE4.1 is supported then only bit 63 in each dword of s is checked, 
+// otherwise all bits in s are used.
+static inline __m128d selectd (__m128d const & s, __m128d const & a, __m128d const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_blendv_pd (b, a, s);
+#else
+    return _mm_or_pd(
+        _mm_and_pd(s,a),
+        _mm_andnot_pd(s,b));
+#endif
+} 
+
+
+/*****************************************************************************
+*
+*          Vec4fb: Vector of 4 Booleans for use with Vec4f
+*
+*****************************************************************************/
+
+class Vec4fb {
+protected:
+    __m128 xmm; // Float vector
+public:
+    // Default constructor:
+    Vec4fb() {
+    }
+    // Constructor to build from all elements:
+    Vec4fb(bool b0, bool b1, bool b2, bool b3) {
+        xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); 
+    }
+    // Constructor to convert from type __m128 used in intrinsics:
+    Vec4fb(__m128 const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128 used in intrinsics:
+    Vec4fb & operator = (__m128 const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4fb(bool b) {
+        xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4fb & operator = (bool b) {
+        *this = Vec4fb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4fb(int b);
+    Vec4fb & operator = (int x);
+public:
+    // Constructor to convert from type Vec4ib used as Boolean for integer vectors
+    Vec4fb(Vec4ib const & x) {
+        xmm = _mm_castsi128_ps(x);
+    }
+    // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors
+    Vec4fb & operator = (Vec4ib const & x) {
+        xmm = _mm_castsi128_ps(x);
+        return *this;
+    }
+    // Type cast operator to convert to __m128 used in intrinsics
+    operator __m128() const {
+        return xmm;
+    }
+#if defined (__clang__) && CLANG_VERSION < 30900 || defined(__apple_build_version__)
+#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY  // clang 3.3 - 3.5 has silent conversion between intrinsic vector types. 
+                                          // I expected this to be fixed in version 3.4 but it still exists!
+                                          // http://llvm.org/bugs/show_bug.cgi?id=17164
+                                          // Problem: The version number is not consistent across platforms
+                                          // The Apple build has different version numbers. Too bad!
+                                          // http://llvm.org/bugs/show_bug.cgi?id=12643
+
+#else
+    // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors
+    operator Vec4ib() const {
+        return _mm_castps_si128(xmm);
+    }
+#endif
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4fb const & insert(uint32_t index, bool value) {
+        static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
+        __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
+        if (value) {
+            xmm = _mm_or_ps(xmm,mask);
+        }
+        else {
+            xmm = _mm_andnot_ps(mask,xmm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        //return Vec4ib(*this).extract(index);
+        return Vec4ib(_mm_castps_si128(xmm)).extract(index);
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4fb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4fb operator & (Vec4fb const & a, Vec4fb const & b) {
+    return _mm_and_ps(a, b);
+}
+static inline Vec4fb operator && (Vec4fb const & a, Vec4fb const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4fb & operator &= (Vec4fb & a, Vec4fb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4fb operator | (Vec4fb const & a, Vec4fb const & b) {
+    return _mm_or_ps(a, b);
+}
+static inline Vec4fb operator || (Vec4fb const & a, Vec4fb const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4fb & operator |= (Vec4fb & a, Vec4fb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4fb operator ^ (Vec4fb const & a, Vec4fb const & b) {
+    return _mm_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4fb & operator ^= (Vec4fb & a, Vec4fb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4fb operator ~ (Vec4fb const & a) {
+    return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4fb operator ! (Vec4fb const & a) {
+    return Vec4fb( ! Vec4ib(a));
+}
+
+// Functions for Vec4fb
+
+// andnot: a & ~ b
+static inline Vec4fb andnot(Vec4fb const & a, Vec4fb const & b) {
+    return _mm_andnot_ps(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec4fb const & a) {
+    return horizontal_and(Vec128b(_mm_castps_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec4fb const & a) {
+    return horizontal_or(Vec128b(_mm_castps_si128(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vec2db: Vector of 2 Booleans for use with Vec2d
+*
+*****************************************************************************/
+
+class Vec2db {
+protected:
+    __m128d xmm; // Double vector
+public:
+    // Default constructor:
+    Vec2db() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    // Constructor to build from all elements:
+    Vec2db(bool b0, bool b1) {
+        xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); 
+    }
+    // Constructor to convert from type __m128d used in intrinsics:
+    Vec2db(__m128d const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128d used in intrinsics:
+    Vec2db & operator = (__m128d const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec2db(bool b) {
+        xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b)));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec2db & operator = (bool b) {
+        *this = Vec2db(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec2db(int b);
+    Vec2db & operator = (int x);
+public:
+    // Constructor to convert from type Vec2qb used as Boolean for integer vectors
+    Vec2db(Vec2qb const & x) {
+        xmm = _mm_castsi128_pd(x);
+    }
+    // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors
+    Vec2db & operator = (Vec2qb const & x) {
+        xmm = _mm_castsi128_pd(x);
+        return *this;
+    }
+    // Type cast operator to convert to __m128d used in intrinsics
+    operator __m128d() const {
+        return xmm;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors
+    operator Vec2qb() const {
+        return _mm_castpd_si128(xmm);
+    }
+#endif
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2db const & insert(uint32_t index, bool value) {
+        static const int32_t maskl[8] = {0,0,0,0,-1,-1,0,0};
+        __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index&1)*2)); // mask with FFFFFFFFFFFFFFFF at index position
+        if (value) {
+            xmm = _mm_or_pd(xmm,_mm_castps_pd(mask));
+        }
+        else {
+            xmm = _mm_andnot_pd(_mm_castps_pd(mask),xmm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec2qb(*this).extract(index);
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 2;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2db
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec2db operator & (Vec2db const & a, Vec2db const & b) {
+    return _mm_and_pd(a, b);
+}
+static inline Vec2db operator && (Vec2db const & a, Vec2db const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec2db & operator &= (Vec2db & a, Vec2db const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2db operator | (Vec2db const & a, Vec2db const & b) {
+    return _mm_or_pd(a, b);
+}
+static inline Vec2db operator || (Vec2db const & a, Vec2db const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec2db & operator |= (Vec2db & a, Vec2db const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2db operator ^ (Vec2db const & a, Vec2db const & b) {
+    return _mm_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec2db & operator ^= (Vec2db & a, Vec2db const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2db operator ~ (Vec2db const & a) {
+    return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec2db operator ! (Vec2db const & a) {
+    return Vec2db (! Vec2qb(a));
+}
+
+// Functions for Vec2db
+
+// andnot: a & ~ b
+static inline Vec2db andnot(Vec2db const & a, Vec2db const & b) {
+    return _mm_andnot_pd(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec2db const & a) {
+    return horizontal_and(Vec128b(_mm_castpd_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec2db const & a) {
+    return horizontal_or(Vec128b(_mm_castpd_si128(a)));
+}
+
+
+
+/*****************************************************************************
+*
+*          Vec4f: Vector of 4 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec4f {
+protected:
+    __m128 xmm; // Float vector
+public:
+    // Default constructor:
+    Vec4f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4f(float f) {
+        xmm = _mm_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec4f(float f0, float f1, float f2, float f3) {
+        xmm = _mm_setr_ps(f0, f1, f2, f3); 
+    }
+    // Constructor to convert from type __m128 used in intrinsics:
+    Vec4f(__m128 const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128 used in intrinsics:
+    Vec4f & operator = (__m128 const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128 used in intrinsics
+    operator __m128() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4f & load(float const * p) {
+        xmm = _mm_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    Vec4f & load_a(float const * p) {
+        xmm = _mm_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm_storeu_ps(p, xmm);
+    }
+    // Member function to store into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(float * p) const {
+        _mm_store_ps(p, xmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4f & load_partial(int n, float const * p) {
+        __m128 t1, t2;
+        switch (n) {
+        case 1:
+            xmm = _mm_load_ss(p); break;
+        case 2:
+            xmm = _mm_castpd_ps(_mm_load_sd((double*)p)); break;
+        case 3:
+            t1 = _mm_castpd_ps(_mm_load_sd((double*)p));
+            t2 = _mm_load_ss(p + 2);
+            xmm = _mm_movelh_ps(t1, t2); break;
+        case 4:
+            load(p); break;
+        default:
+            xmm = _mm_setzero_ps();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        __m128 t1;
+        switch (n) {
+        case 1:
+            _mm_store_ss(p, xmm); break;
+        case 2:
+            _mm_store_sd((double*)p, _mm_castps_pd(xmm)); break;
+        case 3:
+            _mm_store_sd((double*)p, _mm_castps_pd(xmm));
+            t1 = _mm_movehl_ps(xmm,xmm);
+            _mm_store_ss(p + 2, t1); break;
+        case 4:
+            store(p); break;
+        default:;
+        }
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4f & cutoff(int n) {
+        if (uint32_t(n) >= 4) return *this;
+        static const union {        
+            int32_t i[8];
+            float   f[8];
+        } mask = {{1,-1,-1,-1,0,0,0,0}};
+        xmm = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n));
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4f const & insert(uint32_t index, float value) {
+#if INSTRSET >= 5   // SSE4.1 supported
+        switch (index & 3) {
+        case 0:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4);  break;
+        case 1:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4);  break;
+        case 2:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4);  break;
+        default:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4);  break;
+        }
+#else
+        static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
+        __m128 broad = _mm_set1_ps(value);  // broadcast value into all elements
+        __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
+        xmm = selectf(mask,broad,xmm);
+#endif
+        return *this;
+    };
+    // Member function extract a single element from vector
+    float extract(uint32_t index) const {
+        float x[4];
+        store(x);
+        return x[index & 3];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4f operator + (Vec4f const & a, Vec4f const & b) {
+    return _mm_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4f operator + (Vec4f const & a, float b) {
+    return a + Vec4f(b);
+}
+static inline Vec4f operator + (float a, Vec4f const & b) {
+    return Vec4f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4f & operator += (Vec4f & a, Vec4f const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4f operator ++ (Vec4f & a, int) {
+    Vec4f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4f & operator ++ (Vec4f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4f operator - (Vec4f const & a, Vec4f const & b) {
+    return _mm_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4f operator - (Vec4f const & a, float b) {
+    return a - Vec4f(b);
+}
+static inline Vec4f operator - (float a, Vec4f const & b) {
+    return Vec4f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4f operator - (Vec4f const & a) {
+    return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+}
+
+// vector operator -= : subtract
+static inline Vec4f & operator -= (Vec4f & a, Vec4f const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4f operator -- (Vec4f & a, int) {
+    Vec4f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4f & operator -- (Vec4f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4f operator * (Vec4f const & a, Vec4f const & b) {
+    return _mm_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4f operator * (Vec4f const & a, float b) {
+    return a * Vec4f(b);
+}
+static inline Vec4f operator * (float a, Vec4f const & b) {
+    return Vec4f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4f & operator *= (Vec4f & a, Vec4f const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4f operator / (Vec4f const & a, Vec4f const & b) {
+    return _mm_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4f operator / (Vec4f const & a, float b) {
+    return a / Vec4f(b);
+}
+static inline Vec4f operator / (float a, Vec4f const & b) {
+    return Vec4f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4f & operator /= (Vec4f & a, Vec4f const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4fb operator == (Vec4f const & a, Vec4f const & b) {
+    return _mm_cmpeq_ps(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4fb operator != (Vec4f const & a, Vec4f const & b) {
+    return _mm_cmpneq_ps(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4fb operator < (Vec4f const & a, Vec4f const & b) {
+    return _mm_cmplt_ps(a, b);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4fb operator <= (Vec4f const & a, Vec4f const & b) {
+    return _mm_cmple_ps(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4fb operator > (Vec4f const & a, Vec4f const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4fb operator >= (Vec4f const & a, Vec4f const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4f operator & (Vec4f const & a, Vec4f const & b) {
+    return _mm_and_ps(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec4f & operator &= (Vec4f & a, Vec4f const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4f and Vec4fb
+static inline Vec4f operator & (Vec4f const & a, Vec4fb const & b) {
+    return _mm_and_ps(a, b);
+}
+static inline Vec4f operator & (Vec4fb const & a, Vec4f const & b) {
+    return _mm_and_ps(a, b);
+}
+
+// vector operator | : bitwise or
+static inline Vec4f operator | (Vec4f const & a, Vec4f const & b) {
+    return _mm_or_ps(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec4f & operator |= (Vec4f & a, Vec4f const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4f operator ^ (Vec4f const & a, Vec4f const & b) {
+    return _mm_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4f & operator ^= (Vec4f & a, Vec4f const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4fb operator ! (Vec4f const & a) {
+    return a == Vec4f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec4f select (Vec4fb const & s, Vec4f const & a, Vec4f const & b) {
+    return selectf(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4f if_add (Vec4fb const & f, Vec4f const & a, Vec4f const & b) {
+    return a + (Vec4f(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4f if_mul (Vec4fb const & f, Vec4f const & a, Vec4f const & b) {
+    return a * select(f, b, 1.f);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec4f const & a) {
+#if  INSTRSET >= 3  // SSE3
+    __m128 t1 = _mm_hadd_ps(a,a);
+    __m128 t2 = _mm_hadd_ps(t1,t1);
+    return _mm_cvtss_f32(t2);        
+#else
+    __m128 t1 = _mm_movehl_ps(a,a);
+    __m128 t2 = _mm_add_ps(a,t1);
+    __m128 t3 = _mm_shuffle_ps(t2,t2,1);
+    __m128 t4 = _mm_add_ss(t2,t3);
+    return _mm_cvtss_f32(t4);
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4f max(Vec4f const & a, Vec4f const & b) {
+    return _mm_max_ps(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4f min(Vec4f const & a, Vec4f const & b) {
+    return _mm_min_ps(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4f abs(Vec4f const & a) {
+    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    return _mm_and_ps(a,mask);
+}
+
+// function sqrt: square root
+static inline Vec4f sqrt(Vec4f const & a) {
+    return _mm_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec4f square(Vec4f const & a) {
+    return a * a;
+}
+
+// pow(vector,int) function template
+template <typename VTYPE>
+static inline VTYPE pow_template_i(VTYPE const & x0, int n) {
+    VTYPE x = x0;                      // a^(2^i)
+    VTYPE y(1.0f);                     // accumulator
+    if (n >= 0) {                      // make sure n is not negative
+        while (true) {                 // loop for each bit in n
+            if (n & 1) y *= x;         // multiply if bit = 1
+            n >>= 1;                   // get next bit of n
+            if (n == 0) return y;      // finished
+            x *= x;                    // x = a^2, a^4, a^8, etc.
+        }
+    }
+    else {                             // n < 0
+        return VTYPE(1.0f)/pow_template_i<VTYPE>(x0,-n);  // reciprocal
+    }
+}
+
+// pow(Vec4f, int):
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is
+// not included
+
+template <typename TT> static Vec4f pow(Vec4f const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4f pow<int>(Vec4f const & x0, int n) {
+    return pow_template_i<Vec4f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4f pow<uint32_t>(Vec4f const & x0, uint32_t n) {
+    return pow_template_i<Vec4f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4f pow_n(Vec4f const & a) {
+    if (n < 0)    return Vec4f(1.0f) / pow_n<-n>(a);
+    if (n == 0)   return Vec4f(1.0f);
+    if (n >= 256) return pow(a, n);
+    Vec4f x = a;                       // a^(2^i)
+    Vec4f y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+// implement as function pow(vector, const_int)
+template <int n>
+static inline Vec4f pow(Vec4f const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+// implement the same as macro pow_const(vector, int)
+#define pow_const(x,n) pow_n<n>(x)
+
+
+// avoid unsafe optimization in function round
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
+static inline Vec4f round(Vec4f const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
+#elif defined(__clang__) && INSTRSET < 5
+// static inline Vec4f round(Vec4f const & a) __attribute__ ((optnone));
+// This doesn't work, but current versions of Clang (3.5) don't optimize away signedmagic, even with -funsafe-math-optimizations
+// Add volatile to b if future versions fail
+#elif defined (_MSC_VER) || defined(__INTEL_COMPILER) && INSTRSET < 5
+#pragma float_control(push) 
+#pragma float_control(precise,on)
+#define FLOAT_CONTROL_PRECISE_FOR_ROUND
+#endif
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec4f round(Vec4f const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 0);
+#else // SSE2. Use magic number method
+    // Note: assume MXCSR control register is set to rounding
+    // (don't use conversion to int, it will limit the value to +/- 2^31)
+    Vec4f signmask    = _mm_castsi128_ps(constant4i<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000>());  // -0.0
+    Vec4f magic       = _mm_castsi128_ps(constant4i<0x4B000000,0x4B000000,0x4B000000,0x4B000000>());  // magic number = 2^23
+    Vec4f sign        = _mm_and_ps(a, signmask);                                    // signbit of a
+    Vec4f signedmagic = _mm_or_ps(magic, sign);                                     // magic number with sign of a
+    // volatile
+    Vec4f b = a + signedmagic;                                                      // round by adding magic number
+    return b - signedmagic;                                                         // .. and subtracting it again
+#endif
+}
+#ifdef FLOAT_CONTROL_PRECISE_FOR_ROUND
+#pragma float_control(pop)
+#endif
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec4f truncate(Vec4f const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 3);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (3 << 13);      // bit 13-14 = 11
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec4f r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec4f floor(Vec4f const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 1);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (1 << 13);      // bit 13-14 = 01
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec4f r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec4f ceil(Vec4f const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 2);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (2 << 13);      // bit 13-14 = 10
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec4f r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4f const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm_cvtps_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4f const & a) {
+    return _mm_cvttps_epi32(a);
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec4f to_float(Vec4i const & a) {
+    return _mm_cvtepi32_ps(a);
+}
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec4f approx_recipr(Vec4f const & a) {
+    return _mm_rcp_ps(a);
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec4f approx_rsqrt(Vec4f const & a) {
+    return _mm_rsqrt_ps(a);
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4f mul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
+#ifdef __FMA__
+    return _mm_fmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_macc_ps(a, b, c);
+#else
+    return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec4f mul_sub(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
+#ifdef __FMA__
+    return _mm_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_ps(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec4f nmul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
+#ifdef __FMA__
+    return _mm_fnmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_nmacc_ps(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4f mul_sub_x(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
+#ifdef __FMA__
+    return _mm_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_ps(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    Vec4i upper_mask = -(1 << 12);                         // mask to remove lower 12 bits
+    Vec4f a_high = a & Vec4f(_mm_castsi128_ps(upper_mask));// split into high and low parts
+    Vec4f b_high = b & Vec4f(_mm_castsi128_ps(upper_mask));
+    Vec4f a_low  = a - a_high;
+    Vec4f b_low  = b - b_high;
+    Vec4f r1 = a_high * b_high;                            // this product is exact
+    Vec4f r2 = r1 - c;                                     // subtract c from high product
+    Vec4f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec4i exponent(Vec4f const & a) {
+    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
+    Vec4ui t2 = t1 << 1;               // shift out sign bit
+    Vec4ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec4i  t4 = Vec4i(t3) - 0x7F;      // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f 
+static inline Vec4f fraction(Vec4f const & a) {
+    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
+    Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000); // set exponent to 0 + bias
+    return _mm_castsi128_ps(t2);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4f exp2(Vec4i const & n) {
+    Vec4i t1 = max(n,  -0x7F);         // limit to allowed range
+    Vec4i t2 = min(t1,  0x80);
+    Vec4i t3 = t2 + 0x7F;              // add bias
+    Vec4i t4 = t3 << 23;               // put exponent into position 23
+    return _mm_castsi128_ps(t4);       // reinterpret as float
+}
+//static Vec4f exp2(Vec4f const & x); // defined in vectormath_exp.h
+
+
+// Control word manipulaton
+// ------------------------
+// The MXCSR control word has the following bits:
+//  0:    Invalid Operation Flag
+//  1:    Denormal Flag (=subnormal)
+//  2:    Divide-by-Zero Flag
+//  3:    Overflow Flag
+//  4:    Underflow Flag
+//  5:    Precision Flag
+//  6:    Denormals Are Zeros (=subnormals)
+//  7:    Invalid Operation Mask
+//  8:    Denormal Operation Mask (=subnormal)
+//  9:    Divide-by-Zero Mask
+// 10:    Overflow Mask
+// 11:    Underflow Mask
+// 12:    Precision Mask
+// 13-14: Rounding control
+//        00: round to nearest or even
+//        01: round down towards -infinity
+//        10: round up   towards +infinity
+//        11: round towards zero (truncate)
+// 15: Flush to Zero
+
+// Function get_control_word:
+// Read the MXCSR control word
+static inline uint32_t get_control_word() {
+    return _mm_getcsr();
+}
+
+// Function set_control_word:
+// Write the MXCSR control word
+static inline void set_control_word(uint32_t w) {
+    _mm_setcsr(w);
+}
+
+// Function no_subnormals:
+// Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely
+// time-consuming denormals in case of underflow
+static inline void no_subnormals() {
+    uint32_t t1 = get_control_word();
+    t1 |= (1 << 6) | (1 << 15);     // set bit 6 and 15 in MXCSR
+    set_control_word(t1);
+}
+
+// Function reset_control_word:
+// Set the MXCSR control word to the default value 0x1F80.
+// This will mask floating point exceptions, set rounding mode to nearest (or even),
+// and allow denormals.
+static inline void reset_control_word() {
+    set_control_word(0x1F80);
+}
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb sign_bit(Vec4f const & a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 >> 31;               // extend sign bit
+    return _mm_castsi128_ps(t2);       // reinterpret as 32-bit Boolean
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4f sign_combine(Vec4f const & a, Vec4f const & b) {
+    Vec4f signmask = _mm_castsi128_ps(constant4i<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000>());  // -0.0
+    return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_finite(Vec4f const & a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return Vec4ib(t3);
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_inf(Vec4f const & a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    return t2 == Vec4i(0xFF000000);    // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_nan(Vec4f const & a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    Vec4i t3 = 0xFF000000;             // exponent mask
+    Vec4i t4 = t2 & t3;                // exponent
+    Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction
+    return Vec4ib((t4 == t3) & (t5 != 0));// exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4fb is_subnormal(Vec4f const & a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    Vec4i t3 = 0xFF000000;             // exponent mask
+    Vec4i t4 = t2 & t3;                // exponent
+    Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction
+    return Vec4ib((t4 == 0) & (t5 != 0));// exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4fb is_zero_or_subnormal(Vec4f const & a) {
+    Vec4i t = _mm_castps_si128(a);     // reinterpret as 32-bit integer
+          t &= 0x7F800000;             // isolate exponent
+    return t == 0;                     // exponent = 0
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec4f infinite4f() {
+    return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
+}
+
+// Function nan4f: returns a vector where all elements are NAN (quiet)
+static inline Vec4f nan4f(int n = 0x10) {
+    return _mm_castsi128_ps(_mm_set1_epi32(0x7FC00000 + n));
+}
+
+
+/*****************************************************************************
+*
+*          Vector Vec4f permute and blend functions
+*
+******************************************************************************
+*
+* The permute function can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select. A negative index will generate zero.
+*
+* Example:
+* Vec4f a(10.f,11.f,12.f,13.f);        // a is (10,11,12,13)
+* Vec4f b, c;
+* b = permute4f<0,0,2,2>(a);           // b is (10,10,12,12)
+* c = permute4f<3,2,-1,-1>(a);         // c is (13,12, 0, 0)
+*
+*
+* The blend function can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where indexes 0 - 3 indicate an element from the first source
+* vector and indexes 4 - 7 indicate an element from the second source vector.
+* A negative index will generate zero.
+*
+*
+* Example:
+* Vec4f a(10.f,11.f,12.f,13.f);        // a is (10, 11, 12, 13)
+* Vec4f b(20.f,21.f,22.f,23.f);        // b is (20, 21, 22, 23)
+* Vec4f c;
+* c = blend4f<1,4,-1,7> (a,b);         // c is (11, 20,  0, 23)
+*
+* Don't worry about the complicated code for these functions. Most of the
+* code is resolved at compile time to generate only a few instructions.
+*****************************************************************************/
+
+// permute vector Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f permute4f(Vec4f const & a) {
+    // is shuffling needed
+    const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0);
+    // is zeroing needed
+    const bool do_zero    = (i0 | i1 | i2 | i3) < 0 && ((i0 | i1 | i2 | i3) & 0x80);
+
+    if (!do_shuffle && !do_zero) {
+        return a;                                          // trivial case: do nothing
+    }
+    if (do_zero && !do_shuffle) {                          // zeroing, not shuffling
+        if ((i0 & i1 & i2 & i3) < 0) return _mm_setzero_ps(); // zero everything
+        // zero some elements
+        __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
+        return  _mm_and_ps(a,_mm_castsi128_ps(mask1));     // zero with AND mask
+    }
+    if (do_shuffle && !do_zero) {                          // shuffling, not zeroing        
+        return _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6);
+    }
+    // both shuffle and zero
+    if ((i0 & i1) < 0 && (i2 | i3) >= 0) {                 // zero low half, shuffle high half
+        return _mm_shuffle_ps(_mm_setzero_ps(), a, (i2&3)<<4 | (i3&3)<<6);
+    }
+    if ((i0 | i1) >= 0 && (i2 & i3) < 0) {                 // shuffle low half, zero high half
+        return _mm_shuffle_ps(a, _mm_setzero_ps(), (i0&3) | (i1&3)<<2);
+    }
+#if  INSTRSET >= 4  // SSSE3
+    // With SSSE3 we can do both with the PSHUFB instruction
+    const int j0 = (i0 & 3) << 2;
+    const int j1 = (i1 & 3) << 2;
+    const int j2 = (i2 & 3) << 2;
+    const int j3 = (i3 & 3) << 2;
+    __m128i mask2 = constant4i <
+        i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24,
+        i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24,
+        i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24,
+        i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > ();
+    return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a),mask2));
+#else
+    __m128 t1 = _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); // shuffle
+    __m128i mask3 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
+    return _mm_and_ps(t1,_mm_castsi128_ps(mask3));     // zero with AND mask
+#endif
+}
+
+
+// blend vectors Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f blend4f(Vec4f const & a, Vec4f const & b) {
+
+    // Combine all the indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0&7) | (i1&7)<<8 | (i2&7)<<16 | (i3&7)<<24; 
+
+    // Mask to zero out negative indexes
+    const int m2 = (i0<0?0:0xFF) | (i1<0?0:0xFF)<<8 | (i2<0?0:0xFF)<<16 | (i3<0?0:0xFF)<<24;
+
+    if ((m1 & 0x04040404 & m2) == 0) {
+        // no elements from b
+        return permute4f<i0,i1,i2,i3>(a);
+    }
+    if (((m1^0x04040404) & 0x04040404 & m2) == 0) {
+        // no elements from a
+        return permute4f<i0&~4, i1&~4, i2&~4, i3&~4>(b);
+    }
+    if (((m1 & ~0x04040404) ^ 0x03020100) == 0 && m2 == -1) {
+        // selecting without shuffling or zeroing
+        __m128i sel = constant4i <i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1> ();
+        return selectf(_mm_castsi128_ps(sel), a, b);
+    }
+#ifdef __XOP__     // Use AMD XOP instruction PPERM
+    __m128i maska = constant4i <
+        i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24),
+        i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24),
+        i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24),
+        i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > ();
+    return _mm_castsi128_ps(_mm_perm_epi8(_mm_castps_si128(a), _mm_castps_si128(b), maska));
+#else
+    if ((((m1 & ~0x04040404) ^ 0x03020100) & m2) == 0) {
+        // selecting and zeroing, not shuffling
+        __m128i sel1  = constant4i <i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1> ();
+        __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
+        __m128 t1 = selectf(_mm_castsi128_ps(sel1), a, b);   // select
+        return  _mm_and_ps(t1, _mm_castsi128_ps(mask1));     // zero
+    }
+    // special cases unpckhps, unpcklps, shufps
+    Vec4f t;
+    if (((m1 ^ 0x05010400) & m2) == 0) {
+        t = _mm_unpacklo_ps(a, b);
+        goto DOZERO;
+    }
+    if (((m1 ^ 0x01050004) & m2) == 0) {
+        t = _mm_unpacklo_ps(b, a);
+        goto DOZERO;
+    }
+    if (((m1 ^ 0x07030602) & m2) == 0) {
+        t = _mm_unpackhi_ps(a, b);
+        goto DOZERO;
+    }
+    if (((m1 ^ 0x03070206) & m2) == 0) {
+        t = _mm_unpackhi_ps(b, a);
+        goto DOZERO;
+    }    
+    // first two elements from a, last two from b
+    if (((m1^0x04040000) & 0x04040404 & m2) == 0) {
+        t = _mm_shuffle_ps(a, b, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6));
+        goto DOZERO;
+    } 
+    // first two elements from b, last two from a
+    if (((m1^0x00000404) & 0x04040404 & m2) == 0) {
+        t = _mm_shuffle_ps(b, a, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6));
+        goto DOZERO;
+    }
+    {   // general case. combine two permutes
+        __m128 a1 = permute4f <
+            (uint32_t)i0 < 4 ? i0 : -1,
+            (uint32_t)i1 < 4 ? i1 : -1,
+            (uint32_t)i2 < 4 ? i2 : -1,
+            (uint32_t)i3 < 4 ? i3 : -1  > (a);
+        __m128 b1 = permute4f <
+            (uint32_t)(i0^4) < 4 ? (i0^4) : -1,
+            (uint32_t)(i1^4) < 4 ? (i1^4) : -1,
+            (uint32_t)(i2^4) < 4 ? (i2^4) : -1,
+            (uint32_t)(i3^4) < 4 ? (i3^4) : -1  > (b);
+        return  _mm_or_ps(a1,b1);
+    }
+DOZERO:
+    if ((i0|i1|i2|i3) & 0x80) {
+        // zero some elements
+        __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
+        t = _mm_and_ps(t,_mm_castsi128_ps(mask1));     // zero with AND mask
+    }        
+    return t;
+
+#endif // __XOP__
+}
+
+// change signs on vectors Vec4f
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f change_sign(Vec4f const & a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    __m128i mask = constant4i<i0 ? 0x80000000 : 0, i1 ? 0x80000000 : 0, i2 ? 0x80000000 : 0, i3 ? 0x80000000 : 0>();
+    return  _mm_xor_ps(a, _mm_castsi128_ps(mask));     // flip sign bits
+}
+
+
+/*****************************************************************************
+*
+*          Vec2d: Vector of 2 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec2d {
+protected:
+    __m128d xmm; // double vector
+public:
+    // Default constructor:
+    Vec2d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec2d(double d) {
+        xmm = _mm_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec2d(double d0, double d1) {
+        xmm = _mm_setr_pd(d0, d1); 
+    }
+    // Constructor to convert from type __m128d used in intrinsics:
+    Vec2d(__m128d const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128d used in intrinsics:
+    Vec2d & operator = (__m128d const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128d used in intrinsics
+    operator __m128d() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec2d & load(double const * p) {
+        xmm = _mm_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    Vec2d const & load_a(double const * p) {
+        xmm = _mm_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm_storeu_pd(p, xmm);
+    }
+    // Member function to store into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(double * p) const {
+        _mm_store_pd(p, xmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec2d & load_partial(int n, double const * p) {
+        if (n == 1) {
+            xmm = _mm_load_sd(p);
+        }
+        else if (n == 2) {
+            load(p);
+        }
+        else {
+            xmm = _mm_setzero_pd();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n == 1) {
+            _mm_store_sd(p, xmm);
+        }
+        else if (n == 2) {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec2d & cutoff(int n) {
+        xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n*2));
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2d const & insert(uint32_t index, double value) {
+        __m128d v2 = _mm_set_sd(value);
+        if (index == 0) {
+            xmm = _mm_shuffle_pd(v2,xmm,2);
+        }
+        else {
+            xmm = _mm_shuffle_pd(xmm,v2,0);
+        }
+        return *this;
+    };
+    // Member function extract a single element from vector
+    double extract(uint32_t index) const {
+        double x[2];
+        store(x);
+        return x[index & 1];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 2;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2d operator + (Vec2d const & a, Vec2d const & b) {
+    return _mm_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec2d operator + (Vec2d const & a, double b) {
+    return a + Vec2d(b);
+}
+static inline Vec2d operator + (double a, Vec2d const & b) {
+    return Vec2d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec2d & operator += (Vec2d & a, Vec2d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec2d operator ++ (Vec2d & a, int) {
+    Vec2d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec2d & operator ++ (Vec2d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2d operator - (Vec2d const & a, Vec2d const & b) {
+    return _mm_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec2d operator - (Vec2d const & a, double b) {
+    return a - Vec2d(b);
+}
+static inline Vec2d operator - (double a, Vec2d const & b) {
+    return Vec2d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec2d operator - (Vec2d const & a) {
+    return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0,0x80000000,0,0x80000000)));
+}
+
+// vector operator -= : subtract
+static inline Vec2d & operator -= (Vec2d & a, Vec2d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec2d operator -- (Vec2d & a, int) {
+    Vec2d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec2d & operator -- (Vec2d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2d operator * (Vec2d const & a, Vec2d const & b) {
+    return _mm_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec2d operator * (Vec2d const & a, double b) {
+    return a * Vec2d(b);
+}
+static inline Vec2d operator * (double a, Vec2d const & b) {
+    return Vec2d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec2d & operator *= (Vec2d & a, Vec2d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec2d operator / (Vec2d const & a, Vec2d const & b) {
+    return _mm_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec2d operator / (Vec2d const & a, double b) {
+    return a / Vec2d(b);
+}
+static inline Vec2d operator / (double a, Vec2d const & b) {
+    return Vec2d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec2d & operator /= (Vec2d & a, Vec2d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2db operator == (Vec2d const & a, Vec2d const & b) {
+    return _mm_cmpeq_pd(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2db operator != (Vec2d const & a, Vec2d const & b) {
+    return _mm_cmpneq_pd(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec2db operator < (Vec2d const & a, Vec2d const & b) {
+    return _mm_cmplt_pd(a, b);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec2db operator <= (Vec2d const & a, Vec2d const & b) {
+    return _mm_cmple_pd(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2db operator > (Vec2d const & a, Vec2d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec2db operator >= (Vec2d const & a, Vec2d const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec2d operator & (Vec2d const & a, Vec2d const & b) {
+    return _mm_and_pd(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec2d & operator &= (Vec2d & a, Vec2d const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec2d and Vec2db
+static inline Vec2d operator & (Vec2d const & a, Vec2db const & b) {
+    return _mm_and_pd(a, b);
+}
+static inline Vec2d operator & (Vec2db const & a, Vec2d const & b) {
+    return _mm_and_pd(a, b);
+}
+
+// vector operator | : bitwise or
+static inline Vec2d operator | (Vec2d const & a, Vec2d const & b) {
+    return _mm_or_pd(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec2d & operator |= (Vec2d & a, Vec2d const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2d operator ^ (Vec2d const & a, Vec2d const & b) {
+    return _mm_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec2d & operator ^= (Vec2d & a, Vec2d const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec2db operator ! (Vec2d const & a) {
+    return a == Vec2d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec2d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). 
+// No other values are allowed.
+static inline Vec2d select (Vec2db const & s, Vec2d const & a, Vec2d const & b) {
+    return selectd(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2d if_add (Vec2db const & f, Vec2d const & a, Vec2d const & b) {
+    return a + (Vec2d(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec2d if_mul (Vec2db const & f, Vec2d const & a, Vec2d const & b) {
+    return a * select(f, b, 1.);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec2d const & a) {
+#if  INSTRSET >= 3  // SSE3
+    __m128d t1 = _mm_hadd_pd(a,a);
+    return _mm_cvtsd_f64(t1);        
+#else
+    __m128  t0 = _mm_castpd_ps(a);
+    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
+    __m128d t2 = _mm_add_sd(a,t1);
+    return _mm_cvtsd_f64(t2);
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec2d max(Vec2d const & a, Vec2d const & b) {
+    return _mm_max_pd(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2d min(Vec2d const & a, Vec2d const & b) {
+    return _mm_min_pd(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec2d abs(Vec2d const & a) {
+    __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1,0x7FFFFFFF,-1,0x7FFFFFFF));
+    return _mm_and_pd(a,mask);
+}
+
+// function sqrt: square root
+static inline Vec2d sqrt(Vec2d const & a) {
+    return _mm_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec2d square(Vec2d const & a) {
+    return a * a;
+}
+
+// pow(Vec2d, int):
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is
+// not included
+
+template <typename TT> static Vec2d pow(Vec2d const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec2d pow<int>(Vec2d const & x0, int n) {
+    return pow_template_i<Vec2d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec2d pow<uint32_t>(Vec2d const & x0, uint32_t n) {
+    return pow_template_i<Vec2d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec2d pow_n(Vec2d const & a) {
+    if (n < 0)    return Vec2d(1.0) / pow_n<-n>(a);
+    if (n == 0)   return Vec2d(1.0);
+    if (n >= 256) return pow(a, n);
+    Vec2d x = a;                       // a^(2^i)
+    Vec2d y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec2d pow(Vec2d const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// avoid unsafe optimization in function round
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
+static inline Vec4f round(Vec4f const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
+#elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(push) 
+#pragma float_control(precise,on)
+#endif
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec2d round(Vec2d const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 0);
+#else // SSE2. Use magic number method
+    // Note: assume MXCSR control register is set to rounding
+    // (don't use conversion to int, it will limit the value to +/- 2^31)
+    Vec2d signmask    = _mm_castsi128_pd(constant4i<0,(int)0x80000000,0,(int)0x80000000>());  // -0.0
+    Vec2d magic       = _mm_castsi128_pd(constant4i<0,0x43300000,0,0x43300000>());  // magic number = 2^52
+    Vec2d sign        = _mm_and_pd(a, signmask);                                    // signbit of a
+    Vec2d signedmagic = _mm_or_pd(magic, sign);                                     // magic number with sign of a
+    return a + signedmagic - signedmagic;                                           // round by adding magic number
+#endif
+}
+#if defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(pop)
+#endif
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec2d truncate(Vec2d const & a) {
+// (note: may fail on MS Visual Studio 2008, works in later versions)
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 3);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (3 << 13);      // bit 13-14 = 11
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec2d r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+// (note: may fail on MS Visual Studio 2008, works in later versions)
+static inline Vec2d floor(Vec2d const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 1);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (1 << 13);      // bit 13-14 = 01
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec2d r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec2d ceil(Vec2d const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 2);
+#else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
+    uint32_t t1 = _mm_getcsr();        // MXCSR
+    uint32_t t2 = t1 | (2 << 13);      // bit 13-14 = 10
+    _mm_setcsr(t2);                    // change MXCSR
+    Vec2d r = round(a);                // use magic number method
+    _mm_setcsr(t1);                    // restore MXCSR
+    return r;
+#endif
+}
+
+// function truncate_to_int: round towards zero.
+static inline Vec4i truncate_to_int(Vec2d const & a, Vec2d const & b) {
+    Vec4i t1 = _mm_cvttpd_epi32(a);
+    Vec4i t2 = _mm_cvttpd_epi32(b);
+    return blend4i<0,1,4,5> (t1, t2);
+}
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec2q truncate_to_int64(Vec2d const & a) {
+    double aa[2];
+    a.store(aa);
+    return Vec2q(int64_t(aa[0]), int64_t(aa[1]));
+}
+
+// function truncate_to_int64_limited: round towards zero. (inefficient)
+// result as 64-bit integer vector, but with limited range
+static inline Vec2q truncate_to_int64_limited(Vec2d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4i t1 = _mm_cvttpd_epi32(a);
+    return extend_low(t1);
+}
+
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector
+static inline Vec4i round_to_int(Vec2d const & a, Vec2d const & b) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4i t1 = _mm_cvtpd_epi32(a);
+    Vec4i t2 = _mm_cvtpd_epi32(b);
+    return blend4i<0,1,4,5> (t1, t2);
+}
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector. Upper two values of result are 0
+static inline Vec4i round_to_int(Vec2d const & a) {
+    Vec4i t1 = _mm_cvtpd_epi32(a);
+    return t1;
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec2q round_to_int64(Vec2d const & a) {
+    return truncate_to_int64(round(a));
+}
+
+// function round_to_int: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range
+static inline Vec2q round_to_int64_limited(Vec2d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4i t1 = _mm_cvtpd_epi32(a);
+    return extend_low(t1);
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec2d to_double(Vec2q const & a) {
+    int64_t aa[2];
+    a.store(aa);
+    return Vec2d(double(aa[0]), double(aa[1]));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec2d to_double_limited(Vec2q const & x) {
+    Vec4i compressed = permute4i<0,2,-256,-256>(Vec4i(x));
+    return _mm_cvtepi32_pd(compressed);
+}
+
+// function to_double_low: convert integer vector elements [0] and [1] to double vector
+static inline Vec2d to_double_low(Vec4i const & a) {
+    return _mm_cvtepi32_pd(a);
+}
+
+// function to_double_high: convert integer vector elements [2] and [3] to double vector
+static inline Vec2d to_double_high(Vec4i const & a) {
+    return to_double_low(_mm_srli_si128(a,8));
+}
+
+// function compress: convert two Vec2d to one Vec4f
+static inline Vec4f compress (Vec2d const & low, Vec2d const & high) {
+    Vec4f t1 = _mm_cvtpd_ps(low);
+    Vec4f t2 = _mm_cvtpd_ps(high);
+    return blend4f<0,1,4,5> (t1, t2);
+}
+
+// Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d
+static inline Vec2d extend_low (Vec4f const & a) {
+    return _mm_cvtps_pd(a);
+}
+
+// Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d
+static inline Vec2d extend_high (Vec4f const & a) {
+    return _mm_cvtps_pd(_mm_movehl_ps(a,a));
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec2d mul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
+#ifdef __FMA__
+    return _mm_fmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_macc_pd(a, b, c);
+#else
+    return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec2d mul_sub(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
+#ifdef __FMA__
+    return _mm_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_pd(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec2d nmul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
+#ifdef __FMA__
+    return _mm_fnmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_nmacc_pd(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec2d mul_sub_x(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
+#ifdef __FMA__
+    return _mm_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_pd(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    Vec2q upper_mask = -(1LL << 27);                       // mask to remove lower 27 bits
+    Vec2d a_high = a & Vec2d(_mm_castsi128_pd(upper_mask));// split into high and low parts
+    Vec2d b_high = b & Vec2d(_mm_castsi128_pd(upper_mask));
+    Vec2d a_low  = a - a_high;
+    Vec2d b_low  = b - b_high;
+    Vec2d r1 = a_high * b_high;                            // this product is exact
+    Vec2d r2 = r1 - c;                                     // subtract c from high product
+    Vec2d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec2q exponent(Vec2d const & a) {
+    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
+    Vec2uq t2 = t1 << 1;               // shift out sign bit
+    Vec2uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec2q  t4 = Vec2q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25 
+static inline Vec2d fraction(Vec2d const & a) {
+    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
+    Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll); // set exponent to 0 + bias
+    return _mm_castsi128_pd(t2);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec2d exp2(Vec2q const & n) {
+    Vec2q t1 = max(n,  -0x3FF);        // limit to allowed range
+    Vec2q t2 = min(t1,  0x400);
+    Vec2q t3 = t2 + 0x3FF;             // add bias
+    Vec2q t4 = t3 << 52;               // put exponent into position 52
+    return _mm_castsi128_pd(t4);       // reinterpret as double
+}
+//static Vec2d exp2(Vec2d const & x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false
+static inline Vec2db sign_bit(Vec2d const & a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 64-bit integer
+    Vec2q t2 = t1 >> 63;               // extend sign bit
+    return _mm_castsi128_pd(t2);       // reinterpret as 64-bit Boolean
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec2d sign_combine(Vec2d const & a, Vec2d const & b) {
+    Vec2d signmask = _mm_castsi128_pd(constant4i<0,(int)0x80000000,0,(int)0x80000000>());  // -0.0
+    return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+static inline Vec2db is_finite(Vec2d const & a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec2qb t4 = Vec2q(t2 & t3) != t3;  // exponent field is not all 1s
+    return t4;
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec2db is_inf(Vec2d const & a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFFE0000000000000ll; // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec2db is_nan(Vec2d const & a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec2q t4 = t2 & t3;                // exponent
+    Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction
+    return Vec2qb((t4==t3) & (t5!=0)); // exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are subnormal (denormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec2db is_subnormal(Vec2d const & a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 32-bit integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec2q t4 = t2 & t3;                // exponent
+    Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction
+    return Vec2qb((t4==0) & (t5!=0));  // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec2db is_zero_or_subnormal(Vec2d const & a) {
+    Vec2q t = _mm_castpd_si128(a);     // reinterpret as 32-bit integer
+          t &= 0x7FF0000000000000ll;   // isolate exponent
+    return t == 0;                     // exponent = 0
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec2d infinite2d() {
+    return _mm_castsi128_pd(_mm_setr_epi32(0,0x7FF00000,0,0x7FF00000));
+}
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec2d nan2d(int n = 0x10) {
+    return _mm_castsi128_pd(_mm_setr_epi32(n, 0x7FF80000, n, 0x7FF80000));
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline __m128i reinterpret_i (__m128i const & x) {
+    return x;
+}
+
+static inline __m128i reinterpret_i (__m128  const & x) {
+    return _mm_castps_si128(x);
+}
+
+static inline __m128i reinterpret_i (__m128d const & x) {
+    return _mm_castpd_si128(x);
+}
+
+static inline __m128  reinterpret_f (__m128i const & x) {
+    return _mm_castsi128_ps(x);
+}
+
+static inline __m128  reinterpret_f (__m128  const & x) {
+    return x;
+}
+
+static inline __m128  reinterpret_f (__m128d const & x) {
+    return _mm_castpd_ps(x);
+}
+
+static inline __m128d reinterpret_d (__m128i const & x) {
+    return _mm_castsi128_pd(x);
+}
+
+static inline __m128d reinterpret_d (__m128  const & x) {
+    return _mm_castps_pd(x);
+}
+
+static inline __m128d reinterpret_d (__m128d const & x) {
+    return x;
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* The permute function can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select. An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec2d a(10., 11.);              // a is (10, 11)
+* Vec2d b, c;
+* b = permute2d<1,1>(a);          // b is (11, 11)
+* c = permute2d<-1,0>(a);         // c is ( 0, 10)
+*
+*
+* The blend function can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where indexes 0 - 1 indicate an element from the first source
+* vector and indexes 2 - 3 indicate an element from the second source vector.
+* An index of -1 will generate zero.
+*
+*
+* Example:
+* Vec2d a(10., 11.);              // a is (10, 11)
+* Vec2d b(20., 21.);              // b is (20, 21)
+* Vec2d c;
+* c = blend2d<0,3> (a,b);         // c is (10, 21)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// permute vector Vec2d
+template <int i0, int i1>
+static inline Vec2d permute2d(Vec2d const & a) {
+    // is shuffling needed
+    const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0);
+    // is zeroing needed
+    const bool do_zero    = ((i0 | i1) < 0 && (i0 | i1) & 0x80);
+
+    if (do_zero && !do_shuffle) {                          // zeroing, not shuffling
+        if ((i0 & i1) < 0) return _mm_setzero_pd();        // zero everything
+        // zero some elements
+        __m128i mask1 = constant4i< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0) >();
+        return  _mm_and_pd(a,_mm_castsi128_pd(mask1));     // zero with AND mask
+    }
+    else if (do_shuffle && !do_zero) {                     // shuffling, not zeroing        
+        return _mm_shuffle_pd(a, a, (i0&1) | (i1&1)<<1);
+    }
+    else if (do_shuffle && do_zero) {                      // shuffling and zeroing        
+        // both shuffle and zero
+        if (i0 < 0 && i1 >= 0) {                           // zero low half, shuffle high half
+            return _mm_shuffle_pd(_mm_setzero_pd(), a, (i1 & 1) << 1);
+        }
+        if (i0 >= 0 && i1 < 0) {                           // shuffle low half, zero high half
+            return _mm_shuffle_pd(a, _mm_setzero_pd(), i0 & 1);
+        }
+    }
+    return a;                                              // trivial case: do nothing
+}
+
+
+// blend vectors Vec2d
+template <int i0, int i1>
+static inline Vec2d blend2d(Vec2d const & a, Vec2d const & b) {
+
+    // Combine all the indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0 & 3) | (i1 & 3) << 8; 
+
+    // Mask to zero out negative indexes
+    const int m2 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8;
+
+    if ((m1 & 0x0202 & m2) == 0) {
+        // no elements from b, only elements from a and possibly zero
+        return permute2d <i0, i1> (a);
+    }
+    if (((m1^0x0202) & 0x0202 & m2) == 0) {
+        // no elements from a, only elements from b and possibly zero
+        return permute2d <i0 & ~2, i1 & ~2> (b);
+    }
+    // selecting from both a and b without zeroing
+    if ((i0 & 2) == 0) { // first element from a, second element from b
+        return _mm_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1);
+    }
+    else {         // first element from b, second element from a
+        return _mm_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1);
+    }
+}
+
+// change signs on vectors Vec4f
+// Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1>
+static inline Vec2d change_sign(Vec2d const & a) {
+    if ((i0 | i1) == 0) return a;
+    __m128i mask = constant4i<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0> ();
+    return  _mm_xor_pd(a, _mm_castsi128_pd(mask));     // flip sign bits
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+* Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+* Vec4f c;
+* c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+*
+*****************************************************************************/
+
+static inline Vec4f lookup4(Vec4i const & index, Vec4f const & table) {
+#if INSTRSET >= 7  // AVX
+    return _mm_permutevar_ps(table, index);
+#else
+    int32_t ii[4];
+    float   tt[6];
+    table.store(tt);  (index & 3).store(ii);
+    __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64 *)&tt[ii[1]]);
+    __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64 *)&tt[ii[3]]);
+    return _mm_shuffle_ps(r01, r23, 0x88);
+#endif
+}
+
+static inline Vec4f lookup8(Vec4i const & index, Vec4f const & table0, Vec4f const & table1) {
+#if INSTRSET >= 8  // AVX2
+    __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1); // combine tables
+
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)        
+    // bug in MS VS 11 beta: operands in wrong order
+    __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt))); 
+    r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+    __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt)); 
+#else
+    // no bug version
+    __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+#endif
+    return r;
+
+#elif INSTRSET >= 7  // AVX 
+    __m128  r0 = _mm_permutevar_ps(table0, index);
+    __m128  r1 = _mm_permutevar_ps(table1, index);
+    __m128i i4 = _mm_slli_epi32(index, 29);
+    return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#elif INSTRSET >= 5  // SSE4.1
+    Vec4f   r0 = lookup4(index, table0);
+    Vec4f   r1 = lookup4(index, table1);
+    __m128i i4 = _mm_slli_epi32(index, 29);
+    return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#else               // SSE2
+    Vec4f   r0 = lookup4(index, table0);
+    Vec4f   r1 = lookup4(index, table1);
+    __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31);
+    return selectf(_mm_castsi128_ps(i4), r1, r0);
+#endif
+}
+
+template <int n>
+static inline Vec4f lookup(Vec4i const & index, float const * table) {
+    if (n <= 0) return 0.0f;
+    if (n <= 4) return lookup4(index, Vec4f().load(table));
+    if (n <= 8) {
+#if INSTRSET >= 8  // AVX2
+        __m256 tt = _mm256_loadu_ps(table);
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)        
+        // bug in MS VS 11 beta: operands in wrong order
+        __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt)));
+        r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+        // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+        __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt));
+#else
+        // no bug version
+        __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+#endif
+        return r;
+#else   // not AVX2
+        return lookup8(index, Vec4f().load(table), Vec4f().load(table+4));
+#endif  // INSTRSET
+    }
+    // n > 8. Limit index
+    Vec4ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec4ui(index), n-1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm_i32gather_ps(table, index1, 4);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]);
+#endif
+}
+
+static inline Vec2d lookup2(Vec2q const & index, Vec2d const & table) {
+#if INSTRSET >= 7  // AVX
+    return _mm_permutevar_pd(table, index + index);
+#else
+    int32_t ii[4];
+    double  tt[2];
+    table.store(tt);  (index & 1).store(ii);
+    return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+static inline Vec2d lookup4(Vec2q const & index, Vec2d const & table0, Vec2d const & table1) {
+#if INSTRSET >= 7  // AVX
+    Vec2q index2 = index + index;          // index << 1
+    __m128d r0 = _mm_permutevar_pd(table0, index2);
+    __m128d r1 = _mm_permutevar_pd(table1, index2);
+    __m128i i4 = _mm_slli_epi64(index, 62);
+    return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4));
+#else
+    int32_t ii[4];
+    double  tt[4];
+    table0.store(tt);  table1.store(tt + 2);  
+    (index & 3).store(ii);
+    return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+template <int n>
+static inline Vec2d lookup(Vec2q const & index, double const * table) {
+    if (n <= 0) return 0.0;
+    if (n <= 2) return lookup2(index, Vec2d().load(table));
+#if INSTRSET < 8  // not AVX2
+    if (n <= 4) return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2));
+#endif
+    // Limit index
+    Vec2uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec2uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec2uq(index), n-1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm_i64gather_pd(table, index1, 8);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec2d(table[ii[0]], table[ii[2]]);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f gather4f(void const * a) {
+    return reinterpret_f(gather4i<i0, i1, i2, i3>(a));
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2d gather2d(void const * a) {
+    return reinterpret_d(gather2q<i0, i1>(a));
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec4fb const & x) {
+    return horizontal_find_first(Vec4ib(x));
+}
+
+static inline int horizontal_find_first(Vec2db const & x) {
+    return horizontal_find_first(Vec2qb(x));
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec4fb const & x) {
+    return horizontal_count(Vec4ib(x));
+}
+
+static inline uint32_t horizontal_count(Vec2db const & x) {
+    return horizontal_count(Vec2qb(x));
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4fb const & x) {
+    return to_bits(Vec4ib(x));
+}
+
+// to_Vec4fb: convert integer bitfield to boolean vector
+static inline Vec4fb to_Vec4fb(uint8_t x) {
+    return Vec4fb(to_Vec4ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2db const & x) {
+    return to_bits(Vec2qb(x));
+}
+
+// to_Vec2db: convert integer bitfield to boolean vector
+static inline Vec2db to_Vec2db(uint8_t x) {
+    return Vec2db(to_Vec2qb(x));
+}
+
+#endif // VECTORF128_H
diff --git a/vectorclass/vectorf256.h b/vectorclass/vectorf256.h
new file mode 100755
index 0000000..75bc267
--- /dev/null
+++ b/vectorclass/vectorf256.h
@@ -0,0 +1,3166 @@
+/****************************  vectorf256.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining 256-bit floating point vector classes as interface
+* to intrinsic functions in x86 microprocessors with AVX instruction set.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX.
+*
+* The following vector classes are defined here:
+* Vec8f     Vector of 8 single precision floating point numbers
+* Vec8fb    Vector of 8 Booleans for use with Vec8f
+* Vec4d     Vector of 4 double precision floating point numbers
+* Vec4db    Vector of 4 Booleans for use with Vec4d
+*
+* Each vector object is represented internally in the CPU as a 256-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For example:
+* Vec4d a(1., 2., 3., 4.), b(5., 6., 7., 8.), c;
+* c = a + b;     // now c contains (6., 8., 10., 12.)
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORF256_H)
+#if    VECTORF256_H != 2
+#error Two different versions of vectorf256.h included
+#endif
+#else
+#define VECTORF256_H  2
+
+#if INSTRSET < 7   // AVX required
+#error Please compile for the AVX instruction set or higher
+#endif
+
+#include "vectorf128.h"  // Define 128-bit vectors
+
+
+
+/*****************************************************************************
+*
+*          select functions
+*
+*****************************************************************************/
+// Select between two __m256 sources, element by element. Used in various functions 
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+static inline __m256 selectf (__m256 const & s, __m256 const & a, __m256 const & b) {
+    return _mm256_blendv_ps (b, a, s);
+}
+
+// Same, with two __m256d sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other 
+// values are allowed.
+static inline __m256d selectd (__m256d const & s, __m256d const & a, __m256d const & b) {
+    return _mm256_blendv_pd (b, a, s);
+}
+
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory,
+// load as __m256
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline __m256 constant8f() {
+    static const union {
+        int     i[8];
+        __m256  ymm;
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
+    return u.ymm;
+}
+
+
+/*****************************************************************************
+*
+*         Join two 128-bit vectors
+*
+*****************************************************************************/
+#define set_m128r(lo,hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1)
+    // _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h
+
+
+/*****************************************************************************
+*
+*          Vec8fb: Vector of 8 Booleans for use with Vec8f
+*
+*****************************************************************************/
+
+class Vec8fb {
+protected:
+    __m256 ymm; // Float vector
+public:
+    // Default constructor:
+    Vec8fb() {
+    }
+    // Constructor to build from all elements:
+    Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7)); 
+#else
+        __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
+        __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7));
+        ymm = set_m128r(blo,bhi);
+#endif
+    }
+    // Constructor to build from two Vec4fb:
+    Vec8fb(Vec4fb const & a0, Vec4fb const & a1) {
+        ymm = set_m128r(a0, a1);
+    }
+    // Constructor to convert from type __m256 used in intrinsics:
+    Vec8fb(__m256 const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256 used in intrinsics:
+    Vec8fb & operator = (__m256 const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8fb(bool b) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b));
+#else
+        __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+        //ymm = _mm256_set_m128(b1,b1);
+        ymm = set_m128r(b1,b1);
+#endif
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8fb & operator = (bool b) {
+        *this = Vec8fb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8fb(int b);
+    Vec8fb & operator = (int x);
+public:
+    // Type cast operator to convert to __m256 used in intrinsics
+    operator __m256() const {
+        return ymm;
+    }
+#if defined (VECTORI256_H)
+#if VECTORI256_H >= 2  // AVX2 version
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const & x) {
+        ymm = _mm256_castsi256_ps(x);
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const & x) {
+        ymm = _mm256_castsi256_ps(x);
+        return *this;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return _mm256_castps_si256(ymm);
+    }
+#endif
+#else
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const & x) {
+        ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const & x) {
+        ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
+        return *this;
+    }
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high()));
+    }
+#endif
+#endif // VECTORI256_H
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8fb const & insert(uint32_t index, bool value) {
+        static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0};
+        __m256 mask  = _mm256_loadu_ps((float const*)(maskl+8-(index & 7))); // mask with FFFFFFFF at index position
+        if (value) {
+            ymm = _mm256_or_ps(ymm,mask);
+        }
+        else {
+            ymm = _mm256_andnot_ps(mask,ymm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        union {
+            float   f[8];
+            int32_t i[8];
+        } u;
+        _mm256_storeu_ps(u.f, ymm);
+        return u.i[index & 7] != 0;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec4fb get_low() const {
+        return _mm256_castps256_ps128(ymm);
+    }
+    Vec4fb get_high() const {
+        return _mm256_extractf128_ps(ymm,1);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8fb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) {
+    return _mm256_and_ps(a, b);
+}
+static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) {
+    return _mm256_or_ps(a, b);
+}
+static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) {
+    return _mm256_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator ~ (Vec8fb const & a) {
+    return _mm256_xor_ps(a, constant8f<-1,-1,-1,-1,-1,-1,-1,-1>());
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec8fb operator ! (Vec8fb const & a) {
+return Vec8fb( !Vec8ib(a));
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) {
+    return _mm256_andnot_ps(b, a);
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec8fb const & a) {
+    return _mm256_testc_ps(a,constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()) != 0;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec8fb const & a) {
+    return ! _mm256_testz_ps(a,a);
+}
+
+
+/*****************************************************************************
+*
+*          Vec4db: Vector of 4 Booleans for use with Vec4d
+*
+*****************************************************************************/
+
+class Vec4db {
+protected:
+    __m256d ymm; // double vector
+public:
+    // Default constructor:
+    Vec4db() {
+    }
+    // Constructor to build from all elements:
+    Vec4db(bool b0, bool b1, bool b2, bool b3) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3)); 
+#else
+        __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
+        __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3));
+        ymm = _mm256_castps_pd(set_m128r(bhi,blo));
+#endif
+    }
+    // Constructor to build from two Vec2db:
+    Vec4db(Vec2db const & a0, Vec2db const & a1) {
+        ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0),_mm_castpd_ps(a1)));
+        //ymm = _mm256_set_m128d(a1, a0);
+    }
+    // Constructor to convert from type __m256d used in intrinsics:
+    Vec4db(__m256d const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256d used in intrinsics:
+    Vec4db & operator = (__m256d const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4db(bool b) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b));
+#else
+        __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+        ymm = _mm256_castps_pd(set_m128r(b1,b1));
+#endif
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4db & operator = (bool b) {
+        ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b)));
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4db(int b);
+    Vec4db & operator = (int x);
+public:
+    // Type cast operator to convert to __m256d used in intrinsics
+    operator __m256d() const {
+        return ymm;
+    }
+#ifdef VECTORI256_H  
+#if VECTORI256_H == 2  // 256 bit integer vectors are available, AVX2
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const & x) {
+        ymm = _mm256_castsi256_pd(x);
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const & x) {
+        ymm = _mm256_castsi256_pd(x);
+        return *this;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return _mm256_castpd_si256(ymm);
+    }
+#endif
+#else   // 256 bit integer vectors emulated without AVX2
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const & x) {
+        *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const & x) {
+        *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
+        return *this;
+    }
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high()));
+    }
+#endif
+#endif // VECTORI256_H
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4db const & insert(uint32_t index, bool value) {
+        static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0};
+        __m256d mask = _mm256_loadu_pd((double const*)(maskl+8-(index&3)*2)); // mask with FFFFFFFFFFFFFFFF at index position
+        if (value) {
+            ymm = _mm256_or_pd(ymm,mask);
+        }
+        else {
+            ymm = _mm256_andnot_pd(mask,ymm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        union {
+            double  f[8];
+            int32_t i[16];
+        } u;
+        _mm256_storeu_pd(u.f, ymm);
+        return u.i[(index & 3) * 2 + 1] != 0;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec2db get_low() const {
+        return _mm256_castpd256_pd128(ymm);
+    }
+    Vec2db get_high() const {
+        return _mm256_extractf128_pd(ymm,1);
+    }
+    static int size () {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4db
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) {
+    return _mm256_and_pd(a, b);
+}
+static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) {
+    return _mm256_or_pd(a, b);
+}
+static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) {
+    return _mm256_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator ~ (Vec4db const & a) {
+    return _mm256_xor_pd(a, _mm256_castps_pd (constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()));
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4db operator ! (Vec4db const & a) {
+return Vec4db( ! Vec4qb(a));
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) {
+    return _mm256_andnot_pd(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec4db const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    return horizontal_and(Vec256b(_mm256_castpd_si256(a)));
+#else  // split into 128 bit vectors
+    return horizontal_and(a.get_low() & a.get_high());
+#endif
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec4db const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    return horizontal_or(Vec256b(_mm256_castpd_si256(a)));
+#else  // split into 128 bit vectors
+    return horizontal_or(a.get_low() | a.get_high());
+#endif
+}
+
+
+ /*****************************************************************************
+*
+*          Vec8f: Vector of 8 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec8f {
+protected:
+    __m256 ymm; // Float vector
+public:
+    // Default constructor:
+    Vec8f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8f(float f) {
+        ymm = _mm256_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
+        ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7); 
+    }
+    // Constructor to build from two Vec4f:
+    Vec8f(Vec4f const & a0, Vec4f const & a1) {
+        ymm = set_m128r(a0, a1);
+        //ymm = _mm256_set_m128(a1, a0);
+    }
+    // Constructor to convert from type __m256 used in intrinsics:
+    Vec8f(__m256 const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256 used in intrinsics:
+    Vec8f & operator = (__m256 const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256 used in intrinsics
+    operator __m256() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8f & load(float const * p) {
+        ymm = _mm256_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32.
+    Vec8f & load_a(float const * p) {
+        ymm = _mm256_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm256_storeu_ps(p, ymm);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32.
+    void store_a(float * p) const {
+        _mm256_store_ps(p, ymm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8f & load_partial(int n, float const * p) {
+        if (n > 0 && n <= 4) {
+            *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps());
+            // ymm = _mm256_castps128_ps256(Vec4f().load_partial<n>(p)); (this doesn't work on MS compiler due to sloppy definition of the cast)
+        }
+        else if (n > 4 && n <= 8) {
+            *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+        }
+        else {
+            ymm = _mm256_setzero_ps();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 8) {
+            get_low().store(p);
+            get_high().store_partial(n - 4, p + 4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8f & cutoff(int n) {
+        if (uint32_t(n) >= 8) return *this;
+        static const union {        
+            int32_t i[16];
+            float   f[16];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n));
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8f const & insert(uint32_t index, float value) {
+        __m256 v0 = _mm256_broadcast_ss(&value);
+        switch (index) {
+        case 0:
+            ymm = _mm256_blend_ps (ymm, v0, 1);  break;
+        case 1:
+            ymm = _mm256_blend_ps (ymm, v0, 2);  break;
+        case 2:
+            ymm = _mm256_blend_ps (ymm, v0, 4);  break;
+        case 3:
+            ymm = _mm256_blend_ps (ymm, v0, 8);  break;
+        case 4:
+            ymm = _mm256_blend_ps (ymm, v0, 0x10);  break;
+        case 5:
+            ymm = _mm256_blend_ps (ymm, v0, 0x20);  break;
+        case 6:
+            ymm = _mm256_blend_ps (ymm, v0, 0x40);  break;
+        default:
+            ymm = _mm256_blend_ps (ymm, v0, 0x80);  break;
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(uint32_t index) const {
+        float x[8];
+        store(x);
+        return x[index & 7];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec4f get_low() const {
+        return _mm256_castps256_ps128(ymm);
+    }
+    Vec4f get_high() const {
+        return _mm256_extractf128_ps(ymm,1);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
+    return _mm256_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator + (Vec8f const & a, float b) {
+    return a + Vec8f(b);
+}
+static inline Vec8f operator + (float a, Vec8f const & b) {
+    return Vec8f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator ++ (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8f & operator ++ (Vec8f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
+    return _mm256_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator - (Vec8f const & a, float b) {
+    return a - Vec8f(b);
+}
+static inline Vec8f operator - (float a, Vec8f const & b) {
+    return Vec8f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator - (Vec8f const & a) {
+    return _mm256_xor_ps(a, constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000> ());
+}
+
+// vector operator -= : subtract
+static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8f operator -- (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8f & operator -- (Vec8f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
+    return _mm256_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator * (Vec8f const & a, float b) {
+    return a * Vec8f(b);
+}
+static inline Vec8f operator * (float a, Vec8f const & b) {
+    return Vec8f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
+    return _mm256_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator / (Vec8f const & a, float b) {
+    return a / Vec8f(b);
+}
+static inline Vec8f operator / (float a, Vec8f const & b) {
+    return Vec8f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
+    return _mm256_cmp_ps(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
+    return _mm256_cmp_ps(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
+    return _mm256_cmp_ps(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
+    return _mm256_cmp_ps(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
+    return _mm256_and_ps(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
+    return _mm256_and_ps(a, b);
+}
+static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
+    return _mm256_and_ps(a, b);
+}
+
+// vector operator | : bitwise or
+static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
+    return _mm256_or_ps(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
+    return _mm256_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator ! (Vec8f const & a) {
+    return a == Vec8f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) {
+    return _mm256_blendv_ps (b, a, s);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
+    return a + (Vec8f(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
+    return a * select(f, b, 1.f);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec8f const & a) {
+    __m256 t1 = _mm256_hadd_ps(a,a);
+    __m256 t2 = _mm256_hadd_ps(t1,t1);
+    __m128 t3 = _mm256_extractf128_ps(t2,1);
+    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
+    return _mm_cvtss_f32(t4);        
+}
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
+    return _mm256_max_ps(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
+    return _mm256_min_ps(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8f abs(Vec8f const & a) {
+    __m256 mask = constant8f<0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF> ();
+    return _mm256_and_ps(a,mask);
+}
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const & a) {
+    return _mm256_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec8f square(Vec8f const & a) {
+    return a * a;
+}
+
+// pow(Vec8f, int):
+template <typename TT> static Vec8f pow(Vec8f const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const & x0, int n) {
+    return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t n) {
+    return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8f pow_n(Vec8f const & a) {
+    if (n < 0)    return Vec8f(1.0f) / pow_n<-n>(a);
+    if (n == 0)   return Vec8f(1.0f);
+    if (n >= 256) return pow(a, n);
+    Vec8f x = a;                       // a^(2^i)
+    Vec8f y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const & a) {
+    return _mm256_round_ps(a, 0);
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const & a) {
+    return _mm256_round_ps(a, 3);
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const & a) {
+    return _mm256_round_ps(a, 1);
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const & a) {
+    return _mm256_round_ps(a, 2);
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+#if VECTORI256_H > 1  // AVX2
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm256_cvtps_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const & a) {
+    return _mm256_cvttps_epi32(a);
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const & a) {
+    return _mm256_cvtepi32_ps(a);
+}
+#else // no AVX2
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const & a) {
+    return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high()));
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const & a) {
+    return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high()));
+}
+#endif
+#endif // VECTORI256_H
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+#ifdef __FMA__
+    return _mm256_fmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_macc_ps(a, b, c);
+#else
+    return a * b + c;
+#endif
+    
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+#ifdef __FMA__
+    return _mm256_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_ps(a, b, c);
+#else
+    return a * b - c;
+#endif    
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+#ifdef __FMA__
+    return _mm256_fnmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_nmacc_ps(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+#ifdef __FMA__
+    return _mm256_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_ps(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    const int b12 = -(1 << 12);                  // mask to remove lower 12 bits
+    Vec8f upper_mask = constant8f<b12,b12,b12,b12,b12,b12,b12,b12>();
+    Vec8f a_high = a & upper_mask;               // split into high and low parts
+    Vec8f b_high = b & upper_mask;
+    Vec8f a_low  = a - a_high;
+    Vec8f b_low  = b - b_high;
+    Vec8f r1 = a_high * b_high;                  // this product is exact
+    Vec8f r2 = r1 - c;                           // subtract c from high product
+    Vec8f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const & a) {
+    return _mm256_rcp_ps(a);
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const & a) {
+    return _mm256_rsqrt_ps(a);
+}
+
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const & a) {
+#if  VECTORI256_H > 1  // AVX2
+    Vec8ui t1 = _mm256_castps_si256(a);// reinterpret as 32-bit integer
+    Vec8ui t2 = t1 << 1;               // shift out sign bit
+    Vec8ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec8i  t4 = Vec8i(t3) - 0x7F;      // subtract bias from exponent
+    return t4;
+#else  // no AVX2
+    return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+#endif
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f 
+static inline Vec8f fraction(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
+    Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
+    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias
+    return _mm256_castsi256_ps(t2);
+#else
+    return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const & n) {
+#if  VECTORI256_H > 1  // AVX2
+    Vec8i t1 = max(n,  -0x7F);         // limit to allowed range
+    Vec8i t2 = min(t1,  0x80);
+    Vec8i t3 = t2 + 0x7F;              // add bias
+    Vec8i t4 = t3 << 23;               // put exponent into position 23
+    return _mm256_castsi256_ps(t4);    // reinterpret as float
+#else
+    return Vec8f(exp2(n.get_low()), exp2(n.get_high()));
+#endif
+}
+//static inline Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
+
+#endif // VECTORI256_H
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
+    Vec8i t2 = t1 >> 31;                  // extend sign bit
+    return _mm256_castsi256_ps(t2);       // reinterpret as 32-bit Boolean
+#else
+    return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) {
+    Vec8f signmask = constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000>();  // -0.0
+    return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                // shift out sign bit
+    Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return t3;
+#else
+    return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFF000000;           // exponent is all 1s, fraction is 0
+#else
+    return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_nan(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                // shift out sign bit
+    Vec8i t3 = 0xFF000000;             // exponent mask
+    Vec8i t4 = t2 & t3;                // exponent
+    Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction
+    return Vec8ib(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
+#else
+    return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
+#endif
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                   // shift out sign bit
+    Vec8i t3 = 0xFF000000;                // exponent mask
+    Vec8i t4 = t2 & t3;                   // exponent
+    Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction
+    return Vec8ib(t4 == 0 && t5 != 0);    // exponent = 0 and fraction != 0
+#else
+    return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1   // 256 bit integer vectors are available, AVX2
+    Vec8i t = _mm256_castps_si256(a);            // reinterpret as 32-bit integer
+          t &= 0x7F800000;                       // isolate exponent
+    return t == 0;                               // exponent = 0
+#else
+    return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f() {
+    return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) {
+    return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FC00000 + n));
+}
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f change_sign(Vec8f const & a) {
+    if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
+    __m256 mask = constant8f<i0 ? 0x80000000 : 0, i1 ? 0x80000000 : 0, i2 ? 0x80000000 : 0, i3 ? 0x80000000 : 0,
+        i4 ? 0x80000000 : 0, i5 ? 0x80000000 : 0, i6 ? 0x80000000 : 0, i7 ? 0x80000000 : 0> ();
+    return _mm256_xor_ps(a, mask);
+}
+
+
+/*****************************************************************************
+*
+*          Vec4d: Vector of 4 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec4d {
+protected:
+    __m256d ymm; // double vector
+public:
+    // Default constructor:
+    Vec4d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4d(double d) {
+        ymm = _mm256_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec4d(double d0, double d1, double d2, double d3) {
+        ymm = _mm256_setr_pd(d0, d1, d2, d3); 
+    }
+    // Constructor to build from two Vec2d:
+    Vec4d(Vec2d const & a0, Vec2d const & a1) {
+        ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1)));
+        //ymm = _mm256_set_m128d(a1, a0);
+    }
+    // Constructor to convert from type __m256d used in intrinsics:
+    Vec4d(__m256d const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256d used in intrinsics:
+    Vec4d & operator = (__m256d const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256d used in intrinsics
+    operator __m256d() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4d & load(double const * p) {
+        ymm = _mm256_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32
+    Vec4d & load_a(double const * p) {
+        ymm = _mm256_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm256_storeu_pd(p, ymm);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32
+    void store_a(double * p) const {
+        _mm256_store_pd(p, ymm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4d & load_partial(int n, double const * p) {
+        if (n > 0 && n <= 2) {
+            *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+        }
+        else if (n > 2 && n <= 4) {
+            *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+        }
+        else {
+            ymm = _mm256_setzero_pd();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 4) {
+            get_low().store(p);
+            get_high().store_partial(n - 2, p + 2);
+        }
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4d & cutoff(int n) {
+        ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n*2));
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4d const & insert(uint32_t index, double value) {
+        __m256d v0 = _mm256_broadcast_sd(&value);
+        switch (index) {
+        case 0:
+            ymm = _mm256_blend_pd (ymm, v0, 1);  break;
+        case 1:
+            ymm = _mm256_blend_pd (ymm, v0, 2);  break;
+        case 2:
+            ymm = _mm256_blend_pd (ymm, v0, 4);  break;
+        default:
+            ymm = _mm256_blend_pd (ymm, v0, 8);  break;
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(uint32_t index) const {
+        double x[4];
+        store(x);
+        return x[index & 3];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2d:
+    Vec2d get_low() const {
+        return _mm256_castpd256_pd128(ymm);
+    }
+    Vec2d get_high() const {
+        return _mm256_extractf128_pd(ymm,1);
+    }
+    static int size () {
+        return 4;
+    }
+};
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
+    return _mm256_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator + (Vec4d const & a, double b) {
+    return a + Vec4d(b);
+}
+static inline Vec4d operator + (double a, Vec4d const & b) {
+    return Vec4d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator ++ (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4d & operator ++ (Vec4d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
+    return _mm256_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator - (Vec4d const & a, double b) {
+    return a - Vec4d(b);
+}
+static inline Vec4d operator - (double a, Vec4d const & b) {
+    return Vec4d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator - (Vec4d const & a) {
+    return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000> ()));
+}
+
+// vector operator -= : subtract
+static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4d operator -- (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4d & operator -- (Vec4d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
+    return _mm256_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator * (Vec4d const & a, double b) {
+    return a * Vec4d(b);
+}
+static inline Vec4d operator * (double a, Vec4d const & b) {
+    return Vec4d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
+    return _mm256_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator / (Vec4d const & a, double b) {
+    return a / Vec4d(b);
+}
+static inline Vec4d operator / (double a, Vec4d const & b) {
+    return Vec4d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) {
+    return _mm256_cmp_pd(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) {
+    return _mm256_cmp_pd(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) {
+    return _mm256_cmp_pd(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
+    return _mm256_cmp_pd(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
+    return _mm256_and_pd(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
+    return _mm256_and_pd(a, b);
+}
+static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
+    return _mm256_and_pd(a, b);
+}
+
+// vector operator | : bitwise or
+static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
+    return _mm256_or_pd(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
+    return _mm256_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator ! (Vec4d const & a) {
+    return a == Vec4d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). 
+// No other values are allowed.
+static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) {
+    return _mm256_blendv_pd(b, a, s);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
+    return a + (Vec4d(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
+    return a * select(f, b, 1.);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec4d const & a) {
+    __m256d t1 = _mm256_hadd_pd(a,a);
+    __m128d t2 = _mm256_extractf128_pd(t1,1);
+    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
+    return _mm_cvtsd_f64(t3);        
+}
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
+    return _mm256_max_pd(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
+    return _mm256_min_pd(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4d abs(Vec4d const & a) {
+    __m256d mask = _mm256_castps_pd(constant8f<-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF> ());
+    return _mm256_and_pd(a,mask);
+}
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const & a) {
+    return _mm256_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec4d square(Vec4d const & a) {
+    return a * a;
+}
+
+// pow(Vec4d, int):
+template <typename TT> static Vec4d pow(Vec4d const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const & x0, int n) {
+    return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t n) {
+    return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4d pow_n(Vec4d const & a) {
+    if (n < 0)    return Vec4d(1.0) / pow_n<-n>(a);
+    if (n == 0)   return Vec4d(1.0);
+    if (n >= 256) return pow(a, n);
+    Vec4d x = a;                       // a^(2^i)
+    Vec4d y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const & a) {
+    return _mm256_round_pd(a, 0);
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const & a) {
+    return _mm256_round_pd(a, 3);
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const & a) {
+    return _mm256_round_pd(a, 1);
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const & a) {
+    return _mm256_round_pd(a, 2);
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm256_cvtpd_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4d const & a) {
+    return _mm256_cvttpd_epi32(a);
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec4q truncate_to_int64(Vec4d const & a) {
+    double aa[4];
+    a.store(aa);
+    return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q truncate_to_int64_limited(Vec4d const & a) {
+#if VECTORI256_H > 1
+    // Note: assume MXCSR control register is set to rounding
+    Vec2q   b = _mm256_cvttpd_epi32(a);                    // round to 32-bit integers
+    __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b));      // get bits 64-127 to position 128-191
+    __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
+    return      _mm256_unpacklo_epi32(c, s);               // interleave with sign extensions
+#else
+    return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+#endif
+} 
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec4q round_to_int64(Vec4d const & a) {
+    return truncate_to_int64(round(a));
+}
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q round_to_int64_limited(Vec4d const & a) {
+#if VECTORI256_H > 1
+    // Note: assume MXCSR control register is set to rounding
+    Vec2q   b = _mm256_cvtpd_epi32(a);                     // round to 32-bit integers
+    __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b));      // get bits 64-127 to position 128-191
+    __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
+    return      _mm256_unpacklo_epi32(c, s);               // interleave with sign extensions
+#else
+    return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+#endif
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4q const & a) {
+    int64_t aa[4];
+    a.store(aa);
+    return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec4d to_double_limited(Vec4q const & x) {
+    Vec8i compressed = permute8i<0,2,4,6,-256,-256,-256,-256>(Vec8i(x));
+    return _mm256_cvtepi32_pd(compressed.get_low());  // AVX
+}
+
+#endif // VECTORI256_H
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const & a) {
+    return _mm256_cvtepi32_pd(a);
+}
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress (Vec4d const & low, Vec4d const & high) {
+    __m128 t1 = _mm256_cvtpd_ps(low);
+    __m128 t2 = _mm256_cvtpd_ps(high);
+    return Vec8f(t1, t2);
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low(Vec8f const & a) {
+    return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
+}
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high (Vec8f const & a) {
+    return _mm256_cvtps_pd(_mm256_extractf128_ps(a,1));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+#ifdef __FMA__
+    return _mm256_fmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_macc_pd(a, b, c);
+#else
+    return a * b + c;
+#endif
+    
+}
+
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+#ifdef __FMA__
+    return _mm256_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_pd(a, b, c);
+#else
+    return a * b - c;
+#endif
+   
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+#ifdef __FMA__
+    return _mm256_fnmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_nmacc_pd(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+#ifdef __FMA__
+    return _mm256_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_pd(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    // mask to remove lower 27 bits
+    Vec4d upper_mask = _mm256_castps_pd(constant8f<(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1>());
+    Vec4d a_high = a & upper_mask;               // split into high and low parts
+    Vec4d b_high = b & upper_mask;
+    Vec4d a_low  = a - a_high;
+    Vec4d b_low  = b - b_high;
+    Vec4d r1 = a_high * b_high;                  // this product is exact
+    Vec4d r2 = r1 - c;                           // subtract c from high product
+    Vec4d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const & a) {
+#if VECTORI256_H > 1  // AVX2
+    Vec4uq t1 = _mm256_castpd_si256(a);// reinterpret as 64-bit integer
+    Vec4uq t2 = t1 << 1;               // shift out sign bit
+    Vec4uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec4q  t4 = Vec4q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+#else
+    return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25 
+static inline Vec4d fraction(Vec4d const & a) {
+#if VECTORI256_H > 1  // AVX2
+    Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
+    Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000); // set exponent to 0 + bias
+    return _mm256_castsi256_pd(t2);
+#else
+    return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const & n) {
+#if VECTORI256_H > 1  // AVX2
+    Vec4q t1 = max(n,  -0x3FF);        // limit to allowed range
+    Vec4q t2 = min(t1,  0x400);
+    Vec4q t3 = t2 + 0x3FF;             // add bias
+    Vec4q t4 = t3 << 52;               // put exponent into position 52
+    return _mm256_castsi256_pd(t4);       // reinterpret as double
+#else
+    return Vec4d(exp2(n.get_low()), exp2(n.get_high()));
+#endif
+}
+//static inline Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
+#endif
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a);    // reinterpret as 64-bit integer
+    Vec4q t2 = t1 >> 63;               // extend sign bit
+    return _mm256_castsi256_pd(t2);       // reinterpret as 64-bit Boolean
+#else
+    return Vec4db(sign_bit(a.get_low()),sign_bit(a.get_high()));
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) {
+    Vec4d signmask = _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000>());  // -0.0
+    return a ^ (b & signmask);
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    Vec4q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec4qb t4 = Vec4q(t2 & t3) != t3;  // exponent field is not all 1s
+    return t4;
+#else
+    return Vec4db(is_finite(a.get_low()),is_finite(a.get_high()));
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFFE0000000000000;   // exponent is all 1s, fraction is 0
+#else
+    return Vec4db(is_inf(a.get_low()),is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec4db is_nan(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    Vec4q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec4q t4 = t2 & t3;                // exponent
+    Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction
+    return Vec4qb(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
+#else
+    return Vec4db(is_nan(a.get_low()),is_nan(a.get_high()));
+#endif
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    Vec4q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec4q t4 = t2 & t3;                // exponent
+    Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction
+    return Vec4qb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0
+#else
+    return Vec4db(is_subnormal(a.get_low()),is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    Vec4q t = _mm256_castpd_si256(a);     // reinterpret as 32-bit integer
+          t &= 0x7FF0000000000000ll;   // isolate exponent
+    return t == 0;                     // exponent = 0
+#else
+    return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() {
+    return _mm256_castps_pd(constant8f<0,0x7FF00000,0,0x7FF00000,0,0x7FF00000,0,0x7FF00000>());
+}
+
+// Function nan4d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10) {
+#if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
+    return _mm256_castsi256_pd(Vec4q(0x7FF8000000000000 + n));
+#else
+    return Vec4d(nan2d(n),nan2d(n));
+#endif
+}
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d change_sign(Vec4d const & a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    __m256 mask = constant8f<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0, 0, i2 ? 0x80000000 : 0, 0, i3 ? 0x80000000 : 0> ();
+    return _mm256_xor_pd(a, _mm256_castps_pd(mask));
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+#if defined (VECTORI256_H) && VECTORI256_H >= 2
+// AVX2 vectors defined
+
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// It is recommended to compile with -fabi-version=0 to get the latest abi version
+#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)  
+static inline __m256i reinterpret_i (__m256i const & x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (__m256  const & x) {
+    return _mm256_castps_si256(x);
+}
+
+static inline __m256i reinterpret_i (__m256d const & x) {
+    return _mm256_castpd_si256(x);
+}
+
+static inline __m256  reinterpret_f (__m256i const & x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (__m256  const & x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (__m256d const & x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (__m256i const & x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256  const & x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256d const & x) {
+    return x;
+}
+
+#else  // __GXX_ABI_VERSION < 1004
+
+static inline __m256i reinterpret_i (Vec32c const & x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (Vec16s const & x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (Vec8i const & x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (Vec4q const & x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (Vec8f  const & x) {
+    return _mm256_castps_si256(x);
+}
+
+static inline __m256i reinterpret_i (Vec4d const & x) {
+    return _mm256_castpd_si256(x);
+}
+
+static inline __m256  reinterpret_f (Vec32c const & x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (Vec16s const & x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (Vec8i const & x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (Vec4q const & x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (Vec8f  const & x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (Vec4d const & x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (Vec32c const & x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec16s const & x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec8i const & x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec4q const & x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec8f  const & x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec4d const & x) {
+    return x;
+}
+
+#endif  // __GXX_ABI_VERSION
+
+#else
+// AVX2 emulated in vectori256e.h, AVX supported
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// It is recommended to compile with -fabi-version=0 to get the latest abi version
+#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)  
+
+static inline Vec256ie reinterpret_i (__m256  const & x) {
+    Vec8f xx(x);
+    return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline Vec256ie reinterpret_i (__m256d const & x) {
+    Vec4d xx(x);
+    return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline __m256  reinterpret_f (__m256  const & x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (__m256d const & x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (__m256  const & x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256d const & x) {
+    return x;
+}
+
+#else  // __GXX_ABI_VERSION < 1004
+
+static inline Vec256ie reinterpret_i (Vec8f const & x) {
+    Vec8f xx(x);
+    return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline Vec256ie reinterpret_i (Vec4d const & x) {
+    Vec4d xx(x);
+    return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline __m256  reinterpret_f (Vec8f const & x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (Vec4d const & x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (Vec8f const & x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (Vec4d const & x) {
+    return x;
+}
+
+#endif  // __GXX_ABI_VERSION
+
+static inline Vec256ie reinterpret_i (Vec256ie const & x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (Vec256ie const & x) {
+    return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high())));
+}
+
+static inline __m256d reinterpret_d (Vec256ie const & x) {
+    return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high())));
+}
+
+#endif  // VECTORI256_H
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* The permute function can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select. An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+* Vec4d b;
+* b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
+*
+*
+* The blend function can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where indexes 0 - 3 indicate an element from the first source
+* vector and indexes 4 - 7 indicate an element from the second source vector.
+* A negative index will generate zero.
+*
+*
+* Example:
+* Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+* Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
+* Vec4d c;
+* c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4d(Vec4d const & a) {
+
+    const int ior = i0 | i1 | i2 | i3;  // OR indexes
+
+    // is zeroing needed
+    const bool do_zero    = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
+
+    // is shuffling needed
+    const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0);
+
+    if (!do_shuffle) {       // no shuffling needed
+        if (do_zero) {       // zeroing
+            if ((i0 & i1 & i2 & i3) < 0) {
+                return _mm256_setzero_pd(); // zero everything
+            }
+            // zero some elements
+            __m256d const mask = _mm256_castps_pd (
+                constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ());
+            return _mm256_and_pd(a, mask);     // zero with AND mask
+        }
+        else {
+            return a;  // do nothing
+        }
+    }
+#if INSTRSET >= 8  // AVX2: use VPERMPD
+    __m256d x = _mm256_permute4x64_pd(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6);
+    if (do_zero) {       // zeroing
+        // zero some elements
+        __m256d const mask2 = _mm256_castps_pd (
+            constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ());
+        x = _mm256_and_pd(x, mask2);     // zero with AND mask
+    }
+    return x;
+#else   // AVX
+
+    // Needed contents of low/high part of each source register in VSHUFPD
+    // 0: a.low, 1: a.high, 3: zero
+    const int s1 = (i0 < 0 ? 3 : (i0 & 2) >> 1) | (i2 < 0 ? 0x30 : (i2 & 2) << 3);
+    const int s2 = (i1 < 0 ? 3 : (i1 & 2) >> 1) | (i3 < 0 ? 0x30 : (i3 & 2) << 3);
+    // permute mask
+    const int sm = (i0 < 0 ? 0 : (i0 & 1)) | (i1 < 0 ? 1 : (i1 & 1)) << 1 | (i2 < 0 ? 0 : (i2 & 1)) << 2 | (i3 < 0 ? 1 : (i3 & 1)) << 3;
+
+    if (s1 == 0x01 || s1 == 0x11 || s2 == 0x01 || s2 == 0x11) {
+        // too expensive to use 256 bit permute, split into two 128 bit permutes
+        Vec2d alo = a.get_low();
+        Vec2d ahi = a.get_high();
+        Vec2d rlo = blend2d<i0, i1> (alo, ahi);
+        Vec2d rhi = blend2d<i2, i3> (alo, ahi);
+        return Vec4d(rlo, rhi);
+    }
+
+    // make operands for VSHUFPD
+    __m256d r1, r2;
+
+    switch (s1) {
+    case 0x00:  // LL
+        r1 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1);  break;
+    case 0x03:  // LZ
+        r1 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
+        break;
+    case 0x10:  // LH
+        r1 = a;  break;
+    case 0x13:  // ZH
+        r1 = do_zero ? _mm256_and_pd(a, _mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a);  break;
+    case 0x30:  // LZ
+        if (do_zero) {
+            __m128d t  = _mm256_castpd256_pd128(a);
+            t  = _mm_and_pd(t,t);
+            r1 = _mm256_castpd128_pd256(t);  
+        }
+        else r1 = a;
+        break;
+    case 0x31:  // HZ
+        r1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1));  break;
+    case 0x33:  // ZZ
+        r1 = do_zero ? _mm256_setzero_pd() : __m256d(a);  break;
+    }
+
+    if (s2 == s1) {
+        if (sm == 0x0A) return r1;
+        r2 = r1;
+    }
+    else {
+        switch (s2) {
+        case 0x00:  // LL
+            r2 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1);  break;
+        case 0x03:  // ZL
+            r2 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
+            break;
+        case 0x10:  // LH
+            r2 = a;  break;
+        case 0x13:  // ZH
+            r2 = do_zero ? _mm256_and_pd(a,_mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a);  break;
+        case 0x30:  // LZ
+            if (do_zero) {
+                __m128d t  = _mm256_castpd256_pd128(a);
+                t  = _mm_and_pd(t,t);
+                r2 = _mm256_castpd128_pd256(t);  
+            }
+            else r2 = a;
+            break;
+        case 0x31:  // HZ
+            r2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1));  break;
+        case 0x33:  // ZZ
+            r2 = do_zero ? _mm256_setzero_pd() : __m256d(a);  break;
+        }
+    }
+    return  _mm256_shuffle_pd(r1, r2, sm);
+
+#endif  // INSTRSET >= 8
+}
+
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
+
+    // Combine all the indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; 
+
+    // Mask to zero out negative indexes
+    const uint32_t mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+    if (mz == 0) return _mm256_setzero_pd();  // all zero
+    
+    __m256d t1;
+    if ((((m1 & 0xFEFEFEFE) ^ 0x06020400) & mz) == 0) {
+        // fits VSHUFPD(a,b)
+        t1 = _mm256_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
+        if (mz == 0xFFFFFFFF) return t1;
+        return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
+    }
+    if ((((m1 & 0xFEFEFEFE) ^0x02060004) & mz) == 0) {
+        // fits VSHUFPD(b,a)
+        t1 = _mm256_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
+        if (mz == 0xFFFFFFFF) return t1;
+        return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
+    }
+    if ((((m1 & 0x03030303) ^ 0x03020100) & mz) == 0) {
+        // blend and zero, no permute
+        if ((m1 & 0x04040404 & mz) == 0) {
+            t1 = a;
+        }
+        else if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) {
+            t1 = b;
+        }
+        else {
+            t1 = _mm256_blend_pd(a, b, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1);
+        }
+        if (mz == 0xFFFFFFFF) return t1;
+        return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
+    }
+    if ((m1 & 0x04040404 & mz) == 0) {
+        // all from a
+        return permute4d<i0, i1, i2, i3> (a);
+    }
+    if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) {
+        // all from b
+        return permute4d<i0 ^ 4, i1 ^ 4, i2 ^ 4, i3 ^ 4> (b);
+    }
+    // check if we can do 128-bit blend/permute
+    if (((m1 ^ 0x01000100) & 0x01010101 & mz) == 0) {
+        const uint32_t j0 = uint32_t((i0 >= 0 ? i0 : i1 >= 0 ? i1 : -1) >> 1);
+        const uint32_t j1 = uint32_t((i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 1);
+        if (((m1 ^ ((j0 & 3) * 0x00000202 | (j1 & 3) * 0x02020000)) & 0x06060606 & mz) == 0) {
+            t1 = _mm256_permute2f128_pd(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
+            const bool partialzero = (((i0 | i1) ^ j0) & 0x80) != 0 || (((i2 | i3) ^ j1) & 0x80) != 0;
+            if (partialzero) {
+                // zero some elements
+                __m256d mask = _mm256_castps_pd (constant8f < 
+                    i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, 
+                    i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ());
+                return _mm256_and_pd(t1, mask);
+            }
+            else return t1;
+        }
+    }
+    // general case. combine two permutes
+    Vec4d a1 = permute4d <
+        (uint32_t)i0 < 4 ? i0 : -0x100,
+        (uint32_t)i1 < 4 ? i1 : -0x100,
+        (uint32_t)i2 < 4 ? i2 : -0x100,
+        (uint32_t)i3 < 4 ? i3 : -0x100 > (a);
+    Vec4d b1 = permute4d <
+        (uint32_t)(i0^4) < 4 ? (i0^4) : -0x100,
+        (uint32_t)(i1^4) < 4 ? (i1^4) : -0x100,
+        (uint32_t)(i2^4) < 4 ? (i2^4) : -0x100,
+        (uint32_t)(i3^4) < 4 ? (i3^4) : -0x100 > (b);   
+    t1 = _mm256_blend_pd(a1, b1, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1);
+    if (mz == 0xFFFFFFFF) return t1;
+    return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
+}
+
+/*****************************************************************************
+*
+*          Vector Vec8f permute and blend functions
+*
+*****************************************************************************/
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8f(Vec8f const & a) {
+
+    __m256 t1, mask;
+
+    const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+    // is zeroing needed
+    const bool do_zero    = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
+
+    // is shuffling needed
+    const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0) ||
+        (i4!=4 && i4>=0) || (i5!=5 && i5>=0) || (i6!=6 && i6>=0) || (i7!=7 && i7>=0);
+
+    if (!do_shuffle) {       // no shuffling needed
+        if (do_zero) {       // zeroing
+            if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) {
+                return _mm256_setzero_ps(); // zero everything
+            }
+            // zero some elements
+            mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
+            return _mm256_and_ps(a, mask);     // zero with AND mask
+        }
+        else {
+            return a;  // do nothing
+        }
+    }
+
+#if INSTRSET >= 8  // AVX2: use VPERMPS
+    if (do_shuffle) {    // shuffling
+        mask = constant8f< i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 > ();
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+        // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+        t1 = _mm256_permutevar8x32_ps(mask, _mm256_castps_si256(a));      //  problem in immintrin.h
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+        // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
+        t1 = _mm256_permutevar8x32_ps(mask, a);
+#else   // no bug version
+        t1 = _mm256_permutevar8x32_ps(a, _mm256_castps_si256(mask));
+#endif
+    }
+    else {
+        t1 = a;          // no shuffling
+    }
+    if (do_zero) {       // zeroing
+        if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) {
+            return _mm256_setzero_ps(); // zero everything
+        }
+        // zero some elements
+        mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
+        t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
+    }
+    return t1;
+#else   // AVX
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
+
+    // Mask to zero out negative indexes
+    const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    // Check if it is possible to use VSHUFPS. Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2
+    const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & m2 & (m2 >> 16)) == 0  &&  ((m1 ^ (m1 >> 4)) & 0x04040404 & m2 & m2 >> 4) == 0;
+
+    if (sps) {   // can use VSHUFPS
+
+        // Index of each pair (i[n],i[n+1])
+        const int j0 = i0 >= 0 ? i0 : i1;
+        const int j1 = i2 >= 0 ? i2 : i3;
+        const int j2 = i4 >= 0 ? i4 : i5;
+        const int j3 = i6 >= 0 ? i6 : i7;
+
+        // Index of each pair (i[n],i[n+4])
+        const int k0 = i0 >= 0 ? i0 : i4;
+        const int k1 = i1 >= 0 ? i1 : i5;
+        const int k2 = i2 >= 0 ? i2 : i6;
+        const int k3 = i3 >= 0 ? i3 : i7;
+
+        // Needed contents of low/high part of each source register in VSHUFPS
+        // 0: a.low, 1: a.high, 3: zero or don't care
+        const int s1 = (j0 < 0 ? 3 : (j0 & 4) >> 2) | (j2 < 0 ? 0x30 : (j2 & 4) << 2);
+        const int s2 = (j1 < 0 ? 3 : (j1 & 4) >> 2) | (j3 < 0 ? 0x30 : (j3 & 4) << 2);
+
+        // calculate cost of using VSHUFPS
+        const int cost1 = (s1 == 0x01 || s1 == 0x11) ? 2 : (s1 == 0x00 || s1 == 0x03 || s1 == 0x31) ? 1 : 0;
+        const int cost2 = (s2 == s1) ? 0 : (s2 == 0x01 || s2 == 0x11) ? 2 : (s2 == 0x00 || (s2 == 0x03 && (s1 & 0xF0) != 0x00) || (s2 == 0x31 && (s1 & 0x0F) != 0x01)) ? 1 : 0;
+
+        if (cost1 + cost2 <= 3) {
+
+            // permute mask
+            const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
+
+            // make operands for VSHUFPS
+            __m256 r1, r2;
+
+            switch (s1) {
+            case 0x00:  // LL
+            case 0x03:  // ZL
+                r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
+            case 0x01:  // HL
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
+                r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(a),1);  break;
+            case 0x10:  // LH
+            case 0x13:  // ZH
+            case 0x30:  // LZ
+            case 0x33:  // ZZ
+                r1 = a;  break;
+            case 0x11:  // HH
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
+                r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(r1),1);  break;
+            case 0x31:  // HZ
+                r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));  break;
+            }
+
+            if (s2 == s1) {
+                if (sm == 0xE4) return r1;
+                r2 = r1;
+            }
+            else {
+                switch (s2) {
+                case 0x00:  // LL
+                    r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
+                case 0x03:  // ZL
+                    if ((s1 & 0xF0) == 0x00) r2 = r1;
+                    else {
+                        r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);
+                    }
+                    break;
+                case 0x01:  // HL
+                    r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
+                    r2 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(a),1);  break;
+                case 0x10:  // LH
+                case 0x13:  // ZH
+                case 0x30:  // LZ
+                case 0x33:  // ZZ
+                    r2 = a;  break;
+                case 0x11:  // HH
+                    r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
+                    r2 = _mm256_insertf128_ps(r2,_mm256_castps256_ps128(r2),1);  break;
+                case 0x31:  // HZ
+                    if ((s1 & 0x0F) == 0x01) r2 = r1;
+                    else {
+                        r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
+                    }
+                    break;
+                }
+            }
+
+            // now the permute instruction
+            t1 = _mm256_shuffle_ps(r1, r2, sm);
+
+            if (do_zero) {
+                // zero some elements
+                mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
+                t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
+            }
+            return t1;
+        }
+    }
+    // not using VSHUFPS. Split into low and high part
+    Vec4f alo = a.get_low();
+    Vec4f ahi = a.get_high();
+    Vec4f rlo = blend4f<i0, i1, i2, i3> (alo, ahi);
+    Vec4f rhi = blend4f<i4, i5, i6, i7> (alo, ahi);
+    return Vec8f(rlo, rhi);
+#endif
+}
+
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
+
+    const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+    // is zeroing needed
+    const bool do_zero  = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    __m256 t1, mask;
+
+    if (mz == 0) return _mm256_setzero_ps();  // all zero
+
+    if ((m1 & 0x88888888 & mz) == 0) {
+        // all from a
+        return permute8f<i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+
+    if (((m1 ^ 0x88888888) & 0x88888888 & mz) == 0) {
+        // all from b
+        return permute8f<i0&~8, i1&~8, i2&~8, i3&~8, i4&~8, i5&~8, i6&~8, i7&~8> (b);
+    }
+
+    if ((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0) {
+        // blend and zero, no permute
+        mask = constant8f<(i0&8)?0:-1, (i1&8)?0:-1, (i2&8)?0:-1, (i3&8)?0:-1, (i4&8)?0:-1, (i5&8)?0:-1, (i6&8)?0:-1, (i7&8)?0:-1> ();
+        t1   = select(mask, a, b);
+        if (!do_zero) return t1;
+        // zero some elements
+        mask = constant8f< (i0<0&&(i0&8)) ? 0 : -1, (i1<0&&(i1&8)) ? 0 : -1, (i2<0&&(i2&8)) ? 0 : -1, (i3<0&&(i3&8)) ? 0 : -1, 
+            (i4<0&&(i4&8)) ? 0 : -1, (i5<0&&(i5&8)) ? 0 : -1, (i6<0&&(i6&8)) ? 0 : -1, (i7<0&&(i7&8)) ? 0 : -1 > ();
+        return _mm256_and_ps(t1, mask);
+    }
+
+    // check if we can do 128-bit blend/permute
+    if (((m1 ^ 0x32103210) & 0x33333333 & mz) == 0) {
+        const uint32_t j0 = (i0 >= 0 ? i0 : i1 >= 0 ? i1 : i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 2;
+        const uint32_t j1 = (i4 >= 0 ? i4 : i5 >= 0 ? i5 : i6 >= 0 ? i6 : i7 >= 0 ? i7 : -1) >> 2;
+        if (((m1 ^ ((j0 & 3) * 0x00004444 | (j1 & 3) * 0x44440000)) & 0xCCCCCCCC & mz) == 0) {
+            t1 = _mm256_permute2f128_ps(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
+            const bool partialzero = (((i0 | i1 | i2 | i3) ^ j0) & 0x80) != 0 || (((i4 | i5 | i6 | i7) ^ j1) & 0x80) != 0;
+            if (partialzero) {
+                // zero some elements
+                mask = constant8f< i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, 
+                    i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+                return _mm256_and_ps(t1, mask);
+            }
+            else return t1;
+        }
+    }
+    // Not checking special cases for vunpckhps, vunpcklps: they are too rare
+
+    // Check if it is possible to use VSHUFPS. 
+    // Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2-3
+    const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0  &&  ((m1 ^ (m1 >> 4)) & 0x0C0C0C0C & mz & mz >> 4) == 0;
+
+    if (sps) {   // can use VSHUFPS
+
+        // Index of each pair (i[n],i[n+1])
+        const int j0 = i0 >= 0 ? i0 : i1;
+        const int j1 = i2 >= 0 ? i2 : i3;
+        const int j2 = i4 >= 0 ? i4 : i5;
+        const int j3 = i6 >= 0 ? i6 : i7;
+
+        // Index of each pair (i[n],i[n+4])
+        const int k0 = i0 >= 0 ? i0 : i4;
+        const int k1 = i1 >= 0 ? i1 : i5;
+        const int k2 = i2 >= 0 ? i2 : i6;
+        const int k3 = i3 >= 0 ? i3 : i7;
+
+        // Needed contents of low/high part of each source register in VSHUFPS
+        // 0: a.low, 1: a.high, 2: b.low, 3: b.high, 4: zero or don't care
+        const int s1 = (j0 < 0 ? 4 : (j0 & 0xC) >> 2) | (j2 < 0 ? 0x30 : (j2 & 0xC) << 2);
+        const int s2 = (j1 < 0 ? 3 : (j1 & 0xC) >> 2) | (j3 < 0 ? 0x30 : (j3 & 0xC) << 2);
+
+        // permute mask
+        const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
+
+        __m256 r1, r2;
+        __m128 ahi = _mm256_extractf128_ps(a,1);    // 1
+        __m128 bhi = _mm256_extractf128_ps(b,1);    // 3
+
+        switch (s1) {
+        case 0x00:  case 0x04:
+            r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
+        case 0x01:  case 0x41:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1);  break;
+        case 0x02:
+            r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1);  break;
+        case 0x03:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1);  break;
+        case 0x10:  case 0x14:  case 0x40:  case 0x44:
+            r1 = a;  break;
+        case 0x11:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1);  break;
+        case 0x12:
+            r1 = _mm256_insertf128_ps(b,ahi,1);  break;
+        case 0x13:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1);  break;
+        case 0x20:
+            r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1);  break;
+        case 0x21:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1);  break;
+        case 0x22:  case 0x24:  case 0x42:
+            r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1);  break;
+        case 0x23:  case 0x43:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1);  break;
+        case 0x30:
+            r1 = _mm256_insertf128_ps(a,bhi,1);  break;
+        case 0x31:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1);  break;
+        case 0x32:  case 0x34:
+            r1 = b;  break;
+        case 0x33:
+            r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1);  break;
+        }
+        if (s2 == s1 || ((s2 & 0x04) && ((s1 ^ s2) & 0xF0) == 0) || ((s2 & 0x40) && ((s1 ^ s2) & 0x0F) == 0)) {
+            // can use r2 = r1
+            if (sm == 0xE4) return r1;  // no shuffling needed
+            r2 = r1;
+        }
+        else {
+            switch (s2) {
+            case 0x00:  case 0x04:
+                r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
+            case 0x01:  case 0x41:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1);  break;
+            case 0x02:
+                r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1);  break;
+            case 0x03:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1);  break;
+            case 0x10:  case 0x14:  case 0x40:  case 0x44:
+                r2 = a;  break;
+            case 0x11:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1);  break;
+            case 0x12:
+                r2 = _mm256_insertf128_ps(b,ahi,1);  break;
+            case 0x13:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1);  break;
+            case 0x20:
+                r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1);  break;
+            case 0x21:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1);  break;
+            case 0x22:  case 0x24:  case 0x42:
+                r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1);  break;
+            case 0x23:  case 0x43:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1);  break;
+            case 0x30:
+                r2 = _mm256_insertf128_ps(a,bhi,1);  break;
+            case 0x31:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1);  break;
+            case 0x32:  case 0x34:
+                r2 = b;  break;
+            case 0x33:
+                r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1);  break;
+            }
+        }
+
+        // now the shuffle instruction
+        t1 = _mm256_shuffle_ps(r1, r2, sm);
+
+        if (do_zero) {
+            // zero some elements
+            mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
+            t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
+        }
+        return t1;
+    }
+
+    // Check if we can use 64-bit blend. Even numbered indexes must be even and odd numbered
+    // indexes must be equal to the preceding index + 1, except for negative indexes.
+    if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0) {
+
+        const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0; // part of a 64-bit block is zeroed
+        const int blank1 = partialzero ? -0x100 : -1;  // ignore or zero
+        const int n0 = i0 > 0 ? i0/2 : i1 > 0 ? i1/2 : blank1;  // indexes for 64 bit blend
+        const int n1 = i2 > 0 ? i2/2 : i3 > 0 ? i3/2 : blank1;
+        const int n2 = i4 > 0 ? i4/2 : i5 > 0 ? i5/2 : blank1;
+        const int n3 = i6 > 0 ? i6/2 : i7 > 0 ? i7/2 : blank1;
+        t1 = _mm256_castpd_ps (blend4d<n0,n1,n2,n3> (_mm256_castps_pd(a), _mm256_castps_pd(b)));
+        if (blank1 == -1 || !do_zero) {    
+            return  t1;
+        }
+        // need more zeroing
+        mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
+        return _mm256_and_ps(t1, mask);     // zero with AND mask
+    }
+
+    // general case: permute and blend and possible zero
+    const int blank2 = do_zero ? -1 : -0x100;  // ignore or zero
+
+    Vec8f ta = permute8f <
+        (uint32_t)i0 < 8 ? i0 : blank2,
+        (uint32_t)i1 < 8 ? i1 : blank2,
+        (uint32_t)i2 < 8 ? i2 : blank2,
+        (uint32_t)i3 < 8 ? i3 : blank2,
+        (uint32_t)i4 < 8 ? i4 : blank2,
+        (uint32_t)i5 < 8 ? i5 : blank2,
+        (uint32_t)i6 < 8 ? i6 : blank2,
+        (uint32_t)i7 < 8 ? i7 : blank2 > (a);
+    Vec8f tb = permute8f <
+        (uint32_t)(i0^8) < 8 ? (i0^8) : blank2,
+        (uint32_t)(i1^8) < 8 ? (i1^8) : blank2,
+        (uint32_t)(i2^8) < 8 ? (i2^8) : blank2,
+        (uint32_t)(i3^8) < 8 ? (i3^8) : blank2,
+        (uint32_t)(i4^8) < 8 ? (i4^8) : blank2,
+        (uint32_t)(i5^8) < 8 ? (i5^8) : blank2,
+        (uint32_t)(i6^8) < 8 ? (i6^8) : blank2,
+        (uint32_t)(i7^8) < 8 ? (i7^8) : blank2 > (b);
+
+    if (blank2 == -1) {    
+        return  _mm256_or_ps(ta, tb); 
+    }
+    // no zeroing, need to blend
+    const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | 
+        ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) | ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80);
+    return _mm256_blend_ps(ta, tb, maskb);  // blend
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+* Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+* Vec4f c;
+* c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+*
+*****************************************************************************/
+
+#ifdef VECTORI256_H  // Vec8i and Vec4q must be defined
+
+static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) {
+#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)        
+    // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+    return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), _mm256_castps_si256(table)); 
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+        // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
+    return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), table);
+#else
+    // no bug version
+    return _mm256_permutevar8x32_ps(table, index);
+#endif
+
+#else // AVX
+    // swap low and high part of table
+    __m256  t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(table, 1));
+    __m256  t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(table), 1);
+    // join index parts
+    __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1);
+    // permute within each 128-bit part
+    __m256  r0 = _mm256_permutevar_ps(table, index2);
+    __m256  r1 = _mm256_permutevar_ps(t2,    index2);
+    // high index bit for blend
+    __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29);
+    __m128i k0 = _mm_slli_epi32(index.get_low(),      29);
+    __m256  kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1);
+    // blend the two permutes
+    return _mm256_blendv_ps(r0, r1, kk);
+#endif
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const & index, float const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) {
+        Vec4f table1 = Vec4f().load(table);        
+        return Vec8f(       
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+    if (n <= 8) {
+        return lookup8(index, Vec8f().load(table));
+    }
+#endif
+    // Limit index
+    Vec8ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
+    return _mm256_i32gather_ps(table, index1, 4);
+#else // AVX
+    return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
+    table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
+#endif
+}
+
+static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) {
+#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
+    // We can't use VPERMPD because it has constant indexes.
+    // Convert the index to fit VPERMPS
+    Vec8i index1 = permute8i<0,0,2,2,4,4,6,6> (Vec8i(index+index));
+    Vec8i index2 = index1 + Vec8i(constant8i<0,1,0,1,0,1,0,1>());
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)        
+    // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
+    return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_si256(table))); 
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+        // Gcc 4.7.0 has wrong parameter type and operands in wrong order
+    return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_ps(table)));
+#else
+    // no bug version
+    return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2));
+#endif
+
+#else // AVX
+    // swap low and high part of table
+    __m256d t1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(table, 1));
+    __m256d t2 = _mm256_insertf128_pd(t1, _mm256_castpd256_pd128(table), 1);
+    // index << 1
+    __m128i index2lo = index.get_low()  + index.get_low();
+    __m128i index2hi = index.get_high() + index.get_high();
+    // join index parts
+    __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1);
+    // permute within each 128-bit part
+    __m256d r0 = _mm256_permutevar_pd(table, index3);
+    __m256d r1 = _mm256_permutevar_pd(t2,    index3);
+    // high index bit for blend
+    __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62);
+    __m128i k0 = _mm_slli_epi64(index.get_low(),      62);
+    __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1);
+    // blend the two permutes
+    return _mm256_blendv_pd(r0, r1, kk);
+#endif
+}
+
+template <int n>
+static inline Vec4d lookup(Vec4q const & index, double const * table) {
+    if (n <= 0) return 0;
+    if (n <= 2) {
+        Vec2d table1 = Vec2d().load(table);        
+        return Vec4d(       
+            lookup2 (index.get_low(),  table1),
+            lookup2 (index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+    if (n <= 4) {
+        return lookup4(index, Vec4d().load(table));
+    }
+#endif
+    // Limit index
+    Vec8ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>();
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>() );
+    }
+#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
+    return _mm256_i64gather_pd(table, index1, 8);
+#else // AVX
+    Vec4q index2 = Vec4q(index1);
+    return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
+#endif
+}
+#endif  // VECTORI256_H
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, ..
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f gather8f(void const * a) {
+    return reinterpret_f(gather8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d gather4d(void const * a) {
+    return reinterpret_d(gather4q<i0, i1, i2, i3>(a));
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec8fb const & x) {
+    return horizontal_find_first(Vec8ib(x));
+}
+
+static inline int horizontal_find_first(Vec4db const & x) {
+    return horizontal_find_first(Vec4qb(x));
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec8fb const & x) {
+    return horizontal_count(Vec8ib(x));
+}
+
+static inline uint32_t horizontal_count(Vec4db const & x) {
+    return horizontal_count(Vec4qb(x));
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const & x) {
+    return to_bits(Vec8ib(x));
+}
+
+// to_Vec8fb: convert integer bitfield to boolean vector
+static inline Vec8fb to_Vec8fb(uint8_t x) {
+    return Vec8fb(to_Vec8ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const & x) {
+    return to_bits(Vec4qb(x));
+}
+
+// to_Vec4db: convert integer bitfield to boolean vector
+static inline Vec4db to_Vec4db(uint8_t x) {
+    return Vec4db(to_Vec4qb(x));
+}
+
+#endif // VECTORF256_H
diff --git a/vectorclass/vectorf256e.h b/vectorclass/vectorf256e.h
new file mode 100755
index 0000000..6c9f4b7
--- /dev/null
+++ b/vectorclass/vectorf256e.h
@@ -0,0 +1,2069 @@
+/****************************  vectorf256e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining 256-bit floating point vector classes as interface
+* to intrinsic functions. Emulated for processors without AVX instruction set.
+*
+* The following vector classes are defined here:
+* Vec8f     Vector of 8 single precision floating point numbers
+* Vec8fb    Vector of 8 Booleans for use with Vec8f
+* Vec4d     Vector of 4 double precision floating point numbers
+* Vec4db    Vector of 4 Booleans for use with Vec4d
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#ifdef VECTORF256_H
+#if    VECTORF256_H != 1
+#error Two different versions of vectorf256.h included
+#endif
+#else
+#define VECTORF256_H  1
+
+#if defined (VECTORI256_H) &&  VECTORI256_H >= 2
+#error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2
+#endif
+
+
+#include "vectorf128.h"  // Define 128-bit vectors
+
+
+/*****************************************************************************
+*
+*          base class Vec256fe and Vec256de
+*
+*****************************************************************************/
+// base class to replace __m256 when AVX is not supported
+class Vec256fe {
+protected:
+    __m128 y0;                         // low half
+    __m128 y1;                         // high half
+public:
+    Vec256fe(void) {};                 // default constructor
+    Vec256fe(__m128 x0, __m128 x1) {   // constructor to build from two __m128
+        y0 = x0;  y1 = x1;
+    }
+    __m128 get_low() const {           // get low half
+        return y0;
+    }
+    __m128 get_high() const {          // get high half
+        return y1;
+    }
+};
+
+// base class to replace __m256d when AVX is not supported
+class Vec256de {
+public:
+    Vec256de() {};                     // default constructor
+    Vec256de(__m128d x0, __m128d x1) { // constructor to build from two __m128d
+        y0 = x0;  y1 = x1;
+    }
+    __m128d get_low() const {          // get low half
+        return y0;
+    }
+    __m128d get_high() const {         // get high half
+        return y1;
+    }
+protected:
+    __m128d y0;                        // low half
+    __m128d y1;                        // high half
+};
+
+
+/*****************************************************************************
+*
+*          select functions
+*
+*****************************************************************************/
+// Select between two Vec256fe sources, element by element. Used in various functions 
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+static inline Vec256fe selectf (Vec256fe const & s, Vec256fe const & a, Vec256fe const & b) {
+    return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high()));
+}
+
+// Same, with two Vec256de sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other 
+// values are allowed.
+static inline Vec256de selectd (Vec256de const & s, Vec256de const & a, Vec256de const & b) {
+    return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory,
+// load as __m256
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec256fe constant8f() {
+    static const union {
+        int      i[8];
+        __m128   y[2];
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
+    return Vec256fe(u.y[0], u.y[1]);
+}
+
+
+/*****************************************************************************
+*
+*          Vec8fb: Vector of 8 Booleans for use with Vec8f
+*
+*****************************************************************************/
+
+class Vec8fb : public Vec256fe {
+public:
+    // Default constructor:
+    Vec8fb() {
+    }
+    // Constructor to build from all elements:
+    Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+        y0 = Vec4fb(b0, b1, b2, b3);
+        y1 = Vec4fb(b4, b5, b6, b7);
+    }
+    // Constructor to build from two Vec4fb:
+    Vec8fb(Vec4fb const & a0, Vec4fb const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256fe
+    Vec8fb(Vec256fe const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256fe
+    Vec8fb & operator = (Vec256fe const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const & x) {
+        y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+        y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const & x) {
+        y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+        y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8fb(bool b) {
+        y1 = y0 = Vec4fb(b);
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8fb & operator = (bool b) {
+        y0 = y1 = Vec4fb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8fb(int b);
+    Vec8fb & operator = (int x);
+public:
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1));
+    }
+#endif // VECTORI256_H
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8fb const & insert(uint32_t index, bool value) {
+        if (index < 4) {
+            y0 = Vec4fb(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4fb(y1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4fb(y0).extract(index);
+        }
+        else {
+            return Vec4fb(y1).extract(index-4);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec4fb get_low() const {
+        return y0;
+    }
+    Vec4fb get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8fb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) {
+    return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) {
+    return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) {
+    return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator ~ (Vec8fb const & a) {
+    return Vec8fb(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec8fb operator ! (Vec8fb const & a) {
+    return Vec8fb(!a.get_low(), !a.get_high());
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) {
+    return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec8fb const & a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec8fb const & a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+
+/*****************************************************************************
+*
+*          Vec4db: Vector of 4 Booleans for use with Vec4d
+*
+*****************************************************************************/
+
+class Vec4db : public Vec256de {
+public:
+    // Default constructor:
+    Vec4db() {
+    }
+    // Constructor to build from all elements:
+    Vec4db(bool b0, bool b1, bool b2, bool b3) {
+        y0 = Vec2db(b0, b1);
+        y1 = Vec2db(b2, b3);
+    }
+    // Constructor to build from two Vec2db:
+    Vec4db(Vec2db const & a0, Vec2db const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256de
+    Vec4db(Vec256de const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256de
+    Vec4db & operator = (Vec256de const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const & x) {
+        y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+        y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const & x) {
+        y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+        y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4db(bool b) {
+        y1 = y0 = Vec2db(b);
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4db & operator = (bool b) {
+        y0 = y1 = Vec2db(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4db(int b);
+    Vec4db & operator = (int x);
+public:
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1));
+    }
+#endif // VECTORI256_H
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4db const & insert(uint32_t index, bool value) {
+        if (index < 2) {
+            y0 = Vec2db(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2db(y1).insert(index - 2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(uint32_t index) const {
+        if (index < 2) {
+            return Vec2db(y0).extract(index);
+        }
+        else {
+            return Vec2db(y1).extract(index - 2);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec2db get_low() const {
+        return y0;
+    }
+    Vec2db get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4db
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) {
+    return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) {
+    return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) {
+    return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator ~ (Vec4db const & a) {
+    return Vec4db(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4db operator ! (Vec4db const & a) {
+    return Vec4db(!a.get_low(), !a.get_high());
+}
+
+// Functions for Vec4db
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) {
+    return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec4db const & a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec4db const & a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+
+/*****************************************************************************
+*
+*          Vec8f: Vector of 8 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec8f : public Vec256fe {
+public:
+    // Default constructor:
+    Vec8f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8f(float f) {
+        y1 = y0 = _mm_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
+        y0 = _mm_setr_ps(f0, f1, f2, f3);
+        y1 = _mm_setr_ps(f4, f5, f6, f7); 
+    }
+    // Constructor to build from two Vec4f:
+    Vec8f(Vec4f const & a0, Vec4f const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256fe
+    Vec8f(Vec256fe const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256fe
+    Vec8f & operator = (Vec256fe const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8f & load(float const * p) {
+        y0 = _mm_loadu_ps(p);
+        y1 = _mm_loadu_ps(p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32.
+    Vec8f & load_a(float const * p) {
+        y0 = _mm_load_ps(p);
+        y1 = _mm_load_ps(p+4);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm_storeu_ps(p,   y0);
+        _mm_storeu_ps(p+4, y1);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32.
+    void store_a(float * p) const {
+        _mm_store_ps(p,   y0);
+        _mm_store_ps(p+4, y1);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8f & load_partial(int n, float const * p) {
+        if (n > 0 && n <= 4) {
+            *this = Vec8f(Vec4f().load_partial(n, p),_mm_setzero_ps());
+        }
+        else if (n > 4 && n <= 8) {
+            *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+        }
+        else {
+            y1 = y0 = _mm_setzero_ps();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 8) {
+            get_low().store(p);
+            get_high().store_partial(n - 4, p + 4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8f & cutoff(int n) {
+        if (uint32_t(n) >= 8) return *this;
+        else if (n >= 4) {
+            y1 = Vec4f(y1).cutoff(n - 4);
+        }
+        else {
+            y0 = Vec4f(y0).cutoff(n);
+            y1 = Vec4f(0.0f);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8f const & insert(uint32_t index, float value) {
+        if (index < 4) {
+            y0 = Vec4f(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4f(y1).insert(index - 4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    float extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4f(y0).extract(index);
+        }
+        else {
+            return Vec4f(y1).extract(index - 4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec4f get_low() const {
+        return y0;
+    }
+    Vec4f get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator + (Vec8f const & a, float b) {
+    return a + Vec8f(b);
+}
+static inline Vec8f operator + (float a, Vec8f const & b) {
+    return Vec8f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator ++ (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8f & operator ++ (Vec8f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator - (Vec8f const & a, float b) {
+    return a - Vec8f(b);
+}
+static inline Vec8f operator - (float a, Vec8f const & b) {
+    return Vec8f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator - (Vec8f const & a) {
+    return Vec8f(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8f operator -- (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8f & operator -- (Vec8f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator * (Vec8f const & a, float b) {
+    return a * Vec8f(b);
+}
+static inline Vec8f operator * (float a, Vec8f const & b) {
+    return Vec8f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator / (Vec8f const & a, float b) {
+    return a / Vec8f(b);
+}
+static inline Vec8f operator / (float a, Vec8f const & b) {
+    return Vec8f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
+    return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
+    return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator ! (Vec8f const & a) {
+    return Vec8fb(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) {
+    return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
+    return a + (Vec8f(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
+    return a * select(f, b, 1.f);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec8f const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
+    return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
+    return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8f abs(Vec8f const & a) {
+    return Vec8f(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const & a) {
+    return Vec8f(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec8f square(Vec8f const & a) {
+    return Vec8f(square(a.get_low()), square(a.get_high()));
+}
+
+// pow(Vec8f, int):
+template <typename TT> static Vec8f pow(Vec8f const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const & x0, int n) {
+    return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t n) {
+    return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8f pow_n(Vec8f const & a) {
+    return Vec8f(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
+}
+
+template <int n>
+static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const & a) {
+    return Vec8f(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const & a) {
+    return Vec8f(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const & a) {
+    return Vec8f(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const & a) {
+    return Vec8f(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8f const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8f const & a) {
+    return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high()));
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const & a) {
+    return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
+}
+#endif // VECTORI256_H 
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const & a) {
+    return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const & a) {
+    return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+    return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+    return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+    return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
+    return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
+}
+
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const & a) {
+    return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
+}
+#endif
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f 
+static inline Vec8f fraction(Vec8f const & a) {
+    return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const & a) {
+    return Vec8f(exp2(a.get_low()), exp2(a.get_high()));
+}
+//static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
+#endif // VECTORI256_H
+
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const & a) {
+    return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) {
+    return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const & a) {
+    return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const & a) {
+    return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_nan(Vec8f const & a) {
+    return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const & a) {
+    return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) {
+    return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f() {
+    return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) {
+    return Vec8f(nan4f(n), nan4f(n));
+}
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f change_sign(Vec8f const & a) {
+    if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
+    Vec4f lo = change_sign<i0,i1,i2,i3>(a.get_low());
+    Vec4f hi = change_sign<i4,i5,i6,i7>(a.get_high());
+    return Vec8f(lo, hi);
+}
+
+
+/*****************************************************************************
+*
+*          Vec2d: Vector of 2 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec4d : public Vec256de {
+public:
+    // Default constructor:
+    Vec4d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4d(double d) {
+        y1 = y0 = _mm_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec4d(double d0, double d1, double d2, double d3) {
+        y0 = _mm_setr_pd(d0, d1); 
+        y1 = _mm_setr_pd(d2, d3); 
+    }
+    // Constructor to build from two Vec4f:
+    Vec4d(Vec2d const & a0, Vec2d const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256de
+    Vec4d(Vec256de const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256de
+    Vec4d & operator = (Vec256de const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4d & load(double const * p) {
+        y0 = _mm_loadu_pd(p);
+        y1 = _mm_loadu_pd(p+2);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32
+    Vec4d & load_a(double const * p) {
+        y0 = _mm_load_pd(p);
+        y1 = _mm_load_pd(p+2);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm_storeu_pd(p,   y0);
+        _mm_storeu_pd(p+2, y1);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32
+    void store_a(double * p) const {
+        _mm_store_pd(p,   y0);
+        _mm_store_pd(p+2, y1);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4d & load_partial(int n, double const * p) {
+        if (n > 0 && n <= 2) {
+            *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+        }
+        else if (n > 2 && n <= 4) {
+            *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+        }
+        else {
+            y1 = y0 = _mm_setzero_pd();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 4) {
+            get_low().store(p);
+            get_high().store_partial(n - 2, p + 2);
+        }
+    }
+    Vec4d & cutoff(int n) {
+        if (uint32_t(n) >= 4) return *this;
+        else if (n >= 2) {
+            y1 = Vec2d(y1).cutoff(n - 2);
+        }
+        else {
+            y0 = Vec2d(y0).cutoff(n);
+            y1 = Vec2d(0.0);
+        }
+        return *this;
+    }    
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4d const & insert(uint32_t index, double value) {
+        if (index < 2) {
+            y0 = Vec2d(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2d(y1).insert(index-2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    double extract(uint32_t index) const {
+        if (index < 2) {
+            return Vec2d(y0).extract(index);
+        }
+        else {
+            return Vec2d(y1).extract(index-2);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2d:
+    Vec2d get_low() const {
+        return y0;
+    }
+    Vec2d get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 2;
+    }
+};
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator + (Vec4d const & a, double b) {
+    return a + Vec4d(b);
+}
+static inline Vec4d operator + (double a, Vec4d const & b) {
+    return Vec4d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator ++ (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4d & operator ++ (Vec4d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator - (Vec4d const & a, double b) {
+    return a - Vec4d(b);
+}
+static inline Vec4d operator - (double a, Vec4d const & b) {
+    return Vec4d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator - (Vec4d const & a) {
+    return Vec4d(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4d operator -- (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4d & operator -- (Vec4d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator * (Vec4d const & a, double b) {
+    return a * Vec4d(b);
+}
+static inline Vec4d operator * (double a, Vec4d const & b) {
+    return Vec4d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator / (Vec4d const & a, double b) {
+    return a / Vec4d(b);
+}
+static inline Vec4d operator / (double a, Vec4d const & b) {
+    return Vec4d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
+    return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
+    return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator ! (Vec4d const & a) {
+    return Vec4db(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). 
+// No other values are allowed.
+static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) {
+    return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
+    return a + (Vec4d(f) & b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
+    return a * select(f, b, 1.f);
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec4d const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
+    return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
+    return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4d abs(Vec4d const & a) {
+    return Vec4d(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const & a) {
+    return Vec4d(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec4d square(Vec4d const & a) {
+    return Vec4d(square(a.get_low()), square(a.get_high()));
+}
+
+// pow(Vec4d, int):
+// Raise floating point numbers to integer power n
+template <typename TT> static Vec4d pow(Vec4d const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const & x0, int n) {
+    return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t n) {
+    return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4d pow_n(Vec4d const & a) {
+    return Vec4d(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
+}
+
+template <int n>
+static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const & a) {
+    return Vec4d(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const & a) {
+    return Vec4d(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const & a) {
+    return Vec4d(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const & a) {
+    return Vec4d(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int(Vec4d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return round_to_int(a.get_low(), a.get_high());
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int(Vec4d const & a) {
+    return truncate_to_int(a.get_low(), a.get_high());
+}
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec4q truncate_to_int64(Vec4d const & a) {
+    double aa[4];
+    a.store(aa);
+    return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q truncate_to_int64_limited(Vec4d const & a) {
+    return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+}
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec4q round_to_int64(Vec4d const & a) {
+    return truncate_to_int64(round(a));
+}
+
+// function round_to_int64_limited: round to nearest integer
+// result as 64-bit integer vector, but with limited range
+static inline Vec4q round_to_int64_limited(Vec4d const & a) {
+    return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4q const & a) {
+    int64_t aa[4];
+    a.store(aa);
+    return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec4d to_double_limited(Vec4q const & x) {
+    return Vec4d(to_double_limited(x.get_low()),to_double_limited(x.get_high()));
+}
+
+#endif  // VECTORI256_H
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const & a) {
+    return Vec4d(to_double_low(a), to_double_high(a));
+}
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress (Vec4d const & low, Vec4d const & high) {
+    return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low (Vec8f const & a) {
+    return Vec4d(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high (Vec8f const & a) {
+    return Vec4d(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+    return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+    return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+    return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
+    return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
+}
+
+
+// Math functions using fast bit manipulation
+
+#ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const & a) {
+    return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25 
+static inline Vec4d fraction(Vec4d const & a) {
+    return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const & a) {
+    return Vec4d(exp2(a.get_low()), exp2(a.get_high()));
+}
+//static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
+#endif
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const & a) {
+    return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) {
+    return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const & a) {
+    return Vec4db(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const & a) {
+    return Vec4db(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec4db is_nan(Vec4d const & a) {
+    return Vec4db(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const & a) {
+    return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const & a) {
+    return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() {
+    return Vec4d(infinite2d(), infinite2d());
+}
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10) {
+    return Vec4d(nan2d(n), nan2d(n));
+}
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d change_sign(Vec4d const & a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    Vec2d lo = change_sign<i0,i1>(a.get_low());
+    Vec2d hi = change_sign<i2,i3>(a.get_high());
+    return Vec4d(lo, hi);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline Vec256ie reinterpret_i (Vec256ie const & x) {
+    return x;
+}
+
+static inline Vec256ie reinterpret_i (Vec256fe  const & x) {
+    return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec256ie reinterpret_i (Vec256de const & x) {
+    return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec256fe  reinterpret_f (Vec256ie const & x) {
+    return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
+}
+
+static inline Vec256fe  reinterpret_f (Vec256fe  const & x) {
+    return x;
+}
+
+static inline Vec256fe  reinterpret_f (Vec256de const & x) {
+    return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256ie const & x) {
+    return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256fe  const & x) {
+    return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256de const & x) {
+    return x;
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* The permute function can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select. An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+* Vec4d b;
+* b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
+*
+*
+* The blend function can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where indexes 0 - 3 indicate an element from the first source
+* vector and indexes 4 - 7 indicate an element from the second source vector.
+* A negative index will generate zero.
+*
+*
+* Example:
+* Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
+* Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
+* Vec4d c;
+* c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
+*****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4d(Vec4d const & a) {
+    return Vec4d(blend2d<i0,i1> (a.get_low(), a.get_high()), 
+           blend2d<i2,i3> (a.get_low(), a.get_high()));
+}
+
+// helper function used below
+template <int n>
+static inline Vec2d select4(Vec4d const & a, Vec4d const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_pd();
+}
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
+    const int j0 = i0 >= 0 ? i0/2 : i0;
+    const int j1 = i1 >= 0 ? i1/2 : i1;
+    const int j2 = i2 >= 0 ? i2/2 : i2;
+    const int j3 = i3 >= 0 ? i3/2 : i3;    
+    Vec2d x0, x1;
+
+    if (j0 == j1 || i0 < 0 || i1 < 0) {  // both from same
+        const int k0 = j0 >= 0 ? j0 : j1;
+        x0 = permute2d<i0 & -7, i1 & -7> (select4<k0> (a,b));
+    }
+    else {
+        x0 = blend2d<i0 & -7, (i1 & -7) | 2> (select4<j0>(a,b), select4<j1>(a,b));
+    }
+    if (j2 == j3 || i2 < 0 || i3 < 0) {  // both from same
+        const int k1 = j2 >= 0 ? j2 : j3;
+        x1 = permute2d<i2 & -7, i3 & -7> (select4<k1> (a,b));
+    }
+    else {
+        x1 = blend2d<i2 & -7, (i3 & -7) | 2> (select4<j2>(a,b), select4<j3>(a,b));
+    }
+    return Vec4d(x0,x1);
+}
+
+/*****************************************************************************
+*
+*          Vector Vec8f permute and blend functions
+*
+*****************************************************************************/
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8f(Vec8f const & a) {
+    return Vec8f(blend4f<i0,i1,i2,i3> (a.get_low(), a.get_high()), 
+                 blend4f<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+// helper function used below
+template <int n>
+static inline Vec4f select4(Vec8f const & a, Vec8f const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_ps();
+}
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
+    const int j0 = i0 >= 0 ? i0/4 : i0;
+    const int j1 = i1 >= 0 ? i1/4 : i1;
+    const int j2 = i2 >= 0 ? i2/4 : i2;
+    const int j3 = i3 >= 0 ? i3/4 : i3;
+    const int j4 = i4 >= 0 ? i4/4 : i4;
+    const int j5 = i5 >= 0 ? i5/4 : i5;
+    const int j6 = i6 >= 0 ? i6/4 : i6;
+    const int j7 = i7 >= 0 ? i7/4 : i7;
+    Vec4f x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+    const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+    const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    if (r0 < 0) {
+        x0 = _mm_setzero_ps();
+    }
+    else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { 
+        // i0 - i3 all from same source
+        x0 = permute4f<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
+    }
+    else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { 
+        // i0 - i3 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 3 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+        x0 = blend4f<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i3 from three or four different sources
+        x0 = blend4f<0,1,6,7> (
+             blend4f<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
+             blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
+    }
+
+    if (r1 < 0) {
+        x1 = _mm_setzero_ps();
+    }
+    else if (((m1 ^ r1*0x44440000u) & 0xCCCC0000 & mz) == 0) { 
+        // i4 - i7 all from same source
+        x1 = permute4f<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
+    }
+    else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { 
+        // i4 - i7 all from two sources
+        const int k4 =  i4 >= 0 ? i4 & 3 : i4;
+        const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+        x1 = blend4f<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i4 - i7 from three or four different sources
+        x1 = blend4f<0,1,6,7> (
+             blend4f<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
+             blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
+    }
+
+    return Vec8f(x0,x1);
+}
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
+* Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
+* Vec4f c;
+* c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
+*
+*****************************************************************************/
+
+#ifdef VECTORI256_H  // Vec8i and Vec4q must be defined
+
+static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) {
+    Vec4f  r0 = lookup8(index.get_low() , table.get_low(), table.get_high());
+    Vec4f  r1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+    return Vec8f(r0, r1);
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const & index, float const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) {
+        Vec4f table1 = Vec4f().load(table);        
+        return Vec8f(       
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if (n <= 8) {
+        return lookup8(index, Vec8f().load(table));
+    }
+    // Limit index
+    Vec8ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+    return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
+    table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
+}
+
+static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) {
+    Vec2d  r0 = lookup4(index.get_low() , table.get_low(), table.get_high());
+    Vec2d  r1 = lookup4(index.get_high(), table.get_low(), table.get_high());
+    return Vec4d(r0, r1);
+}
+
+template <int n>
+static inline Vec4d lookup(Vec4q const & index, double const * table) {
+    if (n <= 0) return 0;
+    if (n <= 2) {
+        Vec2d table1 = Vec2d().load(table);        
+        return Vec4d(       
+            lookup2 (index.get_low(),  table1),
+            lookup2 (index.get_high(), table1));
+    }
+    // Limit index
+    Vec8ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>();
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>() );
+    }
+    Vec4q index2 = Vec4q(index1);
+    return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
+}
+#endif  // VECTORI256_H
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+
+static inline int horizontal_find_first(Vec8fb const & x) {
+    return horizontal_find_first(Vec8ib(x));
+}
+
+static inline int horizontal_find_first(Vec4db const & x) {
+    return horizontal_find_first(Vec4qb(x));
+} 
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec8fb const & x) {
+    return horizontal_count(Vec8ib(x));
+}
+
+static inline uint32_t horizontal_count(Vec4db const & x) {
+    return horizontal_count(Vec4qb(x));
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const & x) {
+    return to_bits(Vec8ib(x));
+}
+
+// to_Vec8fb: convert integer bitfield to boolean vector
+static inline Vec8fb to_Vec8fb(uint8_t x) {
+    return Vec8fb(to_Vec8ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const & x) {
+    return to_bits(Vec4qb(x));
+}
+
+// to_Vec4db: convert integer bitfield to boolean vector
+static inline Vec4db to_Vec4db(uint8_t x) {
+    return Vec4db(to_Vec4qb(x));
+}
+
+#endif // VECTORF256_H
diff --git a/vectorclass/vectorf512.h b/vectorclass/vectorf512.h
new file mode 100755
index 0000000..66c1263
--- /dev/null
+++ b/vectorclass/vectorf512.h
@@ -0,0 +1,2366 @@
+/****************************  vectorf512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining floating point vector classes as interface to intrinsic 
+* functions in x86 microprocessors with AVX512 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX512F. 
+*
+* The following vector classes are defined here:
+* Vec16f    Vector of  16  single precision floating point numbers
+* Vec16fb   Vector of  16  Booleans for use with Vec16f
+* Vec8d     Vector of   8  double precision floating point numbers
+* Vec8db    Vector of   8  Booleans for use with Vec8d
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORF512_H)
+#if    VECTORF512_H != 2
+#error Two different versions of vectorf512.h included
+#endif
+#else
+#define VECTORF512_H  2
+
+#include "vectori512.h"
+
+// Define missing intrinsic functions
+#if defined (GCC_VERSION) && GCC_VERSION < 41102 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+
+static inline __m512 _mm512_castpd_ps(__m512d x) {
+    union {
+        __m512d a;
+        __m512  b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512d _mm512_castps_pd(__m512 x) {
+    union {
+        __m512  a;
+        __m512d b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+
+static inline __m512i _mm512_castps_si512(__m512 x) {
+    union {
+        __m512  a;
+        __m512i b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512 _mm512_castsi512_ps(__m512i x) {
+    union {
+        __m512i a;
+        __m512  b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512i _mm512_castpd_si512(__m512d x) {
+    union {
+        __m512d a;
+        __m512i b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512d _mm512_castsi512_pd(__m512i x) {
+    union {
+        __m512i a;
+        __m512d b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512 _mm512_castps256_ps512(__m256 x) {
+    union {
+        __m256 a;
+        __m512 b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m256 _mm512_castps512_ps256(__m512 x) {
+    union {
+        __m512 a;
+        __m256 b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m512d _mm512_castpd256_pd512(__m256d x) {
+    union {
+        __m256d a;
+        __m512d b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline __m256d _mm512_castpd512_pd256(__m512d x) {
+    union {
+        __m512d a;
+        __m256d b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+#endif 
+
+
+/*****************************************************************************
+*
+*          Vec16fb: Vector of 16 Booleans for use with Vec16f
+*
+*****************************************************************************/
+class Vec16fb : public Vec16b {
+public:
+    // Default constructor:
+    Vec16fb () {
+    }
+    Vec16fb (Vec16b x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
+    }
+    // Constructor to convert from type __mmask16 used in intrinsics:
+    Vec16fb (__mmask16 x) {
+        m16 = x;
+    }
+    // Constructor to broadcast single value:
+    Vec16fb(bool b) : Vec16b(b) {}
+private: // Prevent constructing from int, etc.
+    Vec16fb(int b);
+public:
+    // Constructor to make from two halves
+    Vec16fb (Vec8fb const & x0, Vec8fb const & x1) {
+        m16 = Vec16b(Vec8ib(x0), Vec8ib(x1));
+    }
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec16fb & operator = (__mmask16 x) {
+        m16 = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16fb & operator = (bool b) {
+        m16 = Vec16b(b);
+        return *this;
+    }
+private: // Prevent assigning int because of ambiguity
+    Vec16fb & operator = (int x);
+public:
+};
+
+// Define operators for Vec16fb
+
+// vector operator & : bitwise and
+static inline Vec16fb operator & (Vec16fb a, Vec16fb b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec16fb operator && (Vec16fb a, Vec16fb b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16fb operator | (Vec16fb a, Vec16fb b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec16fb operator || (Vec16fb a, Vec16fb b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16fb operator ^ (Vec16fb a, Vec16fb b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16fb operator ~ (Vec16fb a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec16fb operator ! (Vec16fb a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16fb & operator &= (Vec16fb & a, Vec16fb b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16fb & operator |= (Vec16fb & a, Vec16fb b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16fb & operator ^= (Vec16fb & a, Vec16fb b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec8db: Vector of 8 Booleans for use with Vec8d
+*
+*****************************************************************************/
+
+class Vec8db : public Vec8b {
+public:
+    // Default constructor:
+    Vec8db () {
+    }
+    Vec8db (Vec16b x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {
+    }
+    // Constructor to convert from type __mmask8 used in intrinsics:
+    Vec8db (__mmask8 x) {
+        m16 = x;
+    }
+    // Constructor to build from two halves
+    Vec8db (Vec4db const & x0, Vec4db const & x1) {
+        m16 = Vec8qb(Vec4qb(x0), Vec4qb(x1));
+    }
+    // Assignment operator to convert from type __mmask8 used in intrinsics:
+    Vec8db & operator = (__mmask8 x) {
+        m16 = x;
+        return *this;
+    }
+    // Constructor to broadcast single value:
+    Vec8db(bool b) : Vec8b(b) {}
+    // Assignment operator to broadcast scalar:
+    Vec8db & operator = (bool b) {
+        m16 = Vec8b(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8db(int b);
+    Vec8db & operator = (int x);
+public:
+    static int size () {
+        return 8;
+    }
+};
+
+// Define operators for Vec8db
+
+// vector operator & : bitwise and
+static inline Vec8db operator & (Vec8db a, Vec8db b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec8db operator && (Vec8db a, Vec8db b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8db operator | (Vec8db a, Vec8db b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec8db operator || (Vec8db a, Vec8db b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8db operator ^ (Vec8db a, Vec8db b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8db operator ~ (Vec8db a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec8db operator ! (Vec8db a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8db & operator &= (Vec8db & a, Vec8db b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8db & operator |= (Vec8db & a, Vec8db b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8db & operator ^= (Vec8db & a, Vec8db b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec16f: Vector of 16 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec16f {
+protected:
+    __m512 zmm; // Float vector
+public:
+    // Default constructor:
+    Vec16f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16f(float f) {
+        zmm = _mm512_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7,
+    float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) {
+        zmm = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15); 
+    }
+    // Constructor to build from two Vec8f:
+    Vec16f(Vec8f const & a0, Vec8f const & a1) {
+        zmm = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(a0)), _mm256_castps_pd(a1), 1));
+    }
+    // Constructor to convert from type __m512 used in intrinsics:
+    Vec16f(__m512 const & x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512 used in intrinsics:
+    Vec16f & operator = (__m512 const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512 used in intrinsics
+    operator __m512() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec16f & load(float const * p) {
+        zmm = _mm512_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64.
+    Vec16f & load_a(float const * p) {
+        zmm = _mm512_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm512_storeu_ps(p, zmm);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64.
+    void store_a(float * p) const {
+        _mm512_store_ps(p, zmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16f & load_partial(int n, float const * p) {
+        zmm = _mm512_maskz_loadu_ps(__mmask16((1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        _mm512_mask_storeu_ps(p, __mmask16((1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16f & cutoff(int n) {
+        zmm = _mm512_maskz_mov_ps(__mmask16((1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16f const & insert(uint32_t index, float value) {
+        //zmm = _mm512_mask_set1_ps(zmm, __mmask16(1 << index), value);  // this intrinsic function does not exist (yet?)
+        zmm = _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_castps_si512(zmm), __mmask16(1 << index), *(int32_t*)&value));  // ignore warning
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(uint32_t index) const {
+        float a[16];
+        store(a);
+        return a[index & 15];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec8f get_low() const {
+        return _mm512_castps512_ps256(zmm);
+    }
+    Vec8f get_high() const {
+        return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1));
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator + (Vec16f const & a, Vec16f const & b) {
+    return _mm512_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator + (Vec16f const & a, float b) {
+    return a + Vec16f(b);
+}
+static inline Vec16f operator + (float a, Vec16f const & b) {
+    return Vec16f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec16f & operator += (Vec16f & a, Vec16f const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator ++ (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16f & operator ++ (Vec16f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator - (Vec16f const & a, Vec16f const & b) {
+    return _mm512_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator - (Vec16f const & a, float b) {
+    return a - Vec16f(b);
+}
+static inline Vec16f operator - (float a, Vec16f const & b) {
+    return Vec16f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator - (Vec16f const & a) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ 0x80000000);
+}
+
+// vector operator -= : subtract
+static inline Vec16f & operator -= (Vec16f & a, Vec16f const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16f operator -- (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16f & operator -- (Vec16f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator * (Vec16f const & a, Vec16f const & b) {
+    return _mm512_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator * (Vec16f const & a, float b) {
+    return a * Vec16f(b);
+}
+static inline Vec16f operator * (float a, Vec16f const & b) {
+    return Vec16f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec16f & operator *= (Vec16f & a, Vec16f const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator / (Vec16f const & a, Vec16f const & b) {
+    return _mm512_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator / (Vec16f const & a, float b) {
+    return a / Vec16f(b);
+}
+static inline Vec16f operator / (float a, Vec16f const & b) {
+    return Vec16f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec16f & operator /= (Vec16f & a, Vec16f const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator == (Vec16f const & a, Vec16f const & b) {
+//    return _mm512_cmpeq_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator != (Vec16f const & a, Vec16f const & b) {
+//    return _mm512_cmpneq_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator < (Vec16f const & a, Vec16f const & b) {
+//    return _mm512_cmplt_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 1);
+
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator <= (Vec16f const & a, Vec16f const & b) {
+//    return _mm512_cmple_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator > (Vec16f const & a, Vec16f const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator >= (Vec16f const & a, Vec16f const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator & (Vec16f const & a, Vec16f const & b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) & Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f & operator &= (Vec16f & a, Vec16f const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator & (Vec16f const & a, Vec16fb const & b) {
+    return _mm512_maskz_mov_ps(b, a);
+}
+static inline Vec16f operator & (Vec16fb const & a, Vec16f const & b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16f operator | (Vec16f const & a, Vec16f const & b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) | Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f & operator |= (Vec16f & a, Vec16f const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator ^ (Vec16f const & a, Vec16f const & b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f & operator ^= (Vec16f & a, Vec16f const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator ! (Vec16f const & a) {
+    return a == Vec16f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec16f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec16f select (Vec16fb const & s, Vec16f const & a, Vec16f const & b) {
+    return _mm512_mask_mov_ps(b, s, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add (Vec16fb const & f, Vec16f const & a, Vec16f const & b) {
+    return _mm512_mask_add_ps(a, f, a, b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16f if_mul (Vec16fb const & f, Vec16f const & a, Vec16f const & b) {
+    return _mm512_mask_mul_ps(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec16f const & a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_ps(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const & a, Vec16f const & b) {
+    return _mm512_max_ps(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const & a, Vec16f const & b) {
+    return _mm512_min_ps(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec16f abs(Vec16f const & a) {
+    union {
+        int32_t i;
+        float   f;
+    } u = {0x7FFFFFFF};
+    return a & Vec16f(u.f);
+}
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const & a) {
+    return _mm512_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec16f square(Vec16f const & a) {
+    return a * a;
+}
+
+// pow(Vec16f, int):
+template <typename TT> static Vec16f pow(Vec16f const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const & x0, int n) {
+    return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const & x0, uint32_t n) {
+    return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow_n(Vec16f const & a) {
+    if (n < 0)    return Vec16f(1.0f) / pow_n<-n>(a);
+    if (n == 0)   return Vec16f(1.0f);
+    if (n >= 256) return pow(a, n);
+    Vec16f x = a;                      // a^(2^i)
+    Vec16f y;                          // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec16f pow(Vec16f const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const & a) {
+    return _mm512_roundscale_ps(a, 0);
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const & a) {
+    return _mm512_roundscale_ps(a, 3);
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const & a) {
+    return _mm512_roundscale_ps(a, 1);
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const & a) {
+    return _mm512_roundscale_ps(a, 2);
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec16i round_to_int(Vec16f const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm512_cvt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec16i truncate_to_int(Vec16f const & a) {
+    return _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const & a) {
+    return _mm512_cvtepi32_ps(a);
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_recipr(Vec16f const & a) {
+    return _mm512_rcp14_ps(a);
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_rsqrt(Vec16f const & a) {
+    return _mm512_rsqrt14_ps(a);
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return _mm512_fmadd_ps(a, b, c);
+}
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return _mm512_fmsub_ps(a, b, c);
+}
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return _mm512_fnmadd_ps(a, b, c);
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+static inline Vec16f mul_sub_x(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return _mm512_fmsub_ps(a, b, c);
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const & a) {
+    // return round_to_int(Vec16i(_mm512_getexp_ps(a)));
+    Vec16ui t1 = _mm512_castps_si512(a);// reinterpret as 32-bit integers
+    Vec16ui t2 = t1 << 1;               // shift out sign bit
+    Vec16ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec16i  t4 = Vec16i(t3) - 0x7F;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f 
+static inline Vec16f fraction(Vec16f const & a) {
+#if 1
+    return _mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#else
+    Vec8ui t1 = _mm512_castps_si512(a);   // reinterpret as 32-bit integer
+    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias
+    return _mm512_castsi512_ps(t2);
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const & n) {
+    Vec16i t1 = max(n,  -0x7F);         // limit to allowed range
+    Vec16i t2 = min(t1,  0x80);
+    Vec16i t3 = t2 + 0x7F;              // add bias
+    Vec16i t4 = t3 << 23;               // put exponent into position 23
+    return _mm512_castsi512_ps(t4);     // reinterpret as float
+}
+//static Vec16f exp2(Vec16f const & x); // defined in vectormath_exp.h
+
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const & a) {
+    Vec16i t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    return Vec16fb(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const & a, Vec16f const & b) {
+    union {
+        uint32_t i;
+        float    f;
+    } signmask = {0x80000000};
+    return a ^ (b & Vec16f(signmask.f));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const & a) {
+    Vec16i  t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    Vec16i  t2 = t1 << 1;                   // shift out sign bit
+    Vec16ib t3 = Vec16i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return Vec16fb(t3);
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const & a) {
+    Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer
+    Vec16i t2 = t1 << 1;                // shift out sign bit
+    return Vec16fb(t2 == 0xFF000000);   // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_nan(Vec16f const & a) {
+    Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer
+    Vec16i t2 = t1 << 1;                // shift out sign bit
+    Vec16i t3 = 0xFF000000;             // exponent mask
+    Vec16i t4 = t2 & t3;                // exponent
+    Vec16i t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec16fb(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const & a) {
+    Vec16i t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    Vec16i t2 = t1 << 1;                   // shift out sign bit
+    Vec16i t3 = 0xFF000000;                // exponent mask
+    Vec16i t4 = t2 & t3;                   // exponent
+    Vec16i t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec16fb(t4 == 0 && t5 != 0);     // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const & a) {
+    Vec16i t = _mm512_castps_si512(a);            // reinterpret as 32-bit integer
+           t &= 0x7F800000;                       // isolate exponent
+    return Vec16fb(t == 0);                       // exponent = 0
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f() {
+    union {
+        int32_t i;
+        float   f;
+    } inf = {0x7F800000};
+    return Vec16f(inf.f);
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x10) {
+    union {
+        int32_t i;
+        float   f;
+    } nanf = {0x7FC00000 + n};
+    return Vec16f(nanf.f);
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f change_sign(Vec16f const & a) {
+    const __mmask16 m = __mmask16((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7
+        | (i8&1)<<8 | (i9&1)<<9 | (i10&1)<<10 | (i11&1)<<11 | (i12&1)<<12 | (i13&1)<<13 | (i14&1)<<14 | (i15&1)<<15);
+    if ((uint16_t)m == 0) return a;
+    __m512 s = _mm512_castsi512_ps(_mm512_maskz_set1_epi32(m, 0x80000000));
+    return a ^ s;
+}
+
+
+
+/*****************************************************************************
+*
+*          Vec8d: Vector of 8 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec8d {
+protected:
+    __m512d zmm; // double vector
+public:
+    // Default constructor:
+    Vec8d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8d(double d) {
+        zmm = _mm512_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) {
+        zmm = _mm512_setr_pd(d0, d1, d2, d3, d4, d5, d6, d7); 
+    }
+    // Constructor to build from two Vec4d:
+    Vec8d(Vec4d const & a0, Vec4d const & a1) {
+        zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512d used in intrinsics:
+    Vec8d(__m512d const & x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512d used in intrinsics:
+    Vec8d & operator = (__m512d const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512d used in intrinsics
+    operator __m512d() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8d & load(double const * p) {
+        zmm = _mm512_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64
+    Vec8d & load_a(double const * p) {
+        zmm = _mm512_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm512_storeu_pd(p, zmm);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64
+    void store_a(double * p) const {
+        _mm512_store_pd(p, zmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8d & load_partial(int n, double const * p) {
+        zmm = _mm512_maskz_loadu_pd(__mmask8((1<<n)-1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        _mm512_mask_storeu_pd(p, __mmask8((1<<n)-1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8d & cutoff(int n) {
+        zmm = _mm512_maskz_mov_pd(__mmask8((1<<n)-1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8d const & insert(uint32_t index, double value) {
+        //zmm = _mm512_mask_set1_pd(zmm, __mmask8(1 << index), value);  // this intrinsic function does not exist (yet?)
+        zmm = _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_castpd_si512(zmm), __mmask8(1 << index), *(int64_t*)&value)); // ignore warning
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(uint32_t index) const {
+        double a[8];
+        store(a);
+        return a[index & 7];        
+    }
+
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4d:
+    Vec4d get_low() const {
+        return _mm512_castpd512_pd256(zmm);
+    }
+    Vec4d get_high() const {
+        return _mm512_extractf64x4_pd(zmm,1);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) {
+    return _mm512_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator + (Vec8d const & a, double b) {
+    return a + Vec8d(b);
+}
+static inline Vec8d operator + (double a, Vec8d const & b) {
+    return Vec8d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator ++ (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8d & operator ++ (Vec8d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) {
+    return _mm512_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator - (Vec8d const & a, double b) {
+    return a - Vec8d(b);
+}
+static inline Vec8d operator - (double a, Vec8d const & b) {
+    return Vec8d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator - (Vec8d const & a) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(0x8000000000000000));
+}
+
+// vector operator -= : subtract
+static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8d operator -- (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8d & operator -- (Vec8d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) {
+    return _mm512_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator * (Vec8d const & a, double b) {
+    return a * Vec8d(b);
+}
+static inline Vec8d operator * (double a, Vec8d const & b) {
+    return Vec8d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) {
+    return _mm512_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator / (Vec8d const & a, double b) {
+    return a / Vec8d(b);
+}
+static inline Vec8d operator / (double a, Vec8d const & b) {
+    return Vec8d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator == (Vec8d const & a, Vec8d const & b) {
+    return _mm512_cmp_pd_mask(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator != (Vec8d const & a, Vec8d const & b) {
+    return _mm512_cmp_pd_mask(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator < (Vec8d const & a, Vec8d const & b) {
+    return _mm512_cmp_pd_mask(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator <= (Vec8d const & a, Vec8d const & b) {
+    return _mm512_cmp_pd_mask(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator > (Vec8d const & a, Vec8d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator >= (Vec8d const & a, Vec8d const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator & (Vec8d const & a, Vec8d const & b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec8d & operator &= (Vec8d & a, Vec8d const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator & (Vec8d const & a, Vec8db const & b) {
+    return _mm512_maskz_mov_pd(b, a);
+}
+
+static inline Vec8d operator & (Vec8db const & a, Vec8d const & b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8d operator | (Vec8d const & a, Vec8d const & b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) | Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec8d & operator |= (Vec8d & a, Vec8d const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator ^ (Vec8d const & a, Vec8d const & b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8d & operator ^= (Vec8d & a, Vec8d const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator ! (Vec8d const & a) {
+    return a == Vec8d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select (Vec8db const & s, Vec8d const & a, Vec8d const & b) {
+    return _mm512_mask_mov_pd (b, s, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add (Vec8db const & f, Vec8d const & a, Vec8d const & b) {
+    return _mm512_mask_add_pd(a, f, a, b);
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b) {
+    return _mm512_mask_mul_pd(a, f, a, b);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec8d const & a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_pd(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const & a, Vec8d const & b) {
+    return _mm512_max_pd(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const & a, Vec8d const & b) {
+    return _mm512_min_pd(a,b);
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const & a) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(0x7FFFFFFFFFFFFFFF));
+}
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const & a) {
+    return _mm512_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec8d square(Vec8d const & a) {
+    return a * a;
+}
+
+// pow(Vec8d, int):
+template <typename TT> static Vec8d pow(Vec8d const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const & x0, int n) {
+    return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const & x0, uint32_t n) {
+    return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow_n(Vec8d const & a) {
+    if (n < 0)    return Vec8d(1.0) / pow_n<-n>(a);
+    if (n == 0)   return Vec8d(1.0);
+    if (n >= 256) return pow(a, n);
+    Vec8d x = a;                       // a^(2^i)
+    Vec8d y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec8d pow(Vec8d const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const & a) {
+    return _mm512_roundscale_pd(a, 0);
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const & a) {
+    return _mm512_roundscale_pd(a, 3);
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const & a) {
+    return _mm512_roundscale_pd(a, 1);
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const & a) {
+    return _mm512_roundscale_pd(a, 2);
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm512_cvtpd_epi32(a);
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8d const & a) {
+    return _mm512_cvttpd_epi32(a);
+}
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec8q truncate_to_int64(Vec8d const & a) {
+    // in 64-bit mode, use __int64 _mm_cvttsd_si64(__m128d a) ?
+    double aa[8];
+    a.store(aa);
+    return Vec8q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]), int64_t(aa[4]), int64_t(aa[5]), int64_t(aa[6]), int64_t(aa[7]));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q truncate_to_int64_limited(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4q   b = _mm512_cvttpd_epi32(a);                    // round to 32-bit integers
+    __m512i c = permute8q<0,-256,1,-256,2,-256,3,-256>(Vec8q(b,b));      // get bits 64-127 to position 128-191, etc.
+    __m512i s = _mm512_srai_epi32(c, 31);                  // sign extension bits
+    return      _mm512_unpacklo_epi32(c, s);               // interleave with sign extensions
+} 
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec8q round_to_int64(Vec8d const & a) {
+    return truncate_to_int64(round(a));
+}
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q round_to_int64_limited(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4q   b = _mm512_cvtpd_epi32(a);                     // round to 32-bit integers
+    __m512i c = permute8q<0,-256,1,-256,2,-256,3,-256>(Vec8q(b,b));      // get bits 64-127 to position 128-191, etc.
+    __m512i s = _mm512_srai_epi32(c, 31);                  // sign extension bits
+    return      _mm512_unpacklo_epi32(c, s);               // interleave with sign extensions
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8q const & a) {
+    int64_t aa[8];
+    a.store(aa);
+    return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7]));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec8d to_double_limited(Vec8q const & x) {
+    Vec16i compressed = permute16i<0,2,4,6,8,10,12,14,-256,-256,-256,-256,-256,-256,-256,-256>(Vec16i(x));
+    return _mm512_cvtepi32_pd(compressed.get_low());
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const & a) {
+    return _mm512_cvtepi32_pd(a);
+}
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress (Vec8d const & low, Vec8d const & high) {
+    __m256 t1 = _mm512_cvtpd_ps(low);
+    __m256 t2 = _mm512_cvtpd_ps(high);
+    return Vec16f(t1, t2);
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const & a) {
+    return _mm512_cvtps_pd(_mm512_castps512_ps256(a));
+}
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high (Vec16f const & a) {
+    return _mm512_cvtps_pd(a.get_high());
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return _mm512_fmadd_pd(a, b, c);
+}
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return _mm512_fmsub_pd(a, b, c);
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return _mm512_fnmadd_pd(a, b, c);
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+static inline Vec8d mul_sub_x(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return _mm512_fmsub_pd(a, b, c);
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const & a) {
+    Vec8uq t1 = _mm512_castpd_si512(a);// reinterpret as 64-bit integer
+    Vec8uq t2 = t1 << 1;               // shift out sign bit
+    Vec8uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec8q  t4 = Vec8q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25 
+static inline Vec8d fraction(Vec8d const & a) {
+    return _mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const & n) {
+    Vec8q t1 = max(n,  -0x3FF);        // limit to allowed range
+    Vec8q t2 = min(t1,  0x400);
+    Vec8q t3 = t2 + 0x3FF;             // add bias
+    Vec8q t4 = t3 << 52;               // put exponent into position 52
+    return _mm512_castsi512_pd(t4);    // reinterpret as double
+}
+//static Vec8d exp2(Vec8d const & x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false
+static inline Vec8db sign_bit(Vec8d const & a) {
+    Vec8q t1 = _mm512_castpd_si512(a);    // reinterpret as 64-bit integer
+    return Vec8db(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const & a, Vec8d const & b) {
+    union {
+        uint64_t i;
+        double f;
+    } u = {0x8000000000000000};  // mask for sign bit
+    return a ^ (b & Vec8d(u.f));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const & a) {
+    Vec8q  t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer
+    Vec8q  t2 = t1 << 1;                // shift out sign bit
+    Vec8q  t3 = 0xFFE0000000000000;     // exponent mask
+    Vec8qb t4 = Vec8q(t2 & t3) != t3;   // exponent field is not all 1s
+    return Vec8db(t4);
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const & a) {
+    Vec8q t1 = _mm512_castpd_si512(a);           // reinterpret as 64-bit integer
+    Vec8q t2 = t1 << 1;                          // shift out sign bit
+    return Vec8db(t2 == 0xFFE0000000000000);     // exponent is all 1s, fraction is 0
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec8db is_nan(Vec8d const & a) {
+    Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer
+    Vec8q t2 = t1 << 1;                // shift out sign bit
+    Vec8q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec8q t4 = t2 & t3;                // exponent
+    Vec8q t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec8db(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const & a) {
+    Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer
+    Vec8q t2 = t1 << 1;                // shift out sign bit
+    Vec8q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec8q t4 = t2 & t3;                // exponent
+    Vec8q t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec8db(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const & a) {
+    Vec8q t = _mm512_castpd_si512(a);            // reinterpret as 32-bit integer
+          t &= 0x7FF0000000000000ll;             // isolate exponent
+    return Vec8db(t == 0);                       // exponent = 0
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d() {
+    union {
+        uint64_t i;
+        double f;
+    } u = {0x7FF0000000000000};
+    return Vec8d(u.f);
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10) {
+    union {
+        uint64_t i;
+        double f;
+    } u = {0x7FF8000000000000 + uint64_t(n)};
+    return Vec8d(u.f);
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d change_sign(Vec8d const & a) {
+    const __mmask8 m = __mmask8((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7);
+    if ((uint8_t)m == 0) return a;
+    __m512d s = _mm512_castsi512_pd(_mm512_maskz_set1_epi64(m, 0x8000000000000000));
+    return a ^ s;
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+// AVX512 requires gcc version 4.9 or higher. Apparently the problem with mangling intrinsic vector types no longer exists in gcc 4.x
+
+static inline __m512i reinterpret_i (__m512i const & x) {
+    return x;
+}
+
+static inline __m512i reinterpret_i (__m512  const & x) {
+    return _mm512_castps_si512(x);
+}
+
+static inline __m512i reinterpret_i (__m512d const & x) {
+    return _mm512_castpd_si512(x);
+}
+
+static inline __m512  reinterpret_f (__m512i const & x) {
+    return _mm512_castsi512_ps(x);
+}
+
+static inline __m512  reinterpret_f (__m512  const & x) {
+    return x;
+}
+
+static inline __m512  reinterpret_f (__m512d const & x) {
+    return _mm512_castpd_ps(x);
+}
+
+static inline __m512d reinterpret_d (__m512i const & x) {
+    return _mm512_castsi512_pd(x);
+}
+
+static inline __m512d reinterpret_d (__m512  const & x) {
+    return _mm512_castps_pd(x);
+}
+
+static inline __m512d reinterpret_d (__m512d const & x) {
+    return x;
+}
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec8d a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8d b;
+* b = permute8d<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8d(Vec8d const & a) {
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<< 8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
+    const int m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_pd();
+
+    // mask for elements not zeroed
+    const __mmask8  z = __mmask8((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
+    // same with 2 bits for each element
+    const __mmask16 zz = __mmask16((i0>=0?3:0) | (i1>=0?0xC:0) | (i2>=0?0x30:0) | (i3>=0?0xC0:0) | (i4>=0?0x300:0) | (i5>=0?0xC00:0) | (i6>=0?0x3000:0) | (i7>=0?0xC000:0));
+
+    if (((m1 ^ 0x76543210) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            return _mm512_maskz_mov_pd(z, a);
+        }
+        return a;                                 // do nothing
+    }
+
+    if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
+        // no exchange of data between the four 128-bit lanes
+        const int pat = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
+        const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
+        if (((m1 ^ pat) & mz & 0x11111111) == 0) {
+            // same permute pattern in all lanes
+            if (dozero) {  // permute within lanes and zero
+                return _mm512_castsi512_pd(_mm512_maskz_shuffle_epi32(zz, _mm512_castpd_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+            else {  // permute within lanes
+                return _mm512_castsi512_pd(_mm512_shuffle_epi32(_mm512_castpd_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+        }
+        // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+    if ((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0) 
+    &&  ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0) {
+        // permute lanes only. no permutation within each lane
+        const int m3 = m2 | (m2 >> 4);
+        const int s = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
+        if (dozero) {
+            // permute lanes and zero some 64-bit elements
+            return  _mm512_maskz_shuffle_f64x2(z, a, a, (_MM_PERM_ENUM)s);
+        }
+        else {
+            // permute lanes
+            return _mm512_shuffle_f64x2(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+    // full permute needed
+    const __m512i pmask = constant16i<i0&7, 0, i1&7, 0, i2&7, 0, i3&7, 0, i4&7, 0, i5&7, 0, i6&7, 0, i7&7, 0>();
+    if (dozero) {
+        // full permute and zeroing
+        return _mm512_maskz_permutexvar_pd(z, pmask, a);
+    }
+    else {    
+        return _mm512_permutexvar_pd(pmask, a);
+    }
+}
+
+
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f permute16f(Vec16f const & a) {
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const uint64_t m1 = (i0&15) | (i1&15)<<4 | (i2&15)<< 8 | (i3&15)<<12 | (i4&15)<<16 | (i5&15)<<20 | (i6&15)<<24 | (i7&15LL)<<28   // 15LL avoids sign extension of (int32_t | int64_t)
+        | (i8&15LL)<<32 | (i9&15LL)<<36 | (i10&15LL)<<40 | (i11&15LL)<<44 | (i12&15LL)<<48 | (i13&15LL)<<52 | (i14&15LL)<<56 | (i15&15LL)<<60;
+
+    // Mask to zero out negative indexes
+    const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL) | (i8<0?0:0xF00000000) 
+        | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
+
+    const uint64_t m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_ps();
+
+    // mask for elements not zeroed
+    const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
+        | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
+
+    if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            return _mm512_maskz_mov_ps(z, a);
+        }
+        return a;                                 // do nothing
+    }
+
+    if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+        // no exchange of data between the four 128-bit lanes
+        const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
+        const int pmask = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
+        if (((m1 ^ pat) & 0x3333333333333333 & mz) == 0) {
+            // same permute pattern in all lanes
+            if (dozero) {  // permute within lanes and zero
+                return _mm512_castsi512_ps(_mm512_maskz_shuffle_epi32(z, _mm512_castps_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+            else {  // permute within lanes
+                return _mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(a), (_MM_PERM_ENUM)pmask));
+            }
+        }
+        // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+    const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
+    if ((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0) 
+    &&  ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+        // permute lanes only. no permutation within each lane
+        const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
+        if (dozero) {
+            // permute lanes and zero some 64-bit elements
+            return  _mm512_maskz_shuffle_f32x4(z, a, a, (_MM_PERM_ENUM)s);
+        }
+        else {
+            // permute lanes
+            return _mm512_shuffle_f32x4(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+    // full permute needed
+    const __m512i pmask = constant16i<i0&15, i1&15, i2&15, i3&15, i4&15, i5&15, i6&15, i7&15, i8&15, i9&15, i10&15, i11&15, i12&15, i13&15, i14&15, i15&15>();
+    if (dozero) {
+        // full permute and zeroing
+        return _mm512_maskz_permutexvar_ps(z, pmask, a);
+    }
+    else {    
+        return _mm512_permutexvar_ps(pmask, a);
+    }
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8d a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8d b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8d c;
+* c = blend8d<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) {  
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<< 8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
+    const int m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
+
+    // mask for elements not zeroed
+    const __mmask8 z = __mmask8((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_pd();
+
+    // special case: all from a
+    if ((m1 & 0x88888888 & mz) == 0) {
+        return permute8d <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+
+    // special case: all from b
+    if ((~m1 & 0x88888888 & mz) == 0) {
+        return permute8d <i0^8, i1^8, i2^8, i3^8, i4^8, i5^8, i6^8, i7^8> (b);
+    }
+
+    // special case: blend without permute
+    if (((m1 ^ 0x76543210) & 0x77777777 & mz) == 0) {
+        __mmask8 blendmask = __mmask8((i0&8)>>3 | (i1&8)>>2 | (i2&8)>>1 | (i3&8)>>0 | (i4&8)<<1 | (i5&8)<<2 | (i6&8)<<3 | (i7&8)<<4 );
+        __m512d t = _mm512_mask_blend_pd(blendmask, a, b);
+        if (dozero) {
+            t = _mm512_maskz_mov_pd(z, t);
+        }
+        return t;
+    }
+    // special case: all data stay within their lane
+    if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
+
+        // mask for elements from a and b
+        const uint32_t mb = ((i0&8)?0xF:0) | ((i1&8)?0xF0:0) | ((i2&8)?0xF00:0) | ((i3&8)?0xF000:0) | ((i4&8)?0xF0000:0) | ((i5&8)?0xF00000:0) | ((i6&8)?0xF000000:0) | ((i7&8)?0xF0000000:0);
+        const uint32_t mbz = mb & mz;     // mask for nonzero elements from b
+        const uint32_t maz = ~mb & mz;    // mask for nonzero elements from a
+        const uint32_t m1a = m1 & maz;
+        const uint32_t m1b = m1 & mbz;
+        const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from a
+        const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from b
+
+        if (((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0) {
+            // Same permute pattern in all lanes:
+            // todo!!: make special case for PSHUFD
+
+            // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+            // and we are saving 64 bytes of data cache.
+            // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+            __m512d ta = permute8d< (maz&0xF)?i0&7:-1, (maz&0xF0)?i1&7:-1, (maz&0xF00)?i2&7:-1, (maz&0xF000)?i3&7:-1, 
+                (maz&0xF0000)?i4&7:-1, (maz&0xF00000)?i5&7:-1, (maz&0xF000000)?i6&7:-1, (maz&0xF0000000)?i7&7:-1> (a);
+            // write mask for elements from b
+            const __mmask16 sb = ((mbz&0xF)?3:0) | ((mbz&0xF0)?0xC:0) | ((mbz&0xF00)?0x30:0) | ((mbz&0xF000)?0xC0:0) | ((mbz&0xF0000)?0x300:0) | ((mbz&0xF00000)?0xC00:0) | ((mbz&0xF000000)?0x3000:0) | ((mbz&0xF0000000)?0xC000:0);
+            // permute index for elements from b
+            const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
+            // 2. Permute elements from b and combine with elements from a through write mask
+            return _mm512_castsi512_pd(_mm512_mask_shuffle_epi32(_mm512_castpd_si512(ta), sb, _mm512_castpd_si512(b), (_MM_PERM_ENUM)pi));
+        }
+        // not same permute pattern in all lanes. use full permute
+    }
+    // general case: full permute
+    const __m512i pmask = constant16i<i0&0xF, 0, i1&0xF, 0, i2&0xF, 0, i3&0xF, 0, i4&0xF, 0, i5&0xF, 0, i6&0xF, 0, i7&0xF, 0>();
+    if (dozero) {
+        return _mm512_maskz_permutex2var_pd(z, a, pmask, b);
+    }
+    else {
+        return _mm512_permutex2var_pd(a, pmask, b);
+    }
+}
+
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16f blend16f(Vec16f const & a, Vec16f const & b) {  
+
+    // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
+    const uint64_t m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xFLL)<<28
+        | (i8&0xFLL)<<32 | (i9&0xFLL)<<36 | (i10&0xFLL)<<40 | (i11&0xFLL)<<44 | (i12&0xFLL)<<48 | (i13&0xFLL)<<52 | (i14&0xFLL)<<56 | (i15&0xFLL)<<60;
+
+    // Mask to zero out negative indexes
+    const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL)
+        | (i8<0?0:0xF00000000) | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
+    const uint64_t m2 = m1 & mz;
+
+    // collect bit 4 of each index = select source
+    const uint64_t ms = ((i0&16)?0xF:0) | ((i1&16)?0xF0:0) | ((i2&16)?0xF00:0) | ((i3&16)?0xF000:0) | ((i4&16)?0xF0000:0) | ((i5&16)?0xF00000:0) | ((i6&16)?0xF000000:0) | ((i7&16)?0xF0000000ULL:0)
+        | ((i8&16)?0xF00000000:0) | ((i9&16)?0xF000000000:0) | ((i10&16)?0xF0000000000:0) | ((i11&16)?0xF00000000000:0) | ((i12&16)?0xF000000000000:0) | ((i13&16)?0xF0000000000000:0) | ((i14&16)?0xF00000000000000:0) | ((i15&16)?0xF000000000000000:0);
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
+
+    // mask for elements not zeroed
+    const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7 
+        | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_ps();
+
+    // special case: all from a
+    if ((ms & mz) == 0) {
+        return permute16f<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a);
+    }
+
+    // special case: all from b
+    if ((~ms & mz) == 0) {
+        return permute16f<i0^16,i1^16,i2^16,i3^16,i4^16,i5^16,i6^16,i7^16,i8^16,i9^16,i10^16,i11^16,i12^16,i13^16,i14^16,i15^16 > (b);
+    }
+
+    // special case: blend without permute
+    if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
+        __mmask16 blendmask = __mmask16((i0&16)>>4 | (i1&16)>>3 | (i2&16)>>2 | (i3&16)>>1 | (i4&16) | (i5&16)<<1 | (i6&16)<<2 | (i7&16)<<3
+            | (i8&16)<<4 | (i9&16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11);
+        __m512 t = _mm512_mask_blend_ps(blendmask, a, b);
+        if (dozero) {
+            t = _mm512_maskz_mov_ps(z, t);
+        }
+        return t;
+    }
+
+    // special case: all data stay within their lane
+    if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+
+        // mask for elements from a and b
+        const uint64_t mb  = ms;
+        const uint64_t mbz = mb & mz;     // mask for nonzero elements from b
+        const uint64_t maz = ~mb & mz;    // mask for nonzero elements from a
+        const uint64_t m1a = m1 & maz;
+        const uint64_t m1b = m1 & mbz;
+        const uint64_t pata = ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from a
+        const uint64_t patb = ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from b
+
+        if (((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0) {
+            // Same permute pattern in all lanes:
+            // todo!!: special case for SHUFPS
+
+            // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+            // and we are saving 64 bytes of data cache.
+            // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+            __m512 ta = permute16f< (maz&0xF)?i0&15:-1, (maz&0xF0)?i1&15:-1, (maz&0xF00)?i2&15:-1, (maz&0xF000)?i3&15:-1, 
+                (maz&0xF0000)?i4&15:-1, (maz&0xF00000)?i5&15:-1, (maz&0xF000000)?i6&15:-1, (maz&0xF0000000)?i7&15:-1,
+                (maz&0xF00000000)?i8&15:-1, (maz&0xF000000000)?i9&15:-1, (maz&0xF0000000000)?i10&15:-1, (maz&0xF00000000000)?i11&15:-1, 
+                (maz&0xF000000000000)?i12&15:-1, (maz&0xF0000000000000)?i13&15:-1, (maz&0xF00000000000000)?i14&15:-1, (maz&0xF000000000000000)?i15&15:-1> (a);
+            // write mask for elements from b
+            const __mmask16 sb = ((mbz&0xF)?1:0) | ((mbz&0xF0)?0x2:0) | ((mbz&0xF00)?0x4:0) | ((mbz&0xF000)?0x8:0) | ((mbz&0xF0000)?0x10:0) | ((mbz&0xF00000)?0x20:0) | ((mbz&0xF000000)?0x40:0) | ((mbz&0xF0000000)?0x80:0) 
+                | ((mbz&0xF00000000)?0x100:0) | ((mbz&0xF000000000)?0x200:0) | ((mbz&0xF0000000000)?0x400:0) | ((mbz&0xF00000000000)?0x800:0) | ((mbz&0xF000000000000)?0x1000:0) | ((mbz&0xF0000000000000)?0x2000:0) | ((mbz&0xF00000000000000)?0x4000:0) | ((mbz&0xF000000000000000)?0x8000:0);
+            // permute index for elements from b
+            const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
+            // 2. Permute elements from b and combine with elements from a through write mask
+            return _mm512_castsi512_ps(_mm512_mask_shuffle_epi32(_mm512_castps_si512(ta), sb, _mm512_castps_si512(b), (_MM_PERM_ENUM)pi));
+        }
+        // not same permute pattern in all lanes. use full permute
+    }
+
+    // general case: full permute
+    const __m512i pmask = constant16i<i0&0x1F, i1&0x1F, i2&0x1F, i3&0x1F, i4&0x1F, i5&0x1F, i6&0x1F, i7&0x1F, 
+        i8&0x1F, i9&0x1F, i10&0x1F, i11&0x1F, i12&0x1F, i13&0x1F, i14&0x1F, i15&0x1F>();
+    if (dozero) {
+        return _mm512_maskz_permutex2var_ps(z, a, pmask, b);        
+    }
+    else {
+        return _mm512_permutex2var_ps(a, pmask, b);
+    }
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8d a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8d b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8d c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const & index, Vec16f const & table) {
+    return _mm512_permutexvar_ps(index, table);
+}
+
+template <int n>
+static inline Vec16f lookup(Vec16i const & index, float const * table) {
+    if (n <= 0) return 0;
+    if (n <= 16) {
+        Vec16f table1 = Vec16f().load((float*)table);
+        return lookup16(index, table1);
+    }
+    if (n <= 32) {
+        Vec16f table1 = Vec16f().load((float*)table);
+        Vec16f table2 = Vec16f().load((float*)table + 16);
+        return _mm512_permutex2var_ps(table1, index, table2);
+    }
+    // n > 32. Limit index
+    Vec16ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16ui(index), uint32_t(n-1));
+    }
+    return _mm512_i32gather_ps(index1, (const float*)table, 4);
+}
+
+
+static inline Vec8d lookup8(Vec8q const & index, Vec8d const & table) {
+    return _mm512_permutexvar_pd(index, table);
+}
+
+template <int n>
+static inline Vec8d lookup(Vec8q const & index, double const * table) {
+    if (n <= 0) return 0;
+    if (n <= 8) {
+        Vec8d table1 = Vec8d().load((double*)table);
+        return lookup8(index, table1);
+    }
+    if (n <= 16) {
+        Vec8d table1 = Vec8d().load((double*)table);
+        Vec8d table2 = Vec8d().load((double*)table + 8);
+        return _mm512_permutex2var_pd(table1, index, table2);
+    }
+    // n > 16. Limit index
+    Vec8uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8uq(index), uint32_t(n-1));
+    }
+    return _mm512_i64gather_pd(index1, (const double*)table, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f gather16f(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
+    // find smallest and biggest index, using only compile-time constant expressions
+    const int i01min   = i0  < i1  ? i0  : i1;
+    const int i23min   = i2  < i3  ? i2  : i3;
+    const int i45min   = i4  < i5  ? i4  : i5;
+    const int i67min   = i6  < i7  ? i6  : i7;
+    const int i89min   = i8  < i9  ? i8  : i9;
+    const int i1011min = i10 < i11 ? i10 : i11;
+    const int i1213min = i12 < i13 ? i12 : i13;
+    const int i1415min = i14 < i15 ? i14 : i15;
+    const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
+    const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
+    const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
+    const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
+    const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
+    const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+    const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
+    const int i01max   = i0  > i1  ? i0  : i1;
+    const int i23max   = i2  > i3  ? i2  : i3;
+    const int i45max   = i4  > i5  ? i4  : i5;
+    const int i67max   = i6  > i7  ? i6  : i7;
+    const int i89max   = i8  > i9  ? i8  : i9;
+    const int i1011max = i10 > i11 ? i10 : i11;
+    const int i1213max = i12 > i13 ? i12 : i13;
+    const int i1415max = i14 > i15 ? i14 : i15;
+    const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
+    const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
+    const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
+    const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
+    const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
+    const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+    const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
+    if (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16f b = Vec16f().load((float const *)a + imax-15);
+            return permute16f<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16f b = Vec16f().load((float const *)a + imin);
+            return permute16f<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)    
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16f b = Vec16f().load((float const *)a + imin);
+        Vec16f c = Vec16f().load((float const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16f<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i32gather_ps(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a, 4);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8d b = Vec8d().load((double const *)a + imax-7);
+            return permute8d<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8d b = Vec8d().load((double const *)a + imin);
+            return permute8d<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8d b = Vec8d().load((double const *)a + imin);
+        Vec8d c = Vec8d().load((double const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8d<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i64gather_pd(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16fb const & x) {
+    return horizontal_find_first(Vec16ib(x));
+}
+
+static inline int horizontal_find_first(Vec8db const & x) {
+    return horizontal_find_first(Vec8qb(x));
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec16fb const & x) {
+    return horizontal_count(Vec16ib(x));
+}
+
+static inline uint32_t horizontal_count(Vec8db const & x) {
+    return horizontal_count(Vec8qb(x));
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16fb x) {
+    return to_bits(Vec16ib(x));
+}
+
+// to_Vec16fb: convert integer bitfield to boolean vector
+static inline Vec16fb to_Vec16fb(uint16_t x) {
+    return Vec16fb(to_Vec16ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8db x) {
+    return to_bits(Vec8qb(x));
+}
+
+// to_Vec8db: convert integer bitfield to boolean vector
+static inline Vec8db to_Vec8db(uint8_t x) {
+    return Vec8db(to_Vec8qb(x));
+}
+
+#endif // VECTORF512_H
diff --git a/vectorclass/vectorf512e.h b/vectorclass/vectorf512e.h
new file mode 100755
index 0000000..a0077b3
--- /dev/null
+++ b/vectorclass/vectorf512e.h
@@ -0,0 +1,2127 @@
+/****************************  vectorf512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining floating point vector classes as interface to intrinsic 
+* functions in x86 microprocessors with AVX512 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX512F. 
+*
+* The following vector classes are defined here:
+* Vec16f    Vector of  16  single precision floating point numbers
+* Vec16fb   Vector of  16  Booleans for use with Vec16f
+* Vec8d     Vector of   8  double precision floating point numbers
+* Vec8db    Vector of   8  Booleans for use with Vec8d
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORF512_H)
+#if    VECTORF512_H != 1
+#error Two different versions of vectorf512.h included
+#endif
+#else
+#define VECTORF512_H 1
+
+#include "vectori512e.h"
+
+
+/*****************************************************************************
+*
+*          Vec16fb: Vector of 16 Booleans for use with Vec16f
+*
+*****************************************************************************/
+class Vec16fb : public Vec16b {
+public:
+    // Default constructor:
+    Vec16fb () {
+    }
+    // Constructor to build from all elements:
+    Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
+    }
+    // Constructor from Vec16b
+    Vec16fb (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor from two Vec8fb
+    Vec16fb (Vec8fb const & x0, Vec8fb const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16fb(bool b) : Vec16b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16fb & operator = (bool b) {
+        *this = Vec16b(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec16fb(int b);
+    Vec16fb & operator = (int x);
+public:
+
+    // Get low and high half
+    Vec8fb get_low() const {
+        return reinterpret_f(Vec8i(z0));
+    }
+    Vec8fb get_high() const {
+        return reinterpret_f(Vec8i(z1));
+    }
+};
+
+// Define operators for Vec16fb
+
+// vector operator & : bitwise and
+static inline Vec16fb operator & (Vec16fb const & a, Vec16fb const & b) {
+    return Vec16fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16fb operator && (Vec16fb const & a, Vec16fb const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16fb operator | (Vec16fb const & a, Vec16fb const & b) {
+    return Vec16fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16fb operator || (Vec16fb const & a, Vec16fb const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16fb operator ^ (Vec16fb const & a, Vec16fb const & b) {
+    return Vec16fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16fb operator ~ (Vec16fb const & a) {
+    return Vec16fb(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : element not
+static inline Vec16fb operator ! (Vec16fb const & a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16fb & operator &= (Vec16fb & a, Vec16fb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16fb & operator |= (Vec16fb & a, Vec16fb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16fb & operator ^= (Vec16fb & a, Vec16fb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec8db: Vector of 8 Booleans for use with Vec8d
+*
+*****************************************************************************/
+
+class Vec8db : public Vec512b {
+public:
+    // Default constructor:
+    Vec8db () {
+    }
+    // Constructor to build from all elements:
+    Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        z0 = Vec4qb(x0, x1, x2, x3);
+        z1 = Vec4qb(x4, x5, x6, x7);
+    }
+    // Construct from Vec512b
+    Vec8db (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor from two Vec4db
+    Vec8db (Vec4db const & x0, Vec4db const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to broadcast single value:
+    Vec8db(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8db & operator = (bool b) {
+        *this = Vec8db(b);
+        return *this;
+    }
+private: 
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8db(int b);
+    // Prevent assigning int because of ambiguity
+    Vec8db & operator = (int x);
+public:
+    Vec8db & insert (int index, bool a) {
+        if (index < 4) {
+            z0 = Vec4q(z0).insert(index, -(int64_t)a);
+        }
+        else {
+            z1 = Vec4q(z1).insert(index-4, -(int64_t)a);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4q(z0).extract(index) != 0;
+        }
+        else {
+            return Vec4q(z1).extract(index-4) != 0;
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Get low and high half
+    Vec4db get_low() const {
+        return reinterpret_d(Vec4q(z0));
+    }
+    Vec4db get_high() const {
+        return reinterpret_d(Vec4q(z1));
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+// Define operators for Vec8db
+
+// vector operator & : bitwise and
+static inline Vec8db operator & (Vec8db const & a, Vec8db const & b) {
+    return Vec8db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8db operator && (Vec8db const & a, Vec8db const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8db operator | (Vec8db const & a, Vec8db const & b) {
+    return Vec8db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8db operator || (Vec8db const & a, Vec8db const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8db operator ^ (Vec8db const & a, Vec8db const & b) {
+    return Vec8db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8db operator ~ (Vec8db const & a) {
+    return Vec8db(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : element not
+static inline Vec8db operator ! (Vec8db const & a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8db & operator &= (Vec8db & a, Vec8db const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8db & operator |= (Vec8db & a, Vec8db const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8db & operator ^= (Vec8db & a, Vec8db const & b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec16f: Vector of 16 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec16f {
+protected:
+    Vec8f z0;
+    Vec8f z1;
+public:
+    // Default constructor:
+    Vec16f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16f(float f) {
+        z0 = z1 = Vec8f(f);
+    }
+    // Constructor to build from all elements:
+    Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7,
+    float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) {
+        z0 = Vec8f(f0, f1, f2, f3, f4, f5, f6, f7);
+        z1 = Vec8f(f8, f9, f10, f11, f12, f13, f14, f15);
+    }
+    // Constructor to build from two Vec8f:
+    Vec16f(Vec8f const & a0, Vec8f const & a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // split into two halves
+    Vec8f get_low() const {
+        return z0;
+    }
+    Vec8f get_high() const {
+        return z1;
+    }
+    // Member function to load from array (unaligned)
+    Vec16f & load(float const * p) {
+        z0 = Vec8f().load(p);
+        z1 = Vec8f().load(p+8);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64.
+    Vec16f & load_a(float const * p) {
+        z0 = Vec8f().load_a(p);
+        z1 = Vec8f().load_a(p+8);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        Vec8f(z0).store(p);
+        Vec8f(z1).store(p+8);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64.
+    void store_a(float * p) const {
+        Vec8f(z0).store_a(p);
+        Vec8f(z1).store_a(p+8);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16f & load_partial(int n, float const * p) {
+        if (n < 8) {
+            z0 = Vec8f().load_partial(n, p);
+            z1 = Vec8f(0.f);
+        }
+        else {
+            z0 = Vec8f().load(p);
+            z1 = Vec8f().load_partial(n-8, p + 8);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        if (n < 8) {
+            Vec8f(z0).store_partial(n, p);
+        }
+        else {
+            Vec8f(z0).store(p);
+            Vec8f(z1).store_partial(n-8, p+8);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16f & cutoff(int n) {
+        if (n < 8) {
+            z0 = Vec8f(z0).cutoff(n);
+            z1 = Vec8f(0.f);
+        }
+        else {
+            z1 = Vec8f(z1).cutoff(n-8);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16f const & insert(uint32_t index, float value) {
+        if (index < 8) {
+            z0 = Vec8f(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8f(z1).insert(index-8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(uint32_t index) const {
+        float a[16];
+        store(a);
+        return a[index & 15];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator + (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator + (Vec16f const & a, float b) {
+    return a + Vec16f(b);
+}
+static inline Vec16f operator + (float a, Vec16f const & b) {
+    return Vec16f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec16f & operator += (Vec16f & a, Vec16f const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator ++ (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16f & operator ++ (Vec16f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator - (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator - (Vec16f const & a, float b) {
+    return a - Vec16f(b);
+}
+static inline Vec16f operator - (float a, Vec16f const & b) {
+    return Vec16f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator - (Vec16f const & a) {
+    return Vec16f(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16f & operator -= (Vec16f & a, Vec16f const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16f operator -- (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16f & operator -- (Vec16f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator * (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator * (Vec16f const & a, float b) {
+    return a * Vec16f(b);
+}
+static inline Vec16f operator * (float a, Vec16f const & b) {
+    return Vec16f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec16f & operator *= (Vec16f & a, Vec16f const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator / (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator / (Vec16f const & a, float b) {
+    return a / Vec16f(b);
+}
+static inline Vec16f operator / (float a, Vec16f const & b) {
+    return Vec16f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec16f & operator /= (Vec16f & a, Vec16f const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator == (Vec16f const & a, Vec16f const & b) {
+    return Vec16fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator != (Vec16f const & a, Vec16f const & b) {
+    return Vec16fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator < (Vec16f const & a, Vec16f const & b) {
+    return Vec16fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator <= (Vec16f const & a, Vec16f const & b) {
+    return Vec16fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator > (Vec16f const & a, Vec16f const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator >= (Vec16f const & a, Vec16f const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator & (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f & operator &= (Vec16f & a, Vec16f const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator & (Vec16f const & a, Vec16fb const & b) {
+    return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16f operator & (Vec16fb const & a, Vec16f const & b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16f operator | (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f & operator |= (Vec16f & a, Vec16f const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator ^ (Vec16f const & a, Vec16f const & b) {
+    return Vec16f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f & operator ^= (Vec16f & a, Vec16f const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator ! (Vec16f const & a) {
+    return Vec16fb(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec16f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec16f select (Vec16fb const & s, Vec16f const & a, Vec16f const & b) {
+    return Vec16f(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add (Vec16fb const & f, Vec16f const & a, Vec16f const & b) {
+    return Vec16f(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16f if_mul (Vec16fb const & f, Vec16f const & a, Vec16f const & b) {
+    return Vec16f(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec16f const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const & a, Vec16f const & b) {
+    return Vec16f(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const & a, Vec16f const & b) {
+    return Vec16f(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec16f abs(Vec16f const & a) {
+    return Vec16f(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const & a) {
+    return Vec16f(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec16f square(Vec16f const & a) {
+    return a * a;
+}
+
+// pow(Vec16f, int):
+template <typename TT> static Vec16f pow(Vec16f const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const & x0, int n) {
+    return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const & x0, uint32_t n) {
+    return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow_n(Vec16f const & a) {
+    if (n < 0)    return Vec16f(1.0f) / pow_n<-n>(a);
+    if (n == 0)   return Vec16f(1.0f);
+    if (n >= 256) return pow(a, n);
+    Vec16f x = a;                      // a^(2^i)
+    Vec16f y;                          // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec16f pow(Vec16f const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const & a) {
+    return Vec16f(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const & a) {
+    return Vec16f(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const & a) {
+    return Vec16f(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const & a) {
+    return Vec16f(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec16i round_to_int(Vec16f const & a) {
+    return Vec16i(round_to_int(a.get_low()), round_to_int(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec16i truncate_to_int(Vec16f const & a) {
+    return Vec16i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high()));
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const & a) {
+    return Vec16f(to_float(a.get_low()), to_float(a.get_high()));
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_recipr(Vec16f const & a) {
+    return Vec16f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_rsqrt(Vec16f const & a) {
+    return Vec16f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return Vec16f(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return Vec16f(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return Vec16f(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec16f mul_sub_x(Vec16f const & a, Vec16f const & b, Vec16f const & c) {
+    return Vec16f(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const & a) {
+    return Vec16i(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f 
+static inline Vec16f fraction(Vec16f const & a) {
+    return Vec16f(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const & n) {
+    return Vec16f(exp2(n.get_low()), exp2(n.get_high()));
+}
+//static Vec16f exp2(Vec16f const & x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const & a) {
+    return Vec16fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const & a, Vec16f const & b) {
+    return Vec16f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const & a) {
+    return Vec16fb(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const & a) {
+    return Vec16fb(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_nan(Vec16f const & a) {
+    return Vec16fb(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const & a) {
+    return Vec16fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const & a) {
+    return Vec16fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f() {
+    Vec8f inf = infinite8f();
+    return Vec16f(inf, inf);
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x10) {
+    Vec8f nan = nan8f(n);
+    return Vec16f(nan, nan);
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f change_sign(Vec16f const & a) {
+    return Vec16f(change_sign<i0,i1,i2,i3,i4,i5,i6,i7>(a.get_low()), change_sign<i8,i9,i10,i11,i12,i13,i14,i15>(a.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Vec8d: Vector of 8 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec8d {
+protected:
+    Vec4d z0;
+    Vec4d z1;
+public:
+    // Default constructor:
+    Vec8d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8d(double d) {
+        z0 = z1 = Vec4d(d);
+    }
+    // Constructor to build from all elements:
+    Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) {
+        z0 = Vec4d(d0, d1, d2, d3);
+        z1 = Vec4d(d4, d5, d6, d7);
+    }
+    // Constructor to build from two Vec4d:
+    Vec8d(Vec4d const & a0, Vec4d const & a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Member function to load from array (unaligned)
+    Vec8d & load(double const * p) {
+        z0.load(p);
+        z1.load(p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64
+    Vec8d & load_a(double const * p) {
+        z0.load_a(p);
+        z1.load_a(p+4);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        z0.store(p);
+        z1.store(p+4);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64
+    void store_a(double * p) const {
+        z0.store_a(p);
+        z1.store_a(p+4);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8d & load_partial(int n, double const * p) {
+        if (n < 4) {
+            z0.load_partial(n, p);
+            z1 = Vec4d(0.);
+        }
+        else {
+            z0.load(p);
+            z1.load_partial(n-4, p+4);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n < 4) {
+            z0.store_partial(n, p);
+        }
+        else {
+            z0.store(p);
+            z1.store_partial(n-4, p+4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8d & cutoff(int n) {
+        if (n < 4) {
+            z0.cutoff(n);
+            z1 = Vec4d(0.);
+        }
+        else {
+            z1.cutoff(n-4);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8d const & insert(uint32_t index, double value) {
+        if (index < 4) {
+            z0.insert(index, value);
+        }
+        else {
+            z1.insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(uint32_t index) const {
+        double a[8];
+        store(a);
+        return a[index & 7];        
+    }
+
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4d:
+    Vec4d get_low() const {
+        return z0;
+    }
+    Vec4d get_high() const {
+        return z1;
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator + (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator + (Vec8d const & a, double b) {
+    return a + Vec8d(b);
+}
+static inline Vec8d operator + (double a, Vec8d const & b) {
+    return Vec8d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator ++ (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8d & operator ++ (Vec8d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator - (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator - (Vec8d const & a, double b) {
+    return a - Vec8d(b);
+}
+static inline Vec8d operator - (double a, Vec8d const & b) {
+    return Vec8d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator - (Vec8d const & a) {
+    return Vec8d(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8d operator -- (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8d & operator -- (Vec8d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator * (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator * (Vec8d const & a, double b) {
+    return a * Vec8d(b);
+}
+static inline Vec8d operator * (double a, Vec8d const & b) {
+    return Vec8d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8d & operator *= (Vec8d & a, Vec8d const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator / (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator / (Vec8d const & a, double b) {
+    return a / Vec8d(b);
+}
+static inline Vec8d operator / (double a, Vec8d const & b) {
+    return Vec8d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8d & operator /= (Vec8d & a, Vec8d const & b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator == (Vec8d const & a, Vec8d const & b) {
+    return Vec8db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator != (Vec8d const & a, Vec8d const & b) {
+    return Vec8db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator < (Vec8d const & a, Vec8d const & b) {
+    return Vec8db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator <= (Vec8d const & a, Vec8d const & b) {
+    return Vec8db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator > (Vec8d const & a, Vec8d const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator >= (Vec8d const & a, Vec8d const & b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator & (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8d & operator &= (Vec8d & a, Vec8d const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator & (Vec8d const & a, Vec8db const & b) {
+    return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8d operator & (Vec8db const & a, Vec8d const & b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8d operator | (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8d & operator |= (Vec8d & a, Vec8d const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator ^ (Vec8d const & a, Vec8d const & b) {
+    return Vec8d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8d & operator ^= (Vec8d & a, Vec8d const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator ! (Vec8d const & a) {
+    return Vec8db(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select (Vec8db const & s, Vec8d const & a, Vec8d const & b) {
+    return Vec8d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add (Vec8db const & f, Vec8d const & a, Vec8d const & b) {
+    return Vec8d(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b) {
+    return Vec8d(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec8d const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const & a, Vec8d const & b) {
+    return Vec8d(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const & a, Vec8d const & b) {
+    return Vec8d(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const & a) {
+    return Vec8d(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const & a) {
+    return Vec8d(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec8d square(Vec8d const & a) {
+    return a * a;
+}
+
+// pow(Vec8d, int):
+template <typename TT> static Vec8d pow(Vec8d const & a, TT n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const & x0, int n) {
+    return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const & x0, uint32_t n) {
+    return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow_n(Vec8d const & a) {
+    if (n < 0)    return Vec8d(1.0) / pow_n<-n>(a);
+    if (n == 0)   return Vec8d(1.0);
+    if (n >= 256) return pow(a, n);
+    Vec8d x = a;                       // a^(2^i)
+    Vec8d y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec8d pow(Vec8d const & a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const & a) {
+    return Vec8d(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const & a) {
+    return Vec8d(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const & a) {
+    return Vec8d(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const & a) {
+    return Vec8d(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function round_to_int: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
+}
+
+// function truncate_to_int: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int(Vec8d const & a) {
+    return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high()));
+}
+
+// function truncate_to_int64: round towards zero. (inefficient)
+static inline Vec8q truncate_to_int64(Vec8d const & a) {
+    return Vec8q(truncate_to_int64(a.get_low()), truncate_to_int64(a.get_high()));
+}
+
+// function truncate_to_int64_limited: round towards zero.
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q truncate_to_int64_limited(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
+} 
+
+// function round_to_int64: round to nearest or even. (inefficient)
+static inline Vec8q round_to_int64(Vec8d const & a) {
+    return Vec8q(round_to_int64(a.get_low()), round_to_int64(a.get_high()));
+}
+
+// function round_to_int64_limited: round to nearest integer (even)
+// result as 64-bit integer vector, but with limited range
+static inline Vec8q round_to_int64_limited(Vec8d const & a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8q const & a) {
+    return Vec8d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+// function to_double_limited: convert integer vector elements to double vector
+// limited to abs(x) < 2^31
+static inline Vec8d to_double_limited(Vec8q const & a) {
+    return Vec8d(to_double_limited(a.get_low()), to_double_limited(a.get_high()));
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const & a) {
+    return Vec8d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress (Vec8d const & low, Vec8d const & high) {
+    return Vec16f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const & a) {
+    return Vec8d(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high (Vec16f const & a) {
+    return Vec8d(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return Vec8d(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return Vec8d(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return Vec8d(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations, 
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8d mul_sub_x(Vec8d const & a, Vec8d const & b, Vec8d const & c) {
+    return Vec8d(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const & a) {
+    return Vec8q(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25 
+static inline Vec8d fraction(Vec8d const & a) {
+    return Vec8d(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const & n) {
+    return Vec8d(exp2(n.get_low()), exp2(n.get_high()));
+}
+//static Vec8d exp2(Vec8d const & x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false
+static inline Vec8db sign_bit(Vec8d const & a) {
+    return Vec8db(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const & a, Vec8d const & b) {
+    return Vec8d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero, 
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const & a) {
+    return Vec8db(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const & a) {
+    return Vec8db(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec8db is_nan(Vec8d const & a) {
+    return Vec8db(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const & a) {
+    return Vec8db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const & a) {
+    return Vec8db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d() {
+    Vec4d inf = infinite4d();
+    return Vec8d(inf, inf);
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10) {
+    Vec4d nan = nan4d(n);
+    return Vec8d(nan, nan);
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d change_sign(Vec8d const & a) {
+    return Vec8d(change_sign<i0,i1,i2,i3>(a.get_low()), change_sign<i4,i5,i6,i7>(a.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline Vec512ie reinterpret_i (Vec512ie const & x) {
+    return x;
+}
+
+static inline Vec512ie reinterpret_i (Vec16f  const & x) {
+    return Vec512ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec512ie reinterpret_i (Vec8d const & x) {
+    return Vec512ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec16f  reinterpret_f (Vec512ie const & x) {
+    return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec16f  reinterpret_f (Vec16f  const & x) {
+    return x;
+}
+
+static inline Vec16f  reinterpret_f (Vec8d const & x) {
+    return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec512ie const & x) {
+    return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec16f  const & x) {
+    return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec8d const & x) {
+    return x;
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec8d a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8d b;
+* b = permute8d<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Permute vector of 8 double
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8d(Vec8d const & a) {
+    return Vec8d(blend4d<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+                 blend4d<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+// Permute vector of 16 float
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f permute16f(Vec16f const & a) {
+    return Vec16f(blend8f<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
+                  blend8f<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8d a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8d b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8d c;
+* c = blend8d<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// helper function used below
+template <int n>
+static inline Vec4d select4(Vec8d const & a, Vec8d const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return Vec4d(0.);
+}
+
+// blend vectors Vec8d
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8d blend8d(Vec8d const & a, Vec8d const & b) {  
+    const int j0 = i0 >= 0 ? i0/4 : i0;
+    const int j1 = i1 >= 0 ? i1/4 : i1;
+    const int j2 = i2 >= 0 ? i2/4 : i2;
+    const int j3 = i3 >= 0 ? i3/4 : i3;
+    const int j4 = i4 >= 0 ? i4/4 : i4;
+    const int j5 = i5 >= 0 ? i5/4 : i5;
+    const int j6 = i6 >= 0 ? i6/4 : i6;
+    const int j7 = i7 >= 0 ? i7/4 : i7;
+    Vec4d x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+    const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+    const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    if (r0 < 0) {
+        x0 = Vec4d(0.);
+    }
+    else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { 
+        // i0 - i3 all from same source
+        x0 = permute4d<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
+    }
+    else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { 
+        // i0 - i3 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 3 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+        x0 = blend4d<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i3 from three or four different sources
+        x0 = blend4d<0,1,6,7> (
+             blend4d<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
+             blend4d<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
+    }
+
+    if (r1 < 0) {
+        x1 = Vec4d(0.);
+    }
+    else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) { 
+        // i4 - i7 all from same source
+        x1 = permute4d<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
+    }
+    else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { 
+        // i4 - i7 all from two sources
+        const int k4 =  i4 >= 0 ? i4 & 3 : i4;
+        const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+        x1 = blend4d<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i4 - i7 from three or four different sources
+        x1 = blend4d<0,1,6,7> (
+             blend4d<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
+             blend4d<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
+    }
+
+    return Vec8d(x0,x1);
+}
+
+// helper function used below
+template <int n>
+static inline Vec8f select4(Vec16f const & a, Vec16f const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return Vec8f(0.f);
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16f blend16f(Vec16f const & a, Vec16f const & b) {
+
+    const int j0  = i0  >= 0 ? i0 /8 : i0;
+    const int j1  = i1  >= 0 ? i1 /8 : i1;
+    const int j2  = i2  >= 0 ? i2 /8 : i2;
+    const int j3  = i3  >= 0 ? i3 /8 : i3;
+    const int j4  = i4  >= 0 ? i4 /8 : i4;
+    const int j5  = i5  >= 0 ? i5 /8 : i5;
+    const int j6  = i6  >= 0 ? i6 /8 : i6;
+    const int j7  = i7  >= 0 ? i7 /8 : i7;
+    const int j8  = i8  >= 0 ? i8 /8 : i8;
+    const int j9  = i9  >= 0 ? i9 /8 : i9;
+    const int j10 = i10 >= 0 ? i10/8 : i10;
+    const int j11 = i11 >= 0 ? i11/8 : i11;
+    const int j12 = i12 >= 0 ? i12/8 : i12;
+    const int j13 = i13 >= 0 ? i13/8 : i13;
+    const int j14 = i14 >= 0 ? i14/8 : i14;
+    const int j15 = i15 >= 0 ? i15/8 : i15;
+
+    Vec8f x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2  >= 0 ? j2  : j3  >= 0 ? j3  : j4  >= 0 ? j4  : j5  >= 0 ? j5  : j6  >= 0 ? j6  : j7;
+    const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2  : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+    const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
+
+    if (r0 < 0) {
+        x0 = Vec8f(0.f);
+    }
+    else if (r0 == s0) {
+        // i0 - i7 all from same source
+        x0 = permute8f<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
+    }
+    else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3==s0) && (j4<0||j4==r0||j4==s0) && (j5<0||j5==r0||j5==s0) && (j6<0||j6==r0||j6==s0) && (j7<0||j7==r0||j7==s0)) {
+        // i0 - i7 all from two sources
+        const int k0 =  i0 >= 0 ? (i0 & 7) : i0;
+        const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
+        const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
+        const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
+        const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
+        const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
+        const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
+        const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
+        x0 = blend8f<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i7 from three or four different sources
+        const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
+        const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
+        const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
+        const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
+        const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
+        const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
+        const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
+        const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
+        x0 = blend8f<n0, n1, n2, n3, n4, n5, n6, n7> (
+             blend8f< j0   & 2 ? -256 : i0 &15,  j1   & 2 ? -256 : i1 &15,  j2   & 2 ? -256 : i2 &15,  j3   & 2 ? -256 : i3 &15,  j4   & 2 ? -256 : i4 &15,  j5   & 2 ? -256 : i5 &15,  j6   & 2 ? -256 : i6 &15,  j7   & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
+             blend8f<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
+    }
+
+    if (r1 < 0) {
+        x1 = Vec8f(0.f);
+    }
+    else if (r1 == s1) {
+        // i8 - i15 all from same source
+        x1 = permute8f<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
+    }
+    else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) {
+        // i8 - i15 all from two sources
+        const int k8 =  i8 >= 0 ? (i8 & 7) : i8;
+        const int k9 = (i9 >= 0 ? (i9 & 7) : i9 ) | (j9 == s1 ? 8 : 0);
+        const int k10= (i10>= 0 ? (i10& 7) : i10) | (j10== s1 ? 8 : 0);
+        const int k11= (i11>= 0 ? (i11& 7) : i11) | (j11== s1 ? 8 : 0);
+        const int k12= (i12>= 0 ? (i12& 7) : i12) | (j12== s1 ? 8 : 0);
+        const int k13= (i13>= 0 ? (i13& 7) : i13) | (j13== s1 ? 8 : 0);
+        const int k14= (i14>= 0 ? (i14& 7) : i14) | (j14== s1 ? 8 : 0);
+        const int k15= (i15>= 0 ? (i15& 7) : i15) | (j15== s1 ? 8 : 0);
+        x1 = blend8f<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i8 - i15 from three or four different sources
+        const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
+        const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
+        const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
+        const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
+        const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
+        const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
+        const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
+        const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
+        x1 = blend8f<n8, n9, n10, n11, n12, n13, n14, n15> (
+             blend8f< j8   & 2 ? -256 : i8 &15,  j9   & 2 ? -256 : i9 &15,  j10   & 2 ? -256 : i10 &15,  j11   & 2 ? -256 : i11 &15,  j12   & 2 ? -256 : i12 &15,  j13   & 2 ? -256 : i13 &15,  j14   & 2 ? -256 : i14 &15,  j15   & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
+             blend8f<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
+    }
+    return Vec16f(x0,x1);
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8d a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8d b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8d c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const & index, Vec16f const & table) {
+    float tab[16];
+    table.store(tab);
+    Vec8f t0 = lookup<16>(index.get_low(), tab);
+    Vec8f t1 = lookup<16>(index.get_high(), tab);
+    return Vec16f(t0, t1);
+}
+
+template <int n>
+static inline Vec16f lookup(Vec16i const & index, float const * table) {
+    if (n <=  0) return 0;
+    if (n <=  8) {
+        Vec8f table1 = Vec8f().load(table);        
+        return Vec16f(       
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if (n <= 16) return lookup16(index, Vec16f().load(table));
+    // n > 16. Limit index
+    Vec16ui i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16ui(index), n-1);
+    }
+    float const * t = table;
+    return Vec16f(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
+        t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
+}
+
+
+static inline Vec8d lookup8(Vec8q const & index, Vec8d const & table) {
+    double tab[8];
+    table.store(tab);
+    Vec4d t0 = lookup<8>(index.get_low(), tab);
+    Vec4d t1 = lookup<8>(index.get_high(), tab);
+    return Vec8d(t0, t1);
+} 
+
+template <int n>
+static inline Vec8d lookup(Vec8q const & index, double const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) {
+        Vec4d table1 = Vec4d().load(table);        
+        return Vec8d(       
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if (n <= 8) {
+        return lookup8(index, Vec8d().load(table));
+    }
+    // n > 8. Limit index
+    Vec8uq i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8uq(index), n-1);
+    }
+    double const * t = table;
+    return Vec8d(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f gather16f(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
+    // find smallest and biggest index, using only compile-time constant expressions
+    const int i01min   = i0  < i1  ? i0  : i1;
+    const int i23min   = i2  < i3  ? i2  : i3;
+    const int i45min   = i4  < i5  ? i4  : i5;
+    const int i67min   = i6  < i7  ? i6  : i7;
+    const int i89min   = i8  < i9  ? i8  : i9;
+    const int i1011min = i10 < i11 ? i10 : i11;
+    const int i1213min = i12 < i13 ? i12 : i13;
+    const int i1415min = i14 < i15 ? i14 : i15;
+    const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
+    const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
+    const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
+    const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
+    const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
+    const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+    const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
+    const int i01max   = i0  > i1  ? i0  : i1;
+    const int i23max   = i2  > i3  ? i2  : i3;
+    const int i45max   = i4  > i5  ? i4  : i5;
+    const int i67max   = i6  > i7  ? i6  : i7;
+    const int i89max   = i8  > i9  ? i8  : i9;
+    const int i1011max = i10 > i11 ? i10 : i11;
+    const int i1213max = i12 > i13 ? i12 : i13;
+    const int i1415max = i14 > i15 ? i14 : i15;
+    const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
+    const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
+    const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
+    const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
+    const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
+    const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+    const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
+    if (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16f b = Vec16f().load((float const *)a + imax-15);
+            return permute16f<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16f b = Vec16f().load((float const *)a + imin);
+            return permute16f<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)    
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16f b = Vec16f().load((float const *)a + imin);
+        Vec16f c = Vec16f().load((float const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16f<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8d b = Vec8d().load((double const *)a + imax-7);
+            return permute8d<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8d b = Vec8d().load((double const *)a + imin);
+            return permute8d<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8d b = Vec8d().load((double const *)a + imin);
+        Vec8d c = Vec8d().load((double const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8d<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a);
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16fb const & x) {
+    int a1 = horizontal_find_first(x.get_low());
+    if (a1 >= 0) return a1;
+    int a2 = horizontal_find_first(x.get_high());
+    if (a2 < 0) return a2;
+    return a2 + 8;
+}
+
+static inline int horizontal_find_first(Vec8db const & x) {
+    int a1 = horizontal_find_first(x.get_low());
+    if (a1 >= 0) return a1;
+    int a2 = horizontal_find_first(x.get_high());
+    if (a2 < 0) return a2;
+    return a2 + 4;
+}
+
+// count the number of true elements
+static inline uint32_t horizontal_count(Vec16fb const & x) {
+    return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
+}
+
+static inline uint32_t horizontal_count(Vec8db const & x) {
+    return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16fb const & x) {
+    return to_bits(Vec16ib(x));
+}
+
+// to_Vec16fb: convert integer bitfield to boolean vector
+static inline Vec16fb to_Vec16fb(uint16_t x) {
+    return Vec16fb(to_Vec16ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8db const & x) {
+    return to_bits(Vec8qb(x));
+}
+
+// to_Vec8db: convert integer bitfield to boolean vector
+static inline Vec8db to_Vec8db(uint8_t x) {
+    return Vec8db(to_Vec8qb(x));
+}
+
+#endif // VECTORF512_H
diff --git a/vectorclass/vectori128.h b/vectorclass/vectori128.h
new file mode 100755
index 0000000..8ec5df0
--- /dev/null
+++ b/vectorclass/vectori128.h
@@ -0,0 +1,6146 @@
+/****************************  vectori128.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-24
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining integer vector classes as interface to intrinsic 
+* functions in x86 microprocessors with SSE2 and later instruction sets
+* up to AVX.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least SSE2. Specify the supported 
+* instruction set by a command line define, e.g. __SSE4_1__ if the 
+* compiler does not automatically do so.
+*
+* The following vector classes are defined here:
+* Vec128b   Vector of 128  1-bit unsigned  integers or Booleans
+* Vec16c    Vector of  16  8-bit signed    integers
+* Vec16uc   Vector of  16  8-bit unsigned  integers
+* Vec16cb   Vector of  16  Booleans for use with Vec16c and Vec16uc
+* Vec8s     Vector of   8  16-bit signed   integers
+* Vec8us    Vector of   8  16-bit unsigned integers
+* Vec8sb    Vector of   8  Booleans for use with Vec8s and Vec8us
+* Vec4i     Vector of   4  32-bit signed   integers
+* Vec4ui    Vector of   4  32-bit unsigned integers
+* Vec4ib    Vector of   4  Booleans for use with Vec4i and Vec4ui
+* Vec2q     Vector of   2  64-bit signed   integers
+* Vec2uq    Vector of   2  64-bit unsigned integers
+* Vec2qb    Vector of   2  Booleans for use with Vec2q and Vec2uq
+*
+* Each vector object is represented internally in the CPU as a 128-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For example:
+* Vec4i a(1,2,3,4), b(5,6,7,8), c;
+* c = a + b;     // now c contains (6,8,10,12)
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2013 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+#ifndef VECTORI128_H
+#define VECTORI128_H
+
+#include "instrset.h"  // Select supported instruction set
+
+#if INSTRSET < 2   // SSE2 required
+#error Please compile for the SSE2 instruction set or higher
+#endif
+
+
+
+/*****************************************************************************
+*
+*          Vector of 128 1-bit unsigned integers or Booleans
+*
+*****************************************************************************/
+class Vec128b {
+protected:
+    __m128i xmm; // Integer vector
+public:
+    // Default constructor:
+    Vec128b() {
+    }
+    // Constructor to broadcast the same value into all elements
+    // Removed because of undesired implicit conversions
+    // Vec128b(int i) {
+    //     xmm = _mm_set1_epi32(-(i & 1));}
+
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec128b(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec128b & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec128b & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    void load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm_storeu_si128((__m128i*)p, xmm);
+    }
+    // Member function to store into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(void * p) const {
+        _mm_store_si128((__m128i*)p, xmm);
+    }
+    // Member function to change a single bit
+    // Note: This function is inefficient. Use load function if changing more than one bit
+    Vec128b const & set_bit(uint32_t index, int value) {
+        static const union {
+            uint64_t i[4];
+            __m128i  x[2];
+        } u = {{1,0,0,1}};                 // 2 vectors with bit 0 and 64 set, respectively
+        int w = (index >> 6) & 1;          // qword index
+        int bi = index & 0x3F;             // bit index within qword w
+        __m128i mask = u.x[w];
+        mask = _mm_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set
+        if (value & 1) {
+            xmm = _mm_or_si128(mask,xmm);
+        }
+        else {
+            xmm = _mm_andnot_si128(mask,xmm);
+        }
+        return *this;
+    }
+    // Member function to get a single bit
+    // Note: This function is inefficient. Use store function if reading more than one bit
+    int get_bit(uint32_t index) const {
+        union {
+            __m128i x;
+            uint8_t i[16];
+        } u;
+        u.x = xmm; 
+        int w = (index >> 3) & 0xF;            // byte index
+        int bi = index & 7;                    // bit index within byte w
+        return (u.i[w] >> bi) & 1;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return get_bit(index) != 0;
+    }
+    static int size() {
+        return 128;
+    }
+};
+
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec128b operator & (Vec128b const & a, Vec128b const & b) {
+    return _mm_and_si128(a, b);
+}
+static inline Vec128b operator && (Vec128b const & a, Vec128b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec128b operator | (Vec128b const & a, Vec128b const & b) {
+    return _mm_or_si128(a, b);
+}
+static inline Vec128b operator || (Vec128b const & a, Vec128b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec128b operator ^ (Vec128b const & a, Vec128b const & b) {
+    return _mm_xor_si128(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec128b operator ~ (Vec128b const & a) {
+    return _mm_xor_si128(a, _mm_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec128b & operator &= (Vec128b & a, Vec128b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec128b & operator |= (Vec128b & a, Vec128b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec128b & operator ^= (Vec128b & a, Vec128b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec128b andnot (Vec128b const & a, Vec128b const & b) {
+    return _mm_andnot_si128(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 4 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3>
+static inline __m128i constant4i() {
+    static const union {
+        int     i[4];
+        __m128i xmm;
+    } u = {{i0,i1,i2,i3}};
+    return u.xmm;
+}
+
+
+/*****************************************************************************
+*
+*          selectb function
+*
+*****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// The implementation depends on the instruction set: 
+// If SSE4.1 is supported then only bit 7 in each byte of s is checked, 
+// otherwise all bits in s are used.
+static inline __m128i selectb (__m128i const & s, __m128i const & a, __m128i const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_blendv_epi8 (b, a, s);
+#else
+    return _mm_or_si128(
+        _mm_and_si128(s,a),
+        _mm_andnot_si128(s,b));
+#endif
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec128b const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return _mm_testc_si128(a,constant4i<-1,-1,-1,-1>()) != 0;
+#else
+    __m128i t1 = _mm_unpackhi_epi64(a,a);                  // get 64 bits down
+    __m128i t2 = _mm_and_si128(a,t1);                      // and 64 bits
+#ifdef __x86_64__
+    int64_t t5 = _mm_cvtsi128_si64(t2);                    // transfer 64 bits to integer
+    return  t5 == int64_t(-1);
+#else
+    __m128i t3 = _mm_srli_epi64(t2,32);                    // get 32 bits down
+    __m128i t4 = _mm_and_si128(t2,t3);                     // and 32 bits
+    int     t5 = _mm_cvtsi128_si32(t4);                    // transfer 32 bits to integer
+    return  t5 == -1;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec128b const & a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return ! _mm_testz_si128(a,a);
+#else
+    __m128i t1 = _mm_unpackhi_epi64(a,a);                  // get 64 bits down
+    __m128i t2 = _mm_or_si128(a,t1);                       // and 64 bits
+#ifdef __x86_64__
+    int64_t t5 = _mm_cvtsi128_si64(t2);                    // transfer 64 bits to integer
+    return  t5 != int64_t(0);
+#else
+    __m128i t3 = _mm_srli_epi64(t2,32);                    // get 32 bits down
+    __m128i t4 = _mm_or_si128(t2,t3);                      // and 32 bits
+    int     t5 = _mm_cvtsi128_si32(t4);                    // transfer to integer
+    return  t5 != 0;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16c : public Vec128b {
+public:
+    // Default constructor:
+    Vec16c() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16c(int i) {
+        xmm = _mm_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) {
+        xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16c(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16c & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec16c & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec16c & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16c & load_partial(int n, void const * p) {
+        if      (n >= 16) load(p);
+        else if (n <= 0)  *this = 0;
+        else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) {
+            // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+            load(p);
+        }
+        else {
+            // worst case. read 1 byte at a time and suffer store forwarding penalty
+            char x[16];
+            for (int i = 0; i < n; i++) x[i] = ((char *)p)[i];
+            load(x);
+        }
+        cutoff(n);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n >= 16) {
+            store(p);
+            return;
+        }
+        if (n <= 0) return;
+        // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+        union {        
+            int8_t  c[16];
+            int16_t s[8];
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        store(u.c);
+        int j = 0;
+        if (n & 8) {
+            *(int64_t*)p = u.q[0];
+            j += 8;
+        }
+        if (n & 4) {
+            ((int32_t*)p)[j/4] = u.i[j/4];
+            j += 4;
+        }
+        if (n & 2) {
+            ((int16_t*)p)[j/2] = u.s[j/2];
+            j += 2;
+        }
+        if (n & 1) {
+            ((int8_t*)p)[j]    = u.c[j];
+        }
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16c & cutoff(int n) {
+        if (uint32_t(n) >= 16) return *this;
+        static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+        *this &= Vec16c().load(mask+16-n);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16c const & insert(uint32_t index, int8_t value) {
+        static const int8_t maskl[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+        __m128i broad = _mm_set1_epi8(value);  // broadcast value into all elements
+        __m128i mask  = _mm_loadu_si128((__m128i const*)(maskl+16-(index & 0x0F))); // mask with FF at index position
+        xmm = selectb(mask,broad,xmm);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(uint32_t index) const {
+        int8_t x[16];
+        store(x);
+        return x[index & 0x0F];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 16;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc
+*
+*****************************************************************************/
+
+class Vec16cb : public Vec16c {
+public:
+    // Default constructor
+    Vec16cb() {}
+    // Constructor to build from all elements:
+    Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
+        xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), 
+            -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16cb(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16cb & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16cb(bool b) : Vec16c(-int8_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16cb & operator = (bool b) {
+        *this = Vec16cb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec16cb(int b);
+    Vec16cb & operator = (int x);
+public:
+    Vec16cb & insert (int index, bool a) {
+        Vec16c::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec16c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16cb operator & (Vec16cb const & a, Vec16cb const & b) {
+    return Vec16cb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16cb operator && (Vec16cb const & a, Vec16cb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16cb & operator &= (Vec16cb & a, Vec16cb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16cb operator | (Vec16cb const & a, Vec16cb const & b) {
+    return Vec16cb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16cb operator || (Vec16cb const & a, Vec16cb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16cb & operator |= (Vec16cb & a, Vec16cb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16cb operator ^ (Vec16cb const & a, Vec16cb const & b) {
+    return Vec16cb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16cb & operator ^= (Vec16cb & a, Vec16cb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16cb operator ~ (Vec16cb const & a) {
+    return Vec16cb( ~ Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16cb operator ! (Vec16cb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec16cb andnot (Vec16cb const & a, Vec16cb const & b) {
+    return Vec16cb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16c operator + (Vec16c const & a, Vec16c const & b) {
+    return _mm_add_epi8(a, b);
+}
+
+// vector operator += : add
+static inline Vec16c & operator += (Vec16c & a, Vec16c const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16c operator ++ (Vec16c & a, int) {
+    Vec16c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16c & operator ++ (Vec16c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16c operator - (Vec16c const & a, Vec16c const & b) {
+    return _mm_sub_epi8(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec16c operator - (Vec16c const & a) {
+    return _mm_sub_epi8(_mm_setzero_si128(), a);
+}
+
+// vector operator -= : add
+static inline Vec16c & operator -= (Vec16c & a, Vec16c const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16c operator -- (Vec16c & a, int) {
+    Vec16c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16c & operator -- (Vec16c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16c operator * (Vec16c const & a, Vec16c const & b) {
+    // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
+    __m128i aodd    = _mm_srli_epi16(a,8);                 // odd numbered elements of a
+    __m128i bodd    = _mm_srli_epi16(b,8);                 // odd numbered elements of b
+    __m128i muleven = _mm_mullo_epi16(a,b);                // product of even numbered elements
+    __m128i mulodd  = _mm_mullo_epi16(aodd,bodd);          // product of odd  numbered elements
+            mulodd  = _mm_slli_epi16(mulodd,8);            // put odd numbered elements back in place
+    __m128i mask    = _mm_set1_epi32(0x00FF00FF);          // mask for even positions
+    __m128i product = selectb(mask,muleven,mulodd);        // interleave even and odd
+    return product;
+}
+
+// vector operator *= : multiply
+static inline Vec16c & operator *= (Vec16c & a, Vec16c const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16c operator << (Vec16c const & a, int b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;         // mask to remove bits that are shifted out
+    __m128i am    = _mm_and_si128(a,_mm_set1_epi8((char)mask));  // remove bits that will overflow
+    __m128i res   = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts
+    return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec16c & operator <<= (Vec16c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec16c operator >> (Vec16c const & a, int b) {
+    __m128i aeven = _mm_slli_epi16(a,8);                   // even numbered elements of a. get sign bit in position
+            aeven = _mm_sra_epi16(aeven,_mm_cvtsi32_si128(b+8)); // shift arithmetic, back to position
+    __m128i aodd  = _mm_sra_epi16(a,_mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic
+    __m128i mask    = _mm_set1_epi32(0x00FF00FF);          // mask for even positions
+    __m128i res     = selectb(mask,aeven,aodd);            // interleave even and odd
+    return res;
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16c & operator >>= (Vec16c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16cb operator == (Vec16c const & a, Vec16c const & b) {
+    return _mm_cmpeq_epi8(a,b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16cb operator != (Vec16c const & a, Vec16c const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comneq_epi8(a,b);
+#else  // SSE2 instruction set
+    return Vec16cb(Vec16c(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec16cb operator > (Vec16c const & a, Vec16c const & b) {
+    return _mm_cmpgt_epi8(a,b);
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec16cb operator < (Vec16c const & a, Vec16c const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16cb operator >= (Vec16c const & a, Vec16c const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epi8(a,b);
+#else  // SSE2 instruction set
+    return Vec16cb(Vec16c(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16cb operator <= (Vec16c const & a, Vec16c const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16c operator & (Vec16c const & a, Vec16c const & b) {
+    return Vec16c(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16c operator && (Vec16c const & a, Vec16c const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16c & operator &= (Vec16c & a, Vec16c const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16c operator | (Vec16c const & a, Vec16c const & b) {
+    return Vec16c(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16c operator || (Vec16c const & a, Vec16c const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16c & operator |= (Vec16c & a, Vec16c const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16c operator ^ (Vec16c const & a, Vec16c const & b) {
+    return Vec16c(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16c & operator ^= (Vec16c & a, Vec16c const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16c operator ~ (Vec16c const & a) {
+    return Vec16c( ~ Vec128b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16cb operator ! (Vec16c const & a) {
+    return _mm_cmpeq_epi8(a,_mm_setzero_si128());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec16c select (Vec16cb const & s, Vec16c const & a, Vec16c const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16c if_add (Vec16cb const & f, Vec16c const & a, Vec16c const & b) {
+    return a + (Vec16c(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec16c const & a) {
+    __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128());
+    __m128i sum2 = _mm_shuffle_epi32(sum1,2);
+    __m128i sum3 = _mm_add_epi16(sum1,sum2);
+    int8_t  sum4 = (int8_t)_mm_cvtsi128_si32(sum3);        // truncate to 8 bits
+    return  sum4;                                          // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec16c const & a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epi8(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);
+#elif  INSTRSET >= 4  // SSSE3
+    __m128i aeven = _mm_slli_epi16(a,8);                   // even numbered elements of a. get sign bit in position
+            aeven = _mm_srai_epi16(aeven,8);               // sign extend even numbered elements
+    __m128i aodd  = _mm_srai_epi16(a,8);                   // sign extend odd  numbered elements
+    __m128i sum1  = _mm_add_epi16(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_hadd_epi16(sum1,sum1);             // horizontally add 8 elements in 3 steps
+    __m128i sum3  = _mm_hadd_epi16(sum2,sum2);
+    __m128i sum4  = _mm_hadd_epi16(sum3,sum3);
+    int16_t sum5  = (int16_t)_mm_cvtsi128_si32(sum4);      // 16 bit sum
+    return  sum5;                                          // sign extend to 32 bits
+#else                 // SSE2
+    __m128i aeven = _mm_slli_epi16(a,8);                   // even numbered elements of a. get sign bit in position
+            aeven = _mm_srai_epi16(aeven,8);               // sign extend even numbered elements
+    __m128i aodd  = _mm_srai_epi16(a,8);                   // sign extend odd  numbered elements
+    __m128i sum1  = _mm_add_epi16(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // 4 high elements
+    __m128i sum3  = _mm_add_epi16(sum1,sum2);              // 4 sums
+    __m128i sum4  = _mm_shuffle_epi32(sum3,0x01);          // 2 high elements
+    __m128i sum5  = _mm_add_epi16(sum3,sum4);              // 2 sums
+    __m128i sum6  = _mm_shufflelo_epi16(sum5,0x01);        // 1 high element
+    __m128i sum7  = _mm_add_epi16(sum5,sum6);              // 1 sum
+    int16_t sum8  = _mm_cvtsi128_si32(sum7);               // 16 bit sum
+    return  sum8;                                          // sign extend to 32 bits
+#endif
+}
+
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16c add_saturated(Vec16c const & a, Vec16c const & b) {
+    return _mm_adds_epi8(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16c sub_saturated(Vec16c const & a, Vec16c const & b) {
+    return _mm_subs_epi8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16c max(Vec16c const & a, Vec16c const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epi8(a,b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80808080);
+    __m128i a1      = _mm_xor_si128(a,signbit);            // add 0x80
+    __m128i b1      = _mm_xor_si128(b,signbit);            // add 0x80
+    __m128i m1      = _mm_max_epu8(a1,b1);                 // unsigned max
+    return  _mm_xor_si128(m1,signbit);                     // sub 0x80
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec16c min(Vec16c const & a, Vec16c const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epi8(a,b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80808080);
+    __m128i a1      = _mm_xor_si128(a,signbit);            // add 0x80
+    __m128i b1      = _mm_xor_si128(b,signbit);            // add 0x80
+    __m128i m1      = _mm_min_epu8(a1,b1);                 // unsigned min
+    return  _mm_xor_si128(m1,signbit);                     // sub 0x80
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16c abs(Vec16c const & a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_sign_epi8(a,a);
+#else                 // SSE2
+    __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a);
+    return _mm_min_epu8(a, nega);   // unsigned min (the negative value is bigger when compared as unsigned)
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16c abs_saturated(Vec16c const & a) {
+    __m128i absa   = abs(a);                               // abs(a)
+    __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(),absa);// 0 > a
+    return           _mm_add_epi8(absa,overfl);            // subtract 1 if 0x80
+}
+
+// function rotate_left: rotate each element left by b bits 
+// Use negative count to rotate right
+static inline Vec16c rotate_left(Vec16c const & a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_rot_epi8(a,_mm_set1_epi8(b));
+#else  // SSE2 instruction set
+    __m128i bb        = _mm_cvtsi32_si128(b & 7);          // b modulo 8
+    __m128i mbb       = _mm_cvtsi32_si128((8-b) & 7);      // 8-b modulo 8
+    __m128i maskeven  = _mm_set1_epi32(0x00FF00FF);        // mask for even numbered bytes
+    __m128i even      = _mm_and_si128(a,maskeven);         // even numbered bytes of a
+    __m128i odd       = _mm_andnot_si128(maskeven,a);      // odd numbered bytes of a
+    __m128i evenleft  = _mm_sll_epi16(even,bb);            // even bytes of a << b
+    __m128i oddleft   = _mm_sll_epi16(odd,bb);             // odd  bytes of a << b
+    __m128i evenright = _mm_srl_epi16(even,mbb);           // even bytes of a >> 8-b
+    __m128i oddright  = _mm_srl_epi16(odd,mbb);            // odd  bytes of a >> 8-b
+    __m128i evenrot   = _mm_or_si128(evenleft,evenright);  // even bytes of a rotated
+    __m128i oddrot    = _mm_or_si128(oddleft,oddright);    // odd  bytes of a rotated
+    __m128i allrot    = selectb(maskeven,evenrot,oddrot);  // all  bytes rotated
+    return  allrot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16uc : public Vec16c {
+public:
+    // Default constructor:
+    Vec16uc() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16uc(uint32_t i) {
+        xmm = _mm_set1_epi8((char)i);
+    };
+    // Constructor to build from all elements:
+    Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) {
+        xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16uc(__m128i const & x) {
+        xmm = x;
+    };
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16uc & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16uc & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec16uc & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16uc const & insert(uint32_t index, uint8_t value) {
+        Vec16c::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(uint32_t index) const {
+        return Vec16c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+// Define operators for this class
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator << (Vec16uc const & a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;         // mask to remove bits that are shifted out
+    __m128i am    = _mm_and_si128(a,_mm_set1_epi8((char)mask));  // remove bits that will overflow
+    __m128i res   = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts
+    return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator << (Vec16uc const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator >> (Vec16uc const & a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF << (uint32_t)b;         // mask to remove bits that are shifted out
+    __m128i am    = _mm_and_si128(a,_mm_set1_epi8((char)mask));  // remove bits that will overflow
+    __m128i res   = _mm_srl_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts
+    return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator >> (Vec16uc const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16uc & operator >>= (Vec16uc & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16cb operator >= (Vec16uc const & a, Vec16uc const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epu8(a,b);
+#else  // SSE2 instruction set
+    return _mm_cmpeq_epi8(_mm_max_epu8(a,b),a); // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16cb operator <= (Vec16uc const & a, Vec16uc const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16cb operator > (Vec16uc const & a, Vec16uc const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comgt_epu8(a,b);
+#else  // SSE2 instruction set
+    return Vec16cb(Vec16c(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16cb operator < (Vec16uc const & a, Vec16uc const & b) {
+    return b > a;
+}
+
+// vector operator + : add
+static inline Vec16uc operator + (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc (Vec16c(a) + Vec16c(b));
+}
+
+// vector operator - : subtract
+static inline Vec16uc operator - (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc (Vec16c(a) - Vec16c(b));
+}
+
+// vector operator * : multiply
+static inline Vec16uc operator * (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc (Vec16c(a) * Vec16c(b));
+}
+
+// vector operator & : bitwise and
+static inline Vec16uc operator & (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16uc operator && (Vec16uc const & a, Vec16uc const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16uc operator | (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16uc operator || (Vec16uc const & a, Vec16uc const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16uc operator ^ (Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16uc operator ~ (Vec16uc const & a) {
+    return Vec16uc( ~ Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16uc select (Vec16cb const & s, Vec16uc const & a, Vec16uc const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16uc if_add (Vec16cb const & f, Vec16uc const & a, Vec16uc const & b) {
+    return a + (Vec16uc(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec16uc) is slightly faster)
+static inline uint32_t horizontal_add (Vec16uc const & a) {
+    __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128());
+    __m128i sum2 = _mm_shuffle_epi32(sum1,2);
+    __m128i sum3 = _mm_add_epi16(sum1,sum2);
+    uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);      // truncate to 16 bits
+    return  sum4;
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec16uc const & a) {
+    __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128());
+    __m128i sum2 = _mm_shuffle_epi32(sum1,2);
+    __m128i sum3 = _mm_add_epi16(sum1,sum2);
+    return _mm_cvtsi128_si32(sum3);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16uc add_saturated(Vec16uc const & a, Vec16uc const & b) {
+    return _mm_adds_epu8(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16uc sub_saturated(Vec16uc const & a, Vec16uc const & b) {
+    return _mm_subs_epu8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16uc max(Vec16uc const & a, Vec16uc const & b) {
+    return _mm_max_epu8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16uc min(Vec16uc const & a, Vec16uc const & b) {
+    return _mm_min_epu8(a,b);
+}
+
+
+    
+/*****************************************************************************
+*
+*          Vector of 8 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8s : public Vec128b {
+public:
+    // Default constructor:
+    Vec8s() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec8s(int i) {
+        xmm = _mm_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7) {
+        xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+    };
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8s(__m128i const & x) {
+        xmm = x;
+    };
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8s & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    };
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    };
+    // Member function to load from array (unaligned)
+    Vec8s & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec8s & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8s & load_partial(int n, void const * p) {
+        if      (n >= 8) load(p);
+        else if (n <= 0)  *this = 0;
+        else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) {
+            // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+            load(p);
+        }
+        else {
+            // worst case. read 1 byte at a time and suffer store forwarding penalty
+            int16_t x[8];
+            for (int i = 0; i < n; i++) x[i] = ((int16_t *)p)[i];
+            load(x);
+        }
+        cutoff(n);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n >= 8) {
+            store(p);
+            return;
+        }
+        if (n <= 0) return;
+        // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+        union {        
+            int8_t  c[16];
+            int16_t s[8];
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        store(u.c);
+        int j = 0;
+        if (n & 4) {
+            *(int64_t*)p = u.q[0];
+            j += 8;
+        }
+        if (n & 2) {
+            ((int32_t*)p)[j/4] = u.i[j/4];
+            j += 4;
+        }
+        if (n & 1) {
+            ((int16_t*)p)[j/2] = u.s[j/2];
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8s & cutoff(int n) {
+        *this = Vec16c(xmm).cutoff(n * 2);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8s const & insert(uint32_t index, int16_t value) {
+        switch(index) {
+        case 0:
+            xmm = _mm_insert_epi16(xmm,value,0);  break;
+        case 1:
+            xmm = _mm_insert_epi16(xmm,value,1);  break;
+        case 2:
+            xmm = _mm_insert_epi16(xmm,value,2);  break;
+        case 3:
+            xmm = _mm_insert_epi16(xmm,value,3);  break;
+        case 4:
+            xmm = _mm_insert_epi16(xmm,value,4);  break;
+        case 5:
+            xmm = _mm_insert_epi16(xmm,value,5);  break;
+        case 6:
+            xmm = _mm_insert_epi16(xmm,value,6);  break;
+        case 7:
+            xmm = _mm_insert_epi16(xmm,value,7);  break;
+        }
+        return *this;
+    };
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    int16_t extract(uint32_t index) const {
+        switch(index) {
+        case 0:
+            return (int16_t)_mm_extract_epi16(xmm,0);
+        case 1:
+            return (int16_t)_mm_extract_epi16(xmm,1);
+        case 2:
+            return (int16_t)_mm_extract_epi16(xmm,2);
+        case 3:
+            return (int16_t)_mm_extract_epi16(xmm,3);
+        case 4:
+            return (int16_t)_mm_extract_epi16(xmm,4);
+        case 5:
+            return (int16_t)_mm_extract_epi16(xmm,5);
+        case 6:
+            return (int16_t)_mm_extract_epi16(xmm,6);
+        case 7:
+            return (int16_t)_mm_extract_epi16(xmm,7);
+        }
+        return 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 8;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us
+*
+*****************************************************************************/
+
+class Vec8sb : public Vec8s {
+public:
+    // Constructor to build from all elements:
+    Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7));
+    }
+    // Default constructor:
+    Vec8sb() {
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8sb(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8sb & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8sb(bool b) : Vec8s(-int16_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8sb & operator = (bool b) {
+        *this = Vec8sb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8sb(int b);
+    Vec8sb & operator = (int x);
+public:
+    Vec8sb & insert (int index, bool a) {
+        Vec8s::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(uint32_t index) const {
+        return Vec8s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec8sb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8sb operator & (Vec8sb const & a, Vec8sb const & b) {
+    return Vec8sb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8sb operator && (Vec8sb const & a, Vec8sb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8sb & operator &= (Vec8sb & a, Vec8sb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8sb operator | (Vec8sb const & a, Vec8sb const & b) {
+    return Vec8sb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8sb operator || (Vec8sb const & a, Vec8sb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8sb & operator |= (Vec8sb & a, Vec8sb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8sb operator ^ (Vec8sb const & a, Vec8sb const & b) {
+    return Vec8sb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8sb & operator ^= (Vec8sb & a, Vec8sb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8sb operator ~ (Vec8sb const & a) {
+    return Vec8sb( ~ Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8sb operator ! (Vec8sb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec8sb andnot (Vec8sb const & a, Vec8sb const & b) {
+    return Vec8sb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+
+/*****************************************************************************
+*
+*         operators for Vec8s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8s operator + (Vec8s const & a, Vec8s const & b) {
+    return _mm_add_epi16(a, b);
+}
+
+// vector operator += : add
+static inline Vec8s & operator += (Vec8s & a, Vec8s const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8s operator ++ (Vec8s & a, int) {
+    Vec8s a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8s & operator ++ (Vec8s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8s operator - (Vec8s const & a, Vec8s const & b) {
+    return _mm_sub_epi16(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec8s operator - (Vec8s const & a) {
+    return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec8s & operator -= (Vec8s & a, Vec8s const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8s operator -- (Vec8s & a, int) {
+    Vec8s a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8s & operator -- (Vec8s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8s operator * (Vec8s const & a, Vec8s const & b) {
+    return _mm_mullo_epi16(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec8s & operator *= (Vec8s & a, Vec8s const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec8s operator << (Vec8s const & a, int b) {
+    return _mm_sll_epi16(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec8s & operator <<= (Vec8s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8s operator >> (Vec8s const & a, int b) {
+    return _mm_sra_epi16(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8s & operator >>= (Vec8s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8s operator == (Vec8s const & a, Vec8s const & b) {
+    return _mm_cmpeq_epi16(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8s operator != (Vec8s const & a, Vec8s const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comneq_epi16(a,b);
+#else  // SSE2 instruction set
+    return Vec8s (~(a == b));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8s operator > (Vec8s const & a, Vec8s const & b) {
+    return _mm_cmpgt_epi16(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8s operator < (Vec8s const & a, Vec8s const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8s operator >= (Vec8s const & a, Vec8s const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epi16(a,b);
+#else  // SSE2 instruction set
+    return Vec8s (~(b > a));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8s operator <= (Vec8s const & a, Vec8s const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8s operator & (Vec8s const & a, Vec8s const & b) {
+    return Vec8s(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8s operator && (Vec8s const & a, Vec8s const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8s & operator &= (Vec8s & a, Vec8s const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8s operator | (Vec8s const & a, Vec8s const & b) {
+    return Vec8s(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8s operator || (Vec8s const & a, Vec8s const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8s & operator |= (Vec8s & a, Vec8s const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8s operator ^ (Vec8s const & a, Vec8s const & b) {
+    return Vec8s(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8s & operator ^= (Vec8s & a, Vec8s const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8s operator ~ (Vec8s const & a) {
+    return Vec8s( ~ Vec128b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec8s operator ! (Vec8s const & a) {
+    return _mm_cmpeq_epi16(a,_mm_setzero_si128());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8s select (Vec8s const & s, Vec8s const & a, Vec8s const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8s if_add (Vec8sb const & f, Vec8s const & a, Vec8s const & b) {
+    return a + (Vec8s(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec8s const & a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epi16(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    int16_t sum4  = _mm_cvtsi128_si32(sum3);               // truncate to 16 bits
+    return  sum4;                                          // sign extend to 32 bits
+#elif  INSTRSET >= 4  // SSSE3
+    __m128i sum1  = _mm_hadd_epi16(a,a);                   // horizontally add 8 elements in 3 steps
+    __m128i sum2  = _mm_hadd_epi16(sum1,sum1);
+    __m128i sum3  = _mm_hadd_epi16(sum2,sum2);
+    int16_t sum4  = (int16_t)_mm_cvtsi128_si32(sum3);      // 16 bit sum
+    return  sum4;                                          // sign extend to 32 bits
+#else                 // SSE2
+    __m128i sum1  = _mm_shuffle_epi32(a,0x0E);             // 4 high elements
+    __m128i sum2  = _mm_add_epi16(a,sum1);                 // 4 sums
+    __m128i sum3  = _mm_shuffle_epi32(sum2,0x01);          // 2 high elements
+    __m128i sum4  = _mm_add_epi16(sum2,sum3);              // 2 sums
+    __m128i sum5  = _mm_shufflelo_epi16(sum4,0x01);        // 1 high element
+    __m128i sum6  = _mm_add_epi16(sum4,sum5);              // 1 sum
+    int16_t sum7  = _mm_cvtsi128_si32(sum6);               // 16 bit sum
+    return  sum7;                                          // sign extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x (Vec8s const & a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epi16(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);
+#elif  INSTRSET >= 4  // SSSE3
+    __m128i aeven = _mm_slli_epi32(a,16);                  // even numbered elements of a. get sign bit in position
+            aeven = _mm_srai_epi32(aeven,16);              // sign extend even numbered elements
+    __m128i aodd  = _mm_srai_epi32(a,16);                  // sign extend odd  numbered elements
+    __m128i sum1  = _mm_add_epi32(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_hadd_epi32(sum1,sum1);             // horizontally add 4 elements in 2 steps
+    __m128i sum3  = _mm_hadd_epi32(sum2,sum2);
+    return  _mm_cvtsi128_si32(sum3);
+#else                 // SSE2
+    __m128i aeven = _mm_slli_epi32(a,16);                  // even numbered elements of a. get sign bit in position
+            aeven = _mm_srai_epi32(aeven,16);              // sign extend even numbered elements
+    __m128i aodd  = _mm_srai_epi32(a,16);                  // sign extend odd  numbered elements
+    __m128i sum1  = _mm_add_epi32(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // 2 high elements
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);
+    __m128i sum4  = _mm_shuffle_epi32(sum3,0x01);          // 1 high elements
+    __m128i sum5  = _mm_add_epi32(sum3,sum4);
+    return  _mm_cvtsi128_si32(sum5);                       // 32 bit sum
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8s add_saturated(Vec8s const & a, Vec8s const & b) {
+    return _mm_adds_epi16(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8s sub_saturated(Vec8s const & a, Vec8s const & b) {
+    return _mm_subs_epi16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8s max(Vec8s const & a, Vec8s const & b) {
+    return _mm_max_epi16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8s min(Vec8s const & a, Vec8s const & b) {
+    return _mm_min_epi16(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8s abs(Vec8s const & a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_sign_epi16(a,a);
+#else                 // SSE2
+    __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a);
+    return _mm_max_epi16(a, nega);
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8s abs_saturated(Vec8s const & a) {
+    __m128i absa   = abs(a);                               // abs(a)
+    __m128i overfl = _mm_srai_epi16(absa,15);              // sign
+    return           _mm_add_epi16(absa,overfl);           // subtract 1 if 0x8000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8s rotate_left(Vec8s const & a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_rot_epi16(a,_mm_set1_epi16(b));
+#else  // SSE2 instruction set
+    __m128i left  = _mm_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F));      // a << b 
+    __m128i right = _mm_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b)
+    __m128i rot   = _mm_or_si128(left,right);                          // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8us : public Vec8s {
+public:
+    // Default constructor:
+    Vec8us() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec8us(uint32_t i) {
+        xmm = _mm_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) {
+        xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+    };
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8us(__m128i const & x) {
+        xmm = x;
+    };
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8us & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec8us & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec8us & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8us const & insert(uint32_t index, uint16_t value) {
+        Vec8s::insert(index, value);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    uint16_t extract(uint32_t index) const {
+        return Vec8s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8us operator + (Vec8us const & a, Vec8us const & b) {
+    return Vec8us (Vec8s(a) + Vec8s(b));
+}
+
+// vector operator - : subtract
+static inline Vec8us operator - (Vec8us const & a, Vec8us const & b) {
+    return Vec8us (Vec8s(a) - Vec8s(b));
+}
+
+// vector operator * : multiply
+static inline Vec8us operator * (Vec8us const & a, Vec8us const & b) {
+    return Vec8us (Vec8s(a) * Vec8s(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator >> (Vec8us const & a, uint32_t b) {
+    return _mm_srl_epi16(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator >> (Vec8us const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8us & operator >>= (Vec8us & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8us operator << (Vec8us const & a, uint32_t b) {
+    return _mm_sll_epi16(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator << : shift left all elements
+static inline Vec8us operator << (Vec8us const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8s operator >= (Vec8us const & a, Vec8us const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epu16(a,b);
+#elif INSTRSET >= 5   // SSE4.1
+    __m128i max_ab = _mm_max_epu16(a,b);                   // max(a,b), unsigned
+    return _mm_cmpeq_epi16(a,max_ab);                      // a == max(a,b)
+#else  // SSE2 instruction set
+    __m128i sub1 = _mm_sub_epi16(a,b);                     // a-b, wraparound
+    __m128i sub2 = _mm_subs_epu16(a,b);                    // a-b, saturated
+    return  _mm_cmpeq_epi16(sub1,sub2);                    // sub1 == sub2 if no carry
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8s operator <= (Vec8us const & a, Vec8us const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8s operator > (Vec8us const & a, Vec8us const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comgt_epu16(a,b);
+#else  // SSE2 instruction set
+    return Vec8s (~(b >= a));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8s operator < (Vec8us const & a, Vec8us const & b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8us operator & (Vec8us const & a, Vec8us const & b) {
+    return Vec8us(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8us operator && (Vec8us const & a, Vec8us const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8us operator | (Vec8us const & a, Vec8us const & b) {
+    return Vec8us(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8us operator || (Vec8us const & a, Vec8us const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8us operator ^ (Vec8us const & a, Vec8us const & b) {
+    return Vec8us(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8us operator ~ (Vec8us const & a) {
+    return Vec8us( ~ Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8us select (Vec8s const & s, Vec8us const & a, Vec8us const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8us if_add (Vec8sb const & f, Vec8us const & a, Vec8us const & b) {
+    return a + (Vec8us(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec8us const & a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epu16(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    uint16_t sum4 = _mm_cvtsi128_si32(sum3);               // truncate to 16 bits
+    return  sum4;                                          // zero extend to 32 bits
+#elif  INSTRSET >= 4  // SSSE3
+    __m128i sum1  = _mm_hadd_epi16(a,a);                   // horizontally add 8 elements in 3 steps
+    __m128i sum2  = _mm_hadd_epi16(sum1,sum1);
+    __m128i sum3  = _mm_hadd_epi16(sum2,sum2);
+    uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);     // 16 bit sum
+    return  sum4;                                          // zero extend to 32 bits
+#else                 // SSE2
+    __m128i sum1  = _mm_shuffle_epi32(a,0x0E);             // 4 high elements
+    __m128i sum2  = _mm_add_epi16(a,sum1);                 // 4 sums
+    __m128i sum3  = _mm_shuffle_epi32(sum2,0x01);          // 2 high elements
+    __m128i sum4  = _mm_add_epi16(sum2,sum3);              // 2 sums
+    __m128i sum5  = _mm_shufflelo_epi16(sum4,0x01);        // 1 high element
+    __m128i sum6  = _mm_add_epi16(sum4,sum5);              // 1 sum
+    uint16_t sum7 = _mm_cvtsi128_si32(sum6);               // 16 bit sum
+    return  sum7;                                          // zero extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec8us const & a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epu16(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);
+#elif INSTRSET >= 4  // SSSE3
+    __m128i mask  = _mm_set1_epi32(0x0000FFFF);            // mask for even positions
+    __m128i aeven = _mm_and_si128(a,mask);                 // even numbered elements of a
+    __m128i aodd  = _mm_srli_epi32(a,16);                  // zero extend odd numbered elements
+    __m128i sum1  = _mm_add_epi32(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_hadd_epi32(sum1,sum1);             // horizontally add 4 elements in 2 steps
+    __m128i sum3  = _mm_hadd_epi32(sum2,sum2);
+    return  _mm_cvtsi128_si32(sum3);
+#else                 // SSE2
+    __m128i mask  = _mm_set1_epi32(0x0000FFFF);            // mask for even positions
+    __m128i aeven = _mm_and_si128(a,mask);                 // even numbered elements of a
+    __m128i aodd  = _mm_srli_epi32(a,16);                  // zero extend odd numbered elements
+    __m128i sum1  = _mm_add_epi32(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // 2 high elements
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);
+    __m128i sum4  = _mm_shuffle_epi32(sum3,0x01);          // 1 high elements
+    __m128i sum5  = _mm_add_epi32(sum3,sum4);
+    return  _mm_cvtsi128_si32(sum5);               // 16 bit sum
+#endif
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8us add_saturated(Vec8us const & a, Vec8us const & b) {
+    return _mm_adds_epu16(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8us sub_saturated(Vec8us const & a, Vec8us const & b) {
+    return _mm_subs_epu16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8us max(Vec8us const & a, Vec8us const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epu16(a,b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80008000);
+    __m128i a1      = _mm_xor_si128(a,signbit);            // add 0x8000
+    __m128i b1      = _mm_xor_si128(b,signbit);            // add 0x8000
+    __m128i m1      = _mm_max_epi16(a1,b1);                // signed max
+    return  _mm_xor_si128(m1,signbit);                     // sub 0x8000
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec8us min(Vec8us const & a, Vec8us const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epu16(a,b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80008000);
+    __m128i a1      = _mm_xor_si128(a,signbit);            // add 0x8000
+    __m128i b1      = _mm_xor_si128(b,signbit);            // add 0x8000
+    __m128i m1      = _mm_min_epi16(a1,b1);                // signed min
+    return  _mm_xor_si128(m1,signbit);                     // sub 0x8000
+#endif
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4i : public Vec128b {
+public:
+    // Default constructor:
+    Vec4i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4i(int i) {
+        xmm = _mm_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) {
+        xmm = _mm_setr_epi32(i0, i1, i2, i3);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4i(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4i & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4i & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec4i & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4i & load_partial(int n, void const * p) {
+        switch (n) {
+        case 0:
+            *this = 0;  break;
+        case 1:
+            xmm = _mm_cvtsi32_si128(*(int32_t*)p);  break;
+        case 2:
+            // intrinsic for movq is missing!
+            xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], 0, 0);  break;
+        case 3:
+            xmm = _mm_setr_epi32(((int32_t*)p)[0], ((int32_t*)p)[1], ((int32_t*)p)[2], 0);  break;
+        case 4:
+            load(p);  break;
+        default: 
+            break;
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        union {        
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        switch (n) {
+        case 1:
+            *(int32_t*)p = _mm_cvtsi128_si32(xmm);  break;
+        case 2:
+            // intrinsic for movq is missing!
+            store(u.i);
+            *(int64_t*)p = u.q[0];  break;
+        case 3:
+            store(u.i);
+            *(int64_t*)p     = u.q[0];  
+            ((int32_t*)p)[2] = u.i[2];  break;
+        case 4:
+            store(p);  break;
+        default:
+            break;
+        }
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4i & cutoff(int n) {
+        *this = Vec16c(xmm).cutoff(n * 4);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4i const & insert(uint32_t index, int32_t value) {
+        static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
+        __m128i broad = _mm_set1_epi32(value);  // broadcast value into all elements
+        __m128i mask  = _mm_loadu_si128((__m128i const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
+        xmm = selectb(mask,broad,xmm);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(uint32_t index) const {
+        int32_t x[4];
+        store(x);
+        return x[index & 3];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui
+*
+*****************************************************************************/
+class Vec4ib : public Vec4i {
+public:
+    // Default constructor:
+    Vec4ib() {
+    }
+    // Constructor to build from all elements:
+    Vec4ib(bool x0, bool x1, bool x2, bool x3) {
+        xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4ib(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4ib & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4ib(bool b) : Vec4i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4ib & operator = (bool b) {
+        *this = Vec4ib(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4ib(int b);
+    Vec4ib & operator = (int x);
+public:
+    Vec4ib & insert (int index, bool a) {
+        Vec4i::insert(index, -(int)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec4i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4ib
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4ib operator & (Vec4ib const & a, Vec4ib const & b) {
+    return Vec4ib(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4ib operator && (Vec4ib const & a, Vec4ib const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4ib & operator &= (Vec4ib & a, Vec4ib const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4ib operator | (Vec4ib const & a, Vec4ib const & b) {
+    return Vec4ib(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4ib operator || (Vec4ib const & a, Vec4ib const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4ib & operator |= (Vec4ib & a, Vec4ib const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4ib operator ^ (Vec4ib const & a, Vec4ib const & b) {
+    return Vec4ib(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4ib & operator ^= (Vec4ib & a, Vec4ib const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4ib operator ~ (Vec4ib const & a) {
+    return Vec4ib( ~ Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4ib operator ! (Vec4ib const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec4ib andnot (Vec4ib const & a, Vec4ib const & b) {
+    return Vec4ib(andnot(Vec128b(a), Vec128b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4i operator + (Vec4i const & a, Vec4i const & b) {
+    return _mm_add_epi32(a, b);
+}
+
+// vector operator += : add
+static inline Vec4i & operator += (Vec4i & a, Vec4i const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4i operator ++ (Vec4i & a, int) {
+    Vec4i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4i & operator ++ (Vec4i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4i operator - (Vec4i const & a, Vec4i const & b) {
+    return _mm_sub_epi32(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec4i operator - (Vec4i const & a) {
+    return _mm_sub_epi32(_mm_setzero_si128(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec4i & operator -= (Vec4i & a, Vec4i const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4i operator -- (Vec4i & a, int) {
+    Vec4i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4i & operator -- (Vec4i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4i operator * (Vec4i const & a, Vec4i const & b) {
+#if INSTRSET >= 5  // SSE4.1 instruction set
+    return _mm_mullo_epi32(a, b);
+#else
+   __m128i a13    = _mm_shuffle_epi32(a, 0xF5);          // (-,a3,-,a1)
+   __m128i b13    = _mm_shuffle_epi32(b, 0xF5);          // (-,b3,-,b1)
+   __m128i prod02 = _mm_mul_epu32(a, b);                 // (-,a2*b2,-,a0*b0)
+   __m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
+   __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13);   // (-,-,a1*b1,a0*b0) 
+   __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13);   // (-,-,a3*b3,a2*b2) 
+   return           _mm_unpacklo_epi64(prod01,prod23);   // (ab3,ab2,ab1,ab0)
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec4i & operator *= (Vec4i & a, Vec4i const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec4i operator << (Vec4i const & a, int32_t b) {
+    return _mm_sll_epi32(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec4i & operator <<= (Vec4i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4i operator >> (Vec4i const & a, int32_t b) {
+    return _mm_sra_epi32(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4i & operator >>= (Vec4i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4ib operator == (Vec4i const & a, Vec4i const & b) {
+    return _mm_cmpeq_epi32(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4ib operator != (Vec4i const & a, Vec4i const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comneq_epi32(a,b);
+#else  // SSE2 instruction set
+    return Vec4ib(Vec4i (~(a == b)));
+#endif
+}
+  
+// vector operator > : returns true for elements for which a > b
+static inline Vec4ib operator > (Vec4i const & a, Vec4i const & b) {
+    return _mm_cmpgt_epi32(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4ib operator < (Vec4i const & a, Vec4i const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4ib operator >= (Vec4i const & a, Vec4i const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epi32(a,b);
+#else  // SSE2 instruction set
+    return Vec4ib(Vec4i (~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4ib operator <= (Vec4i const & a, Vec4i const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4i operator & (Vec4i const & a, Vec4i const & b) {
+    return Vec4i(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4i operator && (Vec4i const & a, Vec4i const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4i & operator &= (Vec4i & a, Vec4i const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4i operator | (Vec4i const & a, Vec4i const & b) {
+    return Vec4i(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4i operator || (Vec4i const & a, Vec4i const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise and
+static inline Vec4i & operator |= (Vec4i & a, Vec4i const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4i operator ^ (Vec4i const & a, Vec4i const & b) {
+    return Vec4i(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise and
+static inline Vec4i & operator ^= (Vec4i & a, Vec4i const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4i operator ~ (Vec4i const & a) {
+    return Vec4i( ~ Vec128b(a));
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec4ib operator ! (Vec4i const & a) {
+    return _mm_cmpeq_epi32(a,_mm_setzero_si128());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4i select (Vec4ib const & s, Vec4i const & a, Vec4i const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4i if_add (Vec4ib const & f, Vec4i const & a, Vec4i const & b) {
+    return a + (Vec4i(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec4i const & a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epi32(a);
+    __m128i sum2  = _mm_shuffle_epi32(sum1,0x0E);          // high element
+    __m128i sum3  = _mm_add_epi32(sum1,sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);               // truncate to 32 bits
+#elif  INSTRSET >= 4  // SSSE3
+    __m128i sum1  = _mm_hadd_epi32(a,a);                   // horizontally add 4 elements in 2 steps
+    __m128i sum2  = _mm_hadd_epi32(sum1,sum1);
+    return          _mm_cvtsi128_si32(sum2);               // 32 bit sum
+#else                 // SSE2
+    __m128i sum1  = _mm_shuffle_epi32(a,0x0E);             // 2 high elements
+    __m128i sum2  = _mm_add_epi32(a,sum1);                 // 2 sums
+    __m128i sum3  = _mm_shuffle_epi32(sum2,0x01);          // 1 high element
+    __m128i sum4  = _mm_add_epi32(sum2,sum3);              // 2 sums
+    return          _mm_cvtsi128_si32(sum4);               // 32 bit sum
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec4i const & a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epi32(a);
+#else              // SSE2
+    __m128i signs = _mm_srai_epi32(a,31);                  // sign of all elements
+    __m128i a01   = _mm_unpacklo_epi32(a,signs);           // sign-extended a0, a1
+    __m128i a23   = _mm_unpackhi_epi32(a,signs);           // sign-extended a2, a3
+    __m128i sum1  = _mm_add_epi64(a01,a23);                // add
+#endif
+    __m128i sum2  = _mm_unpackhi_epi64(sum1,sum1);         // high qword
+    __m128i sum3  = _mm_add_epi64(sum1,sum2);              // add
+#if defined (__x86_64__)
+    return          _mm_cvtsi128_si64(sum3);               // 64 bit mode
+#else
+    union {
+        __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+        int64_t i;
+    } u;
+    _mm_storel_epi64(&u.x,sum3);
+    return u.i;
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec4i add_saturated(Vec4i const & a, Vec4i const & b) {
+    __m128i sum    = _mm_add_epi32(a, b);                  // a + b
+    __m128i axb    = _mm_xor_si128(a, b);                  // check if a and b have different sign
+    __m128i axs    = _mm_xor_si128(a, sum);                // check if a and sum have different sign
+    __m128i overf1 = _mm_andnot_si128(axb,axs);            // check if sum has wrong sign
+    __m128i overf2 = _mm_srai_epi32(overf1,31);            // -1 if overflow
+    __m128i asign  = _mm_srli_epi32(a,31);                 // 1  if a < 0
+    __m128i sat1   = _mm_srli_epi32(overf2,1);             // 7FFFFFFF if overflow
+    __m128i sat2   = _mm_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,sum);                      // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec4i sub_saturated(Vec4i const & a, Vec4i const & b) {
+    __m128i diff   = _mm_sub_epi32(a, b);                  // a + b
+    __m128i axb    = _mm_xor_si128(a, b);                  // check if a and b have different sign
+    __m128i axs    = _mm_xor_si128(a, diff);               // check if a and sum have different sign
+    __m128i overf1 = _mm_and_si128(axb,axs);               // check if sum has wrong sign
+    __m128i overf2 = _mm_srai_epi32(overf1,31);            // -1 if overflow
+    __m128i asign  = _mm_srli_epi32(a,31);                 // 1  if a < 0
+    __m128i sat1   = _mm_srli_epi32(overf2,1);             // 7FFFFFFF if overflow
+    __m128i sat2   = _mm_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,diff);                     // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec4i max(Vec4i const & a, Vec4i const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_max_epi32(a,b);
+#else
+    __m128i greater = _mm_cmpgt_epi32(a,b);
+    return selectb(greater,a,b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4i min(Vec4i const & a, Vec4i const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_min_epi32(a,b);
+#else
+    __m128i greater = _mm_cmpgt_epi32(a,b);
+    return selectb(greater,b,a);
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4i abs(Vec4i const & a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_sign_epi32(a,a);
+#else                 // SSE2
+    __m128i sign = _mm_srai_epi32(a,31);                   // sign of a
+    __m128i inv  = _mm_xor_si128(a,sign);                  // invert bits if negative
+    return         _mm_sub_epi32(inv,sign);                // add 1
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4i abs_saturated(Vec4i const & a) {
+    __m128i absa   = abs(a);                               // abs(a)
+    __m128i overfl = _mm_srai_epi32(absa,31);              // sign
+    return           _mm_add_epi32(absa,overfl);           // subtract 1 if 0x80000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4i rotate_left(Vec4i const & a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_rot_epi32(a,_mm_set1_epi32(b));
+#else  // SSE2 instruction set
+    __m128i left  = _mm_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F));      // a << b 
+    __m128i right = _mm_srl_epi32(a,_mm_cvtsi32_si128((32-b) & 0x1F)); // a >> (32 - b)
+    __m128i rot   = _mm_or_si128(left,right);                          // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4ui : public Vec4i {
+public:
+    // Default constructor:
+    Vec4ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec4ui(uint32_t i) {
+        xmm = _mm_set1_epi32(i);
+    };
+    // Constructor to build from all elements:
+    Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        xmm = _mm_setr_epi32(i0, i1, i2, i3);
+    };
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4ui(__m128i const & x) {
+        xmm = x;
+    };
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4ui & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec4ui & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec4ui & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4ui const & insert(uint32_t index, uint32_t value) {
+        Vec4i::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(uint32_t index) const {
+        return Vec4i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4ui operator + (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui (Vec4i(a) + Vec4i(b));
+}
+
+// vector operator - : subtract
+static inline Vec4ui operator - (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui (Vec4i(a) - Vec4i(b));
+}
+
+// vector operator * : multiply
+static inline Vec4ui operator * (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui (Vec4i(a) * Vec4i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator >> (Vec4ui const & a, uint32_t b) {
+    return _mm_srl_epi32(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator >> (Vec4ui const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec4ui & operator >>= (Vec4ui & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4ui operator << (Vec4ui const & a, uint32_t b) {
+    return Vec4ui ((Vec4i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec4ui operator << (Vec4ui const & a, int32_t b) {
+    return Vec4ui ((Vec4i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4ib operator > (Vec4ui const & a, Vec4ui const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comgt_epu32(a,b);
+#else  // SSE2 instruction set
+    __m128i signbit = _mm_set1_epi32(0x80000000);
+    __m128i a1      = _mm_xor_si128(a,signbit);
+    __m128i b1      = _mm_xor_si128(b,signbit);
+    return _mm_cmpgt_epi32(a1,b1);                         // signed compare
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4ib operator < (Vec4ui const & a, Vec4ui const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4ib operator >= (Vec4ui const & a, Vec4ui const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_comge_epu32(a,b);
+#elif INSTRSET >= 5   // SSE4.1
+    __m128i max_ab = _mm_max_epu32(a,b);                   // max(a,b), unsigned
+    return _mm_cmpeq_epi32(a,max_ab);                      // a == max(a,b)
+#else  // SSE2 instruction set
+    return Vec4ib(Vec4i (~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4ib operator <= (Vec4ui const & a, Vec4ui const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4ui operator & (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4ui operator && (Vec4ui const & a, Vec4ui const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4ui operator | (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4ui operator || (Vec4ui const & a, Vec4ui const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4ui operator ^ (Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4ui operator ~ (Vec4ui const & a) {
+    return Vec4ui( ~ Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4ui select (Vec4ib const & s, Vec4ui const & a, Vec4ui const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4ui if_add (Vec4ib const & f, Vec4ui const & a, Vec4ui const & b) {
+    return a + (Vec4ui(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec4ui const & a) {
+    return horizontal_add((Vec4i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec4ui const & a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1  = _mm_haddq_epu32(a);
+#else              // SSE2
+    __m128i zero  = _mm_setzero_si128();                   // 0
+    __m128i a01   = _mm_unpacklo_epi32(a,zero);            // zero-extended a0, a1
+    __m128i a23   = _mm_unpackhi_epi32(a,zero);            // zero-extended a2, a3
+    __m128i sum1  = _mm_add_epi64(a01,a23);                // add
+#endif
+    __m128i sum2  = _mm_unpackhi_epi64(sum1,sum1);         // high qword
+    __m128i sum3  = _mm_add_epi64(sum1,sum2);              // add
+#if defined(_M_AMD64) || defined(_M_X64) || defined(__x86_64__) || defined(__amd64)
+    return          _mm_cvtsi128_si64(sum3);               // 64 bit mode
+#else
+    union {
+        __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+        uint64_t i;
+    } u;
+    _mm_storel_epi64(&u.x,sum3);
+    return u.i;
+#endif
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec4ui add_saturated(Vec4ui const & a, Vec4ui const & b) {
+    Vec4ui sum      = a + b;
+    Vec4ui aorb     = Vec4ui(a | b);
+    Vec4ui overflow = Vec4ui(sum < aorb);                  // overflow if a + b < (a | b)
+    return Vec4ui (sum | overflow);                        // return 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec4ui sub_saturated(Vec4ui const & a, Vec4ui const & b) {
+    Vec4ui diff      = a - b;
+    Vec4ui underflow = Vec4ui(diff > a);                   // underflow if a - b > a
+    return _mm_andnot_si128(underflow,diff);               // return 0 if underflow
+}
+
+// function max: a > b ? a : b
+static inline Vec4ui max(Vec4ui const & a, Vec4ui const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epu32(a,b);
+#else  // SSE2
+    return select(a > b, a, b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4ui min(Vec4ui const & a, Vec4ui const & b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epu32(a,b);
+#else  // SSE2
+    return select(a > b, b, a);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 2 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec2q : public Vec128b {
+public:
+    // Default constructor:
+    Vec2q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec2q(int64_t i) {
+#if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
+        // MS compiler has no _mm_set1_epi64x in 32 bit mode
+#if defined(__x86_64__)                                    // 64 bit mode
+#if _MSC_VER < 1700
+        __m128i x1 = _mm_cvtsi64_si128(i);                 // 64 bit load
+        xmm = _mm_unpacklo_epi64(x1,x1);                   // broadcast
+#else
+		xmm =  _mm_set1_epi64x(i);
+#endif
+#else
+        union {
+            int64_t q[2];
+            int32_t r[4];
+        } u;
+        u.q[0] = u.q[1] = i;
+        xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]);
+        /*    // this will use an mm register and produce store forwarding stall:
+        union {
+            __m64 m;
+            int64_t ii;
+        } u;
+        u.ii = i;
+        xmm = _mm_set1_epi64(u.m);
+		_m_empty();        */
+
+#endif  // __x86_64__
+#else   // Other compilers
+        xmm = _mm_set1_epi64x(i);   // emmintrin.h
+#endif
+    }
+    // Constructor to build from all elements:
+    Vec2q(int64_t i0, int64_t i1) {
+#if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
+        // MS compiler has no _mm_set_epi64x in 32 bit mode
+#if defined(__x86_64__)                                    // 64 bit mode
+#if _MSC_VER < 1700
+        __m128i x0 = _mm_cvtsi64_si128(i0);                // 64 bit load
+        __m128i x1 = _mm_cvtsi64_si128(i1);                // 64 bit load
+        xmm = _mm_unpacklo_epi64(x0,x1);                   // combine
+#else
+		xmm = _mm_set_epi64x(i1, i0);
+#endif
+#else   // MS compiler in 32-bit mode
+        union {
+            int64_t q[2];
+            int32_t r[4];
+        } u;
+        u.q[0] = i0;  u.q[1] = i1;
+		// this is inefficient, but other solutions are worse
+        xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]);
+#endif  // __x86_64__
+#else   // Other compilers
+        xmm = _mm_set_epi64x(i1, i0);
+#endif
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2q(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2q & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec2q & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec2q & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec2q & load_partial(int n, void const * p) {
+        switch (n) {
+        case 0:
+            *this = 0;  break;
+        case 1:
+            // intrinsic for movq is missing!
+            *this = Vec2q(*(int64_t*)p, 0);  break;
+        case 2:
+            load(p);  break;
+        default: 
+            break;
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        switch (n) {
+        case 1:
+            int64_t q[2];
+            store(q);
+            *(int64_t*)p = q[0];  break;
+        case 2:
+            store(p);  break;
+        default:
+            break;
+        }
+    }
+    // cut off vector to n elements. The last 2-n elements are set to zero
+    Vec2q & cutoff(int n) {
+        *this = Vec16c(xmm).cutoff(n * 8);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2q const & insert(uint32_t index, int64_t value) {
+#if INSTRSET >= 5 && defined(__x86_64__)  // SSE4.1 supported, 64 bit mode
+        if (index == 0) {
+            xmm = _mm_insert_epi64(xmm,value,0);
+        }
+        else {
+            xmm = _mm_insert_epi64(xmm,value,1);
+        }
+
+#else               // SSE2
+#if defined(__x86_64__)                                      // 64 bit mode
+        __m128i v = _mm_cvtsi64_si128(value);                // 64 bit load
+#else
+        union {
+            __m128i m;
+            int64_t ii;
+        } u;
+        u.ii = value;
+        __m128i v = _mm_loadl_epi64(&u.m);
+#endif
+        if (index == 0) {
+            v = _mm_unpacklo_epi64(v,v);     
+            xmm = _mm_unpackhi_epi64(v,xmm);
+        }
+        else {  // index = 1
+            xmm = _mm_unpacklo_epi64(xmm,v);
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(uint32_t index) const {
+        int64_t x[2];
+        store(x);
+        return x[index & 1];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size() {
+        return 2;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq
+*
+*****************************************************************************/
+// Definition will be different for the AVX512 instruction set
+class Vec2qb : public Vec2q {
+public:
+    // Default constructor:
+    Vec2qb() {
+    }
+    // Constructor to build from all elements:
+    Vec2qb(bool x0, bool x1) {
+        xmm = Vec2q(-int64_t(x0), -int64_t(x1));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2qb(__m128i const & x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2qb & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec2qb(bool b) : Vec2q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec2qb & operator = (bool b) {
+        *this = Vec2qb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec2qb(int b);
+    Vec2qb & operator = (int x);
+public:
+    Vec2qb & insert (int index, bool a) {
+        Vec2q::insert(index, -(int64_t)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec2q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec2qb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec2qb operator & (Vec2qb const & a, Vec2qb const & b) {
+    return Vec2qb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2qb operator && (Vec2qb const & a, Vec2qb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec2qb & operator &= (Vec2qb & a, Vec2qb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2qb operator | (Vec2qb const & a, Vec2qb const & b) {
+    return Vec2qb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2qb operator || (Vec2qb const & a, Vec2qb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec2qb & operator |= (Vec2qb & a, Vec2qb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2qb operator ^ (Vec2qb const & a, Vec2qb const & b) {
+    return Vec2qb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec2qb & operator ^= (Vec2qb & a, Vec2qb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2qb operator ~ (Vec2qb const & a) {
+    return Vec2qb( ~ Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec2qb operator ! (Vec2qb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec2qb andnot (Vec2qb const & a, Vec2qb const & b) {
+    return Vec2qb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2q operator + (Vec2q const & a, Vec2q const & b) {
+    return _mm_add_epi64(a, b);
+}
+
+// vector operator += : add
+static inline Vec2q & operator += (Vec2q & a, Vec2q const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec2q operator ++ (Vec2q & a, int) {
+    Vec2q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec2q & operator ++ (Vec2q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2q operator - (Vec2q const & a, Vec2q const & b) {
+    return _mm_sub_epi64(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec2q operator - (Vec2q const & a) {
+    return _mm_sub_epi64(_mm_setzero_si128(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec2q & operator -= (Vec2q & a, Vec2q const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec2q operator -- (Vec2q & a, int) {
+    Vec2q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec2q & operator -- (Vec2q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2q operator * (Vec2q const & a, Vec2q const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    // instruction does not exist. Split into 32-bit multiplies
+    __m128i bswap   = _mm_shuffle_epi32(b,0xB1);           // b0H,b0L,b1H,b1L (swap H<->L)
+    __m128i prodlh  = _mm_mullo_epi32(a,bswap);            // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
+    __m128i zero    = _mm_setzero_si128();                 // 0
+    __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+    __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+    __m128i prodll  = _mm_mul_epu32(a,b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
+    __m128i prod    = _mm_add_epi64(prodll,prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+    return  prod;
+#else               // SSE2
+    int64_t aa[2], bb[2];
+    a.store(aa);                                           // split into elements
+    b.store(bb);
+    return Vec2q(aa[0]*bb[0], aa[1]*bb[1]);                // multiply elements separetely
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec2q & operator *= (Vec2q & a, Vec2q const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec2q operator << (Vec2q const & a, int32_t b) {
+    return _mm_sll_epi64(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec2q & operator <<= (Vec2q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec2q operator >> (Vec2q const & a, int32_t b) {
+    // instruction does not exist. Split into 32-bit shifts
+    if (b <= 32) {
+        __m128i bb   = _mm_cvtsi32_si128(b);               // b
+        __m128i sra  = _mm_sra_epi32(a,bb);                // a >> b signed dwords
+        __m128i srl  = _mm_srl_epi64(a,bb);                // a >> b unsigned qwords
+        __m128i mask = _mm_setr_epi32(0,-1,0,-1);          // mask for signed high part
+        return  selectb(mask,sra,srl);
+    }
+    else {  // b > 32
+        __m128i bm32 = _mm_cvtsi32_si128(b-32);            // b - 32
+        __m128i sign = _mm_srai_epi32(a,31);               // sign of a
+        __m128i sra2 = _mm_sra_epi32(a,bm32);              // a >> (b-32) signed dwords
+        __m128i sra3 = _mm_srli_epi64(sra2,32);            // a >> (b-32) >> 32 (second shift unsigned qword)
+        __m128i mask = _mm_setr_epi32(0,-1,0,-1);          // mask for high part containing only sign
+        return  selectb(mask,sign,sra3);
+    }
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec2q & operator >>= (Vec2q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2qb operator == (Vec2q const & a, Vec2q const & b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_cmpeq_epi64(a, b);
+#else               // SSE2
+    // no 64 compare instruction. Do two 32 bit compares
+    __m128i com32  = _mm_cmpeq_epi32(a,b);                 // 32 bit compares
+    __m128i com32s = _mm_shuffle_epi32(com32,0xB1);        // swap low and high dwords
+    __m128i test   = _mm_and_si128(com32,com32s);          // low & high
+    __m128i teste  = _mm_srai_epi32(test,31);              // extend sign bit to 32 bits
+    __m128i testee = _mm_shuffle_epi32(teste,0xF5);        // extend sign bit to 64 bits
+    return  Vec2qb(Vec2q(testee));
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2qb operator != (Vec2q const & a, Vec2q const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return Vec2q(_mm_comneq_epi64(a,b));
+#else  // SSE2 instruction set
+    return Vec2qb(Vec2q(~(a == b)));
+#endif
+}
+  
+// vector operator < : returns true for elements for which a < b
+static inline Vec2qb operator < (Vec2q const & a, Vec2q const & b) {
+#if INSTRSET >= 6   // SSE4.2 supported
+    return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a)));
+#else               // SSE2
+    // no 64 compare instruction. Subtract
+    __m128i s      = _mm_sub_epi64(a,b);                   // a-b
+    // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0)
+    // The latter () corrects for overflow
+    __m128i axb    = _mm_xor_si128(a,b);                   // a ^ b
+    __m128i anb    = _mm_andnot_si128(b,a);                // a & ~b
+    __m128i snaxb  = _mm_andnot_si128(axb,s);              // s & ~(a ^ b)
+    __m128i or1    = _mm_or_si128(anb,snaxb);              // (a & ~b) | (s & ~(a ^ b))
+    __m128i teste  = _mm_srai_epi32(or1,31);               // extend sign bit to 32 bits
+    __m128i testee = _mm_shuffle_epi32(teste,0xF5);        // extend sign bit to 64 bits
+    return  testee;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2qb operator > (Vec2q const & a, Vec2q const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec2qb operator >= (Vec2q const & a, Vec2q const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return Vec2q(_mm_comge_epi64(a,b));
+#else  // SSE2 instruction set
+    return Vec2qb(Vec2q(~(a < b)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec2qb operator <= (Vec2q const & a, Vec2q const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec2q operator & (Vec2q const & a, Vec2q const & b) {
+    return Vec2q(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2q operator && (Vec2q const & a, Vec2q const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec2q & operator &= (Vec2q & a, Vec2q const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2q operator | (Vec2q const & a, Vec2q const & b) {
+    return Vec2q(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2q operator || (Vec2q const & a, Vec2q const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec2q & operator |= (Vec2q & a, Vec2q const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2q operator ^ (Vec2q const & a, Vec2q const & b) {
+    return Vec2q(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec2q & operator ^= (Vec2q & a, Vec2q const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2q operator ~ (Vec2q const & a) {
+    return Vec2q( ~ Vec128b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec2qb operator ! (Vec2q const & a) {
+    return a == Vec2q(_mm_setzero_si128());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec2q select (Vec2qb const & s, Vec2q const & a, Vec2q const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2q if_add (Vec2qb const & f, Vec2q const & a, Vec2q const & b) {
+    return a + (Vec2q(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add (Vec2q const & a) {
+    __m128i sum1  = _mm_shuffle_epi32(a,0x0E);             // high element
+    __m128i sum2  = _mm_add_epi64(a,sum1);                 // sum
+#if defined(__x86_64__)
+    return          _mm_cvtsi128_si64(sum2);               // 64 bit mode
+#else
+    union {
+        __m128i x;  // silly definition of _mm_storel_epi64 requires __m128i
+        int64_t i;
+    } u;
+    _mm_storel_epi64(&u.x,sum2);
+    return u.i;
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec2q max(Vec2q const & a, Vec2q const & b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2q min(Vec2q const & a, Vec2q const & b) {
+    return select(a < b, a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec2q abs(Vec2q const & a) {
+#if INSTRSET >= 6     // SSE4.2 supported
+    __m128i sign  = _mm_cmpgt_epi64(_mm_setzero_si128(),a);// 0 > a
+#else                 // SSE2
+    __m128i signh = _mm_srai_epi32(a,31);                  // sign in high dword
+    __m128i sign  = _mm_shuffle_epi32(signh,0xF5);         // copy sign to low dword
+#endif
+    __m128i inv   = _mm_xor_si128(a,sign);                 // invert bits if negative
+    return          _mm_sub_epi64(inv,sign);               // add 1
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec2q abs_saturated(Vec2q const & a) {
+    __m128i absa   = abs(a);                               // abs(a)
+#if INSTRSET >= 6     // SSE4.2 supported
+    __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(),absa);// 0 > a
+#else                 // SSE2
+    __m128i signh = _mm_srai_epi32(absa,31);               // sign in high dword
+    __m128i overfl= _mm_shuffle_epi32(signh,0xF5);         // copy sign to low dword
+#endif
+    return           _mm_add_epi64(absa,overfl);           // subtract 1 if 0x8000000000000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec2q rotate_left(Vec2q const & a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return _mm_rot_epi64(a,Vec2q(b));
+#else  // SSE2 instruction set
+    __m128i left  = _mm_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F));      // a << b 
+    __m128i right = _mm_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b)
+    __m128i rot   = _mm_or_si128(left,right);                          // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 2 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec2uq : public Vec2q {
+public:
+    // Default constructor:
+    Vec2uq() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec2uq(uint64_t i) {
+        xmm = Vec2q(i);
+    };
+    // Constructor to build from all elements:
+    Vec2uq(uint64_t i0, uint64_t i1) {
+        xmm = Vec2q(i0, i1);
+    };
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2uq(__m128i const & x) {
+        xmm = x;
+    };
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2uq & operator = (__m128i const & x) {
+        xmm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec2uq & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec2uq & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2uq const & insert(uint32_t index, uint64_t value) {
+        Vec2q::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(uint32_t index) const {
+        return Vec2q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec2uq operator + (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq (Vec2q(a) + Vec2q(b));
+}
+
+// vector operator - : subtract
+static inline Vec2uq operator - (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq (Vec2q(a) - Vec2q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec2uq operator * (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq (Vec2q(a) * Vec2q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator >> (Vec2uq const & a, uint32_t b) {
+    return _mm_srl_epi64(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator >> (Vec2uq const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec2uq & operator >>= (Vec2uq & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator << (Vec2uq const & a, uint32_t b) {
+    return Vec2uq ((Vec2q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator << (Vec2uq const & a, int32_t b) {
+    return Vec2uq ((Vec2q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return Vec2q(_mm_comgt_epu64(a,b));
+#else  // SSE2 instruction set
+    __m128i sign32  = _mm_set1_epi32(0x80000000);          // sign bit of each dword
+    __m128i aflip   = _mm_xor_si128(a,sign32);             // a with sign bits flipped
+    __m128i bflip   = _mm_xor_si128(b,sign32);             // b with sign bits flipped
+    __m128i equal   = _mm_cmpeq_epi32(a,b);                // a == b, dwords
+    __m128i bigger  = _mm_cmpgt_epi32(aflip,bflip);        // a > b, dwords
+    __m128i biggerl = _mm_shuffle_epi32(bigger,0xA0);      // a > b, low dwords copied to high dwords
+    __m128i eqbig   = _mm_and_si128(equal,biggerl);        // high part equal and low part bigger
+    __m128i hibig   = _mm_or_si128(bigger,eqbig);          // high part bigger or high part equal and low part bigger
+    __m128i big     = _mm_shuffle_epi32(hibig,0xF5);       // result copied to low part
+    return  Vec2qb(Vec2q(big));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec2qb operator < (Vec2uq const & a, Vec2uq const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec2qb operator >= (Vec2uq const & a, Vec2uq const & b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return Vec2q(_mm_comge_epu64(a,b));
+#else  // SSE2 instruction set
+    return  Vec2qb(Vec2q(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec2qb operator <= (Vec2uq const & a, Vec2uq const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec2uq operator & (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2uq operator && (Vec2uq const & a, Vec2uq const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec2uq operator | (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2uq operator || (Vec2uq const & a, Vec2uq const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2uq operator ^ (Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2uq operator ~ (Vec2uq const & a) {
+    return Vec2uq( ~ Vec128b(a));
+}
+
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec2uq select (Vec2qb const & s, Vec2uq const & a, Vec2uq const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2uq if_add (Vec2qb const & f, Vec2uq const & a, Vec2uq const & b) {
+    return a + (Vec2uq(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add (Vec2uq const & a) {
+    return horizontal_add((Vec2q)a);
+}
+
+// function max: a > b ? a : b
+static inline Vec2uq max(Vec2uq const & a, Vec2uq const & b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2uq min(Vec2uq const & a, Vec2uq const & b) {
+    return select(a > b, b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select. A negative index will generate zero. an index of -256 means don't care.
+*
+* Example:
+* Vec4i a(10,11,12,13);         // a is (10,11,12,13)
+* Vec4i b, c;
+* b = permute4i<0,0,2,2>(a);    // b is (10,10,12,12)
+* c = permute4i<3,2,-1,-1>(a);  // c is (13,12, 0, 0)
+*
+* The permute functions for vectors of 8-bit integers are inefficient if 
+* the SSSE3 instruction set or later is not enabled.
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+template <int i0, int i1>
+static inline Vec2q permute2q(Vec2q const & a) {
+    if (i0 == 0) {
+        if (i1 == 0) {       // 0,0
+            return _mm_unpacklo_epi64(a, a);
+        }
+        else if (i1 == 1 || i1 == -0x100) {  // 0,1
+            return a;
+        }
+        else {               // 0,-1
+            // return _mm_mov_epi64(a); // doesn't work with MS VS 2008
+            return _mm_and_si128(a, constant4i<-1,-1,0,0>());
+        }
+    }
+    else if (i0 == 1) {
+        if (i1 == 0) {       // 1,0
+            return _mm_shuffle_epi32(a, 0x4E);
+        }
+        else if (i1 == 1) {  // 1,1
+            return _mm_unpackhi_epi64(a, a);
+        }
+        else {               // 1,-1
+            return _mm_srli_si128(a, 8);
+        }
+    }
+    else { // i0 < 0
+        if (i1 == 0) {       // -1,0
+            return _mm_slli_si128(a, 8);
+        }
+        else if (i1 == 1) {  // -1,1
+            if (i0 == -0x100) return a;
+            return _mm_and_si128(a, constant4i<0,0,-1,-1>());
+        }
+        else {               // -1,-1
+            return _mm_setzero_si128();
+        }
+    }
+}
+
+template <int i0, int i1>
+static inline Vec2uq permute2uq(Vec2uq const & a) {
+    return Vec2uq (permute2q <i0, i1> ((__m128i)a));
+}
+
+// permute vector Vec4i
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i permute4i(Vec4i const & a) {
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&3) | (i1&3)<<4 | (i2&3)<<8 | (i3&3)<<12; 
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12;
+
+    // Mask indicating required zeroing of all indexes, with 4 bits for each, 0 for index = -1, 0xF for index >= 0 or -256
+    const int ssz = ((i0 & 0x80) ? 0 : 0xF) | ((i1 & 0x80) ? 0 : 0xF) << 4 | ((i2 & 0x80) ? 0 : 0xF) << 8 | ((i3 & 0x80) ? 0 : 0xF) << 12;
+
+    // Mask indicating 0 for don't care, 0xF for non-negative value of required zeroing
+    const int md = mz | ~ ssz;
+
+    // Test if permutation needed
+    const bool do_shuffle = ((m1 ^ 0x00003210) & mz) != 0;
+
+    // is zeroing needed
+    const bool do_zero    = (ssz != 0xFFFF);
+
+    if (mz == 0) {
+        return _mm_setzero_si128();    // special case: all zero or don't care
+    }
+    // Test if we can do with 64-bit permute only
+    if ((m1 & 0x0101 & mz) == 0        // even indexes are even or negative
+    && (~m1 & 0x1010 & mz) == 0        // odd  indexes are odd  or negative
+    && ((m1 ^ ((m1 + 0x0101) << 4)) & 0xF0F0 & mz & (mz << 4)) == 0  // odd index == preceding even index +1 or at least one of them negative
+    && ((mz ^ (mz << 4)) & 0xF0F0 & md & md << 4) == 0) {      // each pair of indexes are both negative or both positive or one of them don't care
+        const int j0 = i0 >= 0 ? i0 / 2 : (i0 & 0x80) ? i0 : i1 >= 0 ? i1/2 : i1;
+        const int j1 = i2 >= 0 ? i2 / 2 : (i2 & 0x80) ? i2 : i3 >= 0 ? i3/2 : i3;
+        return Vec4i(permute2q<j0, j1> (Vec2q(a)));    // 64 bit permute
+    }
+#if  INSTRSET >= 4  // SSSE3
+    if (do_shuffle && do_zero) {
+        // With SSSE3 we can do both with the PSHUFB instruction
+        const int j0 = (i0 & 3) << 2;
+        const int j1 = (i1 & 3) << 2;
+        const int j2 = (i2 & 3) << 2;
+        const int j3 = (i3 & 3) << 2;
+        __m128i mask1 = constant4i <
+            i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24,
+            i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24,
+            i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24,
+            i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > ();
+        return _mm_shuffle_epi8(a,mask1);
+    }
+#endif
+    __m128i t1;
+
+    if (do_shuffle) {  // permute
+        t1 = _mm_shuffle_epi32(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6);
+    }
+    else {
+        t1 = a;
+    }
+    if (do_zero) {     // set some elements to zero
+        __m128i mask2 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
+        t1 = _mm_and_si128(t1,mask2);
+    }
+    return t1;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui permute4ui(Vec4ui const & a) {
+    return Vec4ui (permute4i <i0,i1,i2,i3> (a));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s permute8s(Vec8s const & a) {
+    if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) {
+        return _mm_setzero_si128();  // special case: all zero
+    }
+#if  INSTRSET >= 4  // SSSE3
+
+    // special case: rotate
+    if (i0>=0 && i0 < 8 && i1==((i0+1)&7) && i2==((i0+2)&7) && i3==((i0+3)&7) && i4==((i0+4)&7) && i5==((i0+5)&7) && i6==((i0+6)&7) && i7==((i0+7)&7)) {
+        if (i0 == 0) return a;  // do nothing
+        return _mm_alignr_epi8(a, a, (i0 & 7) * 2);
+    }    
+    
+    // General case: Use PSHUFB
+    const int j0 = i0 < 0 ? 0xFFFF : ( (i0 & 7) * 2 | ((i0 & 7) * 2 + 1) << 8 );
+    const int j1 = i1 < 0 ? 0xFFFF : ( (i1 & 7) * 2 | ((i1 & 7) * 2 + 1) << 8 );
+    const int j2 = i2 < 0 ? 0xFFFF : ( (i2 & 7) * 2 | ((i2 & 7) * 2 + 1) << 8 );
+    const int j3 = i3 < 0 ? 0xFFFF : ( (i3 & 7) * 2 | ((i3 & 7) * 2 + 1) << 8 );
+    const int j4 = i4 < 0 ? 0xFFFF : ( (i4 & 7) * 2 | ((i4 & 7) * 2 + 1) << 8 );
+    const int j5 = i5 < 0 ? 0xFFFF : ( (i5 & 7) * 2 | ((i5 & 7) * 2 + 1) << 8 );
+    const int j6 = i6 < 0 ? 0xFFFF : ( (i6 & 7) * 2 | ((i6 & 7) * 2 + 1) << 8 );
+    const int j7 = i7 < 0 ? 0xFFFF : ( (i7 & 7) * 2 | ((i7 & 7) * 2 + 1) << 8 );
+    __m128i mask = constant4i < j0 | j1 << 16, j2 | j3 << 16, j4 | j5 << 16, j6 | j7 << 16 > ();
+    return _mm_shuffle_epi8(a,mask);
+
+#else   // SSE2 has no simple solution. Find the optimal permute method.
+    // Without proper metaprogramming features, we have to use constant expressions 
+    // and if-statements to make sure these calculations are resolved at compile time.
+    // All this should produce at most 8 instructions in the final code, depending
+    // on the template parameters.
+
+    // Temporary vectors
+    __m128i t1, t2, t3, t4, t5, t6, t7;
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 
+        | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28; 
+
+    // Mask to zero out negative indexes
+    const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12
+        | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    // Test if we can do without permute
+    const bool case0 = ((m1 ^ 0x76543210) & m2) == 0; // all indexes point to their own place or negative
+
+    // Test if we can do with 32-bit permute only
+    const bool case1 = 
+        (m1 & 0x01010101 & m2) == 0        // even indexes are even or negative
+        && (~m1 & 0x10101010 & m2) == 0    // odd  indexes are odd  or negative
+        && ((m1 ^ ((m1 + 0x01010101) << 4)) & 0xF0F0F0F0 & m2 & (m2 << 4)) == 0; // odd index == preceding even index +1 or at least one of them negative
+
+    // Test if we can do with 16-bit permute only
+    const bool case2 = 
+        (((m1 & 0x44444444) ^ 0x44440000) & m2) == 0;  // indexes 0-3 point to lower 64 bits, 1-7 to higher 64 bits, or negative
+
+    if (case0) {
+        // no permute needed
+        t7 = a;
+    }
+    else if (case1) {
+        // 32 bit permute only
+        const int j0 = i0 >= 0 ? i0/2 : i1 >= 0 ? i1/2 : 0;
+        const int j1 = i2 >= 0 ? i2/2 : i3 >= 0 ? i3/2 : 0;
+        const int j2 = i4 >= 0 ? i4/2 : i5 >= 0 ? i5/2 : 0;
+        const int j3 = i6 >= 0 ? i6/2 : i7 >= 0 ? i7/2 : 0;
+        t7 = _mm_shuffle_epi32(a, (j0&3) | (j1&3)<<2 | (j2&3)<<4 | (j3&3)<<6 );
+    }
+    else if (case2) {
+        // 16 bit permute only
+        const int j0 = i0 >= 0 ? i0&3 : 0;
+        const int j1 = i1 >= 0 ? i1&3 : 1;
+        const int j2 = i2 >= 0 ? i2&3 : 2;
+        const int j3 = i3 >= 0 ? i3&3 : 3;
+        const int j4 = i4 >= 0 ? i4&3 : 0;
+        const int j5 = i5 >= 0 ? i5&3 : 1;
+        const int j6 = i6 >= 0 ? i6&3 : 2;
+        const int j7 = i7 >= 0 ? i7&3 : 3;
+        if (j0!=0 || j1!=1 || j2!=2 || j3!=3) {            
+            t1 = _mm_shufflelo_epi16(a, j0 | j1<<2 | j2<<4 | j3<<6);
+        }
+        else t1 = a;
+        if (j4!=0 || j5!=1 || j6!=2 || j7!=3) {            
+            t7 = _mm_shufflehi_epi16(t1, j4 | j5<<2 | j6<<4 | j7<<6);
+        }
+        else t7 = t1;
+    }
+    else {
+        // Need at least two permute steps
+
+        // Index to where each dword of a is needed
+        const int nn = (m1 & 0x66666666) | 0x88888888; // indicate which dwords are needed
+        const int n0 = ((((uint32_t)(nn ^ 0x00000000) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+        const int n1 = ((((uint32_t)(nn ^ 0x22222222) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+        const int n2 = ((((uint32_t)(nn ^ 0x44444444) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+        const int n3 = ((((uint32_t)(nn ^ 0x66666666) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2;
+        // indicate which dwords are needed in low half
+        const int l0 = (n0 & 0xFFFF) != 0;
+        const int l1 = (n1 & 0xFFFF) != 0;
+        const int l2 = (n2 & 0xFFFF) != 0;
+        const int l3 = (n3 & 0xFFFF) != 0;
+        // indicate which dwords are needed in high half
+        const int h0 = (n0 & 0xFFFF0000) != 0;
+        const int h1 = (n1 & 0xFFFF0000) != 0;
+        const int h2 = (n2 & 0xFFFF0000) != 0;
+        const int h3 = (n3 & 0xFFFF0000) != 0;
+
+        // Test if we can do with two permute steps
+        const bool case3 = l0 + l1 + l2 + l3 <= 2  &&  h0 + h1 + h2 + h3 <= 2;
+
+        if (case3) {
+            // one 32-bit permute followed by one 16-bit permute in each half.
+            // Find permute indices for 32-bit permute
+            const int j0 = l0 ? 0 : l1 ? 1 : l2 ? 2 : 3;
+            const int j1 = l3 ? 3 : l2 ? 2 : l1 ? 1 : 0;
+            const int j2 = h0 ? 0 : h1 ? 1 : h2 ? 2 : 3;
+            const int j3 = h3 ? 3 : h2 ? 2 : h1 ? 1 : 0;
+
+            // Find permute indices for low 16-bit permute
+            const int r0 = i0 < 0 ? 0 : (i0>>1 == j0 ? 0 : 2) + (i0 & 1);
+            const int r1 = i1 < 0 ? 1 : (i1>>1 == j0 ? 0 : 2) + (i1 & 1);
+            const int r2 = i2 < 0 ? 2 : (i2>>1 == j1 ? 2 : 0) + (i2 & 1);
+            const int r3 = i3 < 0 ? 3 : (i3>>1 == j1 ? 2 : 0) + (i3 & 1);
+
+            // Find permute indices for high 16-bit permute
+            const int s0 = i4 < 0 ? 0 : (i4>>1 == j2 ? 0 : 2) + (i4 & 1);
+            const int s1 = i5 < 0 ? 1 : (i5>>1 == j2 ? 0 : 2) + (i5 & 1);
+            const int s2 = i6 < 0 ? 2 : (i6>>1 == j3 ? 2 : 0) + (i6 & 1);
+            const int s3 = i7 < 0 ? 3 : (i7>>1 == j3 ? 2 : 0) + (i7 & 1);
+
+            // 32-bit permute
+            t1 = _mm_shuffle_epi32 (a, j0 | j1<<2 | j2<<4 | j3<<6);
+            // 16-bit permutes
+            if (r0!=0 || r1!=1 || r2!=2 || r3!=3) {  // 16 bit permute of low  half
+                t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6);
+            }
+            else t2 = t1;
+            if (s0!=0 || s1!=1 || s2!=2 || s3!=3) {  // 16 bit permute of high half                
+                t7 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6);
+            }
+            else t7 = t2;
+        }
+        else {
+            // Worst case. We need two sets of 16-bit permutes
+            t1 = _mm_shuffle_epi32(a, 0x4E);  // swap low and high 64-bits
+
+            // Find permute indices for low 16-bit permute from swapped t1
+            const int r0 = i0 < 4 ? 0 : i0 & 3;
+            const int r1 = i1 < 4 ? 1 : i1 & 3;
+            const int r2 = i2 < 4 ? 2 : i2 & 3;
+            const int r3 = i3 < 4 ? 3 : i3 & 3;
+            // Find permute indices for high 16-bit permute from swapped t1
+            const int s0 = i4 < 0 || i4 >= 4 ? 0 : i4 & 3;
+            const int s1 = i5 < 0 || i5 >= 4 ? 1 : i5 & 3;
+            const int s2 = i6 < 0 || i6 >= 4 ? 2 : i6 & 3;
+            const int s3 = i7 < 0 || i7 >= 4 ? 3 : i7 & 3;
+            // Find permute indices for low 16-bit permute from direct a
+            const int u0 = i0 < 0 || i0 >= 4 ? 0 : i0 & 3;
+            const int u1 = i1 < 0 || i1 >= 4 ? 1 : i1 & 3;
+            const int u2 = i2 < 0 || i2 >= 4 ? 2 : i2 & 3;
+            const int u3 = i3 < 0 || i3 >= 4 ? 3 : i3 & 3;
+            // Find permute indices for high 16-bit permute from direct a
+            const int v0 = i4 < 4 ? 0 : i4 & 3;
+            const int v1 = i5 < 4 ? 1 : i5 & 3;
+            const int v2 = i6 < 4 ? 2 : i6 & 3;
+            const int v3 = i7 < 4 ? 3 : i7 & 3;
+
+            // 16-bit permutes
+            if (r0!=0 || r1!=1 || r2!=2 || r3!=3) {  // 16 bit permute of low  half
+                t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6);
+            }
+            else t2 = t1;
+            if (u0!=0 || u1!=1 || u2!=2 || u3!=3) {  // 16 bit permute of low  half
+                t3 = _mm_shufflelo_epi16(a, u0 | u1<<2 | u2<<4 | u3<<6);
+            }
+            else t3 = a;
+            if (s0!=0 || s1!=1 || s2!=2 || s3!=3) {  // 16 bit permute of low  half
+                t4 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6);
+            }
+            else t4 = t2;
+            if (v0!=0 || v1!=1 || v2!=2 || v3!=3) {  // 16 bit permute of low  half
+                t5 = _mm_shufflehi_epi16(t3, v0 | v1<<2 | v2<<4 | v3<<6);
+            }
+            else t5 = t3;
+            // merge data from t4 and t5
+            t6  = constant4i <
+                ((i0 & 4) ? 0xFFFF : 0) | ((i1 & 4) ? 0xFFFF0000 : 0),
+                ((i2 & 4) ? 0xFFFF : 0) | ((i3 & 4) ? 0xFFFF0000 : 0),
+                ((i4 & 4) ? 0 : 0xFFFF) | ((i5 & 4) ? 0 : 0xFFFF0000),
+                ((i6 & 4) ? 0 : 0xFFFF) | ((i7 & 4) ? 0 : 0xFFFF0000) > ();
+            t7 = selectb(t6,t4,t5);  // select between permuted data t4 and t5
+        }
+    }
+    // Set any elements to zero if required
+    if (m2 != -1 && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80)) {
+        // some elements need to be set to 0
+        __m128i mask = constant4i <
+            (i0 < 0 ? 0xFFFF0000 : -1) & (i1 < 0 ? 0x0000FFFF : -1),
+            (i2 < 0 ? 0xFFFF0000 : -1) & (i3 < 0 ? 0x0000FFFF : -1),
+            (i4 < 0 ? 0xFFFF0000 : -1) & (i5 < 0 ? 0x0000FFFF : -1),
+            (i6 < 0 ? 0xFFFF0000 : -1) & (i7 < 0 ? 0x0000FFFF : -1) > ();
+        return  _mm_and_si128(t7,mask);
+    }
+    else {
+        return  t7;
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us permute8us(Vec8us const & a) {
+    return Vec8us (permute8s <i0,i1,i2,i3,i4,i5,i6,i7> (a));
+}
+
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16c permute16c(Vec16c const & a) {
+
+    __m128i temp;
+
+    // Combine all even indexes into a single bitfield, with 4 bits for each
+    const uint32_t me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 
+        | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; 
+
+    // Combine all odd indexes into a single bitfield, with 4 bits for each
+    const uint32_t mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 
+        | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; 
+
+    // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+    const uint32_t se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12
+        | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28;
+
+    // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+    const uint32_t so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12
+        | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28;
+
+    // Mask indicating sign of all indexes, with 2 bits for each, 0 for negative (means set to zero or don't care), 0x3 for non-negative
+    const uint32_t ss = (se & 0x33333333) | (so & 0xCCCCCCCC);
+
+    // Mask indicating required zeroing of all indexes, with 2 bits for each, 0 for index = -1, 3 for index >= 0 or -256
+    const uint32_t ssz = ((i0&0x80)?0:3) | ((i1 &0x80)?0:3)<< 2 | ((i2 &0x80)?0:3)<< 4 | ((i3 &0x80)?0:3)<< 6 | 
+                    ((i4 &0x80)?0:3)<< 8 | ((i5 &0x80)?0:3)<<10 | ((i6 &0x80)?0:3)<<12 | ((i7 &0x80)?0:3)<<14 | 
+                    ((i8 &0x80)?0:3)<<16 | ((i9 &0x80)?0:3)<<18 | ((i10&0x80)?0:3)<<20 | ((i11&0x80)?0:3)<<22 | 
+                    ((i12&0x80)?0:3)<<24 | ((i13&0x80)?0:3)<<26 | ((i14&0x80)?0:3)<<28 | ((i15&0x80)?0:3)<<30 ;
+
+    // These indexes are used only to avoid bogus compiler warnings in false branches
+    const int I0  = i0  > 0 ? (i0  & 0xF) : 0;
+    const int I15 = i15 > 0 ? (i15 & 0xF) : 0;
+
+    // special case: all zero
+    if (ss == 0) {
+        return _mm_setzero_si128();  
+    }
+
+    // remember if extra zeroing is needed
+    bool do_and_zero = (ssz != 0xFFFFFFFFu);
+
+    // check for special shortcut cases
+    int shortcut = 0;
+
+    // check if any permutation
+    if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) {
+        shortcut = 1;
+    }
+    // check if we can use punpcklbw
+    else if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) {
+        shortcut = 2;
+    }
+    // check if we can use punpckhbw
+    else if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) {
+        shortcut = 3;
+    }
+
+    #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER)
+    #pragma warning(disable: 4307)  // disable MS warning C4307: '+' : integral constant overflow
+    #endif
+
+    // check if we can use byte shift right
+    else if (i0 > 0 && ((me ^ (uint32_t(I0)*0x11111111u + 0xECA86420u)) & se) == 0 && 
+    ((mo ^ (uint32_t(I0)*0x11111111u + 0xFDB97531u)) & so) == 0) {
+        shortcut = 4;
+        do_and_zero = ((0xFFFFFFFFu >> 2*I0) & ~ ssz) != 0;
+    }
+    // check if we can use byte shift left
+    else if (i15 >= 0 && i15 < 15 &&         
+    ((mo ^ (uint32_t(I15*0x11111111u) - (0x02468ACEu & so))) & so) == 0 && 
+    ((me ^ (uint32_t(I15*0x11111111u) - (0x13579BDFu & se))) & se) == 0) {
+        shortcut = 5;
+        do_and_zero = ((0xFFFFFFFFu << 2*(15-I15)) & ~ ssz) != 0;
+    }
+
+#if  INSTRSET >= 4  // SSSE3 (PSHUFB available only under SSSE3)
+
+    // special case: rotate
+    if (i0>0 && i0 < 16    && i1==((i0+1)&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) 
+    && i8==((i0+8)&15) && i9==((i0+9)&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)) {
+        temp = _mm_alignr_epi8(a, a, i0 & 15);
+        shortcut = -1;
+    }
+    if (shortcut == 0 || do_and_zero) {
+        // general case: use PSHUFB
+        __m128i mask = constant4i< 
+            (i0  & 0xFF) | (i1  & 0xFF) << 8 | (i2  & 0xFF) << 16 | (i3  & 0xFF) << 24 ,
+            (i4  & 0xFF) | (i5  & 0xFF) << 8 | (i6  & 0xFF) << 16 | (i7  & 0xFF) << 24 ,
+            (i8  & 0xFF) | (i9  & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24 ,
+            (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24 > ();
+        temp = _mm_shuffle_epi8(a,mask);
+        shortcut = -1;
+        do_and_zero = false;
+    }
+
+#endif
+
+    // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered
+    // indexes must be equal to the preceding index + 1, except for negative indexes.
+    if (shortcut == 0 && (me & 0x11111111 & se) == 0 && ((mo ^ 0x11111111) & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) {
+        temp = permute8s <
+            i0  >= 0 ? i0 /2 : i1  >= 0 ? i1 /2 : (i0  | i1 ),
+            i2  >= 0 ? i2 /2 : i3  >= 0 ? i3 /2 : (i2  | i3 ),
+            i4  >= 0 ? i4 /2 : i5  >= 0 ? i5 /2 : (i4  | i5 ),
+            i6  >= 0 ? i6 /2 : i7  >= 0 ? i7 /2 : (i6  | i7 ),
+            i8  >= 0 ? i8 /2 : i9  >= 0 ? i9 /2 : (i8  | i9 ),
+            i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11),
+            i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13),
+            i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(a));
+        shortcut = 100;
+        do_and_zero = (se != so && ssz != 0xFFFFFFFFu);
+    }
+  
+    // Check if we can use 16-bit permute with bytes swapped. Even numbered indexes must be odd and odd 
+    // numbered indexes must be equal to the preceding index - 1, except for negative indexes.
+    // (this case occurs when reversing byte order)
+    if (shortcut == 0 && ((me ^ 0x11111111) & 0x11111111 & se) == 0 && (mo & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) {
+        Vec16c swapped = Vec16c(rotate_left(Vec8s(a), 8)); // swap odd and even bytes
+        temp = permute8s <
+            i0  >= 0 ? i0 /2 : i1  >= 0 ? i1 /2 : (i0  | i1 ),
+            i2  >= 0 ? i2 /2 : i3  >= 0 ? i3 /2 : (i2  | i3 ),
+            i4  >= 0 ? i4 /2 : i5  >= 0 ? i5 /2 : (i4  | i5 ),
+            i6  >= 0 ? i6 /2 : i7  >= 0 ? i7 /2 : (i6  | i7 ),
+            i8  >= 0 ? i8 /2 : i9  >= 0 ? i9 /2 : (i8  | i9 ),
+            i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11),
+            i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13),
+            i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(swapped));
+        shortcut = 101;
+        do_and_zero = (se != so && ssz != 0xFFFFFFFFu);
+    }
+
+    // all shortcuts end here
+    if (shortcut) {
+        switch (shortcut) {
+        case 1:
+            temp = a;  break;
+        case 2:
+            temp = _mm_unpacklo_epi8(a,a);  break;
+        case 3:
+            temp = _mm_unpackhi_epi8(a,a);  break;
+        case 4:
+            temp = _mm_srli_si128(a, I0);  break;
+        case 5:
+            temp = _mm_slli_si128(a, 15-I15);  break;
+        default:
+            break;  // result is already in temp
+        }
+        if (do_and_zero) {
+            // additional zeroing needed
+            __m128i maskz = constant4i < 
+                (i0  < 0 ? 0 : 0xFF) | (i1  < 0 ? 0 : 0xFF00) | (i2  < 0 ? 0 : 0xFF0000) | (i3  < 0 ? 0 : 0xFF000000) ,
+                (i4  < 0 ? 0 : 0xFF) | (i5  < 0 ? 0 : 0xFF00) | (i6  < 0 ? 0 : 0xFF0000) | (i7  < 0 ? 0 : 0xFF000000) ,
+                (i8  < 0 ? 0 : 0xFF) | (i9  < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) ,
+                (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > ();
+            temp = _mm_and_si128(temp, maskz);
+        }
+        return temp;
+    }
+
+    // complicated cases: use 16-bit permute up to four times
+    const bool e2e = (~me & 0x11111111 & se) != 0;  // even bytes of source to even bytes of destination
+    const bool e2o = (~mo & 0x11111111 & so) != 0;  // even bytes of source to odd  bytes of destination
+    const bool o2e = (me  & 0x11111111 & se) != 0;  // odd  bytes of source to even bytes of destination
+    const bool o2o = (mo  & 0x11111111 & so) != 0;  // odd  bytes of source to odd  bytes of destination
+    
+    Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd;
+
+    if (e2o || o2e) swapped = rotate_left(Vec8s(a), 8); // swap odd and even bytes
+
+    // even-to-even bytes
+    if (e2e) te2e = permute8s <(i0&1)?-1:i0/2, (i2&1)?-1:i2/2, (i4&1)?-1:i4/2, (i6&1)?-1:i6/2,
+        (i8&1)?-1:i8/2, (i10&1)?-1:i10/2, (i12&1)?-1:i12/2, (i14&1)?-1:i14/2> (Vec8s(a));                 
+    // odd-to-even bytes
+    if (o2e) to2e = permute8s <(i0&1)?i0/2:-1, (i2&1)?i2/2:-1, (i4&1)?i4/2:-1, (i6&1)?i6/2:-1,
+        (i8&1)?i8/2:-1, (i10&1)?i10/2:-1, (i12&1)?i12/2:-1, (i14&1)?i14/2:-1> (Vec8s(swapped));
+    // even-to-odd bytes
+    if (e2o) te2o = permute8s <(i1&1)?-1:i1/2, (i3&1)?-1:i3/2, (i5&1)?-1:i5/2, (i7&1)?-1:i7/2, 
+        (i9&1)?-1:i9/2, (i11&1)?-1:i11/2, (i13&1)?-1:i13/2, (i15&1)?-1:i15/2> (Vec8s(swapped));
+    // odd-to-odd bytes
+    if (o2o) to2o = permute8s <(i1&1)?i1/2:-1, (i3&1)?i3/2:-1, (i5&1)?i5/2:-1, (i7&1)?i7/2:-1,
+        (i9&1)?i9/2:-1, (i11&1)?i11/2:-1, (i13&1)?i13/2:-1, (i15&1)?i15/2:-1> (Vec8s(a));
+
+    if (e2e && o2e) combeven = te2e | to2e;
+    else if (e2e)   combeven = te2e;
+    else if (o2e)   combeven = to2e;
+    else            combeven = _mm_setzero_si128();
+
+    if (e2o && o2o) combodd  = te2o | to2o;
+    else if (e2o)   combodd  = te2o;
+    else if (o2o)   combodd  = to2o;
+    else            combodd  = _mm_setzero_si128();
+
+    __m128i maske = constant4i <     // mask used even bytes
+        (i0  < 0 ? 0 : 0xFF) | (i2  < 0 ? 0 : 0xFF0000),
+        (i4  < 0 ? 0 : 0xFF) | (i6  < 0 ? 0 : 0xFF0000),
+        (i8  < 0 ? 0 : 0xFF) | (i10 < 0 ? 0 : 0xFF0000),
+        (i12 < 0 ? 0 : 0xFF) | (i14 < 0 ? 0 : 0xFF0000) > ();
+    __m128i masko = constant4i <     // mask used odd bytes
+        (i1  < 0 ? 0 : 0xFF00) | (i3  < 0 ? 0 : 0xFF000000),
+        (i5  < 0 ? 0 : 0xFF00) | (i7  < 0 ? 0 : 0xFF000000),
+        (i9  < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000),
+        (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000) > ();
+
+    return  _mm_or_si128(            // combine even and odd bytes
+        _mm_and_si128(combeven, maske),
+        _mm_and_si128(combodd, masko));
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16uc permute16uc(Vec16uc const & a) {
+    return Vec16uc (permute16c <i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 4 elements, then indexes 0 - 3
+* will select an element from the first vector and indexes 4 - 7 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* The blend functions for vectors of 8-bit integers are inefficient if 
+* the SSSE3 instruction set or later is not enabled.
+*
+* Example:
+* Vec4i a(100,101,102,103);         // a is (100, 101, 102, 103)
+* Vec4i b(200,201,202,203);         // b is (200, 201, 202, 203)
+* Vec4i c;
+* c = blend4i<1,4,-1,7> (a,b);      // c is (101, 200,   0, 203)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16c blend16c(Vec16c const & a, Vec16c const & b) {
+
+    // Combine bit 0-3 of all even indexes into a single bitfield, with 4 bits for each
+    const int me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 
+        | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; 
+
+    // Combine bit 0-3 of all odd indexes into a single bitfield, with 4 bits for each
+    const int mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 
+        | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; 
+
+    // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+    const int se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12
+        | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28;
+
+    // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative
+    const int so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12
+        | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28;
+
+    // Combine bit 4 of all even indexes into a single bitfield, with 4 bits for each
+    const int ne = (i0&16)>>4 | (i2&16) | (i4&16)<<4 | (i6&16)<<8 
+        | (i8&16)<<12 | (i10&16)<<16 | (i12&16)<<20 | (i14&16)<<24; 
+
+    // Combine bit 4 of all odd indexes into a single bitfield, with 4 bits for each
+    const int no = (i1&16)>>4 | (i3&16) | (i5&16)<<4 | (i7&16)<<8
+        | (i9&16)<<12 | (i11&16)<<16 | (i13&16)<<20 | (i15&16)<<24; 
+
+    // Check if zeroing needed
+    const bool do_zero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0; // needs zeroing
+
+    // no elements from b
+    if (((ne & se) | (no & so)) == 0) {
+        return permute16c <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (a);
+    }
+
+    // no elements from a
+    if ((((ne^0x11111111) & se) | ((no^0x11111111) & so)) == 0) {
+        return permute16c <i0^16, i1^16, i2^16, i3^16, i4^16, i5^16, i6^16, i7^16, i8^16, i9^16, i10^16, i11^16, i12^16, i13^16, i14^16, i15^16> (b);
+    }
+    __m128i t;
+
+    // check if we can use punpcklbw
+    if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) {
+        if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) {        
+            t = _mm_unpacklo_epi8(a,b);
+        }
+        if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) {        
+            t = _mm_unpacklo_epi8(b,a);
+        }
+        if (do_zero) {
+            // additional zeroing needed
+            __m128i maskz = constant4i < 
+                (i0  < 0 ? 0 : 0xFF) | (i1  < 0 ? 0 : 0xFF00) | (i2  < 0 ? 0 : 0xFF0000) | (i3  < 0 ? 0 : 0xFF000000) ,
+                (i4  < 0 ? 0 : 0xFF) | (i5  < 0 ? 0 : 0xFF00) | (i6  < 0 ? 0 : 0xFF0000) | (i7  < 0 ? 0 : 0xFF000000) ,
+                (i8  < 0 ? 0 : 0xFF) | (i9  < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) ,
+                (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > ();
+            t = _mm_and_si128(t, maskz);
+        }
+        return t;
+    }
+
+    // check if we can use punpckhbw
+    if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) {
+        if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) {        
+            t = _mm_unpackhi_epi8(a,b);
+        }
+        if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) {        
+            t = _mm_unpackhi_epi8(b,a);
+        }
+        if (do_zero) {
+            // additional zeroing needed
+            __m128i maskz = constant4i < 
+                (i0  < 0 ? 0 : 0xFF) | (i1  < 0 ? 0 : 0xFF00) | (i2  < 0 ? 0 : 0xFF0000) | (i3  < 0 ? 0 : 0xFF000000) ,
+                (i4  < 0 ? 0 : 0xFF) | (i5  < 0 ? 0 : 0xFF00) | (i6  < 0 ? 0 : 0xFF0000) | (i7  < 0 ? 0 : 0xFF000000) ,
+                (i8  < 0 ? 0 : 0xFF) | (i9  < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) ,
+                (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > ();
+            t = _mm_and_si128(t, maskz);
+        }
+        return t;
+    }
+    
+#if  INSTRSET >= 4  // SSSE3
+    // special case: shift left
+    if (i0 > 0 && i0 < 16 && i1==i0+1 && i2==i0+2 && i3==i0+3 && i4==i0+4 && i5==i0+5 && i6==i0+6 && i7==i0+7 && 
+        i8==i0+8 && i9==i0+9 && i10==i0+10 && i11==i0+11 && i12==i0+12 && i13==i0+13 && i14==i0+14 && i15==i0+15) {
+        return _mm_alignr_epi8(b, a, (i0 & 15));
+    }
+
+    // special case: shift right
+    if (i0 > 15 && i0 < 32 && i1==((i0+1)&31) && i2 ==((i0+2 )&31) && i3 ==((i0+3 )&31) && i4 ==((i0+4 )&31) && i5 ==((i0+5 )&31) && i6 ==((i0+6 )&31) && i7 ==((i0+7 )&31) && 
+        i8==((i0+8 )&31)   && i9==((i0+9)&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31)) {
+        return _mm_alignr_epi8(a, b, (i0 & 15));
+    }
+#endif
+
+#if INSTRSET >= 5   // SSE4.1 supported
+    // special case: blend without permute
+    if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) {
+        __m128i maskbl = constant4i<
+            ((i0 & 16) ? 0xFF : 0) | ((i1 & 16) ? 0xFF00 : 0) | ((i2 & 16) ? 0xFF0000 : 0) | ((i3 & 16) ? 0xFF000000 : 0) ,
+            ((i4 & 16) ? 0xFF : 0) | ((i5 & 16) ? 0xFF00 : 0) | ((i6 & 16) ? 0xFF0000 : 0) | ((i7 & 16) ? 0xFF000000 : 0) ,
+            ((i8 & 16) ? 0xFF : 0) | ((i9 & 16) ? 0xFF00 : 0) | ((i10& 16) ? 0xFF0000 : 0) | ((i11& 16) ? 0xFF000000 : 0) ,
+            ((i12& 16) ? 0xFF : 0) | ((i13& 16) ? 0xFF00 : 0) | ((i14& 16) ? 0xFF0000 : 0) | ((i15& 16) ? 0xFF000000 : 0) > ();
+        t = _mm_blendv_epi8(a, b, maskbl);
+        if (do_zero) {
+            // additional zeroing needed
+            __m128i maskz = constant4i < 
+                (i0  < 0 ? 0 : 0xFF) | (i1  < 0 ? 0 : 0xFF00) | (i2  < 0 ? 0 : 0xFF0000) | (i3  < 0 ? 0 : 0xFF000000) ,
+                (i4  < 0 ? 0 : 0xFF) | (i5  < 0 ? 0 : 0xFF00) | (i6  < 0 ? 0 : 0xFF0000) | (i7  < 0 ? 0 : 0xFF000000) ,
+                (i8  < 0 ? 0 : 0xFF) | (i9  < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) ,
+                (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > ();
+            t = _mm_and_si128(t, maskz);
+        }
+        return t;
+    }
+#endif // SSE4.1
+
+#if defined ( __XOP__ )    // Use AMD XOP instruction VPPERM
+    __m128i mask = constant4i<
+        (i0 <0 ? 0x80 : (i0 &31)) | (i1 <0 ? 0x80 : (i1 &31)) << 8 | (i2 <0 ? 0x80 : (i2 &31)) << 16 | (i3 <0 ? 0x80 : (i3 &31)) << 24,
+        (i4 <0 ? 0x80 : (i4 &31)) | (i5 <0 ? 0x80 : (i5 &31)) << 8 | (i6 <0 ? 0x80 : (i6 &31)) << 16 | (i7 <0 ? 0x80 : (i7 &31)) << 24,
+        (i8 <0 ? 0x80 : (i8 &31)) | (i9 <0 ? 0x80 : (i9 &31)) << 8 | (i10<0 ? 0x80 : (i10&31)) << 16 | (i11<0 ? 0x80 : (i11&31)) << 24,
+        (i12<0 ? 0x80 : (i12&31)) | (i13<0 ? 0x80 : (i13&31)) << 8 | (i14<0 ? 0x80 : (i14&31)) << 16 | (i15<0 ? 0x80 : (i15&31)) << 24 > ();
+    return _mm_perm_epi8(a, b, mask);
+
+#elif  INSTRSET >= 4  // SSSE3
+   
+    // general case. Use PSHUFB
+    __m128i maska = constant4i<
+        ((i0 & 0x90) ? 0xFF : (i0 &15)) | ((i1 & 0x90) ? 0xFF : (i1 &15)) << 8 | ((i2 & 0x90) ? 0xFF : (i2 &15)) << 16 | ((i3 & 0x90) ? 0xFF : (i3 &15)) << 24,
+        ((i4 & 0x90) ? 0xFF : (i4 &15)) | ((i5 & 0x90) ? 0xFF : (i5 &15)) << 8 | ((i6 & 0x90) ? 0xFF : (i6 &15)) << 16 | ((i7 & 0x90) ? 0xFF : (i7 &15)) << 24,
+        ((i8 & 0x90) ? 0xFF : (i8 &15)) | ((i9 & 0x90) ? 0xFF : (i9 &15)) << 8 | ((i10& 0x90) ? 0xFF : (i10&15)) << 16 | ((i11& 0x90) ? 0xFF : (i11&15)) << 24,
+        ((i12& 0x90) ? 0xFF : (i12&15)) | ((i13& 0x90) ? 0xFF : (i13&15)) << 8 | ((i14& 0x90) ? 0xFF : (i14&15)) << 16 | ((i15& 0x90) ? 0xFF : (i15&15)) << 24 > ();
+    __m128i maskb = constant4i<
+        (((i0^0x10) & 0x90) ? 0xFF : (i0 &15)) | (((i1^0x10) & 0x90) ? 0xFF : (i1 &15)) << 8 | (((i2^0x10) & 0x90) ? 0xFF : (i2 &15)) << 16 | (((i3^0x10) & 0x90) ? 0xFF : (i3 &15)) << 24,
+        (((i4^0x10) & 0x90) ? 0xFF : (i4 &15)) | (((i5^0x10) & 0x90) ? 0xFF : (i5 &15)) << 8 | (((i6^0x10) & 0x90) ? 0xFF : (i6 &15)) << 16 | (((i7^0x10) & 0x90) ? 0xFF : (i7 &15)) << 24,
+        (((i8^0x10) & 0x90) ? 0xFF : (i8 &15)) | (((i9^0x10) & 0x90) ? 0xFF : (i9 &15)) << 8 | (((i10^0x10)& 0x90) ? 0xFF : (i10&15)) << 16 | (((i11^0x10)& 0x90) ? 0xFF : (i11&15)) << 24,
+        (((i12^0x10)& 0x90) ? 0xFF : (i12&15)) | (((i13^0x10)& 0x90) ? 0xFF : (i13&15)) << 8 | (((i14^0x10)& 0x90) ? 0xFF : (i14&15)) << 16 | (((i15^0x10)& 0x90) ? 0xFF : (i15&15)) << 24 > ();
+    __m128i a1 = _mm_shuffle_epi8(a,maska);
+    __m128i b1 = _mm_shuffle_epi8(b,maskb);
+    return       _mm_or_si128(a1,b1);
+
+#else                 // SSE2
+    // combine two permutes
+    __m128i a1 = permute16c <
+        (uint32_t)i0  < 16 ? i0  : -1,
+        (uint32_t)i1  < 16 ? i1  : -1,
+        (uint32_t)i2  < 16 ? i2  : -1,
+        (uint32_t)i3  < 16 ? i3  : -1,
+        (uint32_t)i4  < 16 ? i4  : -1,
+        (uint32_t)i5  < 16 ? i5  : -1,
+        (uint32_t)i6  < 16 ? i6  : -1,
+        (uint32_t)i7  < 16 ? i7  : -1,
+        (uint32_t)i8  < 16 ? i8  : -1,
+        (uint32_t)i9  < 16 ? i9  : -1,
+        (uint32_t)i10 < 16 ? i10 : -1,
+        (uint32_t)i11 < 16 ? i11 : -1,
+        (uint32_t)i12 < 16 ? i12 : -1,
+        (uint32_t)i13 < 16 ? i13 : -1,
+        (uint32_t)i14 < 16 ? i14 : -1,
+        (uint32_t)i15 < 16 ? i15 : -1 > (a);
+    __m128i b1 = permute16c <
+        (uint32_t)(i0 ^16) < 16 ? (i0 ^16) : -1,
+        (uint32_t)(i1 ^16) < 16 ? (i1 ^16) : -1,
+        (uint32_t)(i2 ^16) < 16 ? (i2 ^16) : -1,
+        (uint32_t)(i3 ^16) < 16 ? (i3 ^16) : -1,
+        (uint32_t)(i4 ^16) < 16 ? (i4 ^16) : -1,
+        (uint32_t)(i5 ^16) < 16 ? (i5 ^16) : -1,
+        (uint32_t)(i6 ^16) < 16 ? (i6 ^16) : -1,
+        (uint32_t)(i7 ^16) < 16 ? (i7 ^16) : -1,        
+        (uint32_t)(i8 ^16) < 16 ? (i8 ^16) : -1,
+        (uint32_t)(i9 ^16) < 16 ? (i9 ^16) : -1,
+        (uint32_t)(i10^16) < 16 ? (i10^16) : -1,
+        (uint32_t)(i11^16) < 16 ? (i11^16) : -1,
+        (uint32_t)(i12^16) < 16 ? (i12^16) : -1,
+        (uint32_t)(i13^16) < 16 ? (i13^16) : -1,
+        (uint32_t)(i14^16) < 16 ? (i14^16) : -1,
+        (uint32_t)(i15^16) < 16 ? (i15^16) : -1 > (b);
+    return   _mm_or_si128(a1,b1);
+
+#endif
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16uc blend16uc(Vec16uc const & a, Vec16uc const & b) {
+    return Vec16uc( blend16c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a,b));
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s blend8s(Vec8s const & a, Vec8s const & b) {
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 
+        | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; 
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12
+        | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    // Some elements must be set to zero
+    const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0;
+
+    // temp contains temporary result, some zeroing needs to be done
+    bool zeroing_pending = false;
+
+    // partially finished result
+    __m128i temp;
+
+    if ((m1 & 0x88888888 & mz) == 0) {
+        // no elements from b
+        return permute8s <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+
+    if (((m1^0x88888888) & 0x88888888 & mz) == 0) {
+        // no elements from a
+        return permute8s <i0&~8, i1&~8, i2&~8, i3&~8, i4&~8, i5&~8, i6&~8, i7&~8> (b);
+    }
+
+    // special case: PUNPCKLWD 
+    if (((m1 ^ 0xB3A29180) & mz) == 0) {
+        temp = _mm_unpacklo_epi16(a, b);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+    if (((m1 ^ 0x3B2A1908) & mz) == 0) {
+        temp = _mm_unpacklo_epi16(b, a);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+    // special case: PUNPCKHWD 
+    if (((m1 ^ 0xF7E6D5C4) & mz) == 0) {
+        temp = _mm_unpackhi_epi16(a, b);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+    if (((m1 ^ 0x7F6E5D4C) & mz) == 0) {
+        temp = _mm_unpackhi_epi16(b, a);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+
+#if  INSTRSET >= 4  // SSSE3
+    // special case: shift left
+    if (i0 > 0 && i0 < 8 && ((m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) {
+        temp = _mm_alignr_epi8(b, a, (i0 & 7) * 2);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+
+    // special case: shift right
+    if (i0 > 8 && i0 < 16 && ((m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) {
+        temp = _mm_alignr_epi8(a, b, (i0 & 7) * 2);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+#endif // SSSE3
+
+#if INSTRSET >= 5   // SSE4.1 supported
+    // special case: blending without permuting
+    if ((((m1 & ~0x88888888) ^ 0x76543210) & mz) == 0) {
+        temp = _mm_blend_epi16(a, b, (i0>>3&1) | (i1>>3&1)<<1 | (i2>>3&1)<<2 | (i3>>3&1)<<3 
+            | (i4>>3&1)<<4 | (i5>>3&1)<<5 | (i6>>3&1)<<6 | (i7>>3&1)<<7);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+#endif // SSE4.1
+
+    if (zeroing_pending) {
+        // additional zeroing of temp needed
+        __m128i maskz = constant4i < 
+            (i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000) ,
+            (i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000) ,
+            (i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000) ,
+            (i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000) > ();
+        return _mm_and_si128(temp, maskz);
+    }        
+
+    // general case
+#ifdef __XOP__     // Use AMD XOP instruction PPERM
+    __m128i mask = constant4i <
+        (i0 < 0 ? 0x8080 : (i0*2 & 31) | ((i0*2 & 31)+1)<<8) | (i1 < 0 ? 0x80800000 : ((i1*2 & 31)<<16) | ((i1*2 & 31)+1)<<24),
+        (i2 < 0 ? 0x8080 : (i2*2 & 31) | ((i2*2 & 31)+1)<<8) | (i3 < 0 ? 0x80800000 : ((i3*2 & 31)<<16) | ((i3*2 & 31)+1)<<24),
+        (i4 < 0 ? 0x8080 : (i4*2 & 31) | ((i4*2 & 31)+1)<<8) | (i5 < 0 ? 0x80800000 : ((i5*2 & 31)<<16) | ((i5*2 & 31)+1)<<24),
+        (i6 < 0 ? 0x8080 : (i6*2 & 31) | ((i6*2 & 31)+1)<<8) | (i7 < 0 ? 0x80800000 : ((i7*2 & 31)<<16) | ((i7*2 & 31)+1)<<24) > ();
+    return _mm_perm_epi8(a, b, mask);
+#else  
+    // combine two permutes
+    __m128i a1 = permute8s <
+        (uint32_t)i0 < 8 ? i0 : -1,
+        (uint32_t)i1 < 8 ? i1 : -1,
+        (uint32_t)i2 < 8 ? i2 : -1,
+        (uint32_t)i3 < 8 ? i3 : -1,
+        (uint32_t)i4 < 8 ? i4 : -1,
+        (uint32_t)i5 < 8 ? i5 : -1,
+        (uint32_t)i6 < 8 ? i6 : -1,
+        (uint32_t)i7 < 8 ? i7 : -1 > (a);
+    __m128i b1 = permute8s <
+        (uint32_t)(i0^8) < 8 ? (i0^8) : -1,
+        (uint32_t)(i1^8) < 8 ? (i1^8) : -1,
+        (uint32_t)(i2^8) < 8 ? (i2^8) : -1,
+        (uint32_t)(i3^8) < 8 ? (i3^8) : -1,
+        (uint32_t)(i4^8) < 8 ? (i4^8) : -1,
+        (uint32_t)(i5^8) < 8 ? (i5^8) : -1,
+        (uint32_t)(i6^8) < 8 ? (i6^8) : -1,
+        (uint32_t)(i7^8) < 8 ? (i7^8) : -1 > (b);
+    return   _mm_or_si128(a1,b1);
+
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us blend8us(Vec8us const & a, Vec8us const & b) {
+    return Vec8us(blend8s<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i blend4i(Vec4i const & a, Vec4i const & b) {
+
+    // Combine all the indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; 
+
+    // Mask to zero out negative indexes
+    const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
+
+    // Some elements must be set to zero
+    const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3) & 0x80) != 0;
+
+    // temp contains temporary result, some zeroing needs to be done
+    bool zeroing_pending = false;
+
+    // partially finished result
+    __m128i temp;
+#if defined (_MSC_VER) || defined (__clang__)
+    temp = a;  // avoid spurious warning message for temp unused
+#endif
+
+    // special case: no elements from b
+    if ((m1 & 0x04040404 & mz) == 0) {
+        return permute4i<i0,i1,i2,i3>(a);
+    }
+
+    // special case: no elements from a
+    if (((m1^0x04040404) & 0x04040404 & mz) == 0) {
+        return permute4i<i0&~4, i1&~4, i2&~4, i3&~4>(b);
+    }
+
+    // special case: PUNPCKLDQ
+    if (((m1 ^ 0x05010400) & mz) == 0) {
+        temp = _mm_unpacklo_epi32(a, b);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+    if (((m1 ^ 0x01050004) & mz) == 0) {
+        temp = _mm_unpacklo_epi32(b, a);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+
+    // special case: PUNPCKHDQ 
+    if (((m1 ^ 0x07030602) & mz) == 0) {
+        temp = _mm_unpackhi_epi32(a, b);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+    if (((m1 ^ 0x03070206) & mz) == 0) {
+        temp = _mm_unpackhi_epi32(b, a);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+
+#if  INSTRSET >= 4  // SSSE3
+    // special case: shift left
+    if (i0 > 0 && i0 < 4 && ((m1 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) {
+        temp = _mm_alignr_epi8(b, a, (i0 & 3) * 4);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+
+    // special case: shift right
+    if (i0 > 4 && i0 < 8 && ((m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) {
+        temp = _mm_alignr_epi8(a, b, (i0 & 3) * 4);
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+#endif // SSSE3
+
+#if INSTRSET >= 5   // SSE4.1 supported
+    if ((((m1 & ~0x04040404) ^ 0x03020100) & mz) == 0) {
+        // blending without permuting
+        temp = _mm_blend_epi16(a, b, ((i0>>2)&1)*3 | ((((i1>>2)&1)*3)<<2) | ((((i2>>2)&1)*3)<<4) | ((((i3>>2)&1)*3)<<6));
+        if (do_zero) zeroing_pending = true; else return temp;
+    }
+#endif // SSE4.1
+
+    if (zeroing_pending) {
+        // additional zeroing of temp needed
+        __m128i maskz = constant4i < (i0 < 0 ? 0 : -1), (i1 < 0 ? 0 : -1), (i2 < 0 ? 0 : -1), (i3 < 0 ? 0 : -1) > ();
+        return _mm_and_si128(temp, maskz);
+    }        
+
+    // general case
+#ifdef __XOP__     // Use AMD XOP instruction PPERM
+    __m128i mask = constant4i <
+        i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24),
+        i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24),
+        i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24),
+        i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > ();
+    return _mm_perm_epi8(a, b, mask);
+
+#else  // combine two permutes
+    __m128i a1 = permute4i <
+        (uint32_t)i0 < 4 ? i0 : -1,
+        (uint32_t)i1 < 4 ? i1 : -1,
+        (uint32_t)i2 < 4 ? i2 : -1,
+        (uint32_t)i3 < 4 ? i3 : -1  > (a);
+    __m128i b1 = permute4i <
+        (uint32_t)(i0^4) < 4 ? (i0^4) : -1,
+        (uint32_t)(i1^4) < 4 ? (i1^4) : -1,
+        (uint32_t)(i2^4) < 4 ? (i2^4) : -1,
+        (uint32_t)(i3^4) < 4 ? (i3^4) : -1  > (b);
+    return  _mm_or_si128(a1,b1);
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui blend4ui(Vec4ui const & a, Vec4ui const & b) {
+    return Vec4ui (blend4i<i0,i1,i2,i3> (a,b));
+}
+
+template <int i0, int i1>
+static inline Vec2q blend2q(Vec2q const & a, Vec2q const & b) {
+
+    // Combine all the indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0&3) | (i1&3)<<8; 
+
+    // Mask to zero out negative indexes
+    const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8;
+
+    // no elements from b
+    if ((m1 & 0x0202 & mz) == 0) {
+        return permute2q <i0, i1> (a);
+    }
+    // no elements from a
+    if (((m1^0x0202) & 0x0202 & mz) == 0) {
+        return permute2q <i0 & ~2, i1 & ~2> (b);
+    }
+    // (all cases where one index is -1 or -256 would go to the above cases)
+
+    // special case: PUNPCKLQDQ 
+    if (i0 == 0 && i1 == 2) {
+        return _mm_unpacklo_epi64(a, b);
+    }
+    if (i0 == 2 && i1 == 0) {
+        return _mm_unpacklo_epi64(b, a);
+    }
+    // special case: PUNPCKHQDQ 
+    if (i0 == 1 && i1 == 3) {
+        return _mm_unpackhi_epi64(a, b);
+    }
+    if (i0 == 3 && i1 == 1) {
+        return _mm_unpackhi_epi64(b, a);
+    }
+
+#if  INSTRSET >= 4  // SSSE3
+    // special case: shift left
+    if (i0 == 1 && i1 == 2) {
+        return _mm_alignr_epi8(b, a, 8);
+    }
+    // special case: shift right
+    if (i0 == 3 && i1 == 0) {
+        return _mm_alignr_epi8(a, b, 8);
+    }
+#endif // SSSE3
+
+#if INSTRSET >= 5   // SSE4.1 supported
+    if (((m1 & ~0x0202) ^ 0x0100) == 0 && mz == 0xFFFF) {
+        // blending without permuting
+        return _mm_blend_epi16(a, b, (i0>>1 & 1) * 0xF | ((i1>>1 & 1) * 0xF) << 4 );
+    }
+#endif // SSE4.1
+
+    // general case. combine two permutes 
+    // (all cases are caught by the above special cases if SSE4.1 or higher is supported)
+    __m128i a1, b1;
+    a1 = permute2q <(uint32_t)i0 < 2 ? i0 : -1, (uint32_t)i1 < 2 ? i1 : -1 > (a);
+    b1 = permute2q <(uint32_t)(i0^2) < 2 ? (i0^2) : -1, (uint32_t)(i1^2) < 2 ? (i1^2) : -1 > (b);
+    return  _mm_or_si128(a1,b1);
+}
+
+template <int i0, int i1>
+static inline Vec2uq blend2uq(Vec2uq const & a, Vec2uq const & b) {
+    return Vec2uq (blend2q <i0, i1> ((__m128i)a, (__m128i)b));
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec4i a(2,0,0,3);           // index a is (  2,   0,   0,   3)
+* Vec4i b(100,101,102,103);   // table b is (100, 101, 102, 103)
+* Vec4i c;
+* c = lookup4 (a,b);          // c is (102, 100, 100, 103)
+*
+*****************************************************************************/
+
+static inline Vec16c lookup16(Vec16c const & index, Vec16c const & table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index);
+#else
+    uint8_t ii[16];
+    int8_t  tt[16], rr[16];
+    table.store(tt);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x0F];
+    return Vec16c().load(rr);
+#endif
+}
+
+static inline Vec16c lookup32(Vec16c const & index, Vec16c const & table0, Vec16c const & table1) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return _mm_perm_epi8(table0, table1, index);
+#elif INSTRSET >= 5  // SSSE3
+    Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70);           // make negative index for values >= 16
+    Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70);  // make negative index for values <  16
+    return r0 | r1;
+#else
+    uint8_t ii[16];
+    int8_t  tt[16], rr[16];
+    table0.store(tt);  table1.store(tt+16);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F];
+    return Vec16c().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec16c lookup(Vec16c const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <= 16) return lookup16(index, Vec16c().load(table));
+    if (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16));
+    // n > 32. Limit index
+    Vec16uc index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16uc(index) & uint8_t(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16uc(index), uint8_t(n-1));
+    }
+    uint8_t ii[16];  index1.store(ii);
+    int8_t  rr[16];
+    for (int j = 0; j < 16; j++) {
+        rr[j] = ((int8_t*)table)[ii[j]];
+    }
+    return Vec16c().load(rr);
+}
+
+static inline Vec8s lookup8(Vec8s const & index, Vec8s const & table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x202 + 0x100);
+#else
+    int16_t ii[8], tt[8], rr[8];
+    table.store(tt);  index.store(ii);
+    for (int j = 0; j < 8; j++) rr[j] = tt[ii[j] & 0x07];
+    return Vec8s().load(rr);
+#endif
+}
+
+static inline Vec8s lookup16(Vec8s const & index, Vec8s const & table0, Vec8s const & table1) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return _mm_perm_epi8(table0, table1, index * 0x202 + 0x100);
+#elif INSTRSET >= 5  // SSSE3
+    Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170)));
+    Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170)));
+    return r0 | r1;
+#else
+    int16_t ii[16], tt[32], rr[16];
+    table0.store(tt);  table1.store(tt+8);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F];
+    return Vec8s().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec8s lookup(Vec8s const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <=  8) return lookup8 (index, Vec8s().load(table));
+    if (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8));
+    // n > 16. Limit index
+    Vec8us index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8us(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8us(index), n-1);
+    }
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2);  // even positions
+    Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16) , 2);  // odd  positions
+    return blend8s<0,8,2,10,4,12,6,14>(t1, t2);
+#else
+    uint16_t ii[8];  index1.store(ii);
+    return Vec8s(((int16_t*)table)[ii[0]], ((int16_t*)table)[ii[1]], ((int16_t*)table)[ii[2]], ((int16_t*)table)[ii[3]],
+                 ((int16_t*)table)[ii[4]], ((int16_t*)table)[ii[5]], ((int16_t*)table)[ii[6]], ((int16_t*)table)[ii[7]]);
+#endif
+}
+
+
+static inline Vec4i lookup4(Vec4i const & index, Vec4i const & table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100);
+#else
+    return Vec4i(table[index[0]],table[index[1]],table[index[2]],table[index[3]]);
+#endif
+}
+
+static inline Vec4i lookup8(Vec4i const & index, Vec4i const & table0, Vec4i const & table1) {
+    // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1)));
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return _mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100);
+#elif INSTRSET >= 8 // AVX2. Use VPERMD
+    __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector
+
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+    // bug in MS VS 11 beta: operands in wrong order
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    // Gcc 4.7.0 also has operands in wrong order
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01));
+#else
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+#endif // bug
+
+#elif INSTRSET >= 4  // SSSE3
+    Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170)));
+    Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170)));
+    return r0 | r1;
+#else    // SSE2
+    int32_t ii[4], tt[8], rr[4];
+    table0.store(tt);  table1.store(tt+4);  index.store(ii);
+    for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x07];
+    return Vec4i().load(rr);
+#endif
+}
+
+static inline Vec4i lookup16(Vec4i const & index, Vec4i const & table0, Vec4i const & table1, Vec4i const & table2, Vec4i const & table3) {
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector
+    __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1); // join tables into 256 bit vector
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+    // bug in MS VS 11 beta: operands in wrong order
+    __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index    ), table01));
+    __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23));
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    // Gcc 4.7.0 also has operands in wrong order
+    __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index    ), table01));
+    __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23));
+#else
+    __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+    __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8)));
+#endif // bug
+    return _mm_blendv_epi8(r0, r1, index > 8);
+
+#elif defined (__XOP__)  // AMD XOP instruction set. Use VPPERM
+    Vec4i r0 = _mm_perm_epi8(table0, table1, ((index    ) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+    Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+    return r0 | r1;
+
+#elif INSTRSET >= 5  // SSSE3
+    Vec16c aa = Vec16c(Vec4i(0x73727170));
+    Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c((index     ) * 0x04040404) + aa);
+    Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c((index ^  4) * 0x04040404) + aa);
+    Vec4i r2 = _mm_shuffle_epi8(table2, Vec16c((index ^  8) * 0x04040404) + aa);
+    Vec4i r3 = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa);
+    return (r0 | r1) | (r2 | r3);
+
+#else    // SSE2
+    int32_t ii[4], tt[16], rr[4];
+    table0.store(tt);  table1.store(tt+4);  table2.store(tt+8);  table3.store(tt+12);
+    index.store(ii);
+    for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x0F];
+    return Vec4i().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec4i lookup(Vec4i const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) return lookup4(index, Vec4i().load(table));
+    if (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4));
+    // n > 8. Limit index
+    Vec4ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec4ui(index), n-1);
+    }
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    return _mm_i32gather_epi32((const int *)table, index1, 4);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec4i(((int32_t*)table)[ii[0]], ((int32_t*)table)[ii[1]], ((int32_t*)table)[ii[2]], ((int32_t*)table)[ii[3]]);
+#endif
+}
+
+
+static inline Vec2q lookup2(Vec2q const & index, Vec2q const & table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll);
+#else
+    int64_t ii[2], tt[2];
+    table.store(tt);  index.store(ii);
+    return Vec2q(tt[int(ii[0])], tt[int(ii[1])]);
+#endif
+}
+
+template <int n>
+static inline Vec2q lookup(Vec2q const & index, void const * table) {
+    if (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec2uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec2uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec2uq(min(Vec2uq(index), constant4i<n-1, 0, n-1, 0>()));
+    }
+    uint32_t ii[4];  index1.store(ii);  // use only lower 32 bits of each index
+    int64_t const * tt = (int64_t const *)table;
+    return Vec2q(tt[ii[0]], tt[ii[2]]);
+}
+
+
+/*****************************************************************************
+*
+*          Other permutations with variable indexes
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec16c shift_bytes_up(Vec16c const & a, int b) {
+    if ((uint32_t)b > 15) return _mm_setzero_si128();
+#if INSTRSET >= 4    // SSSE3
+    static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};    
+    return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+16-b)));
+#else
+    Vec2uq a1 = Vec2uq(a);
+    if (b < 8) {    
+        a1 = (a1 << (b*8)) | (permute2uq<-1,0>(a1) >> (64 - (b*8)));
+    }
+    else {
+        a1 = permute2uq<-1,0>(a1) << ((b-8)*8);
+    }
+    return Vec16c(a1);
+#endif
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec16c shift_bytes_down(Vec16c const & a, int b) {
+    if ((uint32_t)b > 15) return _mm_setzero_si128();
+#if INSTRSET >= 4    // SSSE3
+    static const char mask[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
+    return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+b)));
+#else
+    Vec2uq a1 = Vec2uq(a);
+    if (b < 8) {    
+        a1 = (a1 >> (b*8)) | (permute2uq<1,-1>(a1) << (64 - (b*8)));
+    }
+    else {
+        a1 = permute2uq<1,-1>(a1) >> ((b-8)*8); 
+    }
+    return Vec16c(a1);
+#endif
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i gather4i(void const * a) {
+    Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index;  // Error message if index is negative
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int imin   = i01min < i23min ? i01min : i23min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int imax   = i01max > i23max ? i01max : i23max;
+    if (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4i b = Vec4i().load((int32_t const *)a + imax-3);
+            return permute4i<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
+        }
+        else {
+            Vec4i b = Vec4i().load((int32_t const *)a + imin);
+            return permute4i<i0-imin, i1-imin, i2-imin, i3-imin>(b);
+        }
+    }
+    if ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
+        // load two contiguous blocks and blend
+        Vec4i b = Vec4i().load((int32_t const *)a + imin);
+        Vec4i c = Vec4i().load((int32_t const *)a + imax-3);
+        const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
+        const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
+        const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
+        const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
+        return blend4i<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather if available
+#if INSTRSET >= 8
+    return _mm_i32gather_epi32((const int *)a, Vec4i(i0,i1,i2,i3), 4);
+#else
+    return lookup<imax+1>(Vec4i(i0,i1,i2,i3), a);
+#endif
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2q gather2q(void const * a) {
+    Static_error_check<(i0|i1)>=0> Negative_array_index;  // Error message if index is negative
+    const int imin = i0 < i1 ? i0 : i1;
+    const int imax = i0 > i1 ? i0 : i1;
+    if (imax - imin <= 1) {
+        // load one contiguous block and permute
+        if (imax > 1) {
+            // make sure we don't read past the end of the array
+            Vec2q b = Vec2q().load((int64_t const *)a + imax-1);
+            return permute2q<i0-imax+1, i1-imax+1>(b);
+        }
+        else {
+            Vec2q b = Vec2q().load((int64_t const *)a + imin);
+            return permute2q<i0-imin, i1-imin>(b);
+        }
+    }
+    return Vec2q(((int64_t*)a)[i0], ((int64_t*)a)[i1]);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 16 bits with sign extension
+static inline Vec8s extend_low (Vec16c const & a) {
+    __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a);  // 0 > a
+    return         _mm_unpacklo_epi8(a,sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with sign extension
+static inline Vec8s extend_high (Vec16c const & a) {
+    __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a);  // 0 > a
+    return         _mm_unpackhi_epi8(a,sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 16 bits with zero extension
+static inline Vec8us extend_low (Vec16uc const & a) {
+    return    _mm_unpacklo_epi8(a,_mm_setzero_si128());    // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with zero extension
+static inline Vec8us extend_high (Vec16uc const & a) {
+    return    _mm_unpackhi_epi8(a,_mm_setzero_si128());    // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 32 bits with sign extension
+static inline Vec4i extend_low (Vec8s const & a) {
+    __m128i sign = _mm_srai_epi16(a,15);                   // sign bit
+    return         _mm_unpacklo_epi16(a,sign);             // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with sign extension
+static inline Vec4i extend_high (Vec8s const & a) {
+    __m128i sign = _mm_srai_epi16(a,15);                   // sign bit
+    return         _mm_unpackhi_epi16(a,sign);             // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_low (Vec8us const & a) {
+    return    _mm_unpacklo_epi16(a,_mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_high (Vec8us const & a) {
+    return    _mm_unpackhi_epi16(a,_mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 2 elements to 64 bits with sign extension
+static inline Vec2q extend_low (Vec4i const & a) {
+    __m128i sign = _mm_srai_epi32(a,31);                   // sign bit
+    return         _mm_unpacklo_epi32(a,sign);             // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with sign extension
+static inline Vec2q extend_high (Vec4i const & a) {
+    __m128i sign = _mm_srai_epi32(a,31);                   // sign bit
+    return         _mm_unpackhi_epi32(a,sign);             // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_low (Vec4ui const & a) {
+    return    _mm_unpacklo_epi32(a,_mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_high (Vec4ui const & a) {
+    return    _mm_unpackhi_epi32(a,_mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress (Vec8s const & low, Vec8s const & high) {
+    __m128i mask  = _mm_set1_epi32(0x00FF00FF);            // mask for low bytes
+    __m128i lowm  = _mm_and_si128(low,mask);               // bytes of low
+    __m128i highm = _mm_and_si128(high,mask);              // bytes of high
+    return  _mm_packus_epi16(lowm,highm);                  // unsigned pack
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec16c compress_saturated (Vec8s const & low, Vec8s const & high) {
+    return  _mm_packs_epi16(low,high);
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec16uc compress (Vec8us const & low, Vec8us const & high) {
+    return  Vec16uc (compress((Vec8s)low, (Vec8s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec16uc compress_saturated (Vec8us const & low, Vec8us const & high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i maxval  = _mm_set1_epi32(0x00FF00FF);          // maximum value
+    __m128i minval  = _mm_setzero_si128();                 // minimum value = 0
+    __m128i low1    = _mm_min_epu16(low,maxval);           // upper limit
+    __m128i high1   = _mm_min_epu16(high,maxval);          // upper limit
+    __m128i low2    = _mm_max_epu16(low1,minval);          // lower limit
+    __m128i high2   = _mm_max_epu16(high1,minval);         // lower limit
+    return            _mm_packus_epi16(low2,high2);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+    __m128i zero    = _mm_setzero_si128();                 // 0
+    __m128i signlow = _mm_cmpgt_epi16(zero,low);           // sign bit of low
+    __m128i signhi  = _mm_cmpgt_epi16(zero,high);          // sign bit of high
+    __m128i slow2   = _mm_srli_epi16(signlow,8);           // FF if low negative
+    __m128i shigh2  = _mm_srli_epi16(signhi,8);            // FF if high negative
+    __m128i maskns  = _mm_set1_epi32(0x7FFF7FFF);          // mask for removing sign bit
+    __m128i lowns   = _mm_and_si128(low,maskns);           // low,  with sign bit removed
+    __m128i highns  = _mm_and_si128(high,maskns);          // high, with sign bit removed
+    __m128i lowo    = _mm_or_si128(lowns,slow2);           // low,  sign bit replaced by 00FF
+    __m128i higho   = _mm_or_si128(highns,shigh2);         // high, sign bit replaced by 00FF
+    return            _mm_packus_epi16(lowo,higho);        // this instruction saturates from signed 16 bit to unsigned 8 bit
+#endif
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8s compress (Vec4i const & low, Vec4i const & high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i mask  = _mm_set1_epi32(0x0000FFFF);            // mask for low words
+    __m128i lowm  = _mm_and_si128(low,mask);               // bytes of low
+    __m128i highm = _mm_and_si128(high,mask);              // bytes of high
+    return  _mm_packus_epi32(lowm,highm);                  // unsigned pack
+#else
+    __m128i low1  = _mm_shufflelo_epi16(low,0xD8);         // low words in place
+    __m128i high1 = _mm_shufflelo_epi16(high,0xD8);        // low words in place
+    __m128i low2  = _mm_shufflehi_epi16(low1,0xD8);        // low words in place
+    __m128i high2 = _mm_shufflehi_epi16(high1,0xD8);       // low words in place
+    __m128i low3  = _mm_shuffle_epi32(low2,0xD8);          // low dwords of low  to pos. 0 and 32
+    __m128i high3 = _mm_shuffle_epi32(high2,0xD8);         // low dwords of high to pos. 0 and 32
+    return  _mm_unpacklo_epi64(low3,high3);                // interleave
+#endif
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec8s compress_saturated (Vec4i const & low, Vec4i const & high) {
+    return  _mm_packs_epi32(low,high);                     // pack with signed saturation
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8us compress (Vec4ui const & low, Vec4ui const & high) {
+    return Vec8us (compress((Vec4i)low, (Vec4i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec8us compress_saturated (Vec4ui const & low, Vec4ui const & high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i maxval  = _mm_set1_epi32(0x0000FFFF);          // maximum value
+    __m128i minval  = _mm_setzero_si128();                 // minimum value = 0
+    __m128i low1    = _mm_min_epu32(low,maxval);           // upper limit
+    __m128i high1   = _mm_min_epu32(high,maxval);          // upper limit
+    __m128i low2    = _mm_max_epu32(low1,minval);          // lower limit
+    __m128i high2   = _mm_max_epu32(high1,minval);         // lower limit
+    return            _mm_packus_epi32(low2,high2);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+    __m128i zero     = _mm_setzero_si128();                // 0
+    __m128i lowzero  = _mm_cmpeq_epi16(low,zero);          // for each word is zero
+    __m128i highzero = _mm_cmpeq_epi16(high,zero);         // for each word is zero
+    __m128i mone     = _mm_set1_epi32(-1);                 // FFFFFFFF
+    __m128i lownz    = _mm_xor_si128(lowzero,mone);        // for each word is nonzero
+    __m128i highnz   = _mm_xor_si128(highzero,mone);       // for each word is nonzero
+    __m128i lownz2   = _mm_srli_epi32(lownz,16);           // shift down to low dword
+    __m128i highnz2  = _mm_srli_epi32(highnz,16);          // shift down to low dword
+    __m128i lowsatur = _mm_or_si128(low,lownz2);           // low, saturated
+    __m128i hisatur  = _mm_or_si128(high,highnz2);         // high, saturated
+    return  Vec8us (compress(Vec4i(lowsatur), Vec4i(hisatur)));
+#endif
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec4i compress (Vec2q const & low, Vec2q const & high) {
+    __m128i low2  = _mm_shuffle_epi32(low,0xD8);           // low dwords of low  to pos. 0 and 32
+    __m128i high2 = _mm_shuffle_epi32(high,0xD8);          // low dwords of high to pos. 0 and 32
+    return  _mm_unpacklo_epi64(low2,high2);                // interleave
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+// This function is very inefficient unless the SSE4.2 instruction set is supported
+static inline Vec4i compress_saturated (Vec2q const & low, Vec2q const & high) {
+    Vec2q maxval = _mm_set_epi32(0,0x7FFFFFFF,0,0x7FFFFFFF);
+    Vec2q minval = _mm_set_epi32(-1,0x80000000,-1,0x80000000);
+    Vec2q low1   = min(low,maxval);
+    Vec2q high1  = min(high,maxval);
+    Vec2q low2   = max(low1,minval);
+    Vec2q high2  = max(high1,minval);
+    return compress(low2,high2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec4ui compress (Vec2uq const & low, Vec2uq const & high) {
+    return Vec4ui (compress((Vec2q)low, (Vec2q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec4ui compress_saturated (Vec2uq const & low, Vec2uq const & high) {
+    __m128i zero     = _mm_setzero_si128();                // 0
+    __m128i lowzero  = _mm_cmpeq_epi32(low,zero);          // for each dword is zero
+    __m128i highzero = _mm_cmpeq_epi32(high,zero);         // for each dword is zero
+    __m128i mone     = _mm_set1_epi32(-1);                 // FFFFFFFF
+    __m128i lownz    = _mm_xor_si128(lowzero,mone);        // for each dword is nonzero
+    __m128i highnz   = _mm_xor_si128(highzero,mone);       // for each dword is nonzero
+    __m128i lownz2   = _mm_srli_epi64(lownz,32);           // shift down to low dword
+    __m128i highnz2  = _mm_srli_epi64(highnz,32);          // shift down to low dword
+    __m128i lowsatur = _mm_or_si128(low,lownz2);           // low, saturated
+    __m128i hisatur  = _mm_or_si128(high,highnz2);         // high, saturated
+    return  Vec4ui (compress(Vec2q(lowsatur), Vec2q(hisatur)));
+}
+
+/*****************************************************************************
+*
+*          Helper functions for division and bit scan
+*
+*****************************************************************************/
+
+// Define popcount function. Gives sum of bits
+#if INSTRSET >= 6   // SSE4.2
+    // popcnt instruction is not officially part of the SSE4.2 instruction set,
+    // but available in all known processors with SSE4.2
+#if defined (__GNUC__) || defined(__clang__)
+static inline uint32_t vml_popcnt (uint32_t a) __attribute__ ((pure));
+static inline uint32_t vml_popcnt (uint32_t a) {	
+    uint32_t r;
+    __asm("popcnt %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+#else
+static inline uint32_t vml_popcnt (uint32_t a) {	
+    return _mm_popcnt_u32(a);  // MS intrinsic
+}
+#endif // platform
+#else  // no SSE4.2
+static inline uint32_t vml_popcnt (uint32_t a) {	
+    // popcnt instruction not available
+    uint32_t b = a - ((a >> 1) & 0x55555555);
+    uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+    uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+    uint32_t e = d * 0x01010101;
+    return   e >> 24;
+}
+#endif
+
+
+// Define bit-scan-forward function. Gives index to lowest set bit
+#if defined (__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_reverse (uint32_t a) __attribute__ ((pure));
+static inline uint32_t bit_scan_forward (uint32_t a) {	
+    uint32_t r;
+    __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+#else
+static inline uint32_t bit_scan_forward (uint32_t a) {	
+    unsigned long r;
+    _BitScanForward(&r, a);                      // defined in intrin.h for MS and Intel compilers
+    return r;
+}
+#endif
+
+// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
+#if defined (__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_reverse (uint32_t a) __attribute__ ((pure));
+static inline uint32_t bit_scan_reverse (uint32_t a) {	
+    uint32_t r;
+    __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+#else
+static inline uint32_t bit_scan_reverse (uint32_t a) {	
+    unsigned long r;
+    _BitScanReverse(&r, a);                      // defined in intrin.h for MS and Intel compilers
+    return r;
+}
+#endif
+
+// Same function, for compile-time constants.
+// We need template metaprogramming for calculating this function at compile time.
+// This may take a long time to compile because of the template recursion.
+// Todo: replace this with a constexpr function when C++14 becomes available
+template <uint32_t n> 
+struct BitScanR {
+    enum {val = (
+        n >= 0x10 ? 4 + (BitScanR<(n>>4)>::val) :
+        n  <    2 ? 0 :
+        n  <    4 ? 1 :
+        n  <    8 ? 2 : 3 )                       };
+};
+template <> struct BitScanR<0> {enum {val = 0};};          // Avoid infinite template recursion
+
+#define bit_scan_reverse_const(n)  (BitScanR<n>::val)      // n must be a valid compile-time constant
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+******************************************************************************
+*
+* The instruction set does not support integer vector division. Instead, we
+* are using a method for fast integer division based on multiplication and
+* shift operations. This method is faster than simple integer division if the
+* same divisor is used multiple times.
+*
+* All elements in a vector are divided by the same divisor. It is not possible
+* to divide different elements of the same vector by different divisors.
+*
+* The parameters used for fast division are stored in an object of a 
+* Divisor class. This object can be created implicitly, for example in:
+*        Vec4i a, b; int c;
+*        a = b / c;
+* or explicitly as:
+*        a = b / Divisor_i(c);
+*
+* It takes more time to compute the parameters used for fast division than to
+* do the division. Therefore, it is advantageous to use the same divisor object
+* multiple times. For example, to divide 80 unsigned short integers by 10:
+*
+*        uint16_t dividends[80], quotients[80];         // numbers to work with
+*        Divisor_us div10(10);                          // make divisor object for dividing by 10
+*        Vec8us temp;                                   // temporary vector
+*        for (int i = 0; i < 80; i += 8) {              // loop for 4 elements per iteration
+*            temp.load(dividends+i);                    // load 4 elements
+*            temp /= div10;                             // divide each element by 10
+*            temp.store(quotients+i);                   // store 4 elements
+*        }
+* 
+* The parameters for fast division can also be computed at compile time. This is
+* an advantage if the divisor is known at compile time. Use the const_int or const_uint
+* macro to do this. For example, for signed integers:
+*        Vec8s a, b;
+*        a = b / const_int(10);
+* Or, for unsigned integers:
+*        Vec8us a, b;
+*        a = b / const_uint(10);
+*
+* The division of a vector of 16-bit integers is faster than division of a vector 
+* of other integer sizes.
+*
+* 
+* Mathematical formula, used for signed division with fixed or variable divisor:
+* (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+* Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 )
+* x = dividend
+* d = abs(divisor)
+* w = integer word size, bits
+* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+* L = max(L,1)
+* m = 1 + 2^(w+L-1)/d - 2^w                      [division should overflow to 0 if d = 1]
+* sh1 = L-1
+* q = x + (m*x >> w)                             [high part of signed multiplication with 2w bits]
+* q = (q >> sh1) - (x<0 ? -1 : 0)
+* if (divisor < 0) q = -q 
+* result trunc(x/d) = q
+*
+* Mathematical formula, used for unsigned division with variable divisor:
+* (Also from T. Granlund and P. L. Montgomery)
+* x = dividend
+* d = divisor
+* w = integer word size, bits
+* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+* m = 1 + 2^w * (2^L-d) / d                      [2^L should overflow to 0 if L = w]
+* sh1 = min(L,1)
+* sh2 = max(L-1,0)
+* t = m*x >> w                                   [high part of unsigned multiplication with 2w bits]
+* result floor(x/d) = (((x-t) >> sh1) + t) >> sh2
+*
+* Mathematical formula, used for unsigned division with fixed divisor:
+* (From Terje Mathisen, unpublished)
+* x = dividend
+* d = divisor
+* w = integer word size, bits
+* b = floor(log2(d)) = bit_scan_reverse(d)
+* f = 2^(w+b) / d                                [exact division]
+* If f is an integer then d is a power of 2 then go to case A
+* If the fractional part of f is < 0.5 then go to case B
+* If the fractional part of f is > 0.5 then go to case C
+* Case A:  [shift only]
+* result = x >> b
+* Case B:  [round down f and compensate by adding one to x]
+* result = ((x+1)*floor(f)) >> (w+b)             [high part of unsigned multiplication with 2w bits]
+* Case C:  [round up f, no compensation for rounding error]
+* result = (x*ceil(f)) >> (w+b)                  [high part of unsigned multiplication with 2w bits]
+*
+*
+*****************************************************************************/
+
+// encapsulate parameters for fast division on vector of 4 32-bit signed integers
+class Divisor_i {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count used in fast division
+    __m128i sign;                                          // sign of divisor
+public:
+    Divisor_i() {};                                        // Default constructor
+    Divisor_i(int32_t d) {                                 // Constructor with divisor
+        set(d);
+    }
+    Divisor_i(int m, int s1, int sgn) {                    // Constructor with precalculated multiplier, shift and sign
+        multiplier = _mm_set1_epi32(m);
+        shift1     = _mm_cvtsi32_si128(s1);
+        sign       = _mm_set1_epi32(sgn);
+    }
+    void set(int32_t d) {                                  // Set or change divisor, calculate parameters
+        const int32_t d1 = abs(d);
+        int32_t sh, m;
+        if (d1 > 1) {
+            sh = bit_scan_reverse(d1-1);                   // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+            m = int32_t((int64_t(1) << (32+sh)) / d1 - ((int64_t(1) << 32) - 1)); // calculate multiplier
+        }
+        else {
+            m  = 1;                                        // for d1 = 1
+            sh = 0;
+            if (d == 0) m /= d;                            // provoke error here if d = 0
+            if (uint32_t(d) == 0x80000000u) {              // fix overflow for this special case
+                m  = 0x80000001;
+                sh = 30;
+            }
+        }
+        multiplier = _mm_set1_epi32(m);                    // broadcast multiplier
+        shift1     = _mm_setr_epi32(sh, 0, 0, 0);          // shift count
+        sign       = _mm_set1_epi32(d < 0 ? -1 : 0);       // sign of divisor
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count
+        return shift1;
+    }
+    __m128i getsign() const {                              // get sign of divisor
+        return sign;
+    }
+};
+
+// encapsulate parameters for fast division on vector of 4 32-bit unsigned integers
+class Divisor_ui {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count 1 used in fast division
+    __m128i shift2;                                        // shift count 2 used in fast division
+public:
+    Divisor_ui() {};                                       // Default constructor
+    Divisor_ui(uint32_t d) {                               // Constructor with divisor
+        set(d);
+    }
+    Divisor_ui(uint32_t m, int s1, int s2) {               // Constructor with precalculated multiplier and shifts
+        multiplier = _mm_set1_epi32(m);
+        shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+        shift2     = _mm_setr_epi32(s2, 0, 0, 0);
+    }
+    void set(uint32_t d) {                                 // Set or change divisor, calculate parameters
+        uint32_t L, L2, sh1, sh2, m;
+        switch (d) {
+        case 0:
+            m = sh1 = sh2 = 1 / d;                         // provoke error for d = 0
+            break;
+        case 1:
+            m = 1; sh1 = sh2 = 0;                          // parameters for d = 1
+            break;
+        case 2:
+            m = 1; sh1 = 1; sh2 = 0;                       // parameters for d = 2
+            break;
+        default:                                           // general case for d > 2
+            L  = bit_scan_reverse(d-1)+1;                  // ceil(log2(d))
+            L2 = L < 32 ? 1 << L : 0;                      // 2^L, overflow to 0 if L = 32
+            m  = 1 + uint32_t((uint64_t(L2 - d) << 32) / d); // multiplier
+            sh1 = 1;  sh2 = L - 1;                         // shift counts
+        }
+        multiplier = _mm_set1_epi32(m);
+        shift1     = _mm_setr_epi32(sh1, 0, 0, 0);
+        shift2     = _mm_setr_epi32(sh2, 0, 0, 0);
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count 1
+        return shift1;
+    }
+    __m128i gets2() const {                                // get shift count 2
+        return shift2;
+    }
+};
+
+
+// encapsulate parameters for fast division on vector of 8 16-bit signed integers
+class Divisor_s {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count used in fast division
+    __m128i sign;                                          // sign of divisor
+public:
+    Divisor_s() {};                                        // Default constructor
+    Divisor_s(int16_t d) {                                 // Constructor with divisor
+        set(d);
+    }
+    Divisor_s(int16_t m, int s1, int sgn) {                // Constructor with precalculated multiplier, shift and sign
+        multiplier = _mm_set1_epi16(m);
+        shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+        sign       = _mm_set1_epi32(sgn);
+    }
+    void set(int16_t d) {                                  // Set or change divisor, calculate parameters
+        const int32_t d1 = abs(d);
+        int32_t sh, m;
+        if (d1 > 1) {
+            sh = bit_scan_reverse(d1-1);                   // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+            m = ((int32_t(1) << (16+sh)) / d1 - ((int32_t(1) << 16) - 1)); // calculate multiplier
+        }
+        else {
+            m  = 1;                                        // for d1 = 1
+            sh = 0;
+            if (d == 0) m /= d;                            // provoke error here if d = 0
+            if (uint16_t(d) == 0x8000u) {                  // fix overflow for this special case
+                m  = 0x8001;
+                sh = 14;
+            }
+        }
+        multiplier = _mm_set1_epi16(int16_t(m));           // broadcast multiplier
+        shift1     = _mm_setr_epi32(sh, 0, 0, 0);          // shift count
+        sign       = _mm_set1_epi32(d < 0 ? -1 : 0);       // sign of divisor
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count
+        return shift1;
+    }
+    __m128i getsign() const {                              // get sign of divisor
+        return sign;
+    }
+};
+
+
+// encapsulate parameters for fast division on vector of 8 16-bit unsigned integers
+class Divisor_us {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count 1 used in fast division
+    __m128i shift2;                                        // shift count 2 used in fast division
+public:
+    Divisor_us() {};                                       // Default constructor
+    Divisor_us(uint16_t d) {                               // Constructor with divisor
+        set(d);
+    }
+    Divisor_us(uint16_t m, int s1, int s2) {               // Constructor with precalculated multiplier and shifts
+        multiplier = _mm_set1_epi16(m);
+        shift1     = _mm_setr_epi32(s1, 0, 0, 0);
+        shift2     = _mm_setr_epi32(s2, 0, 0, 0);
+    }
+    void set(uint16_t d) {                                 // Set or change divisor, calculate parameters
+        uint16_t L, L2, sh1, sh2, m;
+        switch (d) {
+        case 0:
+            m = sh1 = sh2 = 1 / d;                         // provoke error for d = 0
+            break;
+        case 1:
+            m = 1; sh1 = sh2 = 0;                          // parameters for d = 1
+            break;
+        case 2:
+            m = 1; sh1 = 1; sh2 = 0;                       // parameters for d = 2
+            break;
+        default:                                           // general case for d > 2
+            L  = (uint16_t)bit_scan_reverse(d-1)+1;        // ceil(log2(d))
+            L2 = uint16_t(1 << L);                         // 2^L, overflow to 0 if L = 16
+            m  = 1 + uint16_t((uint32_t(L2 - d) << 16) / d); // multiplier
+            sh1 = 1;  sh2 = L - 1;                         // shift counts
+        }
+        multiplier = _mm_set1_epi16(m);
+        shift1     = _mm_setr_epi32(sh1, 0, 0, 0);
+        shift2     = _mm_setr_epi32(sh2, 0, 0, 0);
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count 1
+        return shift1;
+    }
+    __m128i gets2() const {                                // get shift count 2
+        return shift2;
+    }
+};
+
+
+// vector operator / : divide each element by divisor
+
+// vector of 4 32-bit signed integers
+static inline Vec4i operator / (Vec4i const & a, Divisor_i const & d) {
+#if defined (__XOP__) && defined (GCC_VERSION) && GCC_VERSION <= 40702/*??*/ && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#define XOP_MUL_BUG                                       // GCC has bug in XOP multiply
+// Bug found in GCC version 4.7.0 and 4.7.1
+#endif
+// todo: test this when GCC bug is fixed
+#if defined (__XOP__) && !defined (XOP_MUL_BUG)
+    __m128i t1  = _mm_mul_epi32(a,d.getm());               // 32x32->64 bit signed multiplication of a[0] and a[2]
+    __m128i t2  = _mm_srli_epi64(t1,32);                   // high dword of result 0 and 2
+    __m128i t3  = _mm_macchi_epi32(a,d.getm(),_mm_setzero_si128());// 32x32->64 bit signed multiplication of a[1] and a[3]
+    __m128i t5  = _mm_set_epi32(-1,0,-1,0);                // mask of dword 1 and 3
+    __m128i t7  = _mm_blendv_epi8(t2,t3,t5);               // blend two results
+    __m128i t8  = _mm_add_epi32(t7,a);                     // add
+    __m128i t9  = _mm_sra_epi32(t8,d.gets1());             // shift right arithmetic
+    __m128i t10 = _mm_srai_epi32(a,31);                    // sign of a
+    __m128i t11 = _mm_sub_epi32(t10,d.getsign());          // sign of a - sign of d
+    __m128i t12 = _mm_sub_epi32(t9,t11);                   // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t12,d.getsign());          // change sign if divisor negative
+
+#elif INSTRSET >= 5 && !defined (XOP_MUL_BUG)  // SSE4.1 supported 
+    __m128i t1  = _mm_mul_epi32(a,d.getm());               // 32x32->64 bit signed multiplication of a[0] and a[2]
+    __m128i t2  = _mm_srli_epi64(t1,32);                   // high dword of result 0 and 2
+    __m128i t3  = _mm_srli_epi64(a,32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4  = _mm_mul_epi32(t3,d.getm());              // 32x32->64 bit signed multiplication of a[1] and a[3]
+    __m128i t5  = _mm_set_epi32(-1,0,-1,0);                // mask of dword 1 and 3
+    __m128i t7  = _mm_blendv_epi8(t2,t4,t5);               // blend two results
+    __m128i t8  = _mm_add_epi32(t7,a);                     // add
+    __m128i t9  = _mm_sra_epi32(t8,d.gets1());             // shift right arithmetic
+    __m128i t10 = _mm_srai_epi32(a,31);                    // sign of a
+    __m128i t11 = _mm_sub_epi32(t10,d.getsign());          // sign of a - sign of d
+    __m128i t12 = _mm_sub_epi32(t9,t11);                   // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t12,d.getsign());          // change sign if divisor negative
+#else  // not SSE4.1
+    __m128i t1  = _mm_mul_epu32(a,d.getm());               // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+    __m128i t2  = _mm_srli_epi64(t1,32);                   // high dword of result 0 and 2
+    __m128i t3  = _mm_srli_epi64(a,32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4  = _mm_mul_epu32(t3,d.getm());              // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+    __m128i t5  = _mm_set_epi32(-1,0,-1,0);                // mask of dword 1 and 3
+    __m128i t6  = _mm_and_si128(t4,t5);                    // high dword of result 1 and 3
+    __m128i t7  = _mm_or_si128(t2,t6);                     // combine all four results of unsigned high mul into one vector
+    // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i u1  = _mm_srai_epi32(a,31);                    // sign of a
+    __m128i u2  = _mm_srai_epi32(d.getm(),31);             // sign of m [ m is always negative, except for abs(d) = 1 ]
+    __m128i u3  = _mm_and_si128 (d.getm(),u1);             // m * sign of a
+    __m128i u4  = _mm_and_si128 (a,u2);                    // a * sign of m
+    __m128i u5  = _mm_add_epi32 (u3,u4);                   // sum of sign corrections
+    __m128i u6  = _mm_sub_epi32 (t7,u5);                   // high multiplication result converted to signed
+    __m128i t8  = _mm_add_epi32(u6,a);                     // add a
+    __m128i t9  = _mm_sra_epi32(t8,d.gets1());             // shift right arithmetic
+    __m128i t10 = _mm_sub_epi32(u1,d.getsign());           // sign of a - sign of d
+    __m128i t11 = _mm_sub_epi32(t9,t10);                   // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t11,d.getsign());          // change sign if divisor negative
+#endif
+}
+
+// vector of 4 32-bit unsigned integers
+static inline Vec4ui operator / (Vec4ui const & a, Divisor_ui const & d) {
+    __m128i t1  = _mm_mul_epu32(a,d.getm());               // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+    __m128i t2  = _mm_srli_epi64(t1,32);                   // high dword of result 0 and 2
+    __m128i t3  = _mm_srli_epi64(a,32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4  = _mm_mul_epu32(t3,d.getm());              // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+    __m128i t5  = _mm_set_epi32(-1,0,-1,0);                // mask of dword 1 and 3
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i t7  = _mm_blendv_epi8(t2,t4,t5);               // blend two results
+#else
+    __m128i t6  = _mm_and_si128(t4,t5);                    // high dword of result 1 and 3
+    __m128i t7  = _mm_or_si128(t2,t6);                     // combine all four results into one vector
+#endif
+    __m128i t8  = _mm_sub_epi32(a,t7);                     // subtract
+    __m128i t9  = _mm_srl_epi32(t8,d.gets1());             // shift right logical
+    __m128i t10 = _mm_add_epi32(t7,t9);                    // add
+    return        _mm_srl_epi32(t10,d.gets2());            // shift right logical 
+}
+
+// vector of 8 16-bit signed integers
+static inline Vec8s operator / (Vec8s const & a, Divisor_s const & d) {
+    __m128i t1  = _mm_mulhi_epi16(a, d.getm());            // multiply high signed words
+    __m128i t2  = _mm_add_epi16(t1,a);                     // + a
+    __m128i t3  = _mm_sra_epi16(t2,d.gets1());             // shift right arithmetic
+    __m128i t4  = _mm_srai_epi16(a,15);                    // sign of a
+    __m128i t5  = _mm_sub_epi16(t4,d.getsign());           // sign of a - sign of d
+    __m128i t6  = _mm_sub_epi16(t3,t5);                    // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t6,d.getsign());           // change sign if divisor negative
+}
+
+// vector of 8 16-bit unsigned integers
+static inline Vec8us operator / (Vec8us const & a, Divisor_us const & d) {
+    __m128i t1  = _mm_mulhi_epu16(a, d.getm());            // multiply high unsigned words
+    __m128i t2  = _mm_sub_epi16(a,t1);                     // subtract
+    __m128i t3  = _mm_srl_epi16(t2,d.gets1());             // shift right logical
+    __m128i t4  = _mm_add_epi16(t1,t3);                    // add
+    return        _mm_srl_epi16(t4,d.gets2());             // shift right logical 
+}
+
+ 
+// vector of 16 8-bit signed integers
+static inline Vec16c operator / (Vec16c const & a, Divisor_s const & d) {
+    // expand into two Vec8s
+    Vec8s low  = extend_low(a)  / d;
+    Vec8s high = extend_high(a) / d;
+    return compress(low,high);
+}
+
+// vector of 16 8-bit unsigned integers
+static inline Vec16uc operator / (Vec16uc const & a, Divisor_us const & d) {
+    // expand into two Vec8s
+    Vec8us low  = extend_low(a)  / d;
+    Vec8us high = extend_high(a) / d;
+    return compress(low,high);
+}
+
+// vector operator /= : divide
+static inline Vec8s & operator /= (Vec8s & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec8us & operator /= (Vec8us & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec4i & operator /= (Vec4i & a, Divisor_i const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec4ui & operator /= (Vec4ui & a, Divisor_ui const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16c & operator /= (Vec16c & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16uc & operator /= (Vec16uc & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec4i by compile-time constant
+template <int32_t d>
+static inline Vec4i divide_by_i(Vec4i const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d ==  1) return  x;
+    if (d == -1) return -x;
+    if (uint32_t(d) == 0x80000000u) return Vec4i(x == Vec4i(0x80000000)) & 1; // prevent overflow when changing sign
+    const uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);          // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if ((d1 & (d1-1)) == 0) {
+        // d1 is a power of 2. use shift
+        const int k = bit_scan_reverse_const(d1);
+        __m128i sign;
+        if (k > 1) sign = _mm_srai_epi32(x, k-1); else sign = x;     // k copies of sign bit
+        __m128i bias    = _mm_srli_epi32(sign, 32-k);                // bias = x >= 0 ? 0 : k-1
+        __m128i xpbias  = _mm_add_epi32 (x, bias);                   // x + bias
+        __m128i q       = _mm_srai_epi32(xpbias, k);                 // (x + bias) >> k
+        if (d > 0)      return q;                                    // d > 0: return  q
+        return _mm_sub_epi32(_mm_setzero_si128(), q);                // d < 0: return -q
+    }
+    // general case
+    const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);            // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec4i a / const_int(d)
+template <int32_t d>
+static inline Vec4i operator / (Vec4i const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec4i a / const_uint(d)
+template <uint32_t d>
+static inline Vec4i operator / (Vec4i const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4i & operator /= (Vec4i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4i & operator /= (Vec4i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec4ui by compile-time constant
+template <uint32_t d>
+static inline Vec4ui divide_by_ui(Vec4ui const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d == 1) return x;                                            // divide by 1
+    const int b = bit_scan_reverse_const(d);                         // floor(log2(d))
+    if ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
+        // d is a power of 2. use shift
+        return    _mm_srli_epi32(x, b);                              // x >> b
+    }
+    // general case (d > 2)
+    uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d);           // multiplier = 2^(32+b) / d
+    const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    const bool round_down = (2*rem < d);                             // check if fraction is less than 0.5
+    if (!round_down) {
+        mult = mult + 1;                                             // round up mult
+    }
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+    const __m128i multv = _mm_set_epi32(0,mult,0,mult);              // zero-extend mult and broadcast
+    __m128i t1  = _mm_mul_epu32(x,multv);                            // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+    if (round_down) {
+        t1      = _mm_add_epi64(t1,multv);                           // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m128i t2  = _mm_srli_epi64(t1,32);                             // high dword of result 0 and 2
+    __m128i t3  = _mm_srli_epi64(x,32);                              // get x[1] and x[3] into position for multiplication
+    __m128i t4  = _mm_mul_epu32(t3,multv);                           // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if (round_down) {
+        t4      = _mm_add_epi64(t4,multv);                           // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m128i t5  = _mm_set_epi32(-1,0,-1,0);                          // mask of dword 1 and 3
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i t7  = _mm_blendv_epi8(t2,t4,t5);                         // blend two results
+#else
+    __m128i t6  = _mm_and_si128(t4,t5);                              // high dword of result 1 and 3
+    __m128i t7  = _mm_or_si128(t2,t6);                               // combine all four results into one vector
+#endif
+    Vec4ui q    = _mm_srli_epi32(t7, b);                             // shift right by b
+    return q;                                                    // no overflow possible
+}
+
+// define Vec4ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec4ui operator / (Vec4ui const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec4ui a / const_int(d)
+template <int32_t d>
+static inline Vec4ui operator / (Vec4ui const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4ui & operator /= (Vec4ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4ui & operator /= (Vec4ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8s by compile-time constant 
+template <int d>
+static inline Vec8s divide_by_i(Vec8s const & x) {
+    const int16_t d0 = int16_t(d);                                   // truncate d to 16 bits
+    Static_error_check<(d0 != 0)> Dividing_by_zero;                  // Error message if dividing by zero
+    if (d0 ==  1) return  x;                                         // divide by  1
+    if (d0 == -1) return -x;                                         // divide by -1
+    if (uint16_t(d0) == 0x8000u) return (x == Vec8s(0x8000)) & 1;    // prevent overflow when changing sign
+    // if (d > 0x7FFF || d < -0x8000) return 0;                      // not relevant when d truncated to 16 bits
+    const uint16_t d1 = d0 > 0 ? d0 : -d0;                           // compile-time abs(d0)
+    if ((d1 & (d1-1)) == 0) {
+        // d is a power of 2. use shift
+        const int k = bit_scan_reverse_const(uint32_t(d1));
+        __m128i sign;
+        if (k > 1) sign = _mm_srai_epi16(x, k-1); else sign = x;     // k copies of sign bit
+        __m128i bias    = _mm_srli_epi16(sign, 16-k);                // bias = x >= 0 ? 0 : k-1
+        __m128i xpbias  = _mm_add_epi16 (x, bias);                   // x + bias
+        __m128i q       = _mm_srai_epi16(xpbias, k);                 // (x + bias) >> k
+        if (d0 > 0)  return q;                                       // d0 > 0: return  q
+        return _mm_sub_epi16(_mm_setzero_si128(), q);                // d0 < 0: return -q
+    }
+    // general case
+    const int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1;        // ceil(log2(d)). (d < 2 handled above)
+    const int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier
+    const int shift1 = L - 1;
+    const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+    return x / div;
+}
+
+// define Vec8s a / const_int(d)
+template <int d>
+static inline Vec8s operator / (Vec8s const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8s a / const_uint(d)
+template <uint32_t d>
+static inline Vec8s operator / (Vec8s const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int(d)>(a);                                   // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8s & operator /= (Vec8s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8s & operator /= (Vec8s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8us by compile-time constant
+template <uint32_t d>
+static inline Vec8us divide_by_ui(Vec8us const & x) {
+    const uint16_t d0 = uint16_t(d);                                 // truncate d to 16 bits
+    Static_error_check<(d0 != 0)> Dividing_by_zero;                  // Error message if dividing by zero
+    if (d0 == 1) return x;                                           // divide by 1
+    const int b = bit_scan_reverse_const(d0);                        // floor(log2(d))
+    if ((d0 & (d0-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm_srli_epi16(x, b);                                // x >> b
+    }
+    // general case (d > 2)
+    uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);          // multiplier = 2^(32+b) / d
+    const uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d
+    const bool round_down = (2*rem < d0);                            // check if fraction is less than 0.5
+    Vec8us x1 = x;
+    if (round_down) {
+        x1 = x1 + 1;                                                 // round down mult and compensate by adding 1 to x
+    }
+    else {
+        mult = mult + 1;                                             // round up mult. no compensation needed
+    }
+    const __m128i multv = _mm_set1_epi16(mult);                      // broadcast mult
+    __m128i xm = _mm_mulhi_epu16(x1, multv);                         // high part of 16x16->32 bit unsigned multiplication
+    Vec8us q    = _mm_srli_epi16(xm, b);                             // shift right by b
+    if (round_down) {
+        Vec8s overfl = (x1 == (Vec8us)_mm_setzero_si128());                  // check for overflow of x+1
+        return select(overfl, Vec8us(mult >> b), q);                 // deal with overflow (rarely needed)
+    }
+    else {
+        return q;                                                    // no overflow possible
+    }
+}
+
+// define Vec8us a / const_uint(d)
+template <uint32_t d>
+static inline Vec8us operator / (Vec8us const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8us a / const_int(d)
+template <int d>
+static inline Vec8us operator / (Vec8us const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8us & operator /= (Vec8us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8us & operator /= (Vec8us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// define Vec16c a / const_int(d)
+template <int d>
+static inline Vec16c operator / (Vec16c const & a, Const_int_t<d>) {
+    // expand into two Vec8s
+    Vec8s low  = extend_low(a)  / Const_int_t<d>();
+    Vec8s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec16c a / const_uint(d)
+template <uint32_t d>
+static inline Vec16c operator / (Vec16c const & a, Const_uint_t<d>) {
+    Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return a / Const_int_t<d>();                              // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16c & operator /= (Vec16c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16c & operator /= (Vec16c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec16uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec16uc operator / (Vec16uc const & a, Const_uint_t<d>) {
+    // expand into two Vec8usc
+    Vec8us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec8us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec16uc a / const_int(d)
+template <int d>
+static inline Vec16uc operator / (Vec16uc const & a, Const_int_t<d>) {
+    Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return a / Const_uint_t<d>();                         // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16uc & operator /= (Vec16uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16uc & operator /= (Vec16uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16cb const & x) {
+    uint32_t a = _mm_movemask_epi8(x);
+    if (a == 0) return -1;
+    int32_t b = bit_scan_forward(a);
+    return b;
+}
+
+static inline int horizontal_find_first(Vec8sb const & x) {
+    return horizontal_find_first(Vec16cb(x)) >> 1;   // must use signed shift
+}
+
+static inline int horizontal_find_first(Vec4ib const & x) {
+    return horizontal_find_first(Vec16cb(x)) >> 2;   // must use signed shift
+}
+
+static inline int horizontal_find_first(Vec2qb const & x) {
+    return horizontal_find_first(Vec16cb(x)) >> 3;   // must use signed shift
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec16cb const & x) {
+    uint32_t a = _mm_movemask_epi8(x);
+    return vml_popcnt(a);
+}
+
+static inline uint32_t horizontal_count(Vec8sb const & x) {
+    return horizontal_count(Vec16cb(x)) >> 1;
+}
+
+static inline uint32_t horizontal_count(Vec4ib const & x) {
+    return horizontal_count(Vec16cb(x)) >> 2;
+}
+
+static inline uint32_t horizontal_count(Vec2qb const & x) {
+    return horizontal_count(Vec16cb(x)) >> 3;
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16cb const & x) {
+    return (uint16_t)_mm_movemask_epi8(x);
+}
+
+// to_Vec16bc: convert integer bitfield to boolean vector
+static inline Vec16cb to_Vec16cb(uint16_t x) {
+    static const uint32_t table[16] = {  // lookup-table
+        0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
+        0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
+        0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
+        0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; 
+    uint32_t a0 = table[x       & 0xF];
+    uint32_t a1 = table[(x>>4)  & 0xF];
+    uint32_t a2 = table[(x>>8)  & 0xF];
+    uint32_t a3 = table[(x>>12) & 0xF];
+    return Vec16cb(Vec16c(Vec4ui(a0, a1, a2, a3)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8sb const & x) {
+    __m128i a = _mm_packs_epi16(x, x);  // 16-bit words to bytes
+    return (uint8_t)_mm_movemask_epi8(a);
+}
+
+// to_Vec8sb: convert integer bitfield to boolean vector
+static inline Vec8sb to_Vec8sb(uint8_t x) {
+    static const uint32_t table[16] = {  // lookup-table
+        0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
+        0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
+        0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
+        0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; 
+    uint32_t a0 = table[x       & 0xF];
+    uint32_t a1 = table[(x>>4)  & 0xF];
+    Vec4ui   b  = Vec4ui(a0, a1, a0, a1);
+    return _mm_unpacklo_epi8(b, b);  // duplicate bytes to 16-bit words
+}
+
+#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+// These functions are defined in Vectori512.h if AVX512 instruction set is used
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib const & x) {
+    __m128i a = _mm_packs_epi32(x, x);  // 32-bit dwords to 16-bit words
+    __m128i b = _mm_packs_epi16(a, a);  // 16-bit words to bytes
+    return _mm_movemask_epi8(b) & 0xF;
+}
+
+// to_Vec4ib: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x) {
+    static const uint32_t table[16] = {    // lookup-table
+        0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 
+        0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 
+        0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 
+        0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; 
+    uint32_t a = table[x & 0xF];           // 4 bytes
+    __m128i b = _mm_cvtsi32_si128(a);      // transfer to vector register
+    __m128i c = _mm_unpacklo_epi8(b, b);   // duplicate bytes to 16-bit words
+    __m128i d = _mm_unpacklo_epi16(c, c);  // duplicate 16-bit words to 32-bit dwords
+    return d;
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb const & x) {
+    uint32_t a = _mm_movemask_epi8(x);
+    return (a & 1) | ((a >> 7) & 2);
+}
+
+// to_Vec2qb: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x) {
+    return Vec2qb(Vec2q(-(x&1), -((x>>1)&1)));
+}
+
+#else  // function prototypes here only
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib x);
+
+// to_Vec4ib: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x);
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb x);
+
+// to_Vec2qb: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x);
+
+#endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+
+#endif // VECTORI128_H
diff --git a/vectorclass/vectori256.h b/vectorclass/vectori256.h
new file mode 100755
index 0000000..be0c869
--- /dev/null
+++ b/vectorclass/vectori256.h
@@ -0,0 +1,5591 @@
+/****************************  vectori256.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining integer vector classes as interface to intrinsic 
+* functions in x86 microprocessors with AVX2 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX2. 
+*
+* The following vector classes are defined here:
+* Vec256b   Vector of 256  1-bit unsigned  integers or Booleans
+* Vec32c    Vector of  32  8-bit signed    integers
+* Vec32uc   Vector of  32  8-bit unsigned  integers
+* Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+* Vec16s    Vector of  16  16-bit signed   integers
+* Vec16us   Vector of  16  16-bit unsigned integers
+* Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+* Vec8i     Vector of   8  32-bit signed   integers
+* Vec8ui    Vector of   8  32-bit unsigned integers
+* Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+* Vec4q     Vector of   4  64-bit signed   integers
+* Vec4uq    Vector of   4  64-bit unsigned integers
+* Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+*
+* Each vector object is represented internally in the CPU as a 256-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For example:
+* Vec8i a(1,2,3,4,5,6,7,8), b(9,10,11,12,13,14,15,16), c;
+* c = a + b;     // now c contains (10,12,14,16,18,20,22,24)
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2013 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORI256_H)
+#if    VECTORI256_H != 2
+#error Two different versions of vectori256.h included
+#endif
+#else
+#define VECTORI256_H  2
+
+#ifdef VECTORF256_H
+#error Please put header file vectori256.h before vectorf256.h
+#endif
+
+
+#if INSTRSET < 8   // AVX2 required
+#error Wrong instruction set for vectori256.h, AVX2 required or use vectori256e.h
+#endif
+
+#include "vectori128.h"
+
+
+/*****************************************************************************
+*
+*         Join two 128-bit vectors
+*
+*****************************************************************************/
+#define set_m128ir(lo,hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo),(hi),1)
+
+
+/*****************************************************************************
+*
+*          Vector of 256 1-bit unsigned integers or Booleans
+*
+*****************************************************************************/
+class Vec256b {
+protected:
+    __m256i ymm; // Integer vector
+public:
+    // Default constructor:
+    Vec256b() {
+    };
+    // Constructor to broadcast the same value into all elements
+    // Removed because of undesired implicit conversions
+    //Vec256b(int i) {
+    //    ymm = _mm256_set1_epi32(-(i & 1));}
+
+    // Constructor to build from two Vec128b:
+    Vec256b(Vec128b const & a0, Vec128b const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec256b(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec256b & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec256b & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    Vec256b & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm256_storeu_si256((__m256i*)p, ymm);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    void store_a(void * p) const {
+        _mm256_store_si256((__m256i*)p, ymm);
+    }
+    // Member function to change a single bit
+    // Note: This function is inefficient. Use load function if changing more than one bit
+    Vec256b const & set_bit(uint32_t index, int value) {
+        static uint64_t m[8] = {0,0,0,0,1,0,0,0};
+        int wi = (index >> 6) & 3;               // qword index
+        int bi = index & 0x3F;                   // bit index within qword w
+
+        __m256i mask = Vec256b().load(m+4-wi);   // 1 in qword number wi
+        mask = _mm256_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set
+        if (value & 1) {
+            ymm = _mm256_or_si256(mask,ymm);
+        }
+        else {
+            ymm = _mm256_andnot_si256(mask,ymm);
+        }
+        return *this;
+    }
+    // Member function to get a single bit
+    // Note: This function is inefficient. Use store function if reading more than one bit
+    int get_bit(uint32_t index) const {
+        union {
+            __m256i x;
+            uint8_t i[32];
+        } u;
+        u.x = ymm; 
+        int wi = (index >> 3) & 0x1F;            // byte index
+        int bi = index & 7;                      // bit index within byte w
+        return (u.i[wi] >> bi) & 1;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return get_bit(index) != 0;
+    }
+    // Member functions to split into two Vec128b:
+    Vec128b get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec128b get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static int size() {
+        return 256;
+    }
+};
+
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator & (Vec256b const & a, Vec256b const & b) {
+    return _mm256_and_si256(a, b);
+}
+static inline Vec256b operator && (Vec256b const & a, Vec256b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec256b operator | (Vec256b const & a, Vec256b const & b) {
+    return _mm256_or_si256(a, b);
+}
+static inline Vec256b operator || (Vec256b const & a, Vec256b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator ^ (Vec256b const & a, Vec256b const & b) {
+    return _mm256_xor_si256(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator ~ (Vec256b const & a) {
+    return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec256b & operator &= (Vec256b & a, Vec256b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b & operator |= (Vec256b & a, Vec256b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b & operator ^= (Vec256b & a, Vec256b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec256b andnot (Vec256b const & a, Vec256b const & b) {
+    return _mm256_andnot_si256(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline __m256i constant8i() {
+    static const union {
+        int32_t i[8];
+        __m256i ymm;
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
+    return u.ymm;
+}
+
+
+/*****************************************************************************
+*
+*          selectb function
+*
+*****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked, 
+static inline __m256i selectb (__m256i const & s, __m256i const & a, __m256i const & b) {
+    return _mm256_blendv_epi8 (b, a, s);
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec256b const & a) {
+    return _mm256_testc_si256(a,constant8i<-1,-1,-1,-1,-1,-1,-1,-1>()) != 0;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec256b const & a) {
+    return ! _mm256_testz_si256(a,a);
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 32 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32c : public Vec256b {
+public:
+    // Default constructor:
+    Vec32c(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec32c(int i) {
+        ymm = _mm256_set1_epi8((char)i);
+    };
+    // Constructor to build from all elements:
+    Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,        
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) {
+        ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    };
+    // Constructor to build from two Vec16c:
+    Vec32c(Vec16c const & a0, Vec16c const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32c(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32c & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec32c & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32c & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32c & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 16) {
+            *this = Vec32c(Vec16c().load_partial(n, p), 0);
+        }
+        else if (n < 32) {
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char*)p+16));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 16) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 32) {
+            get_low().store(p);
+            get_high().store_partial(n-16, (char*)p+16);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32c & cutoff(int n) {
+        if (uint32_t(n) >= 32) return *this;
+        static const union {
+            int32_t i[16];
+            char    c[64];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this &= Vec32c().load(mask.c+32-n);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec32c const & insert(uint32_t index, int8_t value) {
+        static const int8_t maskl[64] = {0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+            -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0};
+        __m256i broad = _mm256_set1_epi8(value);  // broadcast value into all elements
+        __m256i mask  = _mm256_loadu_si256((__m256i const*)(maskl+32-(index & 0x1F))); // mask with FF at index position
+        ymm = selectb(mask,broad,ymm);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(uint32_t index) const {
+        int8_t x[32];
+        store(x);
+        return x[index & 0x1F];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16c:
+    Vec16c get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec16c get_high() const {
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+        return _mm256_extractf128_si256(ymm,1);    // workaround bug in MS compiler VS 11
+#else
+        return _mm256_extracti128_si256(ymm,1);
+#endif
+    }
+    static int size() {
+        return 32;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+*
+*****************************************************************************/
+
+class Vec32cb : public Vec32c {
+public:
+    // Default constructor:
+    Vec32cb(){
+    }
+    // Constructor to build from all elements:
+    Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15,
+        bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23,
+        bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) :
+        Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), 
+            -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15),
+            -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23),
+            -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
+        {}
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32cb(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32cb & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec32cb(bool b) : Vec32c(-int8_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec32cb & operator = (bool b) {
+        *this = Vec32cb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec32cb(int b);
+    Vec32cb & operator = (int x);
+public:
+    // Member functions to split into two Vec16c:
+    Vec16cb get_low() const {
+        return Vec16cb(Vec32c::get_low());
+    }
+    Vec16cb get_high() const {
+        return Vec16cb(Vec32c::get_high());
+    }
+    Vec32cb & insert (int index, bool a) {
+        Vec32c::insert(index, -(int)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec32c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec32cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator & (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32cb operator && (Vec32cb const & a, Vec32cb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator | (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32cb operator || (Vec32cb const & a, Vec32cb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator ^ (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator ~ (Vec32cb const & a) {
+    return Vec32cb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec32cb operator ! (Vec32cb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec32c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator + (Vec32c const & a, Vec32c const & b) {
+    return _mm256_add_epi8(a, b);
+}
+
+// vector operator += : add
+static inline Vec32c & operator += (Vec32c & a, Vec32c const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator ++ (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec32c & operator ++ (Vec32c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator - (Vec32c const & a, Vec32c const & b) {
+    return _mm256_sub_epi8(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec32c operator - (Vec32c const & a) {
+    return _mm256_sub_epi8(_mm256_setzero_si256(), a);
+}
+
+// vector operator -= : add
+static inline Vec32c & operator -= (Vec32c & a, Vec32c const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32c operator -- (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec32c & operator -- (Vec32c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator * (Vec32c const & a, Vec32c const & b) {
+    // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
+    __m256i aodd    = _mm256_srli_epi16(a,8);                 // odd numbered elements of a
+    __m256i bodd    = _mm256_srli_epi16(b,8);                 // odd numbered elements of b
+    __m256i muleven = _mm256_mullo_epi16(a,b);                // product of even numbered elements
+    __m256i mulodd  = _mm256_mullo_epi16(aodd,bodd);          // product of odd  numbered elements
+            mulodd  = _mm256_slli_epi16(mulodd,8);            // put odd numbered elements back in place
+    __m256i mask    = _mm256_set1_epi32(0x00FF00FF);          // mask for even positions
+    __m256i product = selectb(mask,muleven,mulodd);           // interleave even and odd
+    return product;
+}
+
+// vector operator *= : multiply
+static inline Vec32c & operator *= (Vec32c & a, Vec32c const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator << (Vec32c const & a, int b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b));   // 16-bit shifts
+    return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec32c & operator <<= (Vec32c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator >> (Vec32c const & a, int b) {
+    __m256i aeven = _mm256_slli_epi16(a,8);                            // even numbered elements of a. get sign bit in position
+            aeven = _mm256_sra_epi16(aeven,_mm_cvtsi32_si128(b+8));    // shift arithmetic, back to position
+    __m256i aodd  = _mm256_sra_epi16(a,_mm_cvtsi32_si128(b));          // shift odd numbered elements arithmetic
+    __m256i mask  = _mm256_set1_epi32(0x00FF00FF);                     // mask for even positions
+    __m256i res   = selectb(mask,aeven,aodd);                          // interleave even and odd
+    return res;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c & operator >>= (Vec32c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator == (Vec32c const & a, Vec32c const & b) {
+    return _mm256_cmpeq_epi8(a,b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator != (Vec32c const & a, Vec32c const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comneq_epi8(a,b);
+#else  // AVX2 instruction set
+    return Vec32cb(Vec32c(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator > (Vec32c const & a, Vec32c const & b) {
+    return _mm256_cmpgt_epi8(a,b);
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator < (Vec32c const & a, Vec32c const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator >= (Vec32c const & a, Vec32c const & b) {
+#ifdef __XOP2__  // // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epi8(a,b);
+#else  // SSE2 instruction set
+    return Vec32cb(Vec32c(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator <= (Vec32c const & a, Vec32c const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32c operator & (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32c operator && (Vec32c const & a, Vec32c const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32c & operator &= (Vec32c & a, Vec32c const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator | (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32c operator || (Vec32c const & a, Vec32c const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32c & operator |= (Vec32c & a, Vec32c const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator ^ (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec32c & operator ^= (Vec32c & a, Vec32c const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator ~ (Vec32c const & a) {
+    return Vec32c( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator ! (Vec32c const & a) {
+    return _mm256_cmpeq_epi8(a,_mm256_setzero_si256());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec32c select (Vec32cb const & s, Vec32c const & a, Vec32c const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add (Vec32cb const & f, Vec32c const & a, Vec32c const & b) {
+    return a + (Vec32c(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec32c const & a) {
+    __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i sum3 = _mm256_add_epi16(sum1,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4 = _mm256_extractf128_si256(sum3,1);                // bug in MS VS 11
+#else
+    __m128i sum4 = _mm256_extracti128_si256(sum3,1);
+#endif
+    __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    int8_t  sum6 = (int8_t)_mm_cvtsi128_si32(sum5);                  // truncate to 8 bits
+    return  sum6;                                                    // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32c const & a) {
+    __m256i aeven = _mm256_slli_epi16(a,8);                          // even numbered elements of a. get sign bit in position
+            aeven = _mm256_srai_epi16(aeven,8);                      // sign extend even numbered elements
+    __m256i aodd  = _mm256_srai_epi16(a,8);                          // sign extend odd  numbered elements
+    __m256i sum1  = _mm256_add_epi16(aeven,aodd);                    // add even and odd elements
+    __m256i sum2  = _mm256_hadd_epi16(sum1,sum1);                    // horizontally add 2x8 elements in 3 steps
+    __m256i sum3  = _mm256_hadd_epi16(sum2,sum2);
+    __m256i sum4  = _mm256_hadd_epi16(sum3,sum3);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum5  = _mm256_extractf128_si256(sum4,1);                // bug in MS VS 11
+#else
+    __m128i sum5  = _mm256_extracti128_si256(sum4,1);                // get high sum
+#endif
+    __m128i sum6  = _mm_add_epi16(_mm256_castsi256_si128(sum4),sum5);// add high and low sum
+    int16_t sum7  = (int16_t)_mm_cvtsi128_si32(sum6);                // 16 bit sum
+    return  sum7;                                                    // sign extend to 32 bits
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const & a, Vec32c const & b) {
+    return _mm256_adds_epi8(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const & a, Vec32c const & b) {
+    return _mm256_subs_epi8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const & a, Vec32c const & b) {
+    return _mm256_max_epi8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const & a, Vec32c const & b) {
+    return _mm256_min_epi8(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const & a) {
+    return _mm256_sign_epi8(a,a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const & a) {
+    __m256i absa   = abs(a);                                         // abs(a)
+    __m256i overfl = _mm256_cmpgt_epi8(_mm256_setzero_si256(),absa); // 0 > a
+    return           _mm256_add_epi8(absa,overfl);                   // subtract 1 if 0x80
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const & a, int b) {
+#ifdef __XOP2__      // Possible future 256-bit XOP extension ?
+    return _mm256_rot_epi8(a,_mm256_set1_epi8(b));
+#else  // SSE2 instruction set
+    __m128i bb        = _mm_cvtsi32_si128(b & 7);             // b modulo 8
+    __m128i mbb       = _mm_cvtsi32_si128((8-b) & 7);         // 8-b modulo 8
+    __m256i maskeven  = _mm256_set1_epi32(0x00FF00FF);        // mask for even numbered bytes
+    __m256i even      = _mm256_and_si256(a,maskeven);         // even numbered bytes of a
+    __m256i odd       = _mm256_andnot_si256(maskeven,a);      // odd numbered bytes of a
+    __m256i evenleft  = _mm256_sll_epi16(even,bb);            // even bytes of a << b
+    __m256i oddleft   = _mm256_sll_epi16(odd,bb);             // odd  bytes of a << b
+    __m256i evenright = _mm256_srl_epi16(even,mbb);           // even bytes of a >> 8-b
+    __m256i oddright  = _mm256_srl_epi16(odd,mbb);            // odd  bytes of a >> 8-b
+    __m256i evenrot   = _mm256_or_si256(evenleft,evenright);  // even bytes of a rotated
+    __m256i oddrot    = _mm256_or_si256(oddleft,oddright);    // odd  bytes of a rotated
+    __m256i allrot    = selectb(maskeven,evenrot,oddrot);     // all  bytes rotated
+    return  allrot;
+#endif
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32uc : public Vec32c {
+public:
+    // Default constructor:
+    Vec32uc(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec32uc(uint32_t i) {
+        ymm = _mm256_set1_epi8((char)i);
+    };
+    // Constructor to build from all elements:
+    Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,        
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) {
+        ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    };
+    // Constructor to build from two Vec16uc:
+    Vec32uc(Vec16uc const & a0, Vec16uc const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32uc(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32uc & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec32uc & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32uc & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec32uc const & insert(uint32_t index, uint8_t value) {
+        Vec32c::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(uint32_t index) const {
+        return Vec32c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16uc:
+    Vec16uc get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec16uc get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator + (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc (Vec32c(a) + Vec32c(b));
+}
+
+// vector operator - : subtract
+static inline Vec32uc operator - (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc (Vec32c(a) - Vec32c(b));
+}
+
+// vector operator * : multiply
+static inline Vec32uc operator * (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc (Vec32c(a) * Vec32c(b));
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const & a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b));    // 16-bit shifts
+    return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const & a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF << (uint32_t)b;                // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_srl_epi16(am,_mm_cvtsi32_si128(b));    // 16-bit shifts
+    return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator >= (Vec32uc const & a, Vec32uc const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epu8(a,b);
+#else 
+    return _mm256_cmpeq_epi8(_mm256_max_epu8(a,b), a); // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator <= (Vec32uc const & a, Vec32uc const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator > (Vec32uc const & a, Vec32uc const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comgt_epu8(a,b);
+#else  // SSE2 instruction set
+    return Vec32cb(Vec32c(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator < (Vec32uc const & a, Vec32uc const & b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32uc operator & (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32uc operator && (Vec32uc const & a, Vec32uc const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec32uc operator | (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32uc operator || (Vec32uc const & a, Vec32uc const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator ^ (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator ~ (Vec32uc const & a) {
+    return Vec32uc( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec32uc select (Vec32cb const & s, Vec32uc const & a, Vec32uc const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add (Vec32cb const & f, Vec32uc const & a, Vec32uc const & b) {
+    return a + (Vec32uc(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint32_t horizontal_add (Vec32uc const & a) {
+    __m256i  sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i  sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i  sum3 = _mm256_add_epi16(sum1,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i  sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11
+#else
+    __m128i  sum4 = _mm256_extracti128_si256(sum3,1);
+#endif
+    __m128i  sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    uint8_t  sum6 = (uint8_t)_mm_cvtsi128_si32(sum5); // truncate to 8 bits
+    return   sum6;                                    // zero extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec32uc const & a) {
+    __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i sum3 = _mm256_add_epi16(sum1,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11
+#else
+    __m128i sum4 = _mm256_extracti128_si256(sum3,1);
+#endif
+    __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    return         _mm_cvtsi128_si32(sum5);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const & a, Vec32uc const & b) {
+    return _mm256_adds_epu8(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const & a, Vec32uc const & b) {
+    return _mm256_subs_epu8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const & a, Vec32uc const & b) {
+    return _mm256_max_epu8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const & a, Vec32uc const & b) {
+    return _mm256_min_epu8(a,b);
+}
+
+
+    
+/*****************************************************************************
+*
+*          Vector of 16 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16s : public Vec256b {
+public:
+    // Default constructor:
+    Vec16s() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16s(int i) {
+        ymm = _mm256_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec16s(int16_t i0, int16_t i1, int16_t i2,  int16_t i3,  int16_t i4,  int16_t i5,  int16_t i6,  int16_t i7,
+           int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) {
+        ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 );
+    };
+    // Constructor to build from two Vec8s:
+    Vec16s(Vec8s const & a0, Vec8s const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16s(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16s & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    };
+    // Member function to load from array (unaligned)
+    Vec16s & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16s & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16s & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 8) {
+            *this = Vec16s(Vec8s().load_partial(n, p), 0);
+        }
+        else if (n < 16) {
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t*)p+8));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 8) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 16) {
+            get_low().store(p);
+            get_high().store_partial(n-8, (int16_t*)p+8);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16s & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 2);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16s const & insert(uint32_t index, int16_t value) {
+        static const int16_t m[32] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        __m256i mask  = Vec256b().load(m + 16 - (index & 0x0F));
+        __m256i broad = _mm256_set1_epi16(value);
+        ymm = selectb(mask, broad, ymm);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    int16_t extract(uint32_t index) const {
+        int16_t x[16];
+        store(x);
+        return x[index & 0x0F];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8s:
+    Vec8s get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec8s get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static int size() {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+*
+*****************************************************************************/
+class Vec16sb : public Vec16s {
+public:
+    // Default constructor:
+    Vec16sb() {
+    }
+    // Constructor to build from all elements:
+    Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), 
+            -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+        {}
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16sb(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16sb & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16sb(bool b) : Vec16s(-int16_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16sb & operator = (bool b) {
+        *this = Vec16sb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec16sb(int b);
+    Vec16sb & operator = (int x);
+public:
+    Vec8sb get_low() const {
+        return Vec8sb(Vec16s::get_low());
+    }
+    Vec8sb get_high() const {
+        return Vec8sb(Vec16s::get_high());
+    }
+    Vec16sb & insert (int index, bool a) {
+        Vec16s::insert(index, -(int)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec16s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16sb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16sb operator & (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16sb operator && (Vec16sb const & a, Vec16sb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator | (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16sb operator || (Vec16sb const & a, Vec16sb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator ^ (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator ~ (Vec16sb const & a) {
+    return Vec16sb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16sb operator ! (Vec16sb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator + (Vec16s const & a, Vec16s const & b) {
+    return _mm256_add_epi16(a, b);
+}
+
+// vector operator += : add
+static inline Vec16s & operator += (Vec16s & a, Vec16s const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator ++ (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16s & operator ++ (Vec16s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator - (Vec16s const & a, Vec16s const & b) {
+    return _mm256_sub_epi16(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec16s operator - (Vec16s const & a) {
+    return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec16s & operator -= (Vec16s & a, Vec16s const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16s operator -- (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16s & operator -- (Vec16s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator * (Vec16s const & a, Vec16s const & b) {
+    return _mm256_mullo_epi16(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec16s & operator *= (Vec16s & a, Vec16s const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec16s operator << (Vec16s const & a, int b) {
+    return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec16s & operator <<= (Vec16s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator >> (Vec16s const & a, int b) {
+    return _mm256_sra_epi16(a,_mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16s & operator >>= (Vec16s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator == (Vec16s const & a, Vec16s const & b) {
+    return _mm256_cmpeq_epi16(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator != (Vec16s const & a, Vec16s const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comneq_epi16(a,b);
+#else  // SSE2 instruction set
+    return Vec16sb(Vec16s(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator > (Vec16s const & a, Vec16s const & b) {
+    return _mm256_cmpgt_epi16(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator < (Vec16s const & a, Vec16s const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator >= (Vec16s const & a, Vec16s const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epi16(a,b);
+#else  // SSE2 instruction set
+    return Vec16sb(Vec16s(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator <= (Vec16s const & a, Vec16s const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16s operator & (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16s operator && (Vec16s const & a, Vec16s const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16s & operator &= (Vec16s & a, Vec16s const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator | (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16s operator || (Vec16s const & a, Vec16s const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16s & operator |= (Vec16s & a, Vec16s const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator ^ (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16s & operator ^= (Vec16s & a, Vec16s const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator ~ (Vec16s const & a) {
+    return Vec16s( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16sb operator ! (Vec16s const & a) {
+    return _mm256_cmpeq_epi16(a,_mm256_setzero_si256());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16s select (Vec16sb const & s, Vec16s const & a, Vec16s const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add (Vec16sb const & f, Vec16s const & a, Vec16s const & b) {
+    return a + (Vec16s(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec16s const & a) {
+// #ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    __m256i sum1  = _mm256_hadd_epi16(a,a);                           // horizontally add 2x8 elements in 3 steps
+    __m256i sum2  = _mm256_hadd_epi16(sum1,sum1);
+    __m256i sum3  = _mm256_hadd_epi16(sum2,sum2); 
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4  = _mm256_extractf128_si256(sum3,1);                 // bug in MS compiler VS 11
+#else
+    __m128i sum4  = _mm256_extracti128_si256(sum3,1);                 // get high part
+#endif
+    __m128i sum5  = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); // add low and high parts
+    int16_t sum6  = (int16_t)_mm_cvtsi128_si32(sum5);                 // truncate to 16 bits
+    return  sum6;                                                     // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x (Vec16s const & a) {
+    __m256i aeven = _mm256_slli_epi32(a,16);                  // even numbered elements of a. get sign bit in position
+            aeven = _mm256_srai_epi32(aeven,16);              // sign extend even numbered elements
+    __m256i aodd  = _mm256_srai_epi32(a,16);                  // sign extend odd  numbered elements
+    __m256i sum1  = _mm256_add_epi32(aeven,aodd);             // add even and odd elements
+    __m256i sum2  = _mm256_hadd_epi32(sum1,sum1);             // horizontally add 2x4 elements in 2 steps
+    __m256i sum3  = _mm256_hadd_epi32(sum2,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4  = _mm256_extractf128_si256(sum3,1);         // bug in MS compiler VS 11
+#else
+    __m128i sum4  = _mm256_extracti128_si256(sum3,1);
+#endif
+    __m128i sum5  = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4);
+    return          _mm_cvtsi128_si32(sum5); 
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const & a, Vec16s const & b) {
+    return _mm256_adds_epi16(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const & a, Vec16s const & b) {
+    return _mm256_subs_epi16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const & a, Vec16s const & b) {
+    return _mm256_max_epi16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const & a, Vec16s const & b) {
+    return _mm256_min_epi16(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const & a) {
+    return _mm256_sign_epi16(a,a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const & a) {
+    __m256i absa   = abs(a);                                  // abs(a)
+    __m256i overfl = _mm256_srai_epi16(absa,15);              // sign
+    return           _mm256_add_epi16(absa,overfl);           // subtract 1 if 0x8000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const & a, int b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_rot_epi16(a,_mm256_set1_epi16(b));
+#else  // SSE2 instruction set
+    __m256i left  = _mm256_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F));      // a << b 
+    __m256i right = _mm256_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b)
+    __m256i rot   = _mm256_or_si256(left,right);                          // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16us : public Vec16s {
+public:
+    // Default constructor:
+    Vec16us(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16us(uint32_t i) {
+        ymm = _mm256_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec16us(uint16_t i0, uint16_t i1, uint16_t i2,  uint16_t i3,  uint16_t i4,  uint16_t i5,  uint16_t i6,  uint16_t i7,
+            uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) {
+        ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 );
+    };
+    // Constructor to build from two Vec8us:
+    Vec16us(Vec8us const & a0, Vec8us const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16us(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16us & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16us & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16us & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16us const & insert(uint32_t index, uint16_t value) {
+        Vec16s::insert(index, value);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    uint16_t extract(uint32_t index) const {
+        return Vec16s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8us:
+    Vec8us get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec8us get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator + (Vec16us const & a, Vec16us const & b) {
+    return Vec16us (Vec16s(a) + Vec16s(b));
+}
+
+// vector operator - : subtract
+static inline Vec16us operator - (Vec16us const & a, Vec16us const & b) {
+    return Vec16us (Vec16s(a) - Vec16s(b));
+}
+
+// vector operator * : multiply
+static inline Vec16us operator * (Vec16us const & a, Vec16us const & b) {
+    return Vec16us (Vec16s(a) * Vec16s(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const & a, uint32_t b) {
+    return _mm256_srl_epi16(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const & a, uint32_t b) {
+    return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator >= (Vec16us const & a, Vec16us const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epu16(a,b);
+#else
+    __m256i max_ab = _mm256_max_epu16(a,b);                   // max(a,b), unsigned
+    return _mm256_cmpeq_epi16(a,max_ab);                      // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator <= (Vec16us const & a, Vec16us const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator > (Vec16us const & a, Vec16us const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comgt_epu16(a,b);
+#else  // SSE2 instruction set
+    return Vec16sb(Vec16s(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator < (Vec16us const & a, Vec16us const & b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16us operator & (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16us operator && (Vec16us const & a, Vec16us const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16us operator | (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16us operator || (Vec16us const & a, Vec16us const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator ^ (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator ~ (Vec16us const & a) {
+    return Vec16us( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16us select (Vec16sb const & s, Vec16us const & a, Vec16us const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add (Vec16sb const & f, Vec16us const & a, Vec16us const & b) {
+    return a + (Vec16us(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16us const & a) {
+//#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    __m256i sum1  = _mm256_hadd_epi16(a,a);                           // horizontally add 2x8 elements in 3 steps
+    __m256i sum2  = _mm256_hadd_epi16(sum1,sum1);
+    __m256i sum3  = _mm256_hadd_epi16(sum2,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4  = _mm256_extractf128_si256(sum3,1);                 // bug in MS compiler VS 11
+#else
+    __m128i sum4  = _mm256_extracti128_si256(sum3,1);                 // get high part
+#endif
+    __m128i sum5  = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4); // add low and high parts
+    return          _mm_cvtsi128_si32(sum5);  
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec16us const & a) {
+    __m256i mask  = _mm256_set1_epi32(0x0000FFFF);                    // mask for even positions
+    __m256i aeven = _mm256_and_si256(a,mask);                         // even numbered elements of a
+    __m256i aodd  = _mm256_srli_epi32(a,16);                          // zero extend odd numbered elements
+    __m256i sum1  = _mm256_add_epi32(aeven,aodd);                     // add even and odd elements
+    __m256i sum2  = _mm256_hadd_epi32(sum1,sum1);                     // horizontally add 2x4 elements in 2 steps
+    __m256i sum3  = _mm256_hadd_epi32(sum2,sum2);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum4  = _mm256_extractf128_si256(sum3,1);                 // bug in MS compiler VS 11
+#else
+    __m128i sum4  = _mm256_extracti128_si256(sum3,1);                 // get high part
+#endif
+    __m128i sum5  = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4); // add low and high parts
+    return          _mm_cvtsi128_si32(sum5);  
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const & a, Vec16us const & b) {
+    return _mm256_adds_epu16(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const & a, Vec16us const & b) {
+    return _mm256_subs_epu16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const & a, Vec16us const & b) {
+    return _mm256_max_epu16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const & a, Vec16us const & b) {
+    return _mm256_min_epu16(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8i : public Vec256b {
+public:
+    // Default constructor:
+    Vec8i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8i(int i) {
+        ymm = _mm256_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) {
+        ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4i:
+    Vec8i(Vec4i const & a0, Vec4i const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8i(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8i & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8i & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8i & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8i & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 4) {
+            *this = Vec8i(Vec4i().load_partial(n, p), 0);
+        }
+        else if (n < 8) {
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t*)p+4));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 8) {
+            get_low().store(p);
+            get_high().store_partial(n-4, (int32_t*)p+4);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8i & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 4);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8i const & insert(uint32_t index, int32_t value) {
+        static const int32_t maskl[16] = {0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0};
+        __m256i broad = _mm256_set1_epi32(value);  // broadcast value into all elements
+        __m256i mask  = Vec256b().load(maskl + 8 - (index & 7)); // mask with FFFFFFFF at index position
+        ymm = selectb (mask, broad, ymm);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(uint32_t index) const {
+        int32_t x[8];
+        store(x);
+        return x[index & 7];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4i:
+    Vec4i get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec4i get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static int size() {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+*
+*****************************************************************************/
+
+class Vec8ib : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ib() {
+    }
+    // Constructor to build from all elements:
+    Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+        {}
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8ib(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8ib & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8ib(bool b) : Vec8i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8ib & operator = (bool b) {
+        *this = Vec8ib(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8ib(int b);
+    Vec8ib & operator = (int x);
+public:
+    Vec4ib get_low() const {
+        return Vec4ib(Vec8i::get_low());
+    }
+    Vec4ib get_high() const {
+        return Vec4ib(Vec8i::get_high());
+    }
+    Vec8ib & insert (int index, bool a) {
+        Vec8i::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec8i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec8ib
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8ib operator & (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ib operator && (Vec8ib const & a, Vec8ib const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator | (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ib operator || (Vec8ib const & a, Vec8ib const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator ^ (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator ~ (Vec8ib const & a) {
+    return Vec8ib( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8ib operator ! (Vec8ib const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator + (Vec8i const & a, Vec8i const & b) {
+    return _mm256_add_epi32(a, b);
+}
+
+// vector operator += : add
+static inline Vec8i & operator += (Vec8i & a, Vec8i const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator ++ (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8i & operator ++ (Vec8i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator - (Vec8i const & a, Vec8i const & b) {
+    return _mm256_sub_epi32(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec8i operator - (Vec8i const & a) {
+    return _mm256_sub_epi32(_mm256_setzero_si256(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec8i & operator -= (Vec8i & a, Vec8i const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8i operator -- (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8i & operator -- (Vec8i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator * (Vec8i const & a, Vec8i const & b) {
+    return _mm256_mullo_epi32(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec8i & operator *= (Vec8i & a, Vec8i const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec8i operator << (Vec8i const & a, int32_t b) {
+    return _mm256_sll_epi32(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec8i & operator <<= (Vec8i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator >> (Vec8i const & a, int32_t b) {
+    return _mm256_sra_epi32(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8i & operator >>= (Vec8i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator == (Vec8i const & a, Vec8i const & b) {
+    return _mm256_cmpeq_epi32(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator != (Vec8i const & a, Vec8i const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comneq_epi32(a,b);
+#else  // SSE2 instruction set
+    return Vec8ib(Vec8i(~(a == b)));
+#endif
+}
+  
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator > (Vec8i const & a, Vec8i const & b) {
+    return _mm256_cmpgt_epi32(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator < (Vec8i const & a, Vec8i const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator >= (Vec8i const & a, Vec8i const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epi32(a,b);
+#else  // SSE2 instruction set
+    return Vec8ib(Vec8i(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator <= (Vec8i const & a, Vec8i const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8i operator & (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8i operator && (Vec8i const & a, Vec8i const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8i & operator &= (Vec8i & a, Vec8i const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator | (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8i operator || (Vec8i const & a, Vec8i const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8i & operator |= (Vec8i & a, Vec8i const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator ^ (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8i & operator ^= (Vec8i & a, Vec8i const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator ~ (Vec8i const & a) {
+    return Vec8i( ~ Vec256b(a));
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator ! (Vec8i const & a) {
+    return _mm256_cmpeq_epi32(a, _mm256_setzero_si256());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8i select (Vec8ib const & s, Vec8i const & a, Vec8i const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add (Vec8ib const & f, Vec8i const & a, Vec8i const & b) {
+    return a + (Vec8i(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec8i const & a) {
+//#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    __m256i sum1  = _mm256_hadd_epi32(a,a);                           // horizontally add 2x4 elements in 2 steps
+    __m256i sum2  = _mm256_hadd_epi32(sum1,sum1);
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum3  = _mm256_extractf128_si256(sum2,1);                 // bug in MS VS 11
+#else
+    __m128i sum3  = _mm256_extracti128_si256(sum2,1);                 // get high part
+#endif
+    __m128i sum4  = _mm_add_epi32(_mm256_castsi256_si128(sum2),sum3); // add low and high parts
+    return          _mm_cvtsi128_si32(sum4);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+// static inline int64_t horizontal_add_x (Vec8i const & a); // defined below
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const & a, Vec8i const & b) {
+    __m256i sum    = _mm256_add_epi32(a, b);                  // a + b
+    __m256i axb    = _mm256_xor_si256(a, b);                  // check if a and b have different sign
+    __m256i axs    = _mm256_xor_si256(a, sum);                // check if a and sum have different sign
+    __m256i overf1 = _mm256_andnot_si256(axb,axs);            // check if sum has wrong sign
+    __m256i overf2 = _mm256_srai_epi32(overf1,31);            // -1 if overflow
+    __m256i asign  = _mm256_srli_epi32(a,31);                 // 1  if a < 0
+    __m256i sat1   = _mm256_srli_epi32(overf2,1);             // 7FFFFFFF if overflow
+    __m256i sat2   = _mm256_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,sum);                         // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const & a, Vec8i const & b) {
+    __m256i diff   = _mm256_sub_epi32(a, b);                  // a + b
+    __m256i axb    = _mm256_xor_si256(a, b);                  // check if a and b have different sign
+    __m256i axs    = _mm256_xor_si256(a, diff);               // check if a and sum have different sign
+    __m256i overf1 = _mm256_and_si256(axb,axs);               // check if sum has wrong sign
+    __m256i overf2 = _mm256_srai_epi32(overf1,31);            // -1 if overflow
+    __m256i asign  = _mm256_srli_epi32(a,31);                 // 1  if a < 0
+    __m256i sat1   = _mm256_srli_epi32(overf2,1);             // 7FFFFFFF if overflow
+    __m256i sat2   = _mm256_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,diff);                        // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const & a, Vec8i const & b) {
+    return _mm256_max_epi32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const & a, Vec8i const & b) {
+    return _mm256_min_epi32(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const & a) {
+    return _mm256_sign_epi32(a,a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const & a) {
+    __m256i absa   = abs(a);                                  // abs(a)
+    __m256i overfl = _mm256_srai_epi32(absa,31);              // sign
+    return           _mm256_add_epi32(absa,overfl);           // subtract 1 if 0x80000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const & a, int b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_rot_epi32(a,_mm_set1_epi32(b));
+#else  // SSE2 instruction set
+    __m256i left  = _mm256_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F));      // a << b 
+    __m256i right = _mm256_srl_epi32(a,_mm_cvtsi32_si128((32-b) & 0x1F)); // a >> (32 - b)
+    __m256i rot   = _mm256_or_si256(left,right);                          // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8ui : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec8ui(uint32_t i) {
+        ymm = _mm256_set1_epi32(i);
+    };
+    // Constructor to build from all elements:
+    Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) {
+        ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+    };
+    // Constructor to build from two Vec4ui:
+    Vec8ui(Vec4ui const & a0, Vec4ui const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8ui(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8ui & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec8ui & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8ui & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8ui const & insert(uint32_t index, uint32_t value) {
+        Vec8i::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(uint32_t index) const {
+        return Vec8i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec4ui get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec4ui get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator + (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) + Vec8i(b));
+}
+
+// vector operator - : subtract
+static inline Vec8ui operator - (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) - Vec8i(b));
+}
+
+// vector operator * : multiply
+static inline Vec8ui operator * (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) * Vec8i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const & a, uint32_t b) {
+    return _mm256_srl_epi32(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const & a, uint32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const & a, int32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator > (Vec8ui const & a, Vec8ui const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comgt_epu32(a,b);
+#else  // AVX2 instruction set
+    __m256i signbit = _mm256_set1_epi32(0x80000000);
+    __m256i a1      = _mm256_xor_si256(a,signbit);
+    __m256i b1      = _mm256_xor_si256(b,signbit);
+    return _mm256_cmpgt_epi32(a1,b1);                         // signed compare
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator < (Vec8ui const & a, Vec8ui const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator >= (Vec8ui const & a, Vec8ui const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epu32(a,b);
+#else
+    __m256i max_ab = _mm256_max_epu32(a,b);                   // max(a,b), unsigned
+    return _mm256_cmpeq_epi32(a,max_ab);                      // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator <= (Vec8ui const & a, Vec8ui const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8ui operator & (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ui operator && (Vec8ui const & a, Vec8ui const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ui operator | (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ui operator || (Vec8ui const & a, Vec8ui const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator ^ (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator ~ (Vec8ui const & a) {
+    return Vec8ui( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8ui select (Vec8ib const & s, Vec8ui const & a, Vec8ui const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add (Vec8ib const & f, Vec8ui const & a, Vec8ui const & b) {
+    return a + (Vec8ui(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec8ui const & a) {
+    return horizontal_add((Vec8i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+// static inline uint64_t horizontal_add_x (Vec8ui const & a); // defined later
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const & a, Vec8ui const & b) {
+    Vec8ui sum      = a + b;
+    Vec8ui aorb     = Vec8ui(a | b);
+    Vec8ui overflow = Vec8ui(sum < aorb);                  // overflow if a + b < (a | b)
+    return Vec8ui (sum | overflow);                        // return 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const & a, Vec8ui const & b) {
+    Vec8ui diff      = a - b;
+    Vec8ui underflow = Vec8ui(diff > a);                   // underflow if a - b > a
+    return _mm256_andnot_si256(underflow,diff);            // return 0 if underflow
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const & a, Vec8ui const & b) {
+    return _mm256_max_epu32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const & a, Vec8ui const & b) {
+    return _mm256_min_epu32(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4q : public Vec256b {
+public:
+    // Default constructor:
+    Vec4q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4q(int64_t i) {
+#if defined (_MSC_VER) && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
+        // MS compiler cannot use _mm256_set1_epi64x in 32 bit mode, and  
+        // cannot put 64-bit values into xmm register without using
+        // mmx registers, and it makes no emms
+        union {
+            int64_t q[4];
+            int32_t r[8];
+        } u;
+        u.q[0] = u.q[1] = u.q[2] = u.q[3] = i;
+        ymm = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]);
+#else
+        ymm = _mm256_set1_epi64x(i);
+#endif
+    }
+    // Constructor to build from all elements:
+    Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+#if defined (_MSC_VER) && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER)
+        // MS compiler cannot put 64-bit values into xmm register without using
+        // mmx registers, and it makes no emms
+        union {
+            int64_t q[4];
+            int32_t r[8];
+        } u;
+        u.q[0] = i0;  u.q[1] = i1;  u.q[2] = i2;  u.q[3] = i3;
+        ymm = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]);
+#else
+        ymm = _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
+    }
+    // Constructor to build from two Vec2q:
+    Vec4q(Vec2q const & a0, Vec2q const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4q(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4q & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4q & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4q & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4q & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 2) {
+            *this = Vec4q(Vec2q().load_partial(n, p), 0);
+        }
+        else if (n < 4) {
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t*)p+2));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 4) {
+            get_low().store(p);
+            get_high().store_partial(n-2, (int64_t*)p+2);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec4q & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 8);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4q const & insert(uint32_t index, int64_t value) {
+        Vec4q x(value);
+        switch (index) {
+        case 0:        
+            ymm = _mm256_blend_epi32(ymm,x,0x03);  break;
+        case 1:
+            ymm = _mm256_blend_epi32(ymm,x,0x0C);  break;
+        case 2:
+            ymm = _mm256_blend_epi32(ymm,x,0x30);  break;
+        case 3:
+            ymm = _mm256_blend_epi32(ymm,x,0xC0);  break;
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(uint32_t index) const {
+        int64_t x[4];
+        store(x);
+        return x[index & 3];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec2q get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec2q get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static int size() {
+        return 4;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+*
+*****************************************************************************/
+
+class Vec4qb : public Vec4q {
+public:
+    // Default constructor:
+    Vec4qb() {
+    }
+    // Constructor to build from all elements:
+    Vec4qb(bool x0, bool x1, bool x2, bool x3) :
+        Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4qb(__m256i const & x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4qb & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4qb(bool b) : Vec4q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4qb & operator = (bool b) {
+        *this = Vec4qb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4qb(int b);
+    Vec4qb & operator = (int x);
+public:
+    // Member functions to split into two Vec2qb:
+    Vec2qb get_low() const {
+        return Vec2qb(Vec4q::get_low());
+    }
+    Vec2qb get_high() const {
+        return Vec2qb(Vec4q::get_high());
+    }
+    Vec4qb & insert (int index, bool a) {
+        Vec4q::insert(index, -(int64_t)a);
+        return *this;
+    };    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec4q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4qb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4qb operator & (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4qb operator && (Vec4qb const & a, Vec4qb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator | (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4qb operator || (Vec4qb const & a, Vec4qb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator ^ (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator ~ (Vec4qb const & a) {
+    return Vec4qb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4qb operator ! (Vec4qb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator + (Vec4q const & a, Vec4q const & b) {
+    return _mm256_add_epi64(a, b);
+}
+
+// vector operator += : add
+static inline Vec4q & operator += (Vec4q & a, Vec4q const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator ++ (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4q & operator ++ (Vec4q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator - (Vec4q const & a, Vec4q const & b) {
+    return _mm256_sub_epi64(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec4q operator - (Vec4q const & a) {
+    return _mm256_sub_epi64(_mm256_setzero_si256(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec4q & operator -= (Vec4q & a, Vec4q const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4q operator -- (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4q & operator -- (Vec4q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) {
+    // instruction does not exist. Split into 32-bit multiplies
+    __m256i bswap   = _mm256_shuffle_epi32(b,0xB1);           // swap H<->L
+    __m256i prodlh  = _mm256_mullo_epi32(a,bswap);            // 32 bit L*H products
+    __m256i zero    = _mm256_setzero_si256();                 // 0
+    __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+    __m256i prodll  = _mm256_mul_epu32(a,b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
+    __m256i prod    = _mm256_add_epi64(prodll,prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+    return  prod;
+}
+
+// vector operator *= : multiply
+static inline Vec4q & operator *= (Vec4q & a, Vec4q const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator << (Vec4q const & a, int32_t b) {
+    return _mm256_sll_epi64(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec4q & operator <<= (Vec4q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator >> (Vec4q const & a, int32_t b) {
+    // instruction does not exist. Split into 32-bit shifts
+    if (b <= 32) {
+        __m128i bb   = _mm_cvtsi32_si128(b);                   // b
+        __m256i sra  = _mm256_sra_epi32(a,bb);                 // a >> b signed dwords
+        __m256i srl  = _mm256_srl_epi64(a,bb);                 // a >> b unsigned qwords
+        __m256i mask = constant8i<0,-1,0,-1,0,-1,0,-1>();      // mask for signed high part
+        return  selectb(mask, sra, srl);
+    }
+    else {  // b > 32
+        __m128i bm32 = _mm_cvtsi32_si128(b-32);                // b - 32
+        __m256i sign = _mm256_srai_epi32(a,31);                // sign of a
+        __m256i sra2 = _mm256_sra_epi32(a,bm32);               // a >> (b-32) signed dwords
+        __m256i sra3 = _mm256_srli_epi64(sra2,32);             // a >> (b-32) >> 32 (second shift unsigned qword)
+        __m256i mask = constant8i<0,-1,0,-1,0,-1,0,-1>();      // mask for high part containing only sign
+        return  selectb(mask, sign ,sra3);
+    }
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4q & operator >>= (Vec4q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator == (Vec4q const & a, Vec4q const & b) {
+    return _mm256_cmpeq_epi64(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator != (Vec4q const & a, Vec4q const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comneq_epi64(a,b);
+#else 
+    return Vec4qb(Vec4q(~(a == b)));
+#endif
+}
+  
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator < (Vec4q const & a, Vec4q const & b) {
+    return _mm256_cmpgt_epi64(b, a);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator > (Vec4q const & a, Vec4q const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator >= (Vec4q const & a, Vec4q const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epi64(a,b);
+#else  // SSE2 instruction set
+    return Vec4qb(Vec4q(~(a < b)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator <= (Vec4q const & a, Vec4q const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4q operator & (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4q operator && (Vec4q const & a, Vec4q const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4q & operator &= (Vec4q & a, Vec4q const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator | (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4q operator || (Vec4q const & a, Vec4q const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4q & operator |= (Vec4q & a, Vec4q const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator ^ (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4q & operator ^= (Vec4q & a, Vec4q const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator ~ (Vec4q const & a) {
+    return Vec4q( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator ! (Vec4q const & a) {
+    return a == Vec4q(_mm256_setzero_si256());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4q select (Vec4qb const & s, Vec4q const & a, Vec4q const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add (Vec4qb const & f, Vec4q const & a, Vec4q const & b) {
+    return a + (Vec4q(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add (Vec4q const & a) {
+    __m256i sum1  = _mm256_shuffle_epi32(a,0x0E);                     // high element
+    __m256i sum2  = _mm256_add_epi64(a,sum1);                         // sum
+#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER)
+    __m128i sum3  = _mm256_extractf128_si256(sum2, 1);                // bug in MS compiler VS 11
+#else
+    __m128i sum3  = _mm256_extracti128_si256(sum2, 1);                // get high part
+#endif
+    __m128i sum4  = _mm_add_epi64(_mm256_castsi256_si128(sum2),sum3); // add low and high parts
+#if defined(__x86_64__)
+    return          _mm_cvtsi128_si64(sum4);                          // 64 bit mode
+#else
+    union {
+        __m128i x;  // silly definition of _mm256_storel_epi64 requires __m256i
+        uint64_t i;
+    } u;
+    _mm_storel_epi64(&u.x,sum4);
+    return u.i;
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const & a, Vec4q const & b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const & a, Vec4q const & b) {
+    return select(a < b, a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const & a) {
+    __m256i sign  = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);// 0 > a
+    __m256i inv   = _mm256_xor_si256(a, sign);                    // invert bits if negative
+    return          _mm256_sub_epi64(inv, sign);                  // add 1
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const & a) {
+    __m256i absa   = abs(a);                                        // abs(a)
+    __m256i overfl = _mm256_cmpgt_epi64(_mm256_setzero_si256(), absa); // 0 > a
+    return           _mm256_add_epi64(absa, overfl);                // subtract 1 if 0x8000000000000000
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const & a, int b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_rot_epi64(a,Vec4q(b));
+#else  // SSE2 instruction set
+    __m256i left  = _mm256_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F));      // a << b 
+    __m256i right = _mm256_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b)
+    __m256i rot   = _mm256_or_si256(left, right);                         // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4uq : public Vec4q {
+public:
+    // Default constructor:
+    Vec4uq() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec4uq(uint64_t i) {
+        ymm = Vec4q(i);
+    };
+    // Constructor to build from all elements:
+    Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) {
+        ymm = Vec4q(i0, i1, i2, i3);
+    };
+    // Constructor to build from two Vec2uq:
+    Vec4uq(Vec2uq const & a0, Vec2uq const & a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4uq(__m256i const & x) {
+        ymm = x;
+    };
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4uq & operator = (__m256i const & x) {
+        ymm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec4uq & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4uq & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4uq const & insert(uint32_t index, uint64_t value) {
+        Vec4q::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(uint32_t index) const {
+        return Vec4q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec2uq get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec2uq get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator + (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) + Vec4q(b));
+}
+
+// vector operator - : subtract
+static inline Vec4uq operator - (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) - Vec4q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator * (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) * Vec4q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const & a, uint32_t b) {
+    return _mm256_srl_epi64(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const & a, uint32_t b) {
+    return Vec4uq ((Vec4q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const & a, int32_t b) {
+    return Vec4uq ((Vec4q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator > (Vec4uq const & a, Vec4uq const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comgt_epu64(a,b);
+#else  // SSE2 instruction set
+    __m256i sign32  = _mm256_set1_epi32(0x80000000);          // sign bit of each dword
+    __m256i aflip   = _mm256_xor_si256(a,sign32);             // a with sign bits flipped
+    __m256i bflip   = _mm256_xor_si256(b,sign32);             // b with sign bits flipped
+    __m256i equal   = _mm256_cmpeq_epi32(a,b);                // a == b, dwords
+    __m256i bigger  = _mm256_cmpgt_epi32(aflip,bflip);        // a > b, dwords
+    __m256i biggerl = _mm256_shuffle_epi32(bigger,0xA0);      // a > b, low dwords copied to high dwords
+    __m256i eqbig   = _mm256_and_si256(equal,biggerl);        // high part equal and low part bigger
+    __m256i hibig   = _mm256_or_si256(bigger,eqbig);          // high part bigger or high part equal and low part bigger
+    __m256i big     = _mm256_shuffle_epi32(hibig,0xF5);       // result copied to low part
+    return  big;
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator < (Vec4uq const & a, Vec4uq const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator >= (Vec4uq const & a, Vec4uq const & b) {
+#ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+    return _mm256_comge_epu64(a,b);
+#else  // SSE2 instruction set
+    return  Vec4qb(Vec4q(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator <= (Vec4uq const & a, Vec4uq const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4uq operator & (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4uq operator && (Vec4uq const & a, Vec4uq const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4uq operator | (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4uq operator || (Vec4uq const & a, Vec4uq const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator ^ (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(Vec256b(a) ^ Vec256b(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4uq select (Vec4qb const & s, Vec4uq const & a, Vec4uq const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add (Vec4qb const & f, Vec4uq const & a, Vec4uq const & b) {
+    return a + (Vec4uq(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add (Vec4uq const & a) {
+    return horizontal_add((Vec4q)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sing/zero extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec8i const & a) {
+    __m256i signs = _mm256_srai_epi32(a,31);                          // sign of all elements
+    Vec4q   a01   = _mm256_unpacklo_epi32(a,signs);                   // sign-extended a0, a1, a4, a5
+    Vec4q   a23   = _mm256_unpackhi_epi32(a,signs);                   // sign-extended a2, a3, a6, a7
+    return  horizontal_add(a01 + a23);
+}
+
+static inline uint64_t horizontal_add_x (Vec8ui const & a) {
+    __m256i zero  = _mm256_setzero_si256();                           // 0
+    __m256i a01   = _mm256_unpacklo_epi32(a,zero);                    // zero-extended a0, a1
+    __m256i a23   = _mm256_unpackhi_epi32(a,zero);                    // zero-extended a2, a3
+    return horizontal_add(Vec4q(a01) + Vec4q(a23));
+}
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(select(a > b, a, b));
+}
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(select(a > b, b, a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec8i a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8i b;
+* b = permute8i<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Permute vector of 4 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3 >
+static inline Vec4q permute4q(Vec4q const & a) {
+
+    // Combine indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0 & 3) | (i1 & 3) << 8 | (i2 & 3) << 16 | (i3 & 3) << 24;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0 ? 0 : 0xFF) | (i1<0 ? 0 : 0xFF) << 8 | (i2<0 ? 0 : 0xFF) << 16 | (i3<0 ? 0 : 0xFF) << 24;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3) & 0x80) != 0;
+
+    if (((m1 ^ 0x03020100) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, 
+                i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();                    
+            return _mm256_and_si256(a, maskz);
+        }
+        return a;                                 // do nothing
+    }
+
+    if (((m1 ^ 0x02020000) & 0x02020202 & mz) == 0) {
+        // no exchange of data between low and high half
+
+        if (((m1 ^ (m1 >> 16)) & 0x0101 & mz & (mz >> 16)) == 0 && !dozero) {
+            // same pattern in low and high half. use VPSHUFD
+            const int sd = (((i0>=0)?(i0&1):(i2&1)) * 10 + 4) | (((i1>=0)?(i1&1):(i3&1)) * 10 + 4) << 4;
+            return _mm256_shuffle_epi32(a, sd);
+        }
+
+        // use VPSHUFB
+        const __m256i mm = constant8i <
+            i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x03020100,
+            i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x07060504,
+            i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x03020100,
+            i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x07060504,
+            i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x03020100,
+            i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x07060504,
+            i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x03020100,
+            i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x07060504 > ();
+        return _mm256_shuffle_epi8(a, mm);
+    }
+
+    // general case. Use VPERMQ
+    const int ms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6;        
+    __m256i t1 = _mm256_permute4x64_epi64(a, ms);
+
+    if (dozero) {
+        // zero some elements
+        const __m256i maskz = constant8i <
+            i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, 
+            i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();                    
+        return _mm256_and_si256(t1, maskz);
+    }
+    return t1;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4uq(Vec4uq const & a) {
+    return Vec4uq (permute4q<i0,i1,i2,i3> (a));
+}
+
+// Permute vector of 8 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8i permute8i(Vec8i const & a) {
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12
+        | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12
+        | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
+
+    __m256i t1, mask;
+
+    if (((m1 ^ 0x76543210) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            mask = constant8i <
+                i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, 
+                i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();                    
+            return _mm256_and_si256(a, mask);
+        }
+        return a;                                 // do nothing
+    }
+
+    // Check if we can use 64-bit permute. Even numbered indexes must be even and odd numbered
+    // indexes must be equal to the preceding index + 1, except for negative indexes.
+    if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0) {
+
+        const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)) < 0; // part of a 64-bit block is zeroed
+        const int blank1 = partialzero ? -0x100 : -1;  // ignore or zero
+        const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1;  // indexes for 64 bit blend
+        const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1;
+        const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1;
+        const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1;
+        // do 64-bit permute
+        t1 = permute4q<n0,n1,n2,n3> (Vec4q(a));
+        if (blank1 == -1 || !dozero) {    
+            return  t1;
+        }
+        // need more zeroing
+        mask = constant8i <
+            i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, 
+            i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();                    
+        return _mm256_and_si256(t1, mask);
+    }
+
+    if (((m1 ^ 0x44440000) & 0x44444444 & mz) == 0) {
+        // no exchange of data between low and high half
+
+        if (((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0 && !dozero) {
+            // same pattern in low and high half. use VPSHUFD
+            const int sd = ((i0>=0)?(i0&3):(i4&3)) | ((i1>=0)?(i1&3):(i5&3)) << 2 |
+                ((i2>=0)?(i2&3):(i6&3)) << 4 | ((i3>=0)?(i3&3):(i7&3)) << 6;
+            return _mm256_shuffle_epi32(a, sd);
+        }
+
+        // use VPSHUFB
+        mask = constant8i <
+            i0 < 0 ? -1 : (i0 & 3) * 0x04040404 + 0x03020100,
+            i1 < 0 ? -1 : (i1 & 3) * 0x04040404 + 0x03020100,
+            i2 < 0 ? -1 : (i2 & 3) * 0x04040404 + 0x03020100,
+            i3 < 0 ? -1 : (i3 & 3) * 0x04040404 + 0x03020100,
+            i4 < 0 ? -1 : (i4 & 3) * 0x04040404 + 0x03020100,
+            i5 < 0 ? -1 : (i5 & 3) * 0x04040404 + 0x03020100,
+            i6 < 0 ? -1 : (i6 & 3) * 0x04040404 + 0x03020100,
+            i7 < 0 ? -1 : (i7 & 3) * 0x04040404 + 0x03020100 > ();
+        return _mm256_shuffle_epi8(a, mask);
+    }
+
+    // general case. Use VPERMD
+    mask = constant8i <
+        i0 < 0 ? -1 : (i0 & 7), i1 < 0 ? -1 : (i1 & 7),
+        i2 < 0 ? -1 : (i2 & 7), i3 < 0 ? -1 : (i3 & 7),
+        i4 < 0 ? -1 : (i4 & 7), i5 < 0 ? -1 : (i5 & 7),
+        i6 < 0 ? -1 : (i6 & 7), i7 < 0 ? -1 : (i7 & 7) > ();
+#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+    // bug in MS VS 11 beta: operands in wrong order. fixed in v. 11.0
+    t1 = _mm256_permutevar8x32_epi32(mask, a);   // ms
+#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    // Gcc 4.7.0 also has operands in wrong order. fixed in version 4.7.1
+    t1 = _mm256_permutevar8x32_epi32(mask, a);   // GCC
+#else
+    t1 = _mm256_permutevar8x32_epi32(a, mask);   // no-bug version
+#endif
+
+    if (dozero) {
+        // zero some elements
+        mask = constant8i <
+            i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, 
+            i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();                    
+        return _mm256_and_si256(t1, mask);
+    }
+    return t1;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8ui permute8ui(Vec8ui const & a) {
+    return Vec8ui (permute8i<i0,i1,i2,i3,i4,i5,i6,i7> (a));
+}
+
+// Permute vector of 16 16-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16s permute16s(Vec16s const & a) {
+
+    // Combine indexes 0 - 7 into a single bitfield, with 4 bits for each
+    const int mlo = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 
+        | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; 
+
+    // Combine indexes 8 - 15 into a single bitfield, with 4 bits for each
+    const int mhi = (i8&0xF) | (i9&0xF)<<4 | (i10&0xF)<<8 | (i11&0xF)<<12 
+        | (i12&0xF)<<16 | (i13&0xF)<<20 | (i14&0xF)<<24 | (i15&0xF)<<28;
+
+    // Mask to zero out negative indexes 0 - 7
+    const int zlo = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12
+        | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    // Mask to zero out negative indexes 8 - 15
+    const int zhi = (i8<0?0:0xF) | (i9<0?0:0xF)<<4 | (i10<0?0:0xF)<<8 | (i11<0?0:0xF)<<12
+        | (i12<0?0:0xF)<<16 | (i13<0?0:0xF)<<20 | (i14<0?0:0xF)<<24 | (i15<0?0:0xF)<<28;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
+
+    __m256i t1, mask;
+
+    // special case: all zero
+    if (zlo == 0 && zhi == 0) {
+        return _mm256_setzero_si256();
+    }
+
+    // special case: rotate 128 bits
+    if (i0>=0 && i0 < 16 && i1 ==((i0+1)&7) && i2 ==((i0+2)&7) && i3 ==((i0+3)&7) && i4 ==((i0+4)&7) && i5 ==((i0+5)&7) && i6 ==((i0+6)&7) && i7 ==((i0+7)&7) 
+        && i8 ==i0 +8 && i9 ==i1 +8 && i10==i2 +8 && i11==i3 +8 && i12==i4 +8 && i13==i5 +8 && i14==i6 +8 && i15==i7 +8 ) {
+        return _mm256_alignr_epi8(a, a, (i0 & 7) * 2);
+    }
+
+    // special case: rotate 256 bits
+    if (i0>=0 && i0 < 16     && i1 ==((i0+1 )&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) 
+        && i8 ==((i0+8 )&15) && i9 ==((i0+9 )&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)) {
+        t1 = _mm256_permute4x64_epi64(a, 0x4E);
+        return _mm256_alignr_epi8(a, t1, (i0 & 7) * 2);
+    }
+
+    // special case: no exchange of data between 64-bit sections, and same pattern in low and high 128 bits:
+    // can use VPSHUFLW or VPSHUFHW
+    if (((mlo ^ 0x44440000) & 0xCCCCCCCC & zlo) == 0 && ((mhi ^ 0xCCCC8888) & 0xCCCCCCCC & zhi) == 0
+        && ((mlo ^ mhi) & 0x33333333 & zlo & zhi) == 0) {
+
+        const int slo = (i0 >= 0 ? (i0&3) : i8 >= 0 ? (i8&3) : 0) | (i1 >= 0 ? (i1&3) : i9 >= 0 ? (i9&3) : 1) << 2 
+            | (i2 >= 0 ? (i2&3) : i10 >= 0 ? (i10&3) : 2) << 4 | (i3 >= 0 ? (i3&3) : i11 >= 0 ? (i11&3) : 3) << 6;
+
+        const int shi = (i4 >= 0 ? (i4&3) : i12 >= 0 ? (i12&3) : 0) | (i5 >= 0 ? (i5&3) : i13 >= 0 ? (i13&3) : 1) << 2 
+            | (i6 >= 0 ? (i6&3) : i14 >= 0 ? (i14&3) : 2) << 4 | (i7 >= 0 ? (i7&3) : i15 >= 0 ? (i15&3) : 3) << 6;
+
+        if (shi == 0xE4 && slo == 0xE4) {             // no permute
+            if (dozero) {
+                // zero some elements
+                const __m256i maskz = constant8i<
+                    (i0 <0?0:0xFFFF) | (i1 <0?0:0xFFFF0000),
+                    (i2 <0?0:0xFFFF) | (i3 <0?0:0xFFFF0000),
+                    (i4 <0?0:0xFFFF) | (i5 <0?0:0xFFFF0000),
+                    (i6 <0?0:0xFFFF) | (i7 <0?0:0xFFFF0000),
+                    (i8 <0?0:0xFFFF) | (i9 <0?0:0xFFFF0000),
+                    (i10<0?0:0xFFFF) | (i11<0?0:0xFFFF0000),
+                    (i12<0?0:0xFFFF) | (i13<0?0:0xFFFF0000),
+                    (i14<0?0:0xFFFF) | (i15<0?0:0xFFFF0000) > ();                    
+                return _mm256_and_si256(a, maskz);
+            }
+            return a;                                 // do nothing
+        }
+        if (shi == 0xE4 && !dozero) {
+            return _mm256_shufflelo_epi16(a, slo);    // low permute only
+        }
+        if (slo == 0xE4 && !dozero) {
+            return _mm256_shufflehi_epi16(a, shi);    // high permute only
+        }
+    }
+    
+    // Check if we can use 32-bit permute. Even numbered indexes must be even and odd numbered
+    // indexes must be equal to the preceding index + 1, except for negative indexes.
+    if (((mlo ^ 0x10101010) & 0x11111111 & zlo) == 0 && ((mlo ^ mlo >> 4) & 0x0E0E0E0E & zlo & zlo >> 4) == 0 &&
+        ((mhi ^ 0x10101010) & 0x11111111 & zhi) == 0 && ((mhi ^ mhi >> 4) & 0x0E0E0E0E & zhi & zhi >> 4) == 0 ) {
+
+        const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)|(i8^i9)|(i10^i11)|(i12^i13)|(i14^i15)) < 0; // part of a 32-bit block is zeroed
+        const int blank1 = partialzero ? -0x100 : -1;  // ignore or zero
+        const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1;  // indexes for 64 bit blend
+        const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1;
+        const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1;
+        const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1;
+        const int n4 = i8 > 0 ? i8 /2 : i9 > 0 ? i9 /2 : blank1;
+        const int n5 = i10> 0 ? i10/2 : i11> 0 ? i11/2 : blank1;
+        const int n6 = i12> 0 ? i12/2 : i13> 0 ? i13/2 : blank1;
+        const int n7 = i14> 0 ? i14/2 : i15> 0 ? i15/2 : blank1;
+        // do 32-bit permute
+        t1 = permute8i<n0,n1,n2,n3,n4,n5,n6,n7> (Vec8i(a));
+        if (blank1 == -1 || !dozero) {    
+            return  t1;
+        }
+        // need more zeroing
+        mask = constant8i<
+            (i0 <0?0:0xFFFF) | (i1 <0?0:0xFFFF0000),
+            (i2 <0?0:0xFFFF) | (i3 <0?0:0xFFFF0000),
+            (i4 <0?0:0xFFFF) | (i5 <0?0:0xFFFF0000),
+            (i6 <0?0:0xFFFF) | (i7 <0?0:0xFFFF0000),
+            (i8 <0?0:0xFFFF) | (i9 <0?0:0xFFFF0000),
+            (i10<0?0:0xFFFF) | (i11<0?0:0xFFFF0000),
+            (i12<0?0:0xFFFF) | (i13<0?0:0xFFFF0000),
+            (i14<0?0:0xFFFF) | (i15<0?0:0xFFFF0000) > ();                    
+        return _mm256_and_si256(t1, mask);
+    }
+
+    // special case: all elements from same half
+    if ((mlo & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0) {
+        mask = constant8i<
+            (i0  < 0 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | (i1  < 0 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            (i2  < 0 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | (i3  < 0 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            (i4  < 0 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | (i5  < 0 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            (i6  < 0 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | (i7  < 0 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            (i8  < 0 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | (i9  < 0 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+        return _mm256_shuffle_epi8(a, mask);
+    }
+
+    // special case: all elements from low half
+    if ((mlo & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0) {
+        mask = constant8i<
+            (i0  < 0 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | (i1  < 0 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            (i2  < 0 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | (i3  < 0 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            (i4  < 0 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | (i5  < 0 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            (i6  < 0 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | (i7  < 0 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            (i8  < 0 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | (i9  < 0 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+        t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  // low, low
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // special case: all elements from high half
+    if (((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0) {
+        mask = constant8i<
+            (i0  < 0 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | (i1  < 0 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            (i2  < 0 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | (i3  < 0 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            (i4  < 0 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | (i5  < 0 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            (i6  < 0 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | (i7  < 0 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            (i8  < 0 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | (i9  < 0 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+        t1 = _mm256_permute4x64_epi64(a, 0xEE);  // high, high
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // special case: all elements from opposite half
+    if (((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0) {
+        mask = constant8i<
+            (i0  < 0 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | (i1  < 0 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            (i2  < 0 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | (i3  < 0 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            (i4  < 0 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | (i5  < 0 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            (i6  < 0 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | (i7  < 0 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            (i8  < 0 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | (i9  < 0 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+        t1 = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // general case: elements from both halves
+    const __m256i mmsame = constant8i<
+            ((i0 ^8) < 8 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | ((i1 ^8) < 8 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            ((i2 ^8) < 8 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | ((i3 ^8) < 8 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            ((i4 ^8) < 8 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | ((i5 ^8) < 8 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            ((i6 ^8) < 8 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | ((i7 ^8) < 8 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            (i8  < 8 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | (i9  < 8 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            (i10 < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            (i12 < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            (i14 < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+
+    const __m256i mmopposite = constant8i<
+            (i0  < 8 ? 0xFFFF : (i0  & 7) * 0x202 + 0x100) | (i1  < 8 ? 0xFFFF : (i1  & 7) * 0x202 + 0x100) << 16,
+            (i2  < 8 ? 0xFFFF : (i2  & 7) * 0x202 + 0x100) | (i3  < 8 ? 0xFFFF : (i3  & 7) * 0x202 + 0x100) << 16,
+            (i4  < 8 ? 0xFFFF : (i4  & 7) * 0x202 + 0x100) | (i5  < 8 ? 0xFFFF : (i5  & 7) * 0x202 + 0x100) << 16,
+            (i6  < 8 ? 0xFFFF : (i6  & 7) * 0x202 + 0x100) | (i7  < 8 ? 0xFFFF : (i7  & 7) * 0x202 + 0x100) << 16,
+            ((i8 ^8) < 8 ? 0xFFFF : (i8  & 7) * 0x202 + 0x100) | ((i9 ^8) < 8 ? 0xFFFF : (i9  & 7) * 0x202 + 0x100) << 16,
+            ((i10^8) < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | ((i11^8) < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16,
+            ((i12^8) < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | ((i13^8) < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16,
+            ((i14^8) < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | ((i15^8) < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > ();
+
+    __m256i topp = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+    __m256i r1   = _mm256_shuffle_epi8(topp, mmopposite);
+    __m256i r2   = _mm256_shuffle_epi8(a, mmsame);
+    return         _mm256_or_si256(r1, r2);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us permute16us(Vec16us const & a) {
+    return Vec16us (permute16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+static inline Vec32c permute32c(Vec32c const & a) {
+
+    // collect bit 4 of each index
+    const int m1 = 
+        (i0 &16)>>4  | (i1 &16)>>3  | (i2 &16)>>2  | (i3 &16)>>1  | (i4 &16)     | (i5 &16)<<1  | (i6 &16)<<2  | (i7 &16)<<3  | 
+        (i8 &16)<<4  | (i9 &16)<<5  | (i10&16)<<6  | (i11&16)<<7  | (i12&16)<<8  | (i13&16)<<9  | (i14&16)<<10 | (i15&16)<<11 | 
+        (i16&16)<<12 | (i17&16)<<13 | (i18&16)<<14 | (i19&16)<<15 | (i20&16)<<16 | (i21&16)<<17 | (i22&16)<<18 | (i23&16)<<19 | 
+        (i24&16)<<20 | (i25&16)<<21 | (i26&16)<<22 | (i27&16)<<23 | (i28&16)<<24 | (i29&16)<<25 | (i30&16)<<26 | (i31&16)<<27 ;
+
+    // check which elements to set to zero
+    const int mz = ~ (
+        (i0 <0)     | (i1 <0)<<1  | (i2 <0)<<2  | (i3 <0)<<3  | (i4 <0)<<4  | (i5 <0)<<5  | (i6 <0)<<6  | (i7 <0)<<7  | 
+        (i8 <0)<<8  | (i9 <0)<<9  | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 | 
+        (i16<0)<<16 | (i17<0)<<17 | (i18<0)<<18 | (i19<0)<<19 | (i20<0)<<20 | (i21<0)<<21 | (i22<0)<<22 | (i23<0)<<23 | 
+        (i24<0)<<24 | (i25<0)<<25 | (i26<0)<<26 | (i27<0)<<27 | (i28<0)<<28 | (i29<0)<<29 | (i30<0)<<30 | (i31<0)<<31 );
+
+    // Combine indexes 0-7, 8-15, 16-23, 24-31 into a bitfields, with 8 bits for each
+    const uint64_t g0 = (i0 &0x1F)|(i1 &0x1F)<<8|(i2 &0x1F)<<16|(i3 &0x1F)<<24|(i4 &0x1FLL)<<32|(i5 &0x1FLL)<<40|(i6 &0x1FLL)<<48|(i7 &0x1FLL)<<56;
+    const uint64_t g1 = (i8 &0x1F)|(i9 &0x1F)<<8|(i10&0x1F)<<16|(i11&0x1F)<<24|(i12&0x1FLL)<<32|(i13&0x1FLL)<<40|(i14&0x1FLL)<<48|(i15&0x1FLL)<<56; 
+    const uint64_t g2 = (i16&0x1F)|(i17&0x1F)<<8|(i18&0x1F)<<16|(i19&0x1F)<<24|(i20&0x1FLL)<<32|(i21&0x1FLL)<<40|(i22&0x1FLL)<<48|(i23&0x1FLL)<<56; 
+    const uint64_t g3 = (i24&0x1F)|(i25&0x1F)<<8|(i26&0x1F)<<16|(i27&0x1F)<<24|(i28&0x1FLL)<<32|(i29&0x1FLL)<<40|(i30&0x1FLL)<<48|(i31&0x1FLL)<<56; 
+    
+    // Masks to zero out negative indexes
+    const uint64_t z0 = (i0 <0?0:0xFF)|(i1 <0?0:0xFF)<<8|(i2 <0?0:0xFF)<<16|(i3 <0?0:0xFF)<<24|(i4 <0?0:0xFFLL)<<32|(i5 <0?0:0xFFLL)<<40|(i6 <0?0:0xFFLL)<<48|(i7 <0?0:0xFFLL)<<56;
+    const uint64_t z1 = (i8 <0?0:0xFF)|(i9 <0?0:0xFF)<<8|(i10<0?0:0xFF)<<16|(i11<0?0:0xFF)<<24|(i12<0?0:0xFFLL)<<32|(i13<0?0:0xFFLL)<<40|(i14<0?0:0xFFLL)<<48|(i15<0?0:0xFFLL)<<56;
+    const uint64_t z2 = (i16<0?0:0xFF)|(i17<0?0:0xFF)<<8|(i18<0?0:0xFF)<<16|(i19<0?0:0xFF)<<24|(i20<0?0:0xFFLL)<<32|(i21<0?0:0xFFLL)<<40|(i22<0?0:0xFFLL)<<48|(i23<0?0:0xFFLL)<<56;
+    const uint64_t z3 = (i24<0?0:0xFF)|(i25<0?0:0xFF)<<8|(i26<0?0:0xFF)<<16|(i27<0?0:0xFF)<<24|(i28<0?0:0xFFLL)<<32|(i29<0?0:0xFFLL)<<40|(i30<0?0:0xFFLL)<<48|(i31<0?0:0xFFLL)<<56;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15|i16|i17|i18|i19|i20|i21|i22|i23|i24|i25|i26|i27|i28|i29|i30|i31) & 0x80) != 0;
+
+    __m256i t1, mask;
+
+    // special case: all zero
+    if (mz == 0) return  _mm256_setzero_si256();
+
+    // special case: no permute
+    if ((i0 <0||i0 == 0) && (i1 <0||i1 == 1) && (i2 <0||i2 == 2) && (i3 <0||i3 == 3) && (i4 <0||i4 == 4) && (i5 <0||i5 == 5) && (i6 <0||i6 == 6) && (i7 <0||i7 == 7) &&
+        (i8 <0||i8 == 8) && (i9 <0||i9 == 9) && (i10<0||i10==10) && (i11<0||i11==11) && (i12<0||i12==12) && (i13<0||i13==13) && (i14<0||i14==14) && (i15<0||i15==15) &&
+        (i16<0||i16==16) && (i17<0||i17==17) && (i18<0||i18==18) && (i19<0||i19==19) && (i20<0||i20==20) && (i21<0||i21==21) && (i22<0||i22==22) && (i23<0||i23==23) &&
+        (i24<0||i24==24) && (i25<0||i25==25) && (i26<0||i26==26) && (i27<0||i27==27) && (i28<0||i28==28) && (i29<0||i29==29) && (i30<0||i30==30) && (i31<0||i31==31)) {
+        if (dozero) {
+            // zero some elements
+            mask = constant8i <
+                (i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000),
+                (i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000),
+                (i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000),
+                (i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000),
+                (i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000),
+                (i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000),
+                (i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000),
+                (i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000) > ();
+            return _mm256_and_si256(a, mask);
+        }
+        return a; // do nothing
+    }
+
+    // special case: rotate 128 bits
+    if (i0>=0 && i0 < 32     && i1 ==((i0+1 )&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) 
+        && i8 ==((i0+8 )&15) && i9 ==((i0+9 )&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)
+        && i16==i0 +16 && i17==i1 +16 && i18==i2 +16 && i19==i3 +16 && i20==i4 +16 && i21==i5 +16 && i22==i6 +16 && i23==i7 +16 
+        && i24==i8 +16 && i25==i9 +16 && i26==i10+16 && i27==i11+16 && i28==i12+16 && i29==i13+16 && i30==i14+16 && i31==i15+16 ) {
+        return _mm256_alignr_epi8(a, a, i0 & 15);
+    }
+
+    // special case: rotate 256 bits
+    if (i0>=0 && i0 < 32     && i1 ==((i0+1 )&31) && i2 ==((i0+2 )&31) && i3 ==((i0+3 )&31) && i4 ==((i0+4 )&31) && i5 ==((i0+5 )&31) && i6 ==((i0+6 )&31) && i7 ==((i0+7 )&31) 
+        && i8 ==((i0+8 )&31) && i9 ==((i0+9 )&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31)
+        && i16==((i0+16)&31) && i17==((i0+17)&31) && i18==((i0+18)&31) && i19==((i0+19)&31) && i20==((i0+20)&31) && i21==((i0+21)&31) && i22==((i0+22)&31) && i23==((i0+23)&31)
+        && i24==((i0+24)&31) && i25==((i0+25)&31) && i26==((i0+26)&31) && i27==((i0+27)&31) && i28==((i0+28)&31) && i29==((i0+29)&31) && i30==((i0+30)&31) && i31==((i0+31)&31)) {
+        __m256i t1 = _mm256_permute4x64_epi64(a, 0x4E);
+        return _mm256_alignr_epi8(a, t1, i0 & 15);
+    }
+
+    // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered
+    // indexes must be equal to the preceding index + 1, except for negative indexes.
+    if (((g0 ^ 0x0100010001000100) & 0x0101010101010101 & z0) == 0 && ((g0 ^ g0 >> 8) & 0x00FE00FE00FE00FE & z0 & z0 >> 8) == 0 &&
+        ((g1 ^ 0x0100010001000100) & 0x0101010101010101 & z1) == 0 && ((g1 ^ g1 >> 8) & 0x00FE00FE00FE00FE & z1 & z1 >> 8) == 0 &&
+        ((g2 ^ 0x0100010001000100) & 0x0101010101010101 & z2) == 0 && ((g2 ^ g2 >> 8) & 0x00FE00FE00FE00FE & z2 & z2 >> 8) == 0 &&
+        ((g3 ^ 0x0100010001000100) & 0x0101010101010101 & z3) == 0 && ((g3 ^ g3 >> 8) & 0x00FE00FE00FE00FE & z3 & z3 >> 8) == 0 ) {
+    
+        const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)|(i8^i9)|(i10^i11)|(i12^i13)|(i14^i15)
+            |(i16^i17)|(i18^i19)|(i20^i21)|(i22^i23)|(i24^i25)|(i26^i27)|(i28^i29)|(i30^i31)) < 0; // part of a 16-bit block is zeroed
+        const int blank1 = partialzero ? -0x100 : -1;  // ignore or zero
+        const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1;  // indexes for 64 bit blend
+        const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1;
+        const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1;
+        const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1;
+        const int n4 = i8 > 0 ? i8 /2 : i9 > 0 ? i9 /2 : blank1;
+        const int n5 = i10> 0 ? i10/2 : i11> 0 ? i11/2 : blank1;
+        const int n6 = i12> 0 ? i12/2 : i13> 0 ? i13/2 : blank1;
+        const int n7 = i14> 0 ? i14/2 : i15> 0 ? i15/2 : blank1;
+        const int n8 = i16> 0 ? i16/2 : i17> 0 ? i17/2 : blank1;
+        const int n9 = i18> 0 ? i18/2 : i19> 0 ? i19/2 : blank1;
+        const int n10= i20> 0 ? i20/2 : i21> 0 ? i21/2 : blank1;
+        const int n11= i22> 0 ? i22/2 : i23> 0 ? i23/2 : blank1;
+        const int n12= i24> 0 ? i24/2 : i25> 0 ? i25/2 : blank1;
+        const int n13= i26> 0 ? i26/2 : i27> 0 ? i27/2 : blank1;
+        const int n14= i28> 0 ? i28/2 : i29> 0 ? i29/2 : blank1;
+        const int n15= i30> 0 ? i30/2 : i31> 0 ? i31/2 : blank1;
+        // do 16-bit permute
+        t1 = permute16s<n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15> (Vec16s(a));
+        if (blank1 == -1 || !dozero) {    
+            return  t1;
+        }
+        // need more zeroing
+        mask = constant8i <
+            (i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000),
+            (i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000),
+            (i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000),
+            (i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000),
+            (i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000),
+            (i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000),
+            (i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000),
+            (i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000) > ();
+        return _mm256_and_si256(a, mask);
+    } 
+
+    // special case: all elements from same half
+    if (((m1 ^ 0xFFFF0000) & mz) == 0) {
+        mask = constant8i <
+            (i0  & 0xFF) | (i1  & 0xFF) << 8 | (i2  & 0xFF) << 16 | (i3  & 0xFF) << 24,
+            (i4  & 0xFF) | (i5  & 0xFF) << 8 | (i6  & 0xFF) << 16 | (i7  & 0xFF) << 24,
+            (i8  & 0xFF) | (i9  & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24,
+            (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24,
+            (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24,
+            (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24,
+            (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24,
+            (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24 > ();
+        return _mm256_shuffle_epi8(a, mask);
+    }
+
+    // special case: all elements from low half
+    if ((m1 & mz) == 0) {
+        mask = constant8i <
+            (i0  & 0xFF) | (i1  & 0xFF) << 8 | (i2  & 0xFF) << 16 | (i3  & 0xFF) << 24,
+            (i4  & 0xFF) | (i5  & 0xFF) << 8 | (i6  & 0xFF) << 16 | (i7  & 0xFF) << 24,
+            (i8  & 0xFF) | (i9  & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24,
+            (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24,
+            (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24,
+            (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24,
+            (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24,
+            (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24 > ();
+        t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  // low, low
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // special case: all elements from high half
+    if (((m1 ^ 0xFFFFFFFF) & mz) == 0) {
+        mask = constant8i <
+            (i0  & 0xEF) | (i1  & 0xEF) << 8 | (i2  & 0xEF) << 16 | (i3  & 0xEF) << 24,
+            (i4  & 0xEF) | (i5  & 0xEF) << 8 | (i6  & 0xEF) << 16 | (i7  & 0xEF) << 24,
+            (i8  & 0xEF) | (i9  & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24,
+            (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24,
+            (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24,
+            (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24,
+            (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24,
+            (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24 > ();
+        t1 = _mm256_permute4x64_epi64(a, 0xEE);  // high, high
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // special case: all elements from opposite half
+    if (((m1 ^ 0x0000FFFF) & mz) == 0) {
+        mask = constant8i<
+            (i0  & 0xEF) | (i1  & 0xEF) << 8 | (i2  & 0xEF) << 16 | (i3  & 0xEF) << 24,
+            (i4  & 0xEF) | (i5  & 0xEF) << 8 | (i6  & 0xEF) << 16 | (i7  & 0xEF) << 24,
+            (i8  & 0xEF) | (i9  & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24,
+            (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24,
+            (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24,
+            (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24,
+            (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24,
+            (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24 > ();
+
+        t1 = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+        return _mm256_shuffle_epi8(t1, mask);
+    }
+
+    // general case: elements from both halves
+    const __m256i mmsame = constant8i <
+        ((i0 &0xF0)?0xFF:(i0 &15)) | ((i1 &0xF0)?0xFF:(i1 &15)) << 8 | ((i2 &0xF0)?0xFF:(i2 &15)) << 16 | ((i3 &0xF0)?0xFF:(i3 &15)) << 24, 
+        ((i4 &0xF0)?0xFF:(i4 &15)) | ((i5 &0xF0)?0xFF:(i5 &15)) << 8 | ((i6 &0xF0)?0xFF:(i6 &15)) << 16 | ((i7 &0xF0)?0xFF:(i7 &15)) << 24, 
+        ((i8 &0xF0)?0xFF:(i8 &15)) | ((i9 &0xF0)?0xFF:(i9 &15)) << 8 | ((i10&0xF0)?0xFF:(i10&15)) << 16 | ((i11&0xF0)?0xFF:(i11&15)) << 24, 
+        ((i12&0xF0)?0xFF:(i12&15)) | ((i13&0xF0)?0xFF:(i13&15)) << 8 | ((i14&0xF0)?0xFF:(i14&15)) << 16 | ((i15&0xF0)?0xFF:(i15&15)) << 24,
+        ((i16&0xF0)!=0x10?0xFF:(i16&15)) | ((i17&0xF0)!=0x10?0xFF:(i17&15)) << 8 | ((i18&0xF0)!=0x10?0xFF:(i18&15)) << 16 | ((i19&0xF0)!=0x10?0xFF:(i19&15)) << 24, 
+        ((i20&0xF0)!=0x10?0xFF:(i20&15)) | ((i21&0xF0)!=0x10?0xFF:(i21&15)) << 8 | ((i22&0xF0)!=0x10?0xFF:(i22&15)) << 16 | ((i23&0xF0)!=0x10?0xFF:(i23&15)) << 24, 
+        ((i24&0xF0)!=0x10?0xFF:(i24&15)) | ((i25&0xF0)!=0x10?0xFF:(i25&15)) << 8 | ((i26&0xF0)!=0x10?0xFF:(i26&15)) << 16 | ((i27&0xF0)!=0x10?0xFF:(i27&15)) << 24, 
+        ((i28&0xF0)!=0x10?0xFF:(i28&15)) | ((i29&0xF0)!=0x10?0xFF:(i29&15)) << 8 | ((i30&0xF0)!=0x10?0xFF:(i30&15)) << 16 | ((i31&0xF0)!=0x10?0xFF:(i31&15)) << 24 > ();
+
+    const __m256i mmopposite = constant8i <
+        ((i0 &0xF0)!=0x10?0xFF:(i0 &15)) | ((i1 &0xF0)!=0x10?0xFF:(i1 &15)) << 8 | ((i2 &0xF0)!=0x10?0xFF:(i2 &15)) << 16 | ((i3 &0xF0)!=0x10?0xFF:(i3 &15)) << 24, 
+        ((i4 &0xF0)!=0x10?0xFF:(i4 &15)) | ((i5 &0xF0)!=0x10?0xFF:(i5 &15)) << 8 | ((i6 &0xF0)!=0x10?0xFF:(i6 &15)) << 16 | ((i7 &0xF0)!=0x10?0xFF:(i7 &15)) << 24, 
+        ((i8 &0xF0)!=0x10?0xFF:(i8 &15)) | ((i9 &0xF0)!=0x10?0xFF:(i9 &15)) << 8 | ((i10&0xF0)!=0x10?0xFF:(i10&15)) << 16 | ((i11&0xF0)!=0x10?0xFF:(i11&15)) << 24, 
+        ((i12&0xF0)!=0x10?0xFF:(i12&15)) | ((i13&0xF0)!=0x10?0xFF:(i13&15)) << 8 | ((i14&0xF0)!=0x10?0xFF:(i14&15)) << 16 | ((i15&0xF0)!=0x10?0xFF:(i15&15)) << 24,
+        ((i16&0xF0)?0xFF:(i16&15)) | ((i17&0xF0)?0xFF:(i17&15)) << 8 | ((i18&0xF0)?0xFF:(i18&15)) << 16 | ((i19&0xF0)?0xFF:(i19&15)) << 24, 
+        ((i20&0xF0)?0xFF:(i20&15)) | ((i21&0xF0)?0xFF:(i21&15)) << 8 | ((i22&0xF0)?0xFF:(i22&15)) << 16 | ((i23&0xF0)?0xFF:(i23&15)) << 24, 
+        ((i24&0xF0)?0xFF:(i24&15)) | ((i25&0xF0)?0xFF:(i25&15)) << 8 | ((i26&0xF0)?0xFF:(i26&15)) << 16 | ((i27&0xF0)?0xFF:(i27&15)) << 24, 
+        ((i28&0xF0)?0xFF:(i28&15)) | ((i29&0xF0)?0xFF:(i29&15)) << 8 | ((i30&0xF0)?0xFF:(i30&15)) << 16 | ((i31&0xF0)?0xFF:(i31&15)) << 24 > ();
+
+    __m256i topp = _mm256_permute4x64_epi64(a, 0x4E);  // high, low
+    __m256i r1   = _mm256_shuffle_epi8(topp, mmopposite);
+    __m256i r2   = _mm256_shuffle_epi8(a, mmsame);
+    return         _mm256_or_si256(r1, r2);
+}
+
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc permute32uc(Vec32uc const & a) {
+        return Vec32uc (permute32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,    
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8i c;
+* c = blend8i<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+template <int i0,  int i1,  int i2,  int i3> 
+static inline Vec4q blend4q(Vec4q const & a, Vec4q const & b) {  
+
+    // Combine indexes into a single bitfield, with 8 bits for each
+    const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0 ? 0 : 0xFF) | (i1<0 ? 0 : 0xFF) << 8 | (i2<0 ? 0 : 0xFF) << 16 | (i3<0 ? 0 : 0xFF) << 24;
+
+    // zeroing needed. An index of -0x100 means don't care
+    const bool dozero = ((i0|i1|i2|i3) & 0x80) != 0;
+
+    __m256i t1, mask;
+
+    // special case: 128 bit blend/permute
+    if (((m1 ^ 0x01000100) & 0x01010101 & mz) == 0 && (((m1 + 0x00010001) ^ (m1 >> 8)) & 0x00FF00FF & mz & mz >> 8) == 0) {
+        const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : 4;  // index for low 128 bits
+        const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : 4;  // index for high 128 bits
+        const bool partialzero = int((i0 ^ i1) | (i2 ^ i3)) < 0; // part of a 128-bit block is zeroed
+        __m256i t1;
+
+        switch (j0 | j1 << 4) {
+        case 0x00:
+            t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  break;
+        case 0x02:
+            t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 1);  break;
+        case 0x04:
+            if (dozero && !partialzero) return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);
+            t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1);  break;
+        case 0x12:
+            t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 0);  break;
+        case 0x14:
+            if (dozero && !partialzero) return _mm256_inserti128_si256(a,_mm_setzero_si128(), 0);
+            t1 = a;  break;
+        case 0x01: case 0x10: case 0x11: // all from a
+            return permute4q <i0, i1, i2, i3> (a);
+        case 0x20:
+            t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 1);  break;
+        case 0x22:
+            t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1);  break;
+        case 0x24:
+            if (dozero && !partialzero) return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(b), 1);
+            t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1);  break;
+        case 0x30:
+            t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 0);  break;
+        case 0x34:
+            if (dozero && !partialzero) return _mm256_inserti128_si256(b,_mm_setzero_si128(), 0);
+            t1 = b;  break;
+        case 0x23: case 0x32: case 0x33:  // all from b
+            return permute4q <i0^4, i1^4, i2^4, i3^4> (b);
+        case 0x40:
+            if (dozero && !partialzero) return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(a),_mm256_castsi256_si128(a)));
+            t1 = a;  break;
+        case 0x42:
+            if (dozero && !partialzero) return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(b),_mm256_castsi256_si128(b)));
+            t1 = b;  break;
+        case 0x44:
+            return _mm256_setzero_si256();
+        default:
+            t1 = _mm256_permute2x128_si256(a, b, (j0&0x0F) | (j1&0x0F) << 4);
+        }
+        if (dozero) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, 
+                i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();
+            return _mm256_and_si256(t1, maskz);
+        }
+        return t1;
+    }
+
+    // special case: all from a
+    if ((m1 & 0x04040404 & mz) == 0) {
+        return permute4q <i0, i1, i2, i3> (a);
+    }
+
+    // special case: all from b
+    if ((~m1 & 0x04040404 & mz) == 0) {
+        return permute4q <i0^4, i1^4, i2^4, i3^4> (b);
+    }
+
+    // special case: blend without permute
+    if (((m1 ^ 0x03020100) & 0xFBFBFBFB & mz) == 0) {
+
+        mask = constant8i <
+            (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, 
+            (i2 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > ();
+
+        t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+
+        if (dozero) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, 
+                i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ();
+            return _mm256_and_si256(t1, maskz);
+        }
+        return t1;
+    } 
+
+    // special case: shift left
+    if (i0 > 0 && i0 < 4 && mz == -1 && (m1 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0) {
+        t1 = _mm256_permute2x128_si256(a, b, 0x21);
+        if (i0 < 2) return _mm256_alignr_epi8(t1, a, (i0 & 1) * 8);
+        else        return _mm256_alignr_epi8(b, t1, (i0 & 1) * 8);
+    }
+    // special case: shift right
+    if (i0 > 4 && i0 < 8 && mz == -1 && (m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0) {
+        t1 = _mm256_permute2x128_si256(b, a, 0x21);
+        if (i0 < 6) return _mm256_alignr_epi8(t1, b, (i0 & 1) * 8);
+        else        return _mm256_alignr_epi8(a, t1, (i0 & 1) * 8);
+    }
+
+    // general case: permute and blend and possibly zero
+    const int blank = dozero ? -1 : -0x100;  // ignore or zero
+
+    // permute and blend
+    __m256i ta = permute4q <
+        (i0 & 4) ? blank : i0, (i1 & 4) ? blank : i1, (i2 & 4) ? blank : i2, (i3 & 4) ? blank : i3 > (a);
+
+    __m256i tb = permute4q <
+        ((i0^4) & 4) ? blank : i0^4, ((i1^4) & 4) ? blank : i1^4, ((i2^4) & 4) ? blank : i2^4, ((i3^4) & 4) ? blank : i3^4 > (b);
+
+    if (blank == -1) {
+        // we have zeroed, need only to OR
+        return _mm256_or_si256(ta, tb);
+    }
+    // no zeroing, need to blend
+    mask = constant8i <
+        (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, 
+        (i2 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > ();
+
+    return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <int i0, int i1, int i2, int i3> 
+static inline Vec4uq blend4uq(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq( blend4q<i0,i1,i2,i3> (a,b));
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8i blend8i(Vec8i const & a, Vec8i const & b) {  
+
+    const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
+
+    // is zeroing needed
+    const bool do_zero  = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    __m256i t1, mask;
+
+    if (mz == 0) return _mm256_setzero_si256();  // all zero
+
+    // special case: 64 bit blend/permute
+    if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ (m1 >> 4)) & 0x0E0E0E0E & mz & mz >> 4) == 0) {
+        // check if part of a 64-bit block is zeroed
+        const bool partialzero = int((i0^i1) | (i2^i3) | (i4^i5) | (i6^i7)) < 0; 
+        const int blank1 = partialzero ? -0x100 : -1;  // ignore if zeroing later anyway
+        // indexes for 64 bit blend
+        const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : blank1;
+        const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : blank1;
+        const int j2 = i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : blank1;
+        const int j3 = i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : blank1;
+        // 64-bit blend and permute
+        t1 = blend4q<j0,j1,j2,j3>(Vec4q(a), Vec4q(b));
+        if (partialzero && do_zero) {
+            // zero some elements
+            mask = constant8i< i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, 
+                i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
+            return _mm256_and_si256(t1, mask);
+        }
+        return t1;
+    }
+
+    if ((m1 & 0x88888888 & mz) == 0) {
+        // all from a
+        return permute8i<i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+
+    if (((m1 ^ 0x88888888) & 0x88888888 & mz) == 0) {
+        // all from b
+        return permute8i<i0&~8, i1&~8, i2&~8, i3&~8, i4&~8, i5&~8, i6&~8, i7&~8> (b);
+    }
+
+    if ((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0) {
+        // blend and zero, no permute
+        mask = constant8i<(i0&8)?0:-1, (i1&8)?0:-1, (i2&8)?0:-1, (i3&8)?0:-1, (i4&8)?0:-1, (i5&8)?0:-1, (i6&8)?0:-1, (i7&8)?0:-1> ();
+        t1   = select(mask, a, b);
+        if (!do_zero) return t1;
+        // zero some elements
+        mask = constant8i< (i0<0&&(i0&8)) ? 0 : -1, (i1<0&&(i1&8)) ? 0 : -1, (i2<0&&(i2&8)) ? 0 : -1, (i3<0&&(i3&8)) ? 0 : -1, 
+            (i4<0&&(i4&8)) ? 0 : -1, (i5<0&&(i5&8)) ? 0 : -1, (i6<0&&(i6&8)) ? 0 : -1, (i7<0&&(i7&8)) ? 0 : -1 > ();
+        return _mm256_and_si256(t1, mask);
+    }
+
+    // special case: shift left
+    if (i0 > 0 && i0 < 8 && mz == -1 && (m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0) {
+        t1 = _mm256_permute2x128_si256(a, b, 0x21);
+        if (i0 < 4) return _mm256_alignr_epi8(t1, a, (i0 & 3) * 4);
+        else        return _mm256_alignr_epi8(b, t1, (i0 & 3) * 4);
+    }
+    // special case: shift right
+    if (i0 > 8 && i0 < 16 && mz == -1 && (m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0) {
+        t1 = _mm256_permute2x128_si256(b, a, 0x21);
+        if (i0 < 12) return _mm256_alignr_epi8(t1, b, (i0 & 3) * 4);
+        else         return _mm256_alignr_epi8(a, t1, (i0 & 3) * 4);
+    }
+
+    // general case: permute and blend and possible zero
+    const int blank = do_zero ? -1 : -0x100;  // ignore or zero
+
+    Vec8i ta = permute8i <
+        (uint32_t)i0 < 8 ? i0 : blank,
+        (uint32_t)i1 < 8 ? i1 : blank,
+        (uint32_t)i2 < 8 ? i2 : blank,
+        (uint32_t)i3 < 8 ? i3 : blank,
+        (uint32_t)i4 < 8 ? i4 : blank,
+        (uint32_t)i5 < 8 ? i5 : blank,
+        (uint32_t)i6 < 8 ? i6 : blank,
+        (uint32_t)i7 < 8 ? i7 : blank > (a);
+    Vec8i tb = permute8i <
+        (uint32_t)(i0^8) < 8 ? (i0^8) : blank,
+        (uint32_t)(i1^8) < 8 ? (i1^8) : blank,
+        (uint32_t)(i2^8) < 8 ? (i2^8) : blank,
+        (uint32_t)(i3^8) < 8 ? (i3^8) : blank,
+        (uint32_t)(i4^8) < 8 ? (i4^8) : blank,
+        (uint32_t)(i5^8) < 8 ? (i5^8) : blank,
+        (uint32_t)(i6^8) < 8 ? (i6^8) : blank,
+        (uint32_t)(i7^8) < 8 ? (i7^8) : blank > (b);
+    if (blank == -1) {    
+        return  _mm256_or_si256(ta, tb); 
+    }
+    // no zeroing, need to blend
+    const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | 
+        ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) | ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80);
+    return _mm256_blend_epi32(ta, tb, maskb);  // blend
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8ui blend8ui(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui( blend8i<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
+}
+
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16s blend16s(Vec16s const & a, Vec16s const & b) {  
+    //  #ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+
+    // collect bit 4 of each index
+    const int m1 = 
+        (i0 &16)>>4  | (i1 &16)>>3  | (i2 &16)>>2  | (i3 &16)>>1  | (i4 &16)     | (i5 &16)<<1  | (i6 &16)<<2  | (i7 &16)<<3  | 
+        (i8 &16)<<4  | (i9 &16)<<5  | (i10&16)<<6  | (i11&16)<<7  | (i12&16)<<8  | (i13&16)<<9  | (i14&16)<<10 | (i15&16)<<11 ;
+
+    // check which elements to set to zero
+    const int mz = 0x0000FFFF ^ (
+        (i0 <0)     | (i1 <0)<<1  | (i2 <0)<<2  | (i3 <0)<<3  | (i4 <0)<<4  | (i5 <0)<<5  | (i6 <0)<<6  | (i7 <0)<<7  | 
+        (i8 <0)<<8  | (i9 <0)<<9  | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 );
+
+    __m256i t1, mask;
+
+    // special case: all zero
+    if (mz == 0) return  _mm256_setzero_si256();
+
+    // special case: all from a
+    if ((m1 & mz) == 0) {
+        return permute16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a);
+    }
+
+    // special case: all from b
+    if (((m1 ^ 0xFFFF) & mz) == 0) {
+        return permute16s<i0^16,i1^16,i2^16,i3^16,i4^16,i5^16,i6^16,i7^16,i8^16,i9^16,i10^16,i11^16,i12^16,i13^16,i14^16,i15^16 > (b);
+    }
+
+    // special case: blend without permute
+    if ((i0 <0||(i0 &15)== 0) && (i1 <0||(i1 &15)== 1) && (i2 <0||(i2 &15)== 2) && (i3 <0||(i3 &15)== 3) && 
+        (i4 <0||(i4 &15)== 4) && (i5 <0||(i5 &15)== 5) && (i6 <0||(i6 &15)== 6) && (i7 <0||(i7 &15)== 7) && 
+        (i8 <0||(i8 &15)== 8) && (i9 <0||(i9 &15)== 9) && (i10<0||(i10&15)==10) && (i11<0||(i11&15)==11) && 
+        (i12<0||(i12&15)==12) && (i13<0||(i13&15)==13) && (i14<0||(i14&15)==14) && (i15<0||(i15&15)==15)) {
+
+        mask = constant8i <
+            ((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0),
+            ((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0),
+            ((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0),
+            ((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0),
+            ((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0),
+            ((i10& 16) ? 0xFFFF : 0) | ((i11& 16) ? 0xFFFF0000 : 0),
+            ((i12& 16) ? 0xFFFF : 0) | ((i13& 16) ? 0xFFFF0000 : 0),
+            ((i14& 16) ? 0xFFFF : 0) | ((i15& 16) ? 0xFFFF0000 : 0) > ();
+
+        t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+
+        if (mz != 0xFFFF) {
+            // zero some elements
+            mask = constant8i <
+                (i0  < 0 ? 0 : 0xFFFF) | (i1  < 0 ? 0 : 0xFFFF0000),
+                (i2  < 0 ? 0 : 0xFFFF) | (i3  < 0 ? 0 : 0xFFFF0000),
+                (i4  < 0 ? 0 : 0xFFFF) | (i5  < 0 ? 0 : 0xFFFF0000),
+                (i6  < 0 ? 0 : 0xFFFF) | (i7  < 0 ? 0 : 0xFFFF0000),
+                (i8  < 0 ? 0 : 0xFFFF) | (i9  < 0 ? 0 : 0xFFFF0000),
+                (i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000),
+                (i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000),
+                (i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000) > ();
+            return _mm256_and_si256(t1, mask);
+        }
+        return t1;
+    }
+
+    // special case: shift left
+    const int slb = i0 > 0 ? i0 : i15 - 15;
+    if (slb > 0 && slb < 16 
+        && (i0==slb+ 0||i0<0) && (i1==slb+ 1||i1<0) && (i2 ==slb+ 2||i2 <0) && (i3 ==slb+ 3||i3 <0) && (i4 ==slb+ 4||i4 <0) && (i5 ==slb+ 5||i5 <0) && (i6 ==slb+ 6||i6 <0) && (i7 ==slb+ 7||i7 <0)
+        && (i8==slb+ 8||i8<0) && (i9==slb+ 9||i9<0) && (i10==slb+10||i10<0) && (i11==slb+11||i11<0) && (i12==slb+12||i12<0) && (i13==slb+13||i13<0) && (i14==slb+14||i14<0) && (i15==slb+15||i15<0)) {
+        t1 = _mm256_permute2x128_si256(a, b, 0x21);
+        if (slb < 8) t1 = _mm256_alignr_epi8(t1, a, (slb & 7) * 2);
+        else         t1 = _mm256_alignr_epi8(b, t1, (slb & 7) * 2);
+        if (mz != 0xFFFF) {
+            // zero some elements
+            mask = constant8i <
+                (i0  < 0 ? 0 : 0xFFFF) | (i1  < 0 ? 0 : 0xFFFF0000),
+                (i2  < 0 ? 0 : 0xFFFF) | (i3  < 0 ? 0 : 0xFFFF0000),
+                (i4  < 0 ? 0 : 0xFFFF) | (i5  < 0 ? 0 : 0xFFFF0000),
+                (i6  < 0 ? 0 : 0xFFFF) | (i7  < 0 ? 0 : 0xFFFF0000),
+                (i8  < 0 ? 0 : 0xFFFF) | (i9  < 0 ? 0 : 0xFFFF0000),
+                (i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000),
+                (i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000),
+                (i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000) > ();
+            return _mm256_and_si256(t1, mask);
+        }
+        return t1;
+    }
+    // special case: shift right
+    const int srb = i0 > 0 ? (i0^16) : (i15^16) - 15;
+    if (srb > 0 && srb < 16
+        && ((i0 ^16)==srb+ 0||i0 <0) && ((i1 ^16)==srb+ 1||i1 <0) && ((i2 ^16)==srb+ 2||i2 <0) && ((i3 ^16)==srb+ 3||i3 <0) && ((i4 ^16)==srb+ 4||i4 <0) && ((i5 ^16)==srb+ 5||i5 <0) && ((i6 ^16)==srb+ 6||i6 <0) && ((i7 ^16)==srb+ 7||i7 <0)
+        && ((i8 ^16)==srb+ 8||i8 <0) && ((i9 ^16)==srb+ 9||i9 <0) && ((i10^16)==srb+10||i10<0) && ((i11^16)==srb+11||i11<0) && ((i12^16)==srb+12||i12<0) && ((i13^16)==srb+13||i13<0) && ((i14^16)==srb+14||i14<0) && ((i15^16)==srb+15||i15<0)) {
+        t1 = _mm256_permute2x128_si256(b, a, 0x21);
+        if (srb < 8) t1 = _mm256_alignr_epi8(t1, b, (srb & 7) * 2);
+        else         t1 = _mm256_alignr_epi8(a, t1, (srb & 7) * 2);
+        if (mz != 0xFFFF) {
+            // zero some elements
+            mask = constant8i <
+                (i0  < 0 ? 0 : 0xFFFF) | (i1  < 0 ? 0 : 0xFFFF0000),
+                (i2  < 0 ? 0 : 0xFFFF) | (i3  < 0 ? 0 : 0xFFFF0000),
+                (i4  < 0 ? 0 : 0xFFFF) | (i5  < 0 ? 0 : 0xFFFF0000),
+                (i6  < 0 ? 0 : 0xFFFF) | (i7  < 0 ? 0 : 0xFFFF0000),
+                (i8  < 0 ? 0 : 0xFFFF) | (i9  < 0 ? 0 : 0xFFFF0000),
+                (i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000),
+                (i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000),
+                (i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000) > ();
+            return _mm256_and_si256(t1, mask);
+        }
+        return t1;
+    }
+    
+    // general case: permute and blend and possibly zero
+    const int blank = (mz == 0xFFFF) ? -0x100 : -1;  // ignore or zero
+
+    // permute and blend
+    __m256i ta = permute16s <
+        (i0 &16)?blank:i0 , (i1 &16)?blank:i1 , (i2 &16)?blank:i2 , (i3 &16)?blank:i3 ,
+        (i4 &16)?blank:i4 , (i5 &16)?blank:i5 , (i6 &16)?blank:i6 , (i7 &16)?blank:i7 ,
+        (i8 &16)?blank:i8 , (i9 &16)?blank:i9 , (i10&16)?blank:i10, (i11&16)?blank:i11,
+        (i12&16)?blank:i12, (i13&16)?blank:i13, (i14&16)?blank:i14, (i15&16)?blank:i15 > (a);
+
+    __m256i tb = permute16s <
+        ((i0 ^16)&16)?blank:i0 ^16, ((i1 ^16)&16)?blank:i1 ^16, ((i2 ^16)&16)?blank:i2 ^16, ((i3 ^16)&16)?blank:i3 ^16, 
+        ((i4 ^16)&16)?blank:i4 ^16, ((i5 ^16)&16)?blank:i5 ^16, ((i6 ^16)&16)?blank:i6 ^16, ((i7 ^16)&16)?blank:i7 ^16, 
+        ((i8 ^16)&16)?blank:i8 ^16, ((i9 ^16)&16)?blank:i9 ^16, ((i10^16)&16)?blank:i10^16, ((i11^16)&16)?blank:i11^16,
+        ((i12^16)&16)?blank:i12^16, ((i13^16)&16)?blank:i13^16, ((i14^16)&16)?blank:i14^16, ((i15^16)&16)?blank:i15^16 > (b);
+
+    if (blank == -1) {
+        // we have zeroed, need only to OR
+        return _mm256_or_si256(ta, tb);
+    }
+    // no zeroing, need to blend
+    mask = constant8i <
+        ((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0),
+        ((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0),
+        ((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0),
+        ((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0),
+        ((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0),
+        ((i10& 16) ? 0xFFFF : 0) | ((i11& 16) ? 0xFFFF0000 : 0),
+        ((i12& 16) ? 0xFFFF : 0) | ((i13& 16) ? 0xFFFF0000 : 0),
+        ((i14& 16) ? 0xFFFF : 0) | ((i15& 16) ? 0xFFFF0000 : 0) > ();
+
+    return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16us blend16us(Vec16us const & a, Vec16us const & b) {
+    return Vec16us( blend16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a,b));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > 
+static inline Vec32c blend32c(Vec32c const & a, Vec32c const & b) {  
+    //  #ifdef __XOP2__  // Possible future 256-bit XOP extension ?
+
+    // collect bit 5 of each index
+    const int m1 = 
+        (i0 &32)>>5  | (i1 &32)>>4  | (i2 &32)>>3  | (i3 &32)>>2  | (i4 &32)>>1  | (i5 &32)     | (i6 &32)<<1  | (i7 &32)<<2  | 
+        (i8 &32)<<3  | (i9 &32)<<4  | (i10&32)<<5  | (i11&32)<<6  | (i12&32)<<7  | (i13&32)<<8  | (i14&32)<<9  | (i15&32)<<10 | 
+        (i16&32)<<11 | (i17&32)<<12 | (i18&32)<<13 | (i19&32)<<14 | (i20&32)<<15 | (i21&32)<<16 | (i22&32)<<17 | (i23&32)<<18 | 
+        (i24&32)<<19 | (i25&32)<<20 | (i26&32)<<21 | (i27&32)<<22 | (i28&32)<<23 | (i29&32)<<24 | (i30&32)<<25 | (i31&32)<<26 ;
+
+    // check which elements to set to zero
+    const int mz = ~ (
+        (i0 <0)     | (i1 <0)<<1  | (i2 <0)<<2  | (i3 <0)<<3  | (i4 <0)<<4  | (i5 <0)<<5  | (i6 <0)<<6  | (i7 <0)<<7  | 
+        (i8 <0)<<8  | (i9 <0)<<9  | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 | 
+        (i16<0)<<16 | (i17<0)<<17 | (i18<0)<<18 | (i19<0)<<19 | (i20<0)<<20 | (i21<0)<<21 | (i22<0)<<22 | (i23<0)<<23 | 
+        (i24<0)<<24 | (i25<0)<<25 | (i26<0)<<26 | (i27<0)<<27 | (i28<0)<<28 | (i29<0)<<29 | (i30<0)<<30 | (i31<0)<<31 );
+
+    __m256i t1, mask;
+
+    // special case: all zero
+    if (mz == 0) return  _mm256_setzero_si256();
+
+    // special case: all from a
+    if ((m1 & mz) == 0) {
+        return permute32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a);
+    }
+
+    // special case: all from b
+    if ((~m1 & mz) == 0) {
+        return permute32c<i0^32,i1^32,i2^32,i3^32,i4^32,i5^32,i6^32,i7^32,i8^32,i9^32,i10^32,i11^32,i12^32,i13^32,i14^32,i15^32,
+            i16^32,i17^32,i18^32,i19^32,i20^32,i21^32,i22^32,i23^32,i24^32,i25^32,i26^32,i27^32,i28^32,i29^32,i30^32,i31^32> (b);
+    }
+
+    // special case: blend without permute
+    if ((i0 <0||(i0 &31)== 0) && (i1 <0||(i1 &31)== 1) && (i2 <0||(i2 &31)== 2) && (i3 <0||(i3 &31)== 3) && 
+        (i4 <0||(i4 &31)== 4) && (i5 <0||(i5 &31)== 5) && (i6 <0||(i6 &31)== 6) && (i7 <0||(i7 &31)== 7) && 
+        (i8 <0||(i8 &31)== 8) && (i9 <0||(i9 &31)== 9) && (i10<0||(i10&31)==10) && (i11<0||(i11&31)==11) && 
+        (i12<0||(i12&31)==12) && (i13<0||(i13&31)==13) && (i14<0||(i14&31)==14) && (i15<0||(i15&31)==15) &&
+        (i16<0||(i16&31)==16) && (i17<0||(i17&31)==17) && (i18<0||(i18&31)==18) && (i19<0||(i19&31)==19) && 
+        (i20<0||(i20&31)==20) && (i21<0||(i21&31)==21) && (i22<0||(i22&31)==22) && (i23<0||(i23&31)==23) && 
+        (i24<0||(i24&31)==24) && (i25<0||(i25&31)==25) && (i26<0||(i26&31)==26) && (i27<0||(i27&31)==27) && 
+        (i28<0||(i28&31)==28) && (i29<0||(i29&31)==29) && (i30<0||(i30&31)==30) && (i31<0||(i31&31)==31) ) {
+
+        mask = constant8i <
+            ((i0 <<2)&0x80) | ((i1 <<10)&0x8000) | ((i2 <<18)&0x800000) | (uint32_t(i3 <<26)&0x80000000) ,
+            ((i4 <<2)&0x80) | ((i5 <<10)&0x8000) | ((i6 <<18)&0x800000) | (uint32_t(i7 <<26)&0x80000000) ,
+            ((i8 <<2)&0x80) | ((i9 <<10)&0x8000) | ((i10<<18)&0x800000) | (uint32_t(i11<<26)&0x80000000) ,
+            ((i12<<2)&0x80) | ((i13<<10)&0x8000) | ((i14<<18)&0x800000) | (uint32_t(i15<<26)&0x80000000) ,
+            ((i16<<2)&0x80) | ((i17<<10)&0x8000) | ((i18<<18)&0x800000) | (uint32_t(i19<<26)&0x80000000) ,
+            ((i20<<2)&0x80) | ((i21<<10)&0x8000) | ((i22<<18)&0x800000) | (uint32_t(i23<<26)&0x80000000) ,
+            ((i24<<2)&0x80) | ((i25<<10)&0x8000) | ((i26<<18)&0x800000) | (uint32_t(i27<<26)&0x80000000) ,
+            ((i28<<2)&0x80) | ((i29<<10)&0x8000) | ((i30<<18)&0x800000) | (uint32_t(i31<<26)&0x80000000) > ();
+
+        t1 = _mm256_blendv_epi8(a, b, mask);  // blend
+
+        if (mz != -1) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                (i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000),
+                (i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000),
+                (i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000),
+                (i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000),
+                (i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000),
+                (i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000),
+                (i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000),
+                (i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000) > ();
+            return _mm256_and_si256(t1, maskz);
+        }
+        return t1;
+    }
+
+    // special case: shift left
+    const int slb = i0 > 0 ? i0 : i31 - 31;
+    if (slb > 0 && slb < 32 
+        && (i0 ==slb+ 0||i0 <0) && (i1 ==slb+ 1||i1 <0) && (i2 ==slb+ 2||i2 <0) && (i3 ==slb+ 3||i3 <0)
+        && (i4 ==slb+ 4||i4 <0) && (i5 ==slb+ 5||i5 <0) && (i6 ==slb+ 6||i6 <0) && (i7 ==slb+ 7||i7 <0)
+        && (i8 ==slb+ 8||i8 <0) && (i9 ==slb+ 9||i9 <0) && (i10==slb+10||i10<0) && (i11==slb+11||i11<0)
+        && (i12==slb+12||i12<0) && (i13==slb+13||i13<0) && (i14==slb+14||i14<0) && (i15==slb+15||i15<0)
+        && (i16==slb+16||i16<0) && (i17==slb+17||i17<0) && (i18==slb+18||i18<0) && (i19==slb+19||i19<0)
+        && (i20==slb+20||i20<0) && (i21==slb+21||i21<0) && (i22==slb+22||i22<0) && (i23==slb+23||i23<0)
+        && (i24==slb+24||i24<0) && (i25==slb+25||i25<0) && (i26==slb+26||i26<0) && (i27==slb+27||i27<0)
+        && (i28==slb+28||i28<0) && (i29==slb+29||i29<0) && (i30==slb+30||i30<0) && (i31==slb+31||i31<0)) {
+        t1 = _mm256_permute2x128_si256(a, b, 0x21);
+        if (slb < 16) t1 = _mm256_alignr_epi8(t1, a, slb & 15);
+        else          t1 = _mm256_alignr_epi8(b, t1, slb & 15);
+        if (mz != -1) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                (i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000),
+                (i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000),
+                (i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000),
+                (i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000),
+                (i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000),
+                (i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000),
+                (i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000),
+                (i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000) > ();
+            return _mm256_and_si256(t1, maskz);
+        }
+        return t1;
+    }
+    // special case: shift right
+    const int srb = i0 > 0 ? (i0^32) : (i31^32) - 31;
+    if (srb > 0 && srb < 32
+        && ((i0 ^32)==srb+ 0||i0 <0) && ((i1 ^32)==srb+ 1||i1 <0) && ((i2 ^32)==srb+ 2||i2 <0) && ((i3 ^32)==srb+ 3||i3 <0)
+        && ((i4 ^32)==srb+ 4||i4 <0) && ((i5 ^32)==srb+ 5||i5 <0) && ((i6 ^32)==srb+ 6||i6 <0) && ((i7 ^32)==srb+ 7||i7 <0)
+        && ((i8 ^32)==srb+ 8||i8 <0) && ((i9 ^32)==srb+ 9||i9 <0) && ((i10^32)==srb+10||i10<0) && ((i11^32)==srb+11||i11<0)
+        && ((i12^32)==srb+12||i12<0) && ((i13^32)==srb+13||i13<0) && ((i14^32)==srb+14||i14<0) && ((i15^32)==srb+15||i15<0)
+        && ((i16^32)==srb+16||i16<0) && ((i17^32)==srb+17||i17<0) && ((i18^32)==srb+18||i18<0) && ((i19^32)==srb+19||i19<0)
+        && ((i20^32)==srb+20||i20<0) && ((i21^32)==srb+21||i21<0) && ((i22^32)==srb+22||i22<0) && ((i23^32)==srb+23||i23<0)
+        && ((i24^32)==srb+24||i24<0) && ((i25^32)==srb+25||i25<0) && ((i26^32)==srb+26||i26<0) && ((i27^32)==srb+27||i27<0)
+        && ((i28^32)==srb+28||i28<0) && ((i29^32)==srb+29||i29<0) && ((i30^32)==srb+30||i30<0) && ((i31^32)==srb+31||i31<0)) {
+        t1 = _mm256_permute2x128_si256(b, a, 0x21);
+        if (srb < 16) t1 = _mm256_alignr_epi8(t1, b, srb & 15);
+        else          t1 = _mm256_alignr_epi8(a, t1, srb & 15);
+        if (mz != -1) {
+            // zero some elements
+            const __m256i maskz = constant8i <
+                (i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000),
+                (i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000),
+                (i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000),
+                (i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000),
+                (i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000),
+                (i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000),
+                (i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000),
+                (i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000) > ();
+            return _mm256_and_si256(t1, maskz);
+        }
+        return t1;
+    }
+
+    // general case: permute and blend and possible zero
+    const int blank = (mz == -1) ? -0x100 : -1;  // ignore or zero
+
+    // permute and blend
+    __m256i ta = permute32c <
+        (i0 &32)?blank:i0 , (i1 &32)?blank:i1 , (i2 &32)?blank:i2 , (i3 &32)?blank:i3 , 
+        (i4 &32)?blank:i4 , (i5 &32)?blank:i5 , (i6 &32)?blank:i6 , (i7 &32)?blank:i7 , 
+        (i8 &32)?blank:i8 , (i9 &32)?blank:i9 , (i10&32)?blank:i10, (i11&32)?blank:i11,
+        (i12&32)?blank:i12, (i13&32)?blank:i13, (i14&32)?blank:i14, (i15&32)?blank:i15, 
+        (i16&32)?blank:i16, (i17&32)?blank:i17, (i18&32)?blank:i18, (i19&32)?blank:i19, 
+        (i20&32)?blank:i20, (i21&32)?blank:i21, (i22&32)?blank:i22, (i23&32)?blank:i23, 
+        (i24&32)?blank:i24, (i25&32)?blank:i25, (i26&32)?blank:i26, (i27&32)?blank:i27, 
+        (i28&32)?blank:i28, (i29&32)?blank:i29, (i30&32)?blank:i30, (i31&32)?blank:i31 > (a);
+
+    __m256i tb = permute32c <
+        ((i0 ^32)&32)?blank:i0 ^32, ((i1 ^32)&32)?blank:i1 ^32, ((i2 ^32)&32)?blank:i2 ^32, ((i3 ^32)&32)?blank:i3 ^32, 
+        ((i4 ^32)&32)?blank:i4 ^32, ((i5 ^32)&32)?blank:i5 ^32, ((i6 ^32)&32)?blank:i6 ^32, ((i7 ^32)&32)?blank:i7 ^32, 
+        ((i8 ^32)&32)?blank:i8 ^32, ((i9 ^32)&32)?blank:i9 ^32, ((i10^32)&32)?blank:i10^32, ((i11^32)&32)?blank:i11^32,
+        ((i12^32)&32)?blank:i12^32, ((i13^32)&32)?blank:i13^32, ((i14^32)&32)?blank:i14^32, ((i15^32)&32)?blank:i15^32,
+        ((i16^32)&32)?blank:i16^32, ((i17^32)&32)?blank:i17^32, ((i18^32)&32)?blank:i18^32, ((i19^32)&32)?blank:i19^32,
+        ((i20^32)&32)?blank:i20^32, ((i21^32)&32)?blank:i21^32, ((i22^32)&32)?blank:i22^32, ((i23^32)&32)?blank:i23^32,
+        ((i24^32)&32)?blank:i24^32, ((i25^32)&32)?blank:i25^32, ((i26^32)&32)?blank:i26^32, ((i27^32)&32)?blank:i27^32,
+        ((i28^32)&32)?blank:i28^32, ((i29^32)&32)?blank:i29^32, ((i30^32)&32)?blank:i30^32, ((i31^32)&32)?blank:i31^32 > (b);
+
+    if (blank == -1) {
+        // we have zeroed, need only to OR
+        return _mm256_or_si256(ta, tb);
+    }
+    // no zeroing, need to blend
+    mask = constant8i <
+        ((i0 <<2)&0x80) | ((i1 <<10)&0x8000) | ((i2 <<18)&0x800000) | (uint32_t(i3 <<26)&0x80000000) ,
+        ((i4 <<2)&0x80) | ((i5 <<10)&0x8000) | ((i6 <<18)&0x800000) | (uint32_t(i7 <<26)&0x80000000) ,
+        ((i8 <<2)&0x80) | ((i9 <<10)&0x8000) | ((i10<<18)&0x800000) | (uint32_t(i11<<26)&0x80000000) ,
+        ((i12<<2)&0x80) | ((i13<<10)&0x8000) | ((i14<<18)&0x800000) | (uint32_t(i15<<26)&0x80000000) ,
+        ((i16<<2)&0x80) | ((i17<<10)&0x8000) | ((i18<<18)&0x800000) | (uint32_t(i19<<26)&0x80000000) ,
+        ((i20<<2)&0x80) | ((i21<<10)&0x8000) | ((i22<<18)&0x800000) | (uint32_t(i23<<26)&0x80000000) ,
+        ((i24<<2)&0x80) | ((i25<<10)&0x8000) | ((i26<<18)&0x800000) | (uint32_t(i27<<26)&0x80000000) ,
+        ((i28<<2)&0x80) | ((i29<<10)&0x8000) | ((i30<<18)&0x800000) | (uint32_t(i31<<26)&0x80000000) > ();
+
+    return _mm256_blendv_epi8(ta, tb, mask);  // blend
+}
+
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc blend32uc(Vec32uc const & a, Vec32uc const & b) {
+        return Vec32uc (blend32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,    
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a, b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8i a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8i c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+    Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+    return Vec32c(t0, t1);
+#else
+    Vec32c f0 = constant8i<0,0,0,0,0x10101010,0x10101010,0x10101010,0x10101010>();
+    Vec32c f1 = constant8i<0x10101010,0x10101010,0x10101010,0x10101010,0,0,0,0>();
+    Vec32c tablef = _mm256_permute4x64_epi64(table, 0x4E);   // low and high parts swapped
+    Vec32c r0 = _mm256_shuffle_epi8(table,  (index ^ f0) + 0x70);
+    Vec32c r1 = _mm256_shuffle_epi8(tablef, (index ^ f1) + 0x70);
+    return r0 | r1;
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <= 16) {
+        Vec16c tt = Vec16c().load(table);
+        Vec16c r0 = lookup16(index.get_low(),  tt);
+        Vec16c r1 = lookup16(index.get_high(), tt);
+        return Vec32c(r0, r1);
+    }
+    if (n <= 32) return lookup32(index, Vec32c().load(table));
+    // n > 32. Limit index
+    Vec32uc index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec32uc(index) & uint8_t(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec32uc(index), uint8_t(n-1));
+    }
+    Vec8ui mask0 = Vec8ui(0x000000FF);  // mask 8 bits
+    Vec32c t0 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & Vec8ui(index1)),      1); // positions 0, 4, 8,  ...
+    Vec32c t1 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 8)), 1); // positions 1, 5, 9,  ...
+    Vec32c t2 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1,16)), 1); // positions 2, 6, 10, ...
+    Vec32c t3 = _mm256_i32gather_epi32((const int *)table,         _mm256_srli_epi32(index1,24), 1); // positions 3, 7, 11, ...
+    t0 = t0 & mask0;
+    t1 = _mm256_slli_epi32(t1 & mask0,  8);
+    t2 = _mm256_slli_epi32(t2 & mask0, 16);
+    t3 = _mm256_slli_epi32(t3,         24);
+    return (t0 | t3) | (t1 | t2);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const & index, void const * table) {
+    return lookup<n>(Vec32uc(index), table);
+}
+
+
+static inline Vec16s lookup16(Vec16s const & index, Vec16s const & table) {
+    return Vec16s(lookup32(Vec32c(index * 0x202 + 0x100), Vec32c(table)));
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <=  8) {
+        Vec8s table1 = Vec8s().load(table);        
+        return Vec16s(       
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if (n <= 16) return lookup16(index, Vec16s().load(table));
+    // n > 16. Limit index
+    Vec16us index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16us(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16us(index), n-1);
+    }
+    Vec16s t1 = _mm256_i32gather_epi32((const int *)table, __m256i(Vec8ui(index1) & 0x0000FFFF), 2);  // even positions
+    Vec16s t2 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 16) , 2);        // odd  positions
+    return blend16s<0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30>(t1, t2);
+}
+
+static inline Vec8i lookup8(Vec8i const & index, Vec8i const & table) {
+    return _mm256_permutevar8x32_epi32(table, index);
+}
+
+template <int n>
+static inline Vec8i lookup(Vec8i const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 8) {
+        Vec8i table1 = Vec8i().load(table);
+        return lookup8(index, table1);
+    }
+    if (n <= 16) {
+        Vec8i table1 = Vec8i().load(table);
+        Vec8i table2 = Vec8i().load((int32_t*)table + 8);
+        Vec8i y1 = lookup8(index, table1);
+        Vec8i y2 = lookup8(index, table2);
+        Vec8ib s = index > 7;
+        return select(s, y2, y1);
+    }
+    // n > 16. Limit index
+    Vec8ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+    return _mm256_i32gather_epi32((const int *)table, index1, 4);
+}
+
+static inline Vec4q lookup4(Vec4q const & index, Vec4q const & table) {
+    return Vec4q(lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table)));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const & index, int64_t const * table) {
+    if (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec4uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec4uq(min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>()));
+    }
+// old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long
+#if defined (__clang__) && CLANG_VERSION < 30400
+// clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed
+    return _mm256_i64gather_epi64((const int *)table, index1, 8);
+#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+// Old MS and Intel use non-standard type __int64
+    return _mm256_i64gather_epi64((const int64_t *)table, index1, 8);
+#else
+// Gnu, Clang 3.4, MS 11.0
+    return _mm256_i64gather_epi64((const long long *)table, index1, 8);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Other permutations with variable indexes
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_up(Vec32c const & a, int b) {
+    if (b < 16) {    
+        return Vec32c(shift_bytes_up(a.get_low(),b), shift_bytes_up(a.get_high(),b) | shift_bytes_down(a.get_low(),16-b));
+    }
+    else {
+        return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(),b-16));
+    }
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_down(Vec32c const & a, int b) {
+    if (b < 16) {    
+        return Vec32c(shift_bytes_down(a.get_low(),b) | shift_bytes_up(a.get_high(),16-b), shift_bytes_down(a.get_high(),b));
+    }
+    else {
+        return Vec32c(shift_bytes_down(a.get_high(),b-16), Vec16c(0));
+    }
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8i b = Vec8i().load((int32_t const *)a + imax-7);
+            return permute8i<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7>(b);
+        }
+        else {
+            Vec8i b = Vec8i().load((int32_t const *)a + imin);
+            return permute8i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin>(b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8i b = Vec8i().load((int32_t const *)a + imin);
+        Vec8i c = Vec8i().load((int32_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8i<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use AVX2 gather
+    return _mm256_i32gather_epi32((const int *)a, Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), 4);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const * a) {
+    Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index;  // Error message if index is negative
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int imin   = i01min < i23min ? i01min : i23min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int imax   = i01max > i23max ? i01max : i23max;
+    if (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4q b = Vec4q().load((int64_t const *)a + imax-3);
+            return permute4q<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
+        }
+        else {
+            Vec4q b = Vec4q().load((int64_t const *)a + imin);
+            return permute4q<i0-imin, i1-imin, i2-imin, i3-imin>(b);
+        }
+    }
+    if ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
+        // load two contiguous blocks and blend
+        Vec4q b = Vec4q().load((int64_t const *)a + imin);
+        Vec4q c = Vec4q().load((int64_t const *)a + imax-3);
+        const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
+        const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
+        const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
+        const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
+        return blend4q<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather
+    // old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long
+#if defined (__clang__) && CLANG_VERSION < 30400
+    // clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed
+    return _mm256_i32gather_epi64((const int *)a, Vec4i(i0,i1,i2,i3), 8);
+#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+    // Old MS and Intel use non-standard type __int64
+    return _mm256_i32gather_epi64((const int64_t *)a, Vec4i(i0,i1,i2,i3), 8);
+#else
+    // Gnu, Clang 3.4, MS 11.0
+    return _mm256_i32gather_epi64((const long long *)a, Vec4i(i0,i1,i2,i3), 8);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low (Vec32c const & a) {
+    __m256i a2   = permute4q<0,-256,1,-256>(Vec4q(a));           // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2
+    return         _mm256_unpacklo_epi8(a2, sign);               // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high (Vec32c const & a) {
+    __m256i a2   = permute4q<-256,2,-256,3>(Vec4q(a));           // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2
+    return         _mm256_unpackhi_epi8(a2, sign);               // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low (Vec32uc const & a) {
+    __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a));             // get bits 64-127 to position 128-191
+    return    _mm256_unpacklo_epi8(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high (Vec32uc const & a) {
+    __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a));             // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi8(a2, _mm256_setzero_si256());    // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low (Vec16s const & a) {
+    __m256i a2   = permute4q<0,-256,1,-256>(Vec4q(a));           // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_srai_epi16(a2, 15);                    // sign bit
+    return         _mm256_unpacklo_epi16(a2 ,sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high (Vec16s const & a) {
+    __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a));             // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_srai_epi16(a2, 15);                    // sign bit
+    return         _mm256_unpackhi_epi16(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low (Vec16us const & a) {
+    __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a));             // get bits 64-127 to position 128-191
+    return    _mm256_unpacklo_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high (Vec16us const & a) {
+    __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a));             // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi16(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low (Vec8i const & a) {
+    __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a));             // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_srai_epi32(a2, 31);                    // sign bit
+    return         _mm256_unpacklo_epi32(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high (Vec8i const & a) {
+    __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a));             // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_srai_epi32(a2, 31);                    // sign bit
+    return         _mm256_unpackhi_epi32(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low (Vec8ui const & a) {
+    __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a));             // get bits 64-127 to position 128-191
+    return  _mm256_unpacklo_epi32(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high (Vec8ui const & a) {
+    __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a));             // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi32(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress (Vec16s const & low, Vec16s const & high) {
+    __m256i mask  = _mm256_set1_epi32(0x00FF00FF);            // mask for low bytes
+    __m256i lowm  = _mm256_and_si256(low, mask);              // bytes of low
+    __m256i highm = _mm256_and_si256(high, mask);             // bytes of high
+    __m256i pk    = _mm256_packus_epi16(lowm, highm);         // unsigned pack
+    return          _mm256_permute4x64_epi64(pk, 0xD8);       // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated (Vec16s const & low, Vec16s const & high) {
+    __m256i pk    = _mm256_packs_epi16(low,high);             // packed with signed saturation
+    return          _mm256_permute4x64_epi64(pk, 0xD8);       // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress (Vec16us const & low, Vec16us const & high) {
+    return  Vec32uc (compress((Vec16s)low, (Vec16s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & high) {
+    __m256i maxval  = _mm256_set1_epi32(0x00FF00FF);          // maximum value
+    __m256i minval  = _mm256_setzero_si256();                 // minimum value = 0
+    __m256i low1    = _mm256_min_epu16(low,maxval);           // upper limit
+    __m256i high1   = _mm256_min_epu16(high,maxval);          // upper limit
+    __m256i low2    = _mm256_max_epu16(low1,minval);          // lower limit
+    __m256i high2   = _mm256_max_epu16(high1,minval);         // lower limit
+    __m256i pk      = _mm256_packus_epi16(low2,high2);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+    return            _mm256_permute4x64_epi64(pk, 0xD8);     // put in right place
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress (Vec8i const & low, Vec8i const & high) {
+    __m256i mask  = _mm256_set1_epi32(0x0000FFFF);            // mask for low words
+    __m256i lowm  = _mm256_and_si256(low,mask);               // bytes of low
+    __m256i highm = _mm256_and_si256(high,mask);              // bytes of high
+    __m256i pk    = _mm256_packus_epi32(lowm,highm);          // unsigned pack
+    return          _mm256_permute4x64_epi64(pk, 0xD8);       // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated (Vec8i const & low, Vec8i const & high) {
+    __m256i pk    =  _mm256_packs_epi32(low,high);            // pack with signed saturation
+    return           _mm256_permute4x64_epi64(pk, 0xD8);      // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress (Vec8ui const & low, Vec8ui const & high) {
+    return Vec16us (compress((Vec8i)low, (Vec8i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & high) {
+    __m256i maxval  = _mm256_set1_epi32(0x0000FFFF);          // maximum value
+    __m256i minval  = _mm256_setzero_si256();                 // minimum value = 0
+    __m256i low1    = _mm256_min_epu32(low,maxval);           // upper limit
+    __m256i high1   = _mm256_min_epu32(high,maxval);          // upper limit
+    __m256i low2    = _mm256_max_epu32(low1,minval);          // lower limit
+    __m256i high2   = _mm256_max_epu32(high1,minval);         // lower limit
+    __m256i pk      = _mm256_packus_epi32(low2,high2);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+    return            _mm256_permute4x64_epi64(pk, 0xD8);     // put in right place
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress (Vec4q const & low, Vec4q const & high) {
+    __m256i low2  = _mm256_shuffle_epi32(low,0xD8);           // low dwords of low  to pos. 0 and 32
+    __m256i high2 = _mm256_shuffle_epi32(high,0xD8);          // low dwords of high to pos. 0 and 32
+    __m256i pk    = _mm256_unpacklo_epi64(low2,high2);        // interleave
+    return          _mm256_permute4x64_epi64(pk, 0xD8);       // put in right place
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated (Vec4q const & a, Vec4q const & b) {
+    Vec4q maxval = constant8i<0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0>();
+    Vec4q minval = constant8i<(int)0x80000000,-1,(int)0x80000000,-1,(int)0x80000000,-1,(int)0x80000000,-1>();
+    Vec4q a1  = min(a,maxval);
+    Vec4q b1  = min(b,maxval);
+    Vec4q a2  = max(a1,minval);
+    Vec4q b2  = max(b1,minval);
+    return compress(a2,b2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress (Vec4uq const & low, Vec4uq const & high) {
+    return Vec8ui (compress((Vec4q)low, (Vec4q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high) {
+    __m256i zero     = _mm256_setzero_si256();                // 0
+    __m256i lowzero  = _mm256_cmpeq_epi32(low,zero);          // for each dword is zero
+    __m256i highzero = _mm256_cmpeq_epi32(high,zero);         // for each dword is zero
+    __m256i mone     = _mm256_set1_epi32(-1);                 // FFFFFFFF
+    __m256i lownz    = _mm256_xor_si256(lowzero,mone);        // for each dword is nonzero
+    __m256i highnz   = _mm256_xor_si256(highzero,mone);       // for each dword is nonzero
+    __m256i lownz2   = _mm256_srli_epi64(lownz,32);           // shift down to low dword
+    __m256i highnz2  = _mm256_srli_epi64(highnz,32);          // shift down to low dword
+    __m256i lowsatur = _mm256_or_si256(low,lownz2);           // low, saturated
+    __m256i hisatur  = _mm256_or_si256(high,highnz2);         // high, saturated
+    return  Vec8ui (compress(Vec4q(lowsatur), Vec4q(hisatur)));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 8 32-bit signed integers
+static inline Vec8i operator / (Vec8i const & a, Divisor_i const & d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i sgn = _mm256_broadcastq_epi64(d.getsign());    // broadcast sign of d
+    __m256i t1  = _mm256_mul_epi32(a,m);                   // 32x32->64 bit signed multiplication of even elements of a
+    __m256i t2  = _mm256_srli_epi64(t1,32);                // high dword of even numbered results
+    __m256i t3  = _mm256_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m256i t4  = _mm256_mul_epi32(t3,m);                  // 32x32->64 bit signed multiplication of odd elements
+    __m256i t5  = constant8i<0,-1,0,-1,0,-1,0,-1> ();      // mask for odd elements
+    __m256i t7  = _mm256_blendv_epi8(t2,t4,t5);            // blend two results
+    __m256i t8  = _mm256_add_epi32(t7,a);                  // add
+    __m256i t9  = _mm256_sra_epi32(t8,d.gets1());          // shift right artihmetic
+    __m256i t10 = _mm256_srai_epi32(a,31);                 // sign of a
+    __m256i t11 = _mm256_sub_epi32(t10,sgn);               // sign of a - sign of d
+    __m256i t12 = _mm256_sub_epi32(t9,t11);                // + 1 if a < 0, -1 if d < 0
+    return        _mm256_xor_si256(t12,sgn);               // change sign if divisor negative
+}
+
+// vector of 8 32-bit unsigned integers
+static inline Vec8ui operator / (Vec8ui const & a, Divisor_ui const & d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i t1  = _mm256_mul_epu32(a,m);                   // 32x32->64 bit unsigned multiplication of even elements of a
+    __m256i t2  = _mm256_srli_epi64(t1,32);                // high dword of even numbered results
+    __m256i t3  = _mm256_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m256i t4  = _mm256_mul_epu32(t3,m);                  // 32x32->64 bit unsigned multiplication of odd elements
+    __m256i t5  = constant8i<0,-1,0,-1,0,-1,0,-1> ();      // mask for odd elements
+    __m256i t7  = _mm256_blendv_epi8(t2,t4,t5);            // blend two results
+    __m256i t8  = _mm256_sub_epi32(a,t7);                  // subtract
+    __m256i t9  = _mm256_srl_epi32(t8,d.gets1());          // shift right logical
+    __m256i t10 = _mm256_add_epi32(t7,t9);                 // add
+    return        _mm256_srl_epi32(t10,d.gets2());         // shift right logical 
+}
+
+// vector of 16 16-bit signed integers
+static inline Vec16s operator / (Vec16s const & a, Divisor_s const & d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i sgn = _mm256_broadcastq_epi64(d.getsign());    // broadcast sign of d
+    __m256i t1  = _mm256_mulhi_epi16(a, m);                // multiply high signed words
+    __m256i t2  = _mm256_add_epi16(t1,a);                  // + a
+    __m256i t3  = _mm256_sra_epi16(t2,d.gets1());          // shift right artihmetic
+    __m256i t4  = _mm256_srai_epi16(a,15);                 // sign of a
+    __m256i t5  = _mm256_sub_epi16(t4,sgn);                // sign of a - sign of d
+    __m256i t6  = _mm256_sub_epi16(t3,t5);                 // + 1 if a < 0, -1 if d < 0
+    return        _mm256_xor_si256(t6,sgn);                // change sign if divisor negative
+}
+
+// vector of 16 16-bit unsigned integers
+static inline Vec16us operator / (Vec16us const & a, Divisor_us const & d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i t1  = _mm256_mulhi_epu16(a, m);                // multiply high signed words
+    __m256i t2  = _mm256_sub_epi16(a,t1);                  // subtract
+    __m256i t3  = _mm256_srl_epi16(t2,d.gets1());          // shift right logical
+    __m256i t4  = _mm256_add_epi16(t1,t3);                 // add
+    return        _mm256_srl_epi16(t4,d.gets2());          // shift right logical 
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator / (Vec32c const & a, Divisor_s const & d) {
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a) / d;
+    Vec16s high = extend_high(a) / d;
+    return compress(low,high);
+}
+
+// vector of 32 8-bit unsigned integers
+static inline Vec32uc operator / (Vec32uc const & a, Divisor_us const & d) {
+    // expand into two Vec16s
+    Vec16us low  = extend_low(a) / d;
+    Vec16us high = extend_high(a) / d;
+    return compress(low,high);
+}
+
+// vector operator /= : divide
+static inline Vec8i & operator /= (Vec8i & a, Divisor_i const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16s & operator /= (Vec16s & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+
+// vector operator /= : divide
+static inline Vec16us & operator /= (Vec16us & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+
+}
+
+// vector operator /= : divide
+static inline Vec32c & operator /= (Vec32c & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d ==  1) return  x;
+    if (d == -1) return -x;
+    if (uint32_t(d) == 0x80000000u) return Vec8i(x == Vec8i(0x80000000)) & 1; // prevent overflow when changing sign
+    const uint32_t d1 = d > 0 ? uint32_t(d) : -uint32_t(d);          // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if ((d1 & (d1-1)) == 0) {
+        // d1 is a power of 2. use shift
+        const int k = bit_scan_reverse_const(d1);
+        __m256i sign;
+        if (k > 1) sign = _mm256_srai_epi32(x, k-1); else sign = x;  // k copies of sign bit
+        __m256i bias    = _mm256_srli_epi32(sign, 32-k);             // bias = x >= 0 ? 0 : k-1
+        __m256i xpbias  = _mm256_add_epi32 (x, bias);                // x + bias
+        __m256i q       = _mm256_srai_epi32(xpbias, k);              // (x + bias) >> k
+        if (d > 0)      return q;                                    // d > 0: return  q
+        return _mm256_sub_epi32(_mm256_setzero_si256(), q);          // d < 0: return -q
+    }
+    // general case
+    const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);       // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator / (Vec8i const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator / (Vec8i const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d == 1) return x;                                            // divide by 1
+    const int b = bit_scan_reverse_const(d);                         // floor(log2(d))
+    if ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm256_srli_epi32(x, b);                             // x >> b
+    }
+    // general case (d > 2)
+    uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d);           // multiplier = 2^(32+b) / d
+    const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    const bool round_down = (2*rem < d);                             // check if fraction is less than 0.5
+    if (!round_down) {
+        mult = mult + 1;                                             // round up mult
+    }
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+    const __m256i multv = _mm256_set_epi32(0,mult,0,mult,0,mult,0,mult);// zero-extend mult and broadcast
+    __m256i t1 = _mm256_mul_epu32(x,multv);                          // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+    if (round_down) {
+        t1      = _mm256_add_epi64(t1,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m256i t2 = _mm256_srli_epi64(t1,32);                           // high dword of result 0 and 2
+    __m256i t3 = _mm256_srli_epi64(x,32);                            // get x[1] and x[3] into position for multiplication
+    __m256i t4 = _mm256_mul_epu32(t3,multv);                         // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if (round_down) {
+        t4      = _mm256_add_epi64(t4,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m256i t5 = _mm256_set_epi32(-1,0,-1,0,-1,0,-1,0);              // mask of dword 1 and 3
+    __m256i t7 = _mm256_blendv_epi8(t2,t4,t5);                       // blend two results
+    Vec8ui  q  = _mm256_srli_epi32(t7, b);                           // shift right by b
+    return q;                                                        // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator / (Vec8ui const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator / (Vec8ui const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16s by compile-time constant 
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const & x) {
+    const int16_t d0 = int16_t(d);                                   // truncate d to 16 bits
+    Static_error_check<(d0 != 0)> Dividing_by_zero;                  // Error message if dividing by zero
+    if (d0 ==  1) return  x;                                         // divide by  1
+    if (d0 == -1) return -x;                                         // divide by -1
+    if (uint16_t(d0) == 0x8000u) return Vec16s(x == Vec16s(0x8000)) & 1;// prevent overflow when changing sign
+    const uint16_t d1 = d0 > 0 ? d0 : -d0;                           // compile-time abs(d0)
+    if ((d1 & (d1-1)) == 0) {
+        // d is a power of 2. use shift
+        const int k = bit_scan_reverse_const(uint32_t(d1));
+        __m256i sign;
+        if (k > 1) sign = _mm256_srai_epi16(x, k-1); else sign = x;  // k copies of sign bit
+        __m256i bias    = _mm256_srli_epi16(sign, 16-k);             // bias = x >= 0 ? 0 : k-1
+        __m256i xpbias  = _mm256_add_epi16 (x, bias);                // x + bias
+        __m256i q       = _mm256_srai_epi16(xpbias, k);              // (x + bias) >> k
+        if (d0 > 0)  return q;                                       // d0 > 0: return  q
+        return _mm256_sub_epi16(_mm256_setzero_si256(), q);          // d0 < 0: return -q
+    }
+    // general case
+    const int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1;        // ceil(log2(d)). (d < 2 handled above)
+    const int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier
+    const int shift1 = L - 1;
+    const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+    return x / div;
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator / (Vec16s const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator / (Vec16s const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int(d)>(a);                                   // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const & x) {
+    const uint16_t d0 = uint16_t(d);                                 // truncate d to 16 bits
+    Static_error_check<(d0 != 0)> Dividing_by_zero;                  // Error message if dividing by zero
+    if (d0 == 1) return x;                                           // divide by 1
+    const int b = bit_scan_reverse_const(d0);                        // floor(log2(d))
+    if ((d0 & (d0-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm256_srli_epi16(x, b);                             // x >> b
+    }
+    // general case (d > 2)
+    uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);          // multiplier = 2^(32+b) / d
+    const uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d
+    const bool round_down = (2*rem < d0);                            // check if fraction is less than 0.5
+    Vec16us x1 = x;
+    if (round_down) {
+        x1 = x1 + 1;                                                 // round down mult and compensate by adding 1 to x
+    }
+    else {
+        mult = mult + 1;                                             // round up mult. no compensation needed
+    }
+    const __m256i multv = _mm256_set1_epi16(mult);                   // broadcast mult
+    __m256i xm = _mm256_mulhi_epu16(x1, multv);                      // high part of 16x16->32 bit unsigned multiplication
+    Vec16us q    = _mm256_srli_epi16(xm, b);                         // shift right by b
+    if (round_down) {
+        Vec16sb overfl = (x1 == Vec16us(_mm256_setzero_si256()));     // check for overflow of x+1
+        return select(overfl, Vec16us(mult >> b), q);                // deal with overflow (rarely needed)
+    }
+    else {
+        return q;                                                    // no overflow possible
+    }
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator / (Vec16us const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator / (Vec16us const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator / (Vec32c const & a, Const_int_t<d>) {
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a)  / Const_int_t<d>();
+    Vec16s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator / (Vec32c const & a, Const_uint_t<d>) {
+    Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return a / Const_int_t<d>();                                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t<d>) {
+    // expand into two Vec16us
+    Vec16us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec16us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator / (Vec32uc const & a, Const_int_t<d>) {
+    Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return a / Const_uint_t<d>();                                    // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec32cb const & x) {
+    uint32_t a = _mm256_movemask_epi8(x);
+    if (a == 0) return -1;
+    int32_t b = bit_scan_forward(a);
+    return b;
+}
+
+static inline int horizontal_find_first(Vec16sb const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 1;
+}
+
+static inline int horizontal_find_first(Vec8ib const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 2;
+}
+
+static inline int horizontal_find_first(Vec4qb const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 3;
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec32cb const & x) {
+    uint32_t a = _mm256_movemask_epi8(x);
+    return vml_popcnt(a);
+}
+
+static inline uint32_t horizontal_count(Vec16sb const & x) {
+    return horizontal_count(Vec32cb(x)) >> 1;
+}
+
+static inline uint32_t horizontal_count(Vec8ib const & x) {
+    return horizontal_count(Vec32cb(x)) >> 2;
+}
+
+static inline uint32_t horizontal_count(Vec4qb const & x) {
+    return horizontal_count(Vec32cb(x)) >> 3;
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const & x) {
+    return (uint32_t)_mm256_movemask_epi8(x);
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec32cb to_Vec32cb(uint32_t x) {
+    return Vec32cb(Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x>>16))));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16sb const & x) {
+    __m128i a = _mm_packs_epi16(x.get_low(), x.get_high());  // 16-bit words to bytes
+    return (uint16_t)_mm_movemask_epi8(a);
+}
+
+// to_Vec16sb: convert integer bitfield to boolean vector
+static inline Vec16sb to_Vec16sb(uint16_t x) {
+    return Vec16sb(Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x>>8))));
+}
+
+#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+// These functions are defined in Vectori512.h if AVX512 instruction set is used
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib const & x) {
+    __m128i a = _mm_packs_epi32(x.get_low(), x.get_high());  // 32-bit dwords to 16-bit words
+    __m128i b = _mm_packs_epi16(a, a);  // 16-bit words to bytes
+    return (uint8_t)_mm_movemask_epi8(b);
+}
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) {
+    return Vec8ib(Vec8i(to_Vec4ib(x), to_Vec4ib(x>>4)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb const & x) {
+    uint32_t a = _mm256_movemask_epi8(x);
+    return ((a & 1) | ((a >> 7) & 2)) | (((a >> 14) & 4) | ((a >> 21) & 8));
+}
+
+// to_Vec4qb: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) {
+    return  Vec4qb(Vec4q(-(x&1), -((x>>1)&1), -((x>>2)&1), -((x>>3)&1)));
+}
+
+#else  // function prototypes here only
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib x);
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x);
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb x);
+
+// to_Vec4qb: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x);
+
+#endif  // INSTRSET < 9 || MAX_VECTOR_SIZE < 512
+
+
+#endif // VECTORI256_H
diff --git a/vectorclass/vectori256e.h b/vectorclass/vectori256e.h
new file mode 100755
index 0000000..8f3c2b5
--- /dev/null
+++ b/vectorclass/vectori256e.h
@@ -0,0 +1,4332 @@
+/****************************  vectori256e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining 256-bit integer point vector classes as interface
+* to intrinsic functions. Emulated for processors without AVX2 instruction set.
+*
+* The following vector classes are defined here:
+* Vec256b   Vector of 256  1-bit unsigned  integers or Booleans
+* Vec32c    Vector of  32  8-bit signed    integers
+* Vec32uc   Vector of  32  8-bit unsigned  integers
+* Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+* Vec16s    Vector of  16  16-bit signed   integers
+* Vec16us   Vector of  16  16-bit unsigned integers
+* Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+* Vec8i     Vector of   8  32-bit signed   integers
+* Vec8ui    Vector of   8  32-bit unsigned integers
+* Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+* Vec4q     Vector of   4  64-bit signed   integers
+* Vec4uq    Vector of   4  64-bit unsigned integers
+* Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2012 - 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORI256_H)
+#if    VECTORI256_H != 1
+#error Two different versions of vectori256.h included
+#endif
+#else
+#define VECTORI256_H  1
+
+#ifdef VECTORF256_H
+#error Please put header file vectori256.h or vectori256e.h before vectorf256e.h
+#endif
+
+
+#include "vectori128.h"
+
+
+/*****************************************************************************
+*
+*          base class Vec256ie
+*
+*****************************************************************************/
+// base class to replace Vec256ie when AVX2 is not supported
+class Vec256ie {
+protected:
+    __m128i y0;                         // low half
+    __m128i y1;                         // high half
+public:
+    Vec256ie(void) {};                  // default constructor
+    Vec256ie(__m128i x0, __m128i x1) {  // constructor to build from two __m128i
+        y0 = x0;  y1 = x1;
+    }
+    __m128i get_low() const {           // get low half
+        return y0;
+    }
+    __m128i get_high() const {          // get high half
+        return y1;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vector of 256 1-bit unsigned integers or Booleans
+*
+*****************************************************************************/
+
+class Vec256b : public Vec256ie {
+public:
+    // Default constructor:
+    Vec256b() {
+    }
+    // Constructor to broadcast the same value into all elements
+    // Removed because of undesired implicit conversions
+    //Vec256b(int i) {
+    //    y1 = y0 = _mm_set1_epi32(-(i & 1));}
+
+    // Constructor to build from two Vec128b:
+    Vec256b(Vec128b const & a0, Vec128b const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec256b(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec256b & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec256b & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    Vec256b & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm_storeu_si128((__m128i*)p,     y0);
+        _mm_storeu_si128((__m128i*)p + 1, y1);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    void store_a(void * p) const {
+        _mm_store_si128((__m128i*)p,     y0);
+        _mm_store_si128((__m128i*)p + 1, y1);
+    }
+    // Member function to change a single bit
+    // Note: This function is inefficient. Use load function if changing more than one bit
+    Vec256b const & set_bit(uint32_t index, int value) {
+        if (index < 128) {
+            y0 = Vec128b(y0).set_bit(index, value);
+        }
+        else {
+            y1 = Vec128b(y1).set_bit(index-128, value);
+        }
+        return *this;
+    }
+    // Member function to get a single bit
+    // Note: This function is inefficient. Use store function if reading more than one bit
+    int get_bit(uint32_t index) const {
+        if (index < 128) {
+            return Vec128b(y0).get_bit(index);
+        }
+        else {
+            return Vec128b(y1).get_bit(index-128);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return get_bit(index) != 0;
+    }
+    // Member functions to split into two Vec128b:
+    Vec128b get_low() const {
+        return y0;
+    }
+    Vec128b get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 256;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator & (Vec256b const & a, Vec256b const & b) {
+    return Vec256b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec256b operator && (Vec256b const & a, Vec256b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec256b operator | (Vec256b const & a, Vec256b const & b) {
+    return Vec256b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec256b operator || (Vec256b const & a, Vec256b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator ^ (Vec256b const & a, Vec256b const & b) {
+    return Vec256b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator ~ (Vec256b const & a) {
+    return Vec256b(~a.get_low(), ~a.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec256b & operator &= (Vec256b & a, Vec256b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b & operator |= (Vec256b & a, Vec256b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b & operator ^= (Vec256b & a, Vec256b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec256b andnot (Vec256b const & a, Vec256b const & b) {
+    return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec256ie constant8i() {
+    static const union {
+        int32_t i[8];
+        __m128i y[2];
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
+    return Vec256ie(u.y[0], u.y[1]);
+}
+
+
+/*****************************************************************************
+*
+*          selectb function
+*
+*****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked, 
+static inline Vec256ie selectb (Vec256ie const & s, Vec256ie const & a, Vec256ie const & b) {
+    return Vec256ie(selectb(s.get_low(),  a.get_low(),  b.get_low()), 
+                    selectb(s.get_high(), a.get_high(), b.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec256b const & a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec256b const & a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32c : public Vec256b {
+public:
+    // Default constructor:
+    Vec32c(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec32c(int i) {
+        y1 = y0 = _mm_set1_epi8((char)i);
+    };
+    // Constructor to build from all elements:
+    Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,        
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) {
+        y0 = _mm_setr_epi8(i0,  i1,  i2,  i3,  i4,  i5,  i6,  i7,  i8,  i9,  i10, i11, i12, i13, i14, i15);
+        y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    };
+    // Constructor to build from two Vec16c:
+    Vec32c(Vec16c const & a0, Vec16c const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec32c(Vec256ie const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec32c & operator = (Vec256ie const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec32c & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32c & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32c & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 16) {
+            *this = Vec32c(Vec16c().load_partial(n, p), 0);
+        }
+        else if (n < 32) {
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char*)p+16));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 16) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 32) {
+            get_low().store(p);
+            get_high().store_partial(n-16, (char*)p+16);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32c & cutoff(int n) {
+        if (uint32_t(n) >= 32) return *this;
+        static const union {
+            int32_t i[16];
+            char    c[64];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this &= Vec32c().load(mask.c+32-n);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec32c const & insert(uint32_t index, int8_t value) {
+        if (index < 16) {
+            y0 = Vec16c(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec16c(y1).insert(index-16, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(uint32_t index) const {
+        if (index < 16) {
+            return Vec16c(y0).extract(index);
+        }
+        else {
+            return Vec16c(y1).extract(index-16);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16c:
+    Vec16c get_low() const {
+        return y0;
+    }
+    Vec16c get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 32;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+*
+*****************************************************************************/
+
+class Vec32cb : public Vec32c {
+public:
+    // Default constructor:
+    Vec32cb(){}
+    // Constructor to build from all elements:
+    Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15,
+        bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23,
+        bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) :
+        Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), 
+            -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15),
+            -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23),
+            -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
+        {}
+    // Constructor to convert from type Vec256ie
+    Vec32cb(Vec256ie const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec32cb & operator = (Vec256ie const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec32cb(bool b) : Vec32c(-int8_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec32cb & operator = (bool b) {
+        *this = Vec32cb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec32cb(int b);
+    Vec32cb & operator = (int x);
+public:
+    // Member functions to split into two Vec16c:
+    Vec16cb get_low() const {
+        return y0;
+    }
+    Vec16cb get_high() const {
+        return y1;
+    }
+    Vec32cb & insert (int index, bool a) {
+        Vec32c::insert(index, -(int)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec32c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec32cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator & (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32cb operator && (Vec32cb const & a, Vec32cb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator | (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32cb operator || (Vec32cb const & a, Vec32cb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator ^ (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator ~ (Vec32cb const & a) {
+    return Vec32cb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec32cb operator ! (Vec32cb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) {
+    return Vec32cb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec32c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator + (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec32c & operator += (Vec32c & a, Vec32c const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator ++ (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec32c & operator ++ (Vec32c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator - (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec32c operator - (Vec32c const & a) {
+    return Vec32c(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : add
+static inline Vec32c & operator -= (Vec32c & a, Vec32c const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32c operator -- (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec32c & operator -- (Vec32c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator * (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec32c & operator *= (Vec32c & a, Vec32c const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator / (Vec32c const & a, Divisor_s const & d) {
+    return Vec32c(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec32c & operator /= (Vec32c & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator << (Vec32c const & a, int b) {
+    return Vec32c(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec32c & operator <<= (Vec32c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator >> (Vec32c const & a, int b) {
+    return Vec32c(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c & operator >>= (Vec32c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator == (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator != (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator > (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator < (Vec32c const & a, Vec32c const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator >= (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator <= (Vec32c const & a, Vec32c const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32c operator & (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32c operator && (Vec32c const & a, Vec32c const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32c & operator &= (Vec32c & a, Vec32c const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator | (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32c operator || (Vec32c const & a, Vec32c const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32c & operator |= (Vec32c & a, Vec32c const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator ^ (Vec32c const & a, Vec32c const & b) {
+    return Vec32c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec32c & operator ^= (Vec32c & a, Vec32c const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator ~ (Vec32c const & a) {
+    return Vec32c(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator ! (Vec32c const & a) {
+    return Vec32c(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec32c select (Vec32cb const & s, Vec32c const & a, Vec32c const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add (Vec32cb const & f, Vec32c const & a, Vec32c const & b) {
+    return a + (Vec32c(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec32c const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32c const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const & a, Vec32c const & b) {
+    return Vec32c(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const & a, Vec32c const & b) {
+    return Vec32c(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const & a, Vec32c const & b) {
+    return Vec32c(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const & a, Vec32c const & b) {
+    return Vec32c(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const & a) {
+    return Vec32c(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const & a) {
+    return Vec32c(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const & a, int b) {
+    return Vec32c(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32uc : public Vec32c {
+public:
+    // Default constructor:
+    Vec32uc(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec32uc(uint32_t i) {
+        y1 = y0 = _mm_set1_epi8((char)i);
+    };
+    // Constructor to build from all elements:
+    Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,        
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) {
+        y0 = _mm_setr_epi8(i0,  i1,  i2,  i3,  i4,  i5,  i6,  i7,  i8,  i9,  i10, i11, i12, i13, i14, i15);
+        y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    };
+    // Constructor to build from two Vec16uc:
+    Vec32uc(Vec16uc const & a0, Vec16uc const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec32uc(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec32uc & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec32uc & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32uc & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec32uc const & insert(uint32_t index, uint8_t value) {
+        Vec32c::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(uint32_t index) const {
+        return Vec32c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16uc:
+    Vec16uc get_low() const {
+        return y0;
+    }
+    Vec16uc get_high() const {
+        return y1;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator + (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high()); 
+}
+
+// vector operator - : subtract
+static inline Vec32uc operator - (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high()); 
+}
+
+// vector operator * : multiply
+static inline Vec32uc operator * (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high()); 
+}
+
+// vector operator / : divide
+static inline Vec32uc operator / (Vec32uc const & a, Divisor_us const & d) {
+    return Vec32uc(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const & a, uint32_t b) {
+    return Vec32uc(a.get_low() << b, a.get_high() << b); 
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const & a, uint32_t b) {
+    return Vec32uc(a.get_low() >> b, a.get_high() >> b); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator >= (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); 
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator <= (Vec32uc const & a, Vec32uc const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator > (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high()); 
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator < (Vec32uc const & a, Vec32uc const & b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32uc operator & (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32uc operator && (Vec32uc const & a, Vec32uc const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec32uc operator | (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32uc operator || (Vec32uc const & a, Vec32uc const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator ^ (Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator ~ (Vec32uc const & a) {
+    return Vec32uc(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec32uc select (Vec32cb const & s, Vec32uc const & a, Vec32uc const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add (Vec32cb const & f, Vec32uc const & a, Vec32uc const & b) {
+    return a + (Vec32uc(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint32_t horizontal_add (Vec32uc const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec32uc const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); 
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); 
+}
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); 
+}
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const & a, Vec32uc const & b) {
+    return Vec32uc(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); 
+}
+
+
+    
+/*****************************************************************************
+*
+*          Vector of 16 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16s : public Vec256b {
+public:
+    // Default constructor:
+    Vec16s() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16s(int i) {
+        y1 = y0 = _mm_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec16s(int16_t i0, int16_t i1, int16_t i2,  int16_t i3,  int16_t i4,  int16_t i5,  int16_t i6,  int16_t i7,
+           int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) {
+        y0 = _mm_setr_epi16(i0, i1, i2,  i3,  i4,  i5,  i6,  i7);
+        y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to build from two Vec8s:
+    Vec16s(Vec8s const & a0, Vec8s const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec16s(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec16s & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16s & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16s & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16s & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 8) {
+            *this = Vec16s(Vec8s().load_partial(n, p), 0);
+        }
+        else if (n < 16) {
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t*)p+8));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 8) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 16) {
+            get_low().store(p);
+            get_high().store_partial(n-8, (int16_t*)p+8);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16s & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 2);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16s const & insert(uint32_t index, int16_t value) {
+        if (index < 8) {
+            y0 = Vec8s(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec8s(y1).insert(index-8, value);
+        }
+        return *this;
+    };
+    // Member function extract a single element from vector
+    int16_t extract(uint32_t index) const {
+        if (index < 8) {
+            return Vec8s(y0).extract(index);
+        }
+        else {
+            return Vec8s(y1).extract(index-8);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8s:
+    Vec8s get_low() const {
+        return y0;
+    }
+    Vec8s get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+*
+*****************************************************************************/
+
+class Vec16sb : public Vec16s {
+public:
+    // Default constructor:
+    Vec16sb() {
+    }
+    // Constructor to build from all elements:
+    Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), 
+            -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+        {}
+    // Constructor to convert from type Vec256ie
+    Vec16sb(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec16sb & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16sb(bool b) : Vec16s(-int16_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16sb & operator = (bool b) {
+        *this = Vec16sb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec16sb(int b);
+    Vec16sb & operator = (int x);
+public:
+    // Member functions to split into two Vec8s:
+    Vec8sb get_low() const {
+        return y0;
+    }
+    Vec8sb get_high() const {
+        return y1;
+    }
+    Vec16sb & insert (int index, bool a) {
+        Vec16s::insert(index, -(int)a);
+        return *this;
+    }    
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return Vec16s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16sb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16sb operator & (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16sb operator && (Vec16sb const & a, Vec16sb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator | (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16sb operator || (Vec16sb const & a, Vec16sb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator ^ (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator ~ (Vec16sb const & a) {
+    return Vec16sb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16sb operator ! (Vec16sb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) {
+    return Vec16sb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator + (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16s & operator += (Vec16s & a, Vec16s const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator ++ (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16s & operator ++ (Vec16s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator - (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16s operator - (Vec16s const & a) {
+    return Vec16s(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16s & operator -= (Vec16s & a, Vec16s const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16s operator -- (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16s & operator -- (Vec16s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator * (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16s & operator *= (Vec16s & a, Vec16s const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16s operator / (Vec16s const & a, Divisor_s const & d) {
+    return Vec16s(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16s & operator /= (Vec16s & a, Divisor_s const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec16s operator << (Vec16s const & a, int b) {
+    return Vec16s(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec16s & operator <<= (Vec16s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator >> (Vec16s const & a, int b) {
+    return Vec16s(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16s & operator >>= (Vec16s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator == (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator != (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator > (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator < (Vec16s const & a, Vec16s const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator >= (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator <= (Vec16s const & a, Vec16s const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16s operator & (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16s operator && (Vec16s const & a, Vec16s const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16s & operator &= (Vec16s & a, Vec16s const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator | (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16s operator || (Vec16s const & a, Vec16s const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16s & operator |= (Vec16s & a, Vec16s const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator ^ (Vec16s const & a, Vec16s const & b) {
+    return Vec16s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec16s & operator ^= (Vec16s & a, Vec16s const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator ~ (Vec16s const & a) {
+    return Vec16s(~Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16sb operator ! (Vec16s const & a) {
+    return Vec16s(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16s select (Vec16sb const & s, Vec16s const & a, Vec16s const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add (Vec16sb const & f, Vec16s const & a, Vec16s const & b) {
+    return a + (Vec16s(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec16s const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x (Vec16s const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const & a, Vec16s const & b) {
+    return Vec16s(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const & a, Vec16s const & b) {
+    return Vec16s(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const & a, Vec16s const & b) {
+    return Vec16s(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const & a, Vec16s const & b) {
+    return Vec16s(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const & a) {
+    return Vec16s(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const & a) {
+    return Vec16s(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const & a, int b) {
+    return Vec16s(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16us : public Vec16s {
+public:
+    // Default constructor:
+    Vec16us(){
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16us(uint32_t i) {
+        y1 = y0 = _mm_set1_epi16((int16_t)i);
+    };
+    // Constructor to build from all elements:
+    Vec16us(uint16_t i0, uint16_t i1, uint16_t i2,  uint16_t i3,  uint16_t i4,  uint16_t i5,  uint16_t i6,  uint16_t i7,
+            uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) {
+        y0 = _mm_setr_epi16(i0, i1, i2,  i3,  i4,  i5,  i6,  i7);
+        y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15 );
+    };
+    // Constructor to build from two Vec8us:
+    Vec16us(Vec8us const & a0, Vec8us const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec16us(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec16us & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16us & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16us & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16us const & insert(uint32_t index, uint16_t value) {
+        Vec16s::insert(index, value);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    uint16_t extract(uint32_t index) const {
+        return Vec16s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8us:
+    Vec8us get_low() const {
+        return y0;
+    }
+    Vec8us get_high() const {
+        return y1;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator + (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract
+static inline Vec16us operator - (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply
+static inline Vec16us operator * (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+static inline Vec16us operator / (Vec16us const & a, Divisor_us const & d) {
+    return Vec16us(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16us & operator /= (Vec16us & a, Divisor_us const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const & a, uint32_t b) {
+    return Vec16us(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const & a, int b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const & a, uint32_t b) {
+    return Vec16us(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const & a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator >= (Vec16us const & a, Vec16us const & b) {
+    return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator <= (Vec16us const & a, Vec16us const & b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator > (Vec16us const & a, Vec16us const & b) {
+    return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator < (Vec16us const & a, Vec16us const & b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16us operator & (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16us operator && (Vec16us const & a, Vec16us const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16us operator | (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16us operator || (Vec16us const & a, Vec16us const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator ^ (Vec16us const & a, Vec16us const & b) {
+    return Vec16us(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator ~ (Vec16us const & a) {
+    return Vec16us(~ Vec256b(a));
+}
+
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16us select (Vec16sb const & s, Vec16us const & a, Vec16us const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add (Vec16sb const & f, Vec16us const & a, Vec16us const & b) {
+    return a + (Vec16us(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16us const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec16us const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const & a, Vec16us const & b) {
+    return Vec16us(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const & a, Vec16us const & b) {
+    return Vec16us(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const & a, Vec16us const & b) {
+    return Vec16us(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const & a, Vec16us const & b) {
+    return Vec16us(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8i : public Vec256b {
+public:
+    // Default constructor:
+    Vec8i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8i(int i) {
+        y1 = y0 = _mm_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) {
+        y0 = _mm_setr_epi32(i0, i1, i2, i3);
+        y1 = _mm_setr_epi32(i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4i:
+    Vec8i(Vec4i const & a0, Vec4i const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec8i(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec8i & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8i & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8i & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8i & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 4) {
+            *this = Vec8i(Vec4i().load_partial(n, p), 0);
+        }
+        else if (n < 8) {
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t*)p+4));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 8) {
+            get_low().store(p);
+            get_high().store_partial(n-4, (int32_t*)p+4);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8i & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 4);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8i const & insert(uint32_t index, int32_t value) {
+        if (index < 4) {
+            y0 = Vec4i(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4i(y1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    int32_t extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4i(y0).extract(index);
+        }
+        else {
+            return Vec4i(y1).extract(index-4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4i:
+    Vec4i get_low() const {
+        return y0;
+    }
+    Vec4i get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+*
+*****************************************************************************/
+
+class Vec8ib : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ib() {
+    }
+    // Constructor to build from all elements:
+    Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+        {}
+    // Constructor to convert from type Vec256ie
+    Vec8ib(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec8ib & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8ib(bool b) : Vec8i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8ib & operator = (bool b) {
+        *this = Vec8ib(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8ib(int b);
+    Vec8ib & operator = (int x);
+public:
+    // Member functions to split into two Vec4i:
+    Vec4ib get_low() const {
+        return y0;
+    }
+    Vec4ib get_high() const {
+        return y1;
+    }
+    Vec8ib & insert (int index, bool a) {
+        Vec8i::insert(index, -(int)a);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(uint32_t index) const {
+        return Vec8i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+/*****************************************************************************
+*
+*          Define operators for Vec8ib
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8ib operator & (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ib operator && (Vec8ib const & a, Vec8ib const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator | (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ib operator || (Vec8ib const & a, Vec8ib const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator ^ (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator ~ (Vec8ib const & a) {
+    return Vec8ib( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8ib operator ! (Vec8ib const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) {
+    return Vec8ib(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator + (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec8i & operator += (Vec8i & a, Vec8i const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator ++ (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8i & operator ++ (Vec8i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator - (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec8i operator - (Vec8i const & a) {
+    return Vec8i(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8i & operator -= (Vec8i & a, Vec8i const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8i operator -- (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8i & operator -- (Vec8i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator * (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec8i & operator *= (Vec8i & a, Vec8i const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8i operator / (Vec8i const & a, Divisor_i const & d) {
+    return Vec8i(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec8i & operator /= (Vec8i & a, Divisor_i const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8i operator << (Vec8i const & a, int32_t b) {
+    return Vec8i(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec8i & operator <<= (Vec8i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator >> (Vec8i const & a, int32_t b) {
+    return Vec8i(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8i & operator >>= (Vec8i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator == (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator != (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+  
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator > (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator < (Vec8i const & a, Vec8i const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator >= (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator <= (Vec8i const & a, Vec8i const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8i operator & (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8i operator && (Vec8i const & a, Vec8i const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8i & operator &= (Vec8i & a, Vec8i const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator | (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8i operator || (Vec8i const & a, Vec8i const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8i & operator |= (Vec8i & a, Vec8i const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator ^ (Vec8i const & a, Vec8i const & b) {
+    return Vec8i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec8i & operator ^= (Vec8i & a, Vec8i const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator ~ (Vec8i const & a) {
+    return Vec8i(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator ! (Vec8i const & a) {
+    return Vec8i(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8i select (Vec8ib const & s, Vec8i const & a, Vec8i const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add (Vec8ib const & f, Vec8i const & a, Vec8i const & b) {
+    return a + (Vec8i(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec8i const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec8i const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const & a, Vec8i const & b) {
+    return Vec8i(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const & a, Vec8i const & b) {
+    return Vec8i(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const & a, Vec8i const & b) {
+    return Vec8i(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const & a, Vec8i const & b) {
+    return Vec8i(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const & a) {
+    return Vec8i(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const & a) {
+    return Vec8i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const & a, int b) {
+    return Vec8i(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8ui : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec8ui(uint32_t i) {
+        y1 = y0 = _mm_set1_epi32(i);
+    };
+    // Constructor to build from all elements:
+    Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) {
+        y0 = _mm_setr_epi32(i0, i1, i2, i3);
+        y1 = _mm_setr_epi32(i4, i5, i6, i7);
+    };
+    // Constructor to build from two Vec4ui:
+    Vec8ui(Vec4ui const & a0, Vec4ui const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec8ui(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec8ui & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec8ui & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8ui & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8ui const & insert(uint32_t index, uint32_t value) {
+        Vec8i::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(uint32_t index) const {
+        return Vec8i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec4ui get_low() const {
+        return y0;
+    }
+    Vec4ui get_high() const {
+        return y1;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator + (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) + Vec8i(b));
+}
+
+// vector operator - : subtract
+static inline Vec8ui operator - (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) - Vec8i(b));
+}
+
+// vector operator * : multiply
+static inline Vec8ui operator * (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui (Vec8i(a) * Vec8i(b));
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8ui operator / (Vec8ui const & a, Divisor_ui const & d) {
+    return Vec8ui(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const & a, uint32_t b) {
+    return Vec8ui(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, int32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const & a, uint32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const & a, int32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator > (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator < (Vec8ui const & a, Vec8ui const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator >= (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator <= (Vec8ui const & a, Vec8ui const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8ui operator & (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8ui operator && (Vec8ui const & a, Vec8ui const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ui operator | (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8ui operator || (Vec8ui const & a, Vec8ui const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator ^ (Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator ~ (Vec8ui const & a) {
+    return Vec8ui(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8ui select (Vec8ib const & s, Vec8ui const & a, Vec8ui const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add (Vec8ib const & f, Vec8ui const & a, Vec8ui const & b) {
+    return a + (Vec8ui(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec8ui const & a) {
+    return horizontal_add((Vec8i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec8ui const & a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4q : public Vec256b {
+public:
+    // Default constructor:
+    Vec4q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4q(int64_t i) {
+        y0 = y1 = Vec2q(i);
+    }
+    // Constructor to build from all elements:
+    Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+        y0 = Vec2q(i0,i1);
+        y1 = Vec2q(i2,i3);
+    }
+    // Constructor to build from two Vec2q:
+    Vec4q(Vec2q const & a0, Vec2q const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec4q(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec4q & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4q & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4q & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4q & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 2) {
+            *this = Vec4q(Vec2q().load_partial(n, p), 0);
+        }
+        else if (n < 4) {
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t*)p+2));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 4) {
+            get_low().store(p);
+            get_high().store_partial(n-2, (int64_t*)p+2);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec4q & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 8);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4q const & insert(uint32_t index, int64_t value) {
+        if (index < 2) {
+            y0 = Vec2q(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2q(y1).insert(index-2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    int64_t extract(uint32_t index) const {
+        if (index < 2) {
+            return Vec2q(y0).extract(index);
+        }
+        else {
+            return Vec2q(y1).extract(index-2);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec2q get_low() const {
+        return y0;
+    }
+    Vec2q get_high() const {
+        return y1;
+    }
+    static int size () {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+*
+*****************************************************************************/
+
+class Vec4qb : public Vec4q {
+public:
+    // Default constructor:
+    Vec4qb() {
+    }
+    // Constructor to build from all elements:
+    Vec4qb(bool x0, bool x1, bool x2, bool x3) :
+        Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {
+    }
+    // Constructor to convert from type Vec256ie
+    Vec4qb(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256ie
+    Vec4qb & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4qb(bool b) : Vec4q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4qb & operator = (bool b) {
+        *this = Vec4qb(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec4qb(int b);
+    Vec4qb & operator = (int x);
+public:
+    // Member functions to split into two Vec2qb:
+    Vec2qb get_low() const {
+        return y0;
+    }
+    Vec2qb get_high() const {
+        return y1;
+    }
+    Vec4qb & insert (int index, bool a) {
+        Vec4q::insert(index, -(int64_t)a);
+        return *this;
+    };    
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(uint32_t index) const {
+        return Vec4q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4qb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4qb operator & (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4qb operator && (Vec4qb const & a, Vec4qb const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator | (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4qb operator || (Vec4qb const & a, Vec4qb const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator ^ (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator ~ (Vec4qb const & a) {
+    return Vec4qb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4qb operator ! (Vec4qb const & a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) {
+    return Vec4qb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator + (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec4q & operator += (Vec4q & a, Vec4q const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator ++ (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4q & operator ++ (Vec4q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator - (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec4q operator - (Vec4q const & a) {
+    return Vec4q(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec4q & operator -= (Vec4q & a, Vec4q const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4q operator -- (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4q & operator -- (Vec4q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec4q & operator *= (Vec4q & a, Vec4q const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator << (Vec4q const & a, int32_t b) {
+    return Vec4q(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec4q & operator <<= (Vec4q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator >> (Vec4q const & a, int32_t b) {
+    return Vec4q(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4q & operator >>= (Vec4q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator == (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator != (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+  
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator < (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator > (Vec4q const & a, Vec4q const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator >= (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator <= (Vec4q const & a, Vec4q const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4q operator & (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4q operator && (Vec4q const & a, Vec4q const & b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4q & operator &= (Vec4q & a, Vec4q const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator | (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4q operator || (Vec4q const & a, Vec4q const & b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4q & operator |= (Vec4q & a, Vec4q const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator ^ (Vec4q const & a, Vec4q const & b) {
+    return Vec4q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec4q & operator ^= (Vec4q & a, Vec4q const & b) {
+    a = a ^ b;
+    return a;
+}
+
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator ~ (Vec4q const & a) {
+    return Vec4q(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator ! (Vec4q const & a) {
+    return Vec4q(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4q select (Vec4qb const & s, Vec4q const & a, Vec4q const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add (Vec4qb const & f, Vec4q const & a, Vec4q const & b) {
+    return a + (Vec4q(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add (Vec4q const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const & a, Vec4q const & b) {
+    return Vec4q(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const & a, Vec4q const & b) {
+    return Vec4q(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const & a) {
+    return Vec4q(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const & a) {
+    return Vec4q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const & a, int b) {
+    return Vec4q(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4uq : public Vec4q {
+public:
+    // Default constructor:
+    Vec4uq() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec4uq(uint64_t i) {
+        y1 = y0 = Vec2q(i);
+    };
+    // Constructor to build from all elements:
+    Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) {
+        y0 = Vec2q(i0, i1);
+        y1 = Vec2q(i2, i3);
+    };
+    // Constructor to build from two Vec2uq:
+    Vec4uq(Vec2uq const & a0, Vec2uq const & a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256ie
+    Vec4uq(Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    };
+    // Assignment operator to convert from type Vec256ie
+    Vec4uq & operator = (Vec256ie const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec4uq & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4uq & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4uq const & insert(uint32_t index, uint64_t value) {
+        Vec4q::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(uint32_t index) const {
+        return Vec4q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec2uq get_low() const {
+        return y0;
+    }
+    Vec2uq get_high() const {
+        return y1;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator + (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) + Vec4q(b));
+}
+
+// vector operator - : subtract
+static inline Vec4uq operator - (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) - Vec4q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator * (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq (Vec4q(a) * Vec4q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const & a, uint32_t b) {
+    return Vec4uq(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const & a, uint32_t b) {
+    return Vec4uq ((Vec4q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const & a, int32_t b) {
+    return Vec4uq ((Vec4q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator > (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4q(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator < (Vec4uq const & a, Vec4uq const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator >= (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator <= (Vec4uq const & a, Vec4uq const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4uq operator & (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4uq operator && (Vec4uq const & a, Vec4uq const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4uq operator | (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4uq operator || (Vec4uq const & a, Vec4uq const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator ^ (Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4uq operator ~ (Vec4uq const & a) {
+    return Vec4uq(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4uq select (Vec4qb const & s, Vec4uq const & a, Vec4uq const & b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add (Vec4qb const & f, Vec4uq const & a, Vec4uq const & b) {
+    return a + (Vec4uq(f) & b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add (Vec4uq const & a) {
+    return horizontal_add((Vec4q)a);
+}
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+
+* Vec8i a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8i b;
+* b = permute8i<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+
+// Shuffle vector of 4 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3 >
+static inline Vec4q permute4q(Vec4q const & a) {
+    return Vec4q(blend2q<i0,i1> (a.get_low(), a.get_high()),
+                 blend2q<i2,i3> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4uq(Vec4uq const & a) {
+    return Vec4uq (permute4q<i0,i1,i2,i3> (a));
+}
+
+// Shuffle vector of 8 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8i permute8i(Vec8i const & a) {
+    return Vec8i(blend4i<i0,i1,i2,i3> (a.get_low(), a.get_high()), 
+                 blend4i<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8ui permute8ui(Vec8ui const & a) {
+    return Vec8ui (permute8i<i0,i1,i2,i3,i4,i5,i6,i7> (a));
+}
+
+// Shuffle vector of 16 16-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16s permute16s(Vec16s const & a) {
+    return Vec16s(blend8s<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()), 
+                  blend8s<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us permute16us(Vec16us const & a) {
+    return Vec16us (permute16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+static inline Vec32c permute32c(Vec32c const & a) {
+    return Vec32c(blend16c<i0, i1, i2 ,i3 ,i4 ,i5 ,i6 ,i7, i8, i9, i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()), 
+                  blend16c<i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a.get_low(), a.get_high()));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc permute32uc(Vec32uc const & a) {
+        return Vec32uc (permute32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,    
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8i c;
+* c = blend8i<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// helper function used below
+template <int n>
+static inline Vec2q select4(Vec4q const & a, Vec4q const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_si128();
+}
+
+// blend vectors Vec4q
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q blend4q(Vec4q const & a, Vec4q const & b) {
+    const int j0 = i0 >= 0 ? i0/2 : i0;
+    const int j1 = i1 >= 0 ? i1/2 : i1;
+    const int j2 = i2 >= 0 ? i2/2 : i2;
+    const int j3 = i3 >= 0 ? i3/2 : i3;    
+    Vec2q x0, x1;
+
+    if (j0 == j1 || i0 < 0 || i1 < 0) {  // both from same
+        const int k0 = j0 >= 0 ? j0 : j1;
+        x0 = permute2q<i0 & -7, i1 & -7> (select4<k0> (a,b));
+    }
+    else {
+        x0 = blend2q<i0 & -7, (i1 & -7) | 2> (select4<j0>(a,b), select4<j1>(a,b));
+    }
+    if (j2 == j3 || i2 < 0 || i3 < 0) {  // both from same
+        const int k1 = j2 >= 0 ? j2 : j3;
+        x1 = permute2q<i2 & -7, i3 & -7> (select4<k1> (a,b));
+    }
+    else {
+        x1 = blend2q<i2 & -7, (i3 & -7) | 2> (select4<j2>(a,b), select4<j3>(a,b));
+    }
+    return Vec4q(x0,x1);
+}
+
+template <int i0, int i1, int i2, int i3> 
+static inline Vec4uq blend4uq(Vec4uq const & a, Vec4uq const & b) {
+    return Vec4uq( blend4q<i0,i1,i2,i3> (a,b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec4i select4(Vec8i const & a, Vec8i const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_si128();
+}
+
+// blend vectors Vec8i
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i blend8i(Vec8i const & a, Vec8i const & b) {
+    const int j0 = i0 >= 0 ? i0/4 : i0;
+    const int j1 = i1 >= 0 ? i1/4 : i1;
+    const int j2 = i2 >= 0 ? i2/4 : i2;
+    const int j3 = i3 >= 0 ? i3/4 : i3;
+    const int j4 = i4 >= 0 ? i4/4 : i4;
+    const int j5 = i5 >= 0 ? i5/4 : i5;
+    const int j6 = i6 >= 0 ? i6/4 : i6;
+    const int j7 = i7 >= 0 ? i7/4 : i7;
+    Vec4i x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+    const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+    const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    if (r0 < 0) {
+        x0 = _mm_setzero_si128();
+    }
+    else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { 
+        // i0 - i3 all from same source
+        x0 = permute4i<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
+    }
+    else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { 
+        // i0 - i3 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 3 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+        x0 = blend4i<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i3 from three or four different sources
+        x0 = blend4i<0,1,6,7> (
+             blend4i<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
+             blend4i<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
+    }
+
+    if (r1 < 0) {
+        x1 = _mm_setzero_si128();
+    }
+    else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) { 
+        // i4 - i7 all from same source
+        x1 = permute4i<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
+    }
+    else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { 
+        // i4 - i7 all from two sources
+        const int k4 =  i4 >= 0 ? i4 & 3 : i4;
+        const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+        x1 = blend4i<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i4 - i7 from three or four different sources
+        x1 = blend4i<0,1,6,7> (
+             blend4i<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
+             blend4i<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
+    }
+
+    return Vec8i(x0,x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8ui blend8ui(Vec8ui const & a, Vec8ui const & b) {
+    return Vec8ui( blend8i<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec8s select4(Vec16s const & a, Vec16s const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_si128();
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16s blend16s(Vec16s const & a, Vec16s const & b) {
+
+    const int j0  = i0  >= 0 ? i0 /8 : i0;
+    const int j1  = i1  >= 0 ? i1 /8 : i1;
+    const int j2  = i2  >= 0 ? i2 /8 : i2;
+    const int j3  = i3  >= 0 ? i3 /8 : i3;
+    const int j4  = i4  >= 0 ? i4 /8 : i4;
+    const int j5  = i5  >= 0 ? i5 /8 : i5;
+    const int j6  = i6  >= 0 ? i6 /8 : i6;
+    const int j7  = i7  >= 0 ? i7 /8 : i7;
+    const int j8  = i8  >= 0 ? i8 /8 : i8;
+    const int j9  = i9  >= 0 ? i9 /8 : i9;
+    const int j10 = i10 >= 0 ? i10/8 : i10;
+    const int j11 = i11 >= 0 ? i11/8 : i11;
+    const int j12 = i12 >= 0 ? i12/8 : i12;
+    const int j13 = i13 >= 0 ? i13/8 : i13;
+    const int j14 = i14 >= 0 ? i14/8 : i14;
+    const int j15 = i15 >= 0 ? i15/8 : i15;
+
+    Vec8s x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2  >= 0 ? j2  : j3  >= 0 ? j3  : j4  >= 0 ? j4  : j5  >= 0 ? j5  : j6  >= 0 ? j6  : j7;
+    const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2  : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+    const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
+
+    if (r0 < 0) {
+        x0 = _mm_setzero_si128();
+    }
+    else if (r0 == s0) {
+        // i0 - i7 all from same source
+        x0 = permute8s<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
+    }
+    else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3 == s0) && (j4<0||j4==r0||j4 == s0) && (j5<0||j5==r0||j5 == s0) && (j6<0||j6==r0||j6 == s0) && (j7<0||j7==r0||j7 == s0)) { 
+        // i0 - i7 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 7 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 7 : i1) | (j1 == s0 ? 8 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 7 : i2) | (j2 == s0 ? 8 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 7 : i3) | (j3 == s0 ? 8 : 0);
+        const int k4 = (i4 >= 0 ? i4 & 7 : i4) | (j4 == s0 ? 8 : 0);
+        const int k5 = (i5 >= 0 ? i5 & 7 : i5) | (j5 == s0 ? 8 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 7 : i6) | (j6 == s0 ? 8 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 7 : i7) | (j7 == s0 ? 8 : 0);
+        x0 = blend8s<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i7 from three or four different sources
+        const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
+        const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
+        const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
+        const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
+        const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
+        const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
+        const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
+        const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
+        x0 = blend8s<n0, n1, n2, n3, n4, n5, n6, n7> (
+             blend8s< j0   & 2 ? -256 : i0 &15,  j1   & 2 ? -256 : i1 &15,  j2   & 2 ? -256 : i2 &15,  j3   & 2 ? -256 : i3 &15,  j4   & 2 ? -256 : i4 &15,  j5   & 2 ? -256 : i5 &15,  j6   & 2 ? -256 : i6 &15,  j7   & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
+             blend8s<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
+    }
+
+    if (r1 < 0) {
+        x1 = _mm_setzero_si128();
+    }
+    else if (r1 == s1) {
+        // i8 - i15 all from same source
+        x1 = permute8s<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
+    }
+    else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) { 
+        // i8 - i15 all from two sources
+        const int k8 =  i8 >= 0 ? i8 & 7 : i8;
+        const int k9 = (i9 >= 0 ? i9 & 7 : i9 ) | (j9 == s1 ? 8 : 0);
+        const int k10= (i10>= 0 ? i10& 7 : i10) | (j10== s1 ? 8 : 0);
+        const int k11= (i11>= 0 ? i11& 7 : i11) | (j11== s1 ? 8 : 0);
+        const int k12= (i12>= 0 ? i12& 7 : i12) | (j12== s1 ? 8 : 0);
+        const int k13= (i13>= 0 ? i13& 7 : i13) | (j13== s1 ? 8 : 0);
+        const int k14= (i14>= 0 ? i14& 7 : i14) | (j14== s1 ? 8 : 0);
+        const int k15= (i15>= 0 ? i15& 7 : i15) | (j15== s1 ? 8 : 0);
+        x1 = blend8s<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i8 - i15 from three or four different sources
+        const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
+        const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
+        const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
+        const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
+        const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
+        const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
+        const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
+        const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
+        x1 = blend8s<n8, n9, n10, n11, n12, n13, n14, n15> (
+             blend8s< j8   & 2 ? -256 : i8 &15,  j9   & 2 ? -256 : i9 &15,  j10   & 2 ? -256 : i10 &15,  j11   & 2 ? -256 : i11 &15,  j12   & 2 ? -256 : i12 &15,  j13   & 2 ? -256 : i13 &15,  j14   & 2 ? -256 : i14 &15,  j15   & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
+             blend8s<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
+    }
+    return Vec16s(x0,x1);
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16us blend16us(Vec16us const & a, Vec16us const & b) {
+    return Vec16us( blend16s<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a,b));
+}
+
+// helper function used below
+template <int n>
+static inline Vec16c select4(Vec32c const & a, Vec32c const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return _mm_setzero_si128();
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > 
+static inline Vec32c blend32c(Vec32c const & a, Vec32c const & b) {  
+
+    // j0 - j31 indicate one of four 16-byte sources
+    const int j0  = i0  >= 0 ? i0 /16 : i0;
+    const int j1  = i1  >= 0 ? i1 /16 : i1;
+    const int j2  = i2  >= 0 ? i2 /16 : i2;
+    const int j3  = i3  >= 0 ? i3 /16 : i3;
+    const int j4  = i4  >= 0 ? i4 /16 : i4;
+    const int j5  = i5  >= 0 ? i5 /16 : i5;
+    const int j6  = i6  >= 0 ? i6 /16 : i6;
+    const int j7  = i7  >= 0 ? i7 /16 : i7;
+    const int j8  = i8  >= 0 ? i8 /16 : i8;
+    const int j9  = i9  >= 0 ? i9 /16 : i9;
+    const int j10 = i10 >= 0 ? i10/16 : i10;
+    const int j11 = i11 >= 0 ? i11/16 : i11;
+    const int j12 = i12 >= 0 ? i12/16 : i12;
+    const int j13 = i13 >= 0 ? i13/16 : i13;
+    const int j14 = i14 >= 0 ? i14/16 : i14;
+    const int j15 = i15 >= 0 ? i15/16 : i15;
+    const int j16 = i16 >= 0 ? i16/16 : i16;
+    const int j17 = i17 >= 0 ? i17/16 : i17;
+    const int j18 = i18 >= 0 ? i18/16 : i18;
+    const int j19 = i19 >= 0 ? i19/16 : i19;
+    const int j20 = i20 >= 0 ? i20/16 : i20;
+    const int j21 = i21 >= 0 ? i21/16 : i21;
+    const int j22 = i22 >= 0 ? i22/16 : i22;
+    const int j23 = i23 >= 0 ? i23/16 : i23;
+    const int j24 = i24 >= 0 ? i24/16 : i24;
+    const int j25 = i25 >= 0 ? i25/16 : i25;
+    const int j26 = i26 >= 0 ? i26/16 : i26;
+    const int j27 = i27 >= 0 ? i27/16 : i27;
+    const int j28 = i28 >= 0 ? i28/16 : i28;
+    const int j29 = i29 >= 0 ? i29/16 : i29;
+    const int j30 = i30 >= 0 ? i30/16 : i30;
+    const int j31 = i31 >= 0 ? i31/16 : i31;
+
+    Vec16c x0, x1;
+
+    // r0, s0 = first two sources of low  destination (i0  - i15)
+    // r1, s1 = first two sources of high destination (i16 - i31)
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2  >= 0 ? j2  : j3  >= 0 ? j3  : j4  >= 0 ? j4  : j5  >= 0 ? j5  : j6  >= 0 ? j6  : j7 >= 0 ? j7 : 
+                   j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+    const int r1 = j16>= 0 ? j16: j17>= 0 ? j17: j18 >= 0 ? j18 : j19 >= 0 ? j19 : j20 >= 0 ? j20 : j21 >= 0 ? j21 : j22 >= 0 ? j22 : j23>= 0 ? j23: 
+                   j24>= 0 ? j24: j25>= 0 ? j25: j26 >= 0 ? j26 : j27 >= 0 ? j27 : j28 >= 0 ? j28 : j29 >= 0 ? j29 : j30 >= 0 ? j30 : j31;
+    const int s0 = (j1 >=0&&j1 !=r0)?j1  : (j2 >=0&&j2 !=r0)?j2  : (j3 >=0&&j3 !=r0)?j3  : (j4 >=0&&j4 !=r0)?j4  : (j5 >=0&&j5 !=r0)?j5  : (j6 >=0&&j6 !=r0)?j6 : (j7 >=0&&j7 !=r0)?j7 : 
+                   (j8 >=0&&j8 !=r0)?j8  : (j9 >=0&&j9 !=r0)?j9  : (j10>=0&&j10!=r0)?j10 : (j11>=0&&j11!=r0)?j11 : (j12>=0&&j12!=r0)?j12 : (j13>=0&&j13!=r0)?j13: (j14>=0&&j14!=r0)?j14: j15;
+    const int s1 = (j17>=0&&j17!=r1)?j17 : (j18>=0&&j18!=r1)?j18 : (j19>=0&&j19!=r1)?j19 : (j20>=0&&j20!=r1)?j20 : (j21>=0&&j21!=r1)?j21 : (j22>=0&&j22!=r1)?j22: (j23>=0&&j23!=r1)?j23: 
+                   (j24>=0&&j24!=r1)?j24 : (j25>=0&&j25!=r1)?j25 : (j26>=0&&j26!=r1)?j26 : (j27>=0&&j27!=r1)?j27 : (j28>=0&&j28!=r1)?j28 : (j29>=0&&j29!=r1)?j29: (j30>=0&&j30!=r1)?j30: j31;
+
+    if (r0 < 0) {
+        x0 = _mm_setzero_si128();
+    }
+    else if (r0 == s0) {
+        // i0 - i15 all from same source
+        x0 = permute16c< i0&-49, i1&-49, i2 &-49, i3 &-49, i4 &-49, i5 &-49, i6 &-49, i7 &-49,
+                         i8&-49, i9&-49, i10&-49, i11&-49, i12&-49, i13&-49, i14&-49, i15&-49 >
+             (select4<r0> (a,b));
+    }
+    else if ((j2 <0||j2 ==r0||j2 ==s0) && (j3 <0||j3 ==r0||j3 ==s0) && (j4 <0||j4 ==r0||j4 ==s0) && (j5 <0||j5 ==r0||j5 ==s0) && (j6 <0||j6 ==r0||j6 ==s0) && (j7 <0||j7 ==r0||j7 ==s0) && (j8 <0||j8 ==r0||j8 ==s0) && 
+             (j9 <0||j9 ==r0||j9 ==s0) && (j10<0||j10==r0||j10==s0) && (j11<0||j11==r0||j11==s0) && (j12<0||j12==r0||j12==s0) && (j13<0||j13==r0||j13==s0) && (j14<0||j14==r0||j14==s0) && (j15<0||j15==r0||j15==s0)) {
+        // i0 - i15 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 15 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 15 : i1 ) | (j1 == s0 ? 16 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 15 : i2 ) | (j2 == s0 ? 16 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 15 : i3 ) | (j3 == s0 ? 16 : 0);
+        const int k4 = (i4 >= 0 ? i4 & 15 : i4 ) | (j4 == s0 ? 16 : 0);
+        const int k5 = (i5 >= 0 ? i5 & 15 : i5 ) | (j5 == s0 ? 16 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 15 : i6 ) | (j6 == s0 ? 16 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 15 : i7 ) | (j7 == s0 ? 16 : 0);
+        const int k8 = (i8 >= 0 ? i8 & 15 : i8 ) | (j8 == s0 ? 16 : 0);
+        const int k9 = (i9 >= 0 ? i9 & 15 : i9 ) | (j9 == s0 ? 16 : 0);
+        const int k10= (i10>= 0 ? i10& 15 : i10) | (j10== s0 ? 16 : 0);
+        const int k11= (i11>= 0 ? i11& 15 : i11) | (j11== s0 ? 16 : 0);
+        const int k12= (i12>= 0 ? i12& 15 : i12) | (j12== s0 ? 16 : 0);
+        const int k13= (i13>= 0 ? i13& 15 : i13) | (j13== s0 ? 16 : 0);
+        const int k14= (i14>= 0 ? i14& 15 : i14) | (j14== s0 ? 16 : 0);
+        const int k15= (i15>= 0 ? i15& 15 : i15) | (j15== s0 ? 16 : 0);
+        x0 = blend16c<k0,k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i15 from three or four different sources
+        const int n0 = j0 >= 0 ? j0 /2*16 + 0 : j0;
+        const int n1 = j1 >= 0 ? j1 /2*16 + 1 : j1;
+        const int n2 = j2 >= 0 ? j2 /2*16 + 2 : j2;
+        const int n3 = j3 >= 0 ? j3 /2*16 + 3 : j3;
+        const int n4 = j4 >= 0 ? j4 /2*16 + 4 : j4;
+        const int n5 = j5 >= 0 ? j5 /2*16 + 5 : j5;
+        const int n6 = j6 >= 0 ? j6 /2*16 + 6 : j6;
+        const int n7 = j7 >= 0 ? j7 /2*16 + 7 : j7;
+        const int n8 = j8 >= 0 ? j8 /2*16 + 8 : j8;
+        const int n9 = j9 >= 0 ? j9 /2*16 + 9 : j9;
+        const int n10= j10>= 0 ? j10/2*16 +10 : j10;
+        const int n11= j11>= 0 ? j11/2*16 +11 : j11;
+        const int n12= j12>= 0 ? j12/2*16 +12 : j12;
+        const int n13= j13>= 0 ? j13/2*16 +13 : j13;
+        const int n14= j14>= 0 ? j14/2*16 +14 : j14;
+        const int n15= j15>= 0 ? j15/2*16 +15 : j15;
+
+        Vec16c x0a = blend16c< j0   & 2 ? -256 : i0 & 31,  j1   & 2 ? -256 : i1 & 31,  j2    & 2 ? -256 : i2 & 31,  j3    & 2 ? -256 : i3 & 31,  j4    & 2 ? -256 : i4 & 31,  j5    & 2 ? -256 : i5 & 31,  j6    & 2 ? -256 : i6 & 31,  j7    & 2 ? -256 : i7 & 31,
+                               j8   & 2 ? -256 : i8 & 31,  j9   & 2 ? -256 : i9 & 31,  j10   & 2 ? -256 : i10& 31,  j11   & 2 ? -256 : i11& 31,  j12   & 2 ? -256 : i12& 31,  j13   & 2 ? -256 : i13& 31,  j14   & 2 ? -256 : i14& 31,  j15   & 2 ? -256 : i15& 31 > (a.get_low(),a.get_high());
+        Vec16c x0b = blend16c<(j0^2)& 6 ? -256 : i0 & 31, (j1^2)& 6 ? -256 : i1 & 31, (j2 ^2)& 6 ? -256 : i2 & 31, (j3 ^2)& 6 ? -256 : i3 & 31, (j4 ^2)& 6 ? -256 : i4 & 31, (j5 ^2)& 6 ? -256 : i5 & 31, (j6 ^2)& 6 ? -256 : i6 & 31, (j7 ^2)& 6 ? -256 : i7 & 31,
+                              (j8^2)& 6 ? -256 : i8 & 31, (j9^2)& 6 ? -256 : i9 & 31, (j10^2)& 6 ? -256 : i10& 31, (j11^2)& 6 ? -256 : i11& 31, (j12^2)& 6 ? -256 : i12& 31, (j13^2)& 6 ? -256 : i13& 31, (j14^2)& 6 ? -256 : i14& 31, (j15^2)& 6 ? -256 : i15& 31 > (b.get_low(),b.get_high());
+        x0         = blend16c<n0, n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15> (x0a, x0b);
+    }
+
+    if (r1 < 0) {
+        x1 = _mm_setzero_si128();
+    }
+    else if (r1 == s1) {
+        // i16 - i31 all from same source
+        x1 = permute16c< i16&-49, i17&-49, i18&-49, i19&-49, i20&-49, i21&-49, i22&-49, i23&-49,
+                         i24&-49, i25&-49, i26&-49, i27&-49, i28&-49, i29&-49, i30&-49, i31&-49 >
+             (select4<r1> (a,b));
+    }
+    else if ((j18<0||j18==r1||j18==s1) && (j19<0||j19==r1||j19==s1) && (j20<0||j20==r1||j20==s1) && (j21<0||j21==r1||j21==s1) && (j22<0||j22==r1||j22==s1) && (j23<0||j23==r1||j23==s1) && (j24<0||j24==r1||j24==s1) && 
+             (j25<0||j25==r1||j25==s1) && (j26<0||j26==r1||j26==s1) && (j27<0||j27==r1||j27==s1) && (j28<0||j28==r1||j28==s1) && (j29<0||j29==r1||j29==s1) && (j30<0||j30==r1||j30==s1) && (j31<0||j31==r1||j31==s1)) {
+        // i16 - i31 all from two sources
+        const int k16=  i16>= 0 ? i16& 15 : i16;
+        const int k17= (i17>= 0 ? i17& 15 : i17) | (j17== s1 ? 16 : 0);
+        const int k18= (i18>= 0 ? i18& 15 : i18) | (j18== s1 ? 16 : 0);
+        const int k19= (i19>= 0 ? i19& 15 : i19) | (j19== s1 ? 16 : 0);
+        const int k20= (i20>= 0 ? i20& 15 : i20) | (j20== s1 ? 16 : 0);
+        const int k21= (i21>= 0 ? i21& 15 : i21) | (j21== s1 ? 16 : 0);
+        const int k22= (i22>= 0 ? i22& 15 : i22) | (j22== s1 ? 16 : 0);
+        const int k23= (i23>= 0 ? i23& 15 : i23) | (j23== s1 ? 16 : 0);
+        const int k24= (i24>= 0 ? i24& 15 : i24) | (j24== s1 ? 16 : 0);
+        const int k25= (i25>= 0 ? i25& 15 : i25) | (j25== s1 ? 16 : 0);
+        const int k26= (i26>= 0 ? i26& 15 : i26) | (j26== s1 ? 16 : 0);
+        const int k27= (i27>= 0 ? i27& 15 : i27) | (j27== s1 ? 16 : 0);
+        const int k28= (i28>= 0 ? i28& 15 : i28) | (j28== s1 ? 16 : 0);
+        const int k29= (i29>= 0 ? i29& 15 : i29) | (j29== s1 ? 16 : 0);
+        const int k30= (i30>= 0 ? i30& 15 : i30) | (j30== s1 ? 16 : 0);
+        const int k31= (i31>= 0 ? i31& 15 : i31) | (j31== s1 ? 16 : 0);
+        x1 = blend16c<k16,k17,k18,k19,k20,k21,k22,k23,k24,k25,k26,k27,k28,k29,k30,k31> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i16 - i31 from three or four different sources
+        const int n16= j16>= 0 ? j16/2*16 + 0 : j16;
+        const int n17= j17>= 0 ? j17/2*16 + 1 : j17;
+        const int n18= j18>= 0 ? j18/2*16 + 2 : j18;
+        const int n19= j19>= 0 ? j19/2*16 + 3 : j19;
+        const int n20= j20>= 0 ? j20/2*16 + 4 : j20;
+        const int n21= j21>= 0 ? j21/2*16 + 5 : j21;
+        const int n22= j22>= 0 ? j22/2*16 + 6 : j22;
+        const int n23= j23>= 0 ? j23/2*16 + 7 : j23;
+        const int n24= j24>= 0 ? j24/2*16 + 8 : j24;
+        const int n25= j25>= 0 ? j25/2*16 + 9 : j25; 
+        const int n26= j26>= 0 ? j26/2*16 +10 : j26;
+        const int n27= j27>= 0 ? j27/2*16 +11 : j27;
+        const int n28= j28>= 0 ? j28/2*16 +12 : j28;
+        const int n29= j29>= 0 ? j29/2*16 +13 : j29;
+        const int n30= j30>= 0 ? j30/2*16 +14 : j30;
+        const int n31= j31>= 0 ? j31/2*16 +15 : j31;
+        x1 = blend16c<n16, n17, n18, n19, n20, n21, n22, n23, n24, n25, n26, n27, n28, n29, n30, n31> (
+             blend16c< j16   & 2 ? -256 : i16& 31,  j17   & 2 ? -256 : i17& 31,  j18   & 2 ? -256 : i18& 31,  j19   & 2 ? -256 : i19& 31,  j20   & 2 ? -256 : i20& 31,  j21   & 2 ? -256 : i21& 31,  j22   & 2 ? -256 : i22& 31,  j23   & 2 ? -256 : i23& 31,
+                       j24   & 2 ? -256 : i24& 31,  j25   & 2 ? -256 : i25& 31,  j26   & 2 ? -256 : i26& 31,  j27   & 2 ? -256 : i27& 31,  j28   & 2 ? -256 : i28& 31,  j29   & 2 ? -256 : i29& 31,  j30   & 2 ? -256 : i30& 31,  j31   & 2 ? -256 : i31& 31 > (a.get_low(),a.get_high()),
+             blend16c<(j16^2)& 6 ? -256 : i16& 31, (j17^2)& 6 ? -256 : i17& 31, (j18^2)& 6 ? -256 : i18& 31, (j19^2)& 6 ? -256 : i19& 31, (j20^2)& 6 ? -256 : i20& 31, (j21^2)& 6 ? -256 : i21& 31, (j22^2)& 6 ? -256 : i22& 31, (j23^2)& 6 ? -256 : i23& 31,
+                      (j24^2)& 6 ? -256 : i24& 31, (j25^2)& 6 ? -256 : i25& 31, (j26^2)& 6 ? -256 : i26& 31, (j27^2)& 6 ? -256 : i27& 31, (j28^2)& 6 ? -256 : i28& 31, (j29^2)& 6 ? -256 : i29& 31, (j30^2)& 6 ? -256 : i30& 31, (j31^2)& 6 ? -256 : i31& 31 > (b.get_low(),b.get_high()));
+    }
+    return Vec32c(x0,x1);
+}
+
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc blend32uc(Vec32uc const & a, Vec32uc const & b) {
+        return Vec32uc (blend32c<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,    
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a, b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8i a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8i c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) {
+#if defined (__XOP__)   // AMD XOP instruction set. Use VPPERM
+    Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+    Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+    return Vec32c(t0, t1);
+#else
+    Vec16c t0 = lookup32(index.get_low() , table.get_low(), table.get_high());
+    Vec16c t1 = lookup32(index.get_high(), table.get_low(), table.get_high());
+    return Vec32c(t0, t1);
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <= 16) {
+        Vec16c tt = Vec16c().load(table);
+        Vec16c r0 = lookup16(index.get_low(),  tt);
+        Vec16c r1 = lookup16(index.get_high(), tt);
+        return Vec32c(r0, r1);
+    }
+    if (n <= 32) return lookup32(index, Vec32c().load(table));
+    // n > 32. Limit index
+    Vec32uc index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec32uc(index) & uint8_t(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec32uc(index), uint8_t(n-1));
+    }
+    uint8_t ii[32];  index1.store(ii);
+    int8_t  rr[32];
+    for (int j = 0; j < 32; j++) {
+        rr[j] = ((int8_t*)table)[ii[j]];
+    }
+    return Vec32c().load(rr);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const & index, void const * table) {
+    return lookup<n>(Vec32uc(index), table);
+}
+
+
+static inline Vec16s lookup16(Vec16s const & index, Vec16s const & table) {
+    Vec8s t0 = lookup16(index.get_low() , table.get_low(), table.get_high());
+    Vec8s t1 = lookup16(index.get_high(), table.get_low(), table.get_high());
+    return Vec16s(t0, t1);
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <=  8) {
+        Vec8s table1 = Vec8s().load(table);        
+        return Vec16s(       
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if (n <= 16) return lookup16(index, Vec16s().load(table));
+    // n > 16. Limit index
+    Vec16us i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16us(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16us(index), n-1);
+    }
+    int16_t const * t = (int16_t const *)table;
+    return Vec16s(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
+        t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
+}
+
+static inline Vec8i lookup8(Vec8i const & index, Vec8i const & table) {
+    Vec4i t0 = lookup8(index.get_low() , table.get_low(), table.get_high());
+    Vec4i t1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+    return Vec8i(t0, t1);
+}
+
+template <int n>
+static inline Vec8i lookup(Vec8i const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) {
+        Vec4i table1 = Vec4i().load(table);        
+        return Vec8i(       
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if (n <= 8) {
+        return lookup8(index, Vec8i().load(table));
+    }
+    // n > 8. Limit index
+    Vec8ui i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8ui(index), n-1);
+    }
+    int32_t const * t = (int32_t const *)table;
+    return Vec8i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+static inline Vec4q lookup4(Vec4q const & index, Vec4q const & table) {
+    return lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const & index, void const * table) {
+    if (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec4uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec4uq(min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>()));
+    }
+    uint32_t ii[8];  index1.store(ii);  // use only lower 32 bits of each index
+    int64_t const * tt = (int64_t const *)table;
+    return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]);    
+}
+
+
+/*****************************************************************************
+*
+*          Other permutations with variable indexes
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_up(Vec32c const & a, int b) {
+    if (b < 16) {    
+        return Vec32c(shift_bytes_up(a.get_low(),b), shift_bytes_up(a.get_high(),b) | shift_bytes_down(a.get_low(),16-b));
+    }
+    else {
+        return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(),b-16));
+    }
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+// You may use a permute function instead if b is a compile-time constant
+static inline Vec32c shift_bytes_down(Vec32c const & a, int b) {
+    if (b < 16) {    
+        return Vec32c(shift_bytes_down(a.get_low(),b) | shift_bytes_up(a.get_high(),16-b), shift_bytes_down(a.get_high(),b));
+    }
+    else {
+        return Vec32c(shift_bytes_down(a.get_high(),b-16), Vec16c(0));
+    }
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8i b = Vec8i().load((int32_t const *)a + imax-7);
+            return permute8i<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7>(b);
+        }
+        else {
+            Vec8i b = Vec8i().load((int32_t const *)a + imin);
+            return permute8i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin>(b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8i b = Vec8i().load((int32_t const *)a + imin);
+        Vec8i c = Vec8i().load((int32_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8i<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), a);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const * a) {
+    Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index;  // Error message if index is negative
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int imin   = i01min < i23min ? i01min : i23min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int imax   = i01max > i23max ? i01max : i23max;
+    if (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4q b = Vec4q().load((int64_t const *)a + imax-3);
+            return permute4q<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
+        }
+        else {
+            Vec4q b = Vec4q().load((int64_t const *)a + imin);
+            return permute4q<i0-imin, i1-imin, i2-imin, i3-imin>(b);
+        }
+    }
+    if ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
+        // load two contiguous blocks and blend
+        Vec4q b = Vec4q().load((int64_t const *)a + imin);
+        Vec4q c = Vec4q().load((int64_t const *)a + imax-3);
+        const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
+        const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
+        const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
+        const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
+        return blend4q<j0, j1, j2, j3>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec4q(i0,i1,i2,i3), a);
+}
+
+
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low (Vec32c const & a) {
+    return Vec16s(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high (Vec32c const & a) {
+    return Vec16s(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low (Vec32uc const & a) {
+    return Vec16us(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high (Vec32uc const & a) {
+    return Vec16us(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low (Vec16s const & a) {
+    return Vec8i(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high (Vec16s const & a) {
+    return Vec8i(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low (Vec16us const & a) {
+    return Vec8ui(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high (Vec16us const & a) {
+    return Vec8ui(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low (Vec8i const & a) {
+    return Vec4q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high (Vec8i const & a) {
+    return Vec4q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low (Vec8ui const & a) {
+    return Vec4uq(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high (Vec8ui const & a) {
+    return Vec4uq(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress (Vec16s const & low, Vec16s const & high) {
+    return Vec32c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated (Vec16s const & low, Vec16s const & high) {
+    return Vec32c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress (Vec16us const & low, Vec16us const & high) {
+    return Vec32uc(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & high) {
+    return Vec32uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress (Vec8i const & low, Vec8i const & high) {
+    return Vec16s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated (Vec8i const & low, Vec8i const & high) {
+    return Vec16s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress (Vec8ui const & low, Vec8ui const & high) {
+    return Vec16us(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & high) {
+    return Vec16us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress (Vec4q const & low, Vec4q const & high) {
+    return Vec8i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated (Vec4q const & low, Vec4q const & high) {
+    return Vec8i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress (Vec4uq const & low, Vec4uq const & high) {
+    return Vec8ui (compress((Vec4q)low, (Vec4q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high) {
+    return Vec8ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const & a) {
+    return Vec8i( divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator / (Vec8i const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator / (Vec8i const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const & a) {
+    return Vec8ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator / (Vec8ui const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator / (Vec8ui const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16s by compile-time constant 
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const & a) {
+    return Vec16s( divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator / (Vec16s const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator / (Vec16s const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int(d)>(a);                                   // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const & a) {
+    return Vec16us( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator / (Vec16us const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator / (Vec16us const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator / (Vec32c const & a, Const_int_t<d>) {
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a)  / Const_int_t<d>();
+    Vec16s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator / (Vec32c const & a, Const_uint_t<d>) {
+    Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return a / Const_int_t<d>();                                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t<d>) {
+    // expand into two Vec16us
+    Vec16us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec16us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator / (Vec32uc const & a, Const_int_t<d>) {
+    Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return a / Const_uint_t<d>();                                    // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec32cb const & x) {
+    int a1 = horizontal_find_first(x.get_low());
+    if (a1 >= 0) return a1;
+    int a2 = horizontal_find_first(x.get_high());
+    if (a2 < 0) return a2;
+    return a2 + 16;;
+}
+
+static inline int horizontal_find_first(Vec16sb const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 1;
+}
+
+static inline int horizontal_find_first(Vec8ib const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 2;
+}
+
+static inline int horizontal_find_first(Vec4qb const & x) {
+    return horizontal_find_first(Vec32cb(x)) >> 3;
+}
+
+// Count the number of elements that are true
+static inline uint32_t horizontal_count(Vec32cb const & x) {
+    return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
+}
+
+static inline uint32_t horizontal_count(Vec16sb const & x) {
+    return horizontal_count(Vec32cb(x)) >> 1;
+}
+
+static inline uint32_t horizontal_count(Vec8ib const & x) {
+    return horizontal_count(Vec32cb(x)) >> 2;
+}
+
+static inline uint32_t horizontal_count(Vec4qb const & x) {
+    return horizontal_count(Vec32cb(x)) >> 3;
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const & x) {
+    return to_bits(x.get_low()) | (uint32_t)to_bits(x.get_high()) << 16;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec32cb to_Vec32cb(uint32_t x) {
+    return Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x>>16)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16sb const & x) {
+    return to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8;
+}
+
+// to_Vec16sb: convert integer bitfield to boolean vector
+static inline Vec16sb to_Vec16sb(uint16_t x) {
+    return Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x>>8)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib const & x) {
+    return to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4;
+}
+
+// to_Vec8ib: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) {
+    return Vec8i(to_Vec4ib(x), to_Vec4ib(x>>4));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb const & x) {
+    return to_bits(x.get_low()) | to_bits(x.get_high()) << 2;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) {
+    return Vec4q(to_Vec2qb(x), to_Vec2qb(x>>2));
+}
+
+#endif // VECTORI256_H
diff --git a/vectorclass/vectori512.h b/vectorclass/vectori512.h
new file mode 100755
index 0000000..ad8863f
--- /dev/null
+++ b/vectorclass/vectori512.h
@@ -0,0 +1,2733 @@
+/****************************  vectori512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining integer vector classes as interface to intrinsic 
+* functions in x86 microprocessors with AVX512 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX512. 
+*
+* The following vector classes are defined here:
+* Vec16i    Vector of  16  32-bit signed   integers
+* Vec16ui   Vector of  16  32-bit unsigned integers
+* Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+* Vec8q     Vector of   8  64-bit signed   integers
+* Vec8uq    Vector of   8  64-bit unsigned integers
+* Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORI512_H)
+#if    VECTORI512_H != 2
+#error Two different versions of vectori512.h included
+#endif
+#else
+#define VECTORI512_H  2
+
+#ifdef VECTORF512_H
+#error Please put header file vectori512.h before vectorf512.h
+#endif
+
+
+#if INSTRSET < 9   // AVX512 required
+#error Wrong instruction set for vectori512.h, AVX512 required or use vectori512e.h
+#endif
+
+#include "vectori256.h"
+
+
+// Bug fix for missing intrinsics:
+// _mm512_cmpgt_epu32_mask, _mm512_cmpgt_epu64_mask
+// all typecast intrinsics
+// Fix expected in GCC version 4.9.2 but not seen yet https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
+
+// questionable
+// _mm512_mask_mov_epi32 check select(). Doc at https://software.intel.com/en-us/node/513888 is wrong. Bug report filed
+
+
+#if defined (GCC_VERSION) && GCC_VERSION < 41102 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+
+static inline  __m512i _mm512_castsi256_si512(__m256i x) {
+    union {
+        __m512i a;
+        __m256i b;
+    } u;
+    u.b = x;
+    return u.a;
+}
+
+static inline  __m256i _mm512_castsi512_si256(__m512i x) {
+    union {
+        __m512i a;
+        __m256i b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline  __m512i _mm512_castsi128_si512(__m128i x) {
+    union {
+        __m128i a;
+        __m512i b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+static inline  __m128i _mm512_castsi512_si128(__m512i x) {
+    union {
+        __m512i a;
+        __m128i b;
+    } u;
+    u.a = x;
+    return u.b;
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline __m512i constant16i() {
+    static const union {
+        int32_t i[16];
+        __m512i zmm;
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
+    return u.zmm;
+}
+
+
+/*****************************************************************************
+*
+*          Boolean vector base classes for AVX512
+*
+*****************************************************************************/
+
+class Vec16b {
+protected:
+    __mmask16  m16; // Boolean vector
+public:
+    // Default constructor:
+    Vec16b () {
+    }
+    // Constructor to convert from type __mmask16 used in intrinsics:
+    Vec16b (__mmask16 x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, 
+    bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
+        m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7 |
+              b8<<8 | b9<<9 | b10<<10 | b11<<11 | b12<<12 | b13<<13 | b14<<14 | b15<<15);
+    }
+    // Constructor to broadcast single value:
+    Vec16b(bool b) {
+        m16 = __mmask16(-int16_t(b));
+    }
+private: // Prevent constructing from int, etc.
+    Vec16b(int b);
+public:
+    // Constructor to make from two halves
+    Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
+        // = Vec16i(x0,x1) != 0;  (not defined yet)
+        __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
+        m16 = _mm512_cmpneq_epi32_mask(z, _mm512_setzero_epi32());
+    }        
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec16b & operator = (__mmask16 x) {
+        m16 = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16b & operator = (bool b) {
+        m16 = Vec16b(b);
+        return *this;
+    }
+private: // Prevent assigning int because of ambiguity
+    Vec16b & operator = (int x);
+public:
+    // Type cast operator to convert to __mmask16 used in intrinsics
+    operator __mmask16() const {
+        return m16;
+    }
+    // split into two halves
+    Vec8ib get_low() const {
+        return to_Vec8ib((uint8_t)m16);
+    }
+    Vec8ib get_high() const {
+        return to_Vec8ib((uint16_t)m16 >> 8);
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16b const & insert(uint32_t index, bool value) {
+        m16 = __mmask16(((uint16_t)m16 & ~(1 << index)) | (int)value << index);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        return ((uint32_t)m16 >> index) & 1;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec16b operator & (Vec16b a, Vec16b b) {
+    return _mm512_kand(a, b);
+}
+static inline Vec16b operator && (Vec16b a, Vec16b b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16b operator | (Vec16b a, Vec16b b) {
+    return _mm512_kor(a, b);
+}
+static inline Vec16b operator || (Vec16b a, Vec16b b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16b operator ^ (Vec16b a, Vec16b b) {
+    return _mm512_kxor(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16b operator ~ (Vec16b a) {
+    return _mm512_knot(a);
+}
+
+// vector operator ! : element not
+static inline Vec16b operator ! (Vec16b a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16b & operator &= (Vec16b & a, Vec16b b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16b & operator |= (Vec16b & a, Vec16b b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16b & operator ^= (Vec16b & a, Vec16b b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Functions for boolean vectors
+*
+*****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec16b andnot (Vec16b a, Vec16b b) {
+    return _mm512_kandn(b, a);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec16b const & a) {
+    return (uint16_t)(__mmask16)a == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec16b const & a) {
+    return (uint16_t)(__mmask16)a != 0;
+}
+
+
+/*****************************************************************************
+*
+*          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
+*
+*****************************************************************************/
+
+class Vec16ib : public Vec16b {
+public:
+    // Default constructor:
+    Vec16ib () {
+    }
+    Vec16ib (Vec16b x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
+    }
+    // Constructor to convert from type __mmask16 used in intrinsics:
+    Vec16ib (__mmask16 x) {
+        m16 = x;
+    }
+    // Constructor to broadcast single value:
+    Vec16ib(bool b) : Vec16b(b) {}
+private: // Prevent constructing from int, etc.
+    Vec16ib(int b);
+public:
+    // Constructor to make from two halves
+    Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
+        m16 = Vec16b(x0, x1);
+    }
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec16ib & operator = (__mmask16 x) {
+        m16 = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16ib & operator = (bool b) {
+        m16 = Vec16b(b);
+        return *this;
+    }
+private: // Prevent assigning int because of ambiguity
+    Vec16ib & operator = (int x);
+public:
+};
+
+// Define operators for Vec16ib
+
+// vector operator & : bitwise and
+static inline Vec16ib operator & (Vec16ib a, Vec16ib b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec16ib operator && (Vec16ib a, Vec16ib b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16ib operator | (Vec16ib a, Vec16ib b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec16ib operator || (Vec16ib a, Vec16ib b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ib operator ^ (Vec16ib a, Vec16ib b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ib operator ~ (Vec16ib a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec16ib operator ! (Vec16ib a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector function andnot
+static inline Vec16ib andnot (Vec16ib a, Vec16ib b) {
+    return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vec8b: Base class vector of 8 Booleans
+*
+*****************************************************************************/
+
+class Vec8b : public Vec16b {
+public:
+    // Default constructor:
+    Vec8b () {
+    }
+    // Constructor to convert from type __mmask8 used in intrinsics:
+    Vec8b (__mmask8 x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+        m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7);
+    }
+    Vec8b (Vec16b const & x) {
+        m16 = __mmask8(x);
+    }
+    // Constructor to broadcast single value:
+    Vec8b(bool b) {
+        m16 = __mmask8(-int8_t(b));
+    }
+    // Assignment operator to convert from type __mmask8 used in intrinsics:
+    Vec8b & operator = (__mmask8 x) {
+        m16 = x;
+        return *this;
+    }
+private: // Prevent constructing from int etc. because of ambiguity
+    Vec8b(int b);
+    Vec8b & operator = (int x);
+public:
+    // split into two halves
+    Vec4qb get_low() const {
+        return Vec4qb(Vec4q(_mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(m16), -1LL))));
+    }
+    Vec4qb get_high() const {
+        return Vec8b(__mmask8(m16 >> 4)).get_low();
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
+*
+*****************************************************************************/
+
+class Vec8qb : public Vec8b {
+public:
+    // Default constructor:
+    Vec8qb () {
+    }
+    Vec8qb (Vec16b x) {
+        m16 = x;
+    }
+    // Constructor to build from all elements:
+    Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {
+    }
+    // Constructor to convert from type __mmask8 used in intrinsics:
+    Vec8qb (__mmask8 x) {
+        m16 = x;
+    }
+    // Assignment operator to convert from type __mmask8 used in intrinsics:
+    Vec8qb & operator = (__mmask8 x) {
+        m16 = x;
+        return *this;
+    }
+    // Constructor to broadcast single value:
+    Vec8qb(bool b) : Vec8b(b) {}
+    // Assignment operator to broadcast scalar:
+    Vec8qb & operator = (bool b) {
+        m16 = Vec8b(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec8qb(int b);
+    Vec8qb & operator = (int x);
+public:
+    // Constructor to make from two halves
+    Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
+        // = Vec8q(x0,x1) != 0;  (not defined yet)
+        __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
+        m16 = _mm512_cmpneq_epi64_mask(z, _mm512_setzero_si512());
+    }        
+};
+
+// Define operators for Vec8qb
+
+// vector operator & : bitwise and
+static inline Vec8qb operator & (Vec8qb a, Vec8qb b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec8qb operator && (Vec8qb a, Vec8qb b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8qb operator | (Vec8qb a, Vec8qb b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec8qb operator || (Vec8qb a, Vec8qb b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8qb operator ^ (Vec8qb a, Vec8qb b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8qb operator ~ (Vec8qb a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec8qb operator ! (Vec8qb a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb b) {
+    a = a ^ b;
+    return a;
+}
+
+// to_bits: convert to integer bitfield
+static inline uint32_t to_bits(Vec8qb a) {
+    return (uint8_t)(__mmask16)a;
+}
+
+// vector function andnot
+static inline Vec8qb andnot (Vec8qb a, Vec8qb b) {
+    return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 512 1-bit unsigned integers (base class for Vec16i)
+*
+*****************************************************************************/
+class Vec512b {
+protected:
+    __m512i zmm; // Integer vector
+public:
+    // Default constructor:
+    Vec512b() {
+    }
+    // Constructor to build from two Vec256b:
+    Vec512b(Vec256b const & a0, Vec256b const & a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec512b(__m512i const & x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec512b & operator = (__m512i const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec512b & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64, but there is hardly any speed advantage of load_a on modern processors
+    Vec512b & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm512_storeu_si512(p, zmm);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64, but there is hardly any speed advantage of store_a on modern processors
+    void store_a(void * p) const {
+        _mm512_store_si512(p, zmm);
+    }
+    // Member function to change a single bit, mainly for test purposes
+    // Note: This function is inefficient. Use load function if changing more than one bit
+    Vec512b const & set_bit(uint32_t index, int value) {
+        static uint64_t m[16] = {0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0};
+        int wi = (index >> 6) & 7;               // qword index
+        int bi = index & 0x3F;                   // bit index within qword w
+
+        __m512i mask = Vec512b().load(m+8-wi);   // 1 in qword number wi
+        mask = _mm512_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set
+        if (value & 1) {
+            zmm = _mm512_or_si512(mask,zmm);
+        }
+        else {
+            zmm = _mm512_andnot_si512(mask,zmm);
+        }
+        return *this;
+    }
+    // Member function to get a single bit, mainly for test purposes
+    // Note: This function is inefficient. Use store function if reading more than one bit
+    int get_bit(uint32_t index) const {
+        union {
+            __m512i z;
+            uint8_t i[64];
+        } u;
+        u.z = zmm; 
+        int wi = (index >> 3) & 0x3F;            // byte index
+        int bi = index & 7;                      // bit index within byte w
+        return (u.i[wi] >> bi) & 1;
+    }
+    // Member functions to split into two Vec256b:
+    Vec256b get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec256b get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static int size () {
+        return 512;
+    }
+};
+
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
+    return _mm512_and_epi32(a, b);
+}
+static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
+    return _mm512_or_epi32(a, b);
+}
+static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator ~ (Vec512b const & a) {
+    return _mm512_xor_epi32(a, _mm512_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
+    return _mm512_andnot_epi32(b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16i: public Vec512b {
+public:
+    // Default constructor:
+    Vec16i() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16i(int i) {
+        zmm = _mm512_set1_epi32(i);
+    };
+    // Constructor to build from all elements:
+    Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+        int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
+        zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to build from two Vec8i:
+    Vec16i(Vec8i const & a0, Vec8i const & a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec16i(__m512i const & x) {
+        zmm = x;
+    };
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec16i & operator = (__m512i const & x) {
+        zmm = x;
+        return *this;
+    };
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    };
+    // Member function to load from array (unaligned)
+    Vec16i & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16i & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16i & load_partial(int n, void const * p) {
+        zmm = _mm512_maskz_loadu_epi32(__mmask16((1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        _mm512_mask_storeu_epi32(p, __mmask16((1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16i & cutoff(int n) {
+        zmm = _mm512_maskz_mov_epi32(__mmask16((1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16i const & insert(uint32_t index, int32_t value) {
+        zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1 << index), value);
+        return *this;
+    };
+    // Member function extract a single element from vector
+    int32_t extract(uint32_t index) const {
+        int32_t a[16];
+        store(a);
+        return a[index & 15];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8i:
+    Vec8i get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec8i get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
+    return _mm512_add_epi32(a, b);
+}
+
+// vector operator += : add
+static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator ++ (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16i & operator ++ (Vec16i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
+    return _mm512_sub_epi32(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec16i operator - (Vec16i const & a) {
+    return _mm512_sub_epi32(_mm512_setzero_epi32(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16i operator -- (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16i & operator -- (Vec16i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
+    return _mm512_mullo_epi32(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec16i operator << (Vec16i const & a, int32_t b) {
+    return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
+    return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
+    return _mm512_cmpeq_epi32_mask(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
+    return _mm512_cmpneq_epi32_mask(a, b);
+}
+  
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
+    return  _mm512_cmpgt_epi32_mask(a, b);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
+    return _mm512_cmpge_epi32_mask(a, b);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
+    return _mm512_and_epi32(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
+    return _mm512_or_epi32(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator ~ (Vec16i const & a) {
+    return a ^ Vec16i(-1);
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
+    return _mm512_mask_mov_epi32(b, s, a);  // conditional move may be optimized better by the compiler than blend
+    // return _mm512_mask_blend_epi32(s, b, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
+    return _mm512_mask_add_epi32(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec16i const & a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_epi32(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+// (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?)
+static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
+    __m512i sum    = _mm512_add_epi32(a, b);                  // a + b
+    __m512i axb    = _mm512_xor_epi32(a, b);                  // check if a and b have different sign
+    __m512i axs    = _mm512_xor_epi32(a, sum);                // check if a and sum have different sign
+    __m512i ovf1   = _mm512_andnot_epi32(axb,axs);            // check if sum has wrong sign
+    __m512i ovf2   = _mm512_srai_epi32(ovf1,31);              // -1 if overflow
+    __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
+    __m512i asign  = _mm512_srli_epi32(a,31);                 // 1  if a < 0
+    __m512i sat1   = _mm512_srli_epi32(ovf2,1);               // 7FFFFFFF if overflow
+    __m512i sat2   = _mm512_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return _mm512_mask_blend_epi32(ovf3, sum, sat2);          // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
+    __m512i diff   = _mm512_sub_epi32(a, b);                  // a + b
+    __m512i axb    = _mm512_xor_si512(a, b);                  // check if a and b have different sign
+    __m512i axs    = _mm512_xor_si512(a, diff);               // check if a and sum have different sign
+    __m512i ovf1   = _mm512_and_si512(axb,axs);               // check if sum has wrong sign
+    __m512i ovf2   = _mm512_srai_epi32(ovf1,31);              // -1 if overflow
+    __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
+    __m512i asign  = _mm512_srli_epi32(a,31);                 // 1  if a < 0
+    __m512i sat1   = _mm512_srli_epi32(ovf2,1);               // 7FFFFFFF if overflow
+    __m512i sat2   = _mm512_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return _mm512_mask_blend_epi32(ovf3, diff, sat2);         // sum if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
+    return _mm512_max_epi32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
+    return _mm512_min_epi32(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const & a) {
+    return _mm512_abs_epi32(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const & a) {
+    return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const & a, int b) {
+    return _mm512_rolv_epi32(a, Vec16i(b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit unsigned integers
+*
+*****************************************************************************/
+
+
+class Vec16ui : public Vec16i {
+public:
+    // Default constructor:
+    Vec16ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16ui(uint32_t i) {
+        zmm = _mm512_set1_epi32(i);
+    };
+    // Constructor to build from all elements:
+    Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, 
+        uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
+        zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to build from two Vec8ui:
+    Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
+        zmm = Vec16i(Vec8i(a0), Vec8i(a1));
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec16ui(__m512i const & x) {
+        zmm = x;
+    };
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec16ui & operator = (__m512i const & x) {
+        zmm = x;
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16ui & load(void const * p) {
+        Vec16i::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16ui & load_a(void const * p) {
+        Vec16i::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16ui const & insert(uint32_t index, uint32_t value) {
+        Vec16i::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(uint32_t index) const {
+        return Vec16i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec8ui get_low() const {
+        return Vec8ui(Vec16i::get_low());
+    }
+    Vec8ui get_high() const {
+        return Vec8ui(Vec16i::get_high());
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) + Vec16i(b));
+}
+
+// vector operator - : subtract
+static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) - Vec16i(b));
+}
+
+// vector operator * : multiply
+static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) * Vec16i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
+    return _mm512_srl_epi32(a, _mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+} 
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
+    return _mm512_cmplt_epu32_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
+    return b < a;
+}
+
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
+    return  _mm512_cmpge_epu32_mask(a, b);
+}            
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) & Vec16i(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) | Vec16i(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) ^ Vec16i(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator ~ (Vec16ui const & a) {
+    return Vec16ui( ~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16ui const & a) {
+    return horizontal_add((Vec16i)a);
+}
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
+    Vec16ui sum      = a + b;
+    Vec16ib overflow = sum < (a | b);                  // overflow if (a + b) < (a | b)
+    return _mm512_mask_set1_epi32(sum, overflow, -1);  // 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
+    Vec16ui diff      = a - b;
+    return _mm512_maskz_mov_epi32(diff <= a, diff);   // underflow if diff > a gives zero
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
+    return _mm512_max_epu32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
+    return _mm512_min_epu32(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8q : public Vec512b {
+public:
+    // Default constructor:
+    Vec8q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8q(int64_t i) {
+        zmm = _mm512_set1_epi64(i);
+    }
+    // Constructor to build from all elements:
+    Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
+        zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4q:
+    Vec8q(Vec4q const & a0, Vec4q const & a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec8q(__m512i const & x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec8q & operator = (__m512i const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8q & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec8q & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8q & load_partial(int n, void const * p) {
+        zmm = _mm512_maskz_loadu_epi64(__mmask8((1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        _mm512_mask_storeu_epi64(p, __mmask8((1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8q & cutoff(int n) {
+        zmm = _mm512_maskz_mov_epi64(__mmask8((1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8q const & insert(uint32_t index, int64_t value) {
+        zmm = _mm512_mask_set1_epi64(zmm, __mmask8(1 << index), value);
+        // zmm = _mm512_mask_blend_epi64(__mmask8(1 << index), zmm, _mm512_set1_epi64(value));
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(uint32_t index) const {
+        int64_t a[8];
+        store (a);
+        return a[index & 7];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec4q get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec4q get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
+    return _mm512_add_epi64(a, b);
+}
+
+// vector operator += : add
+static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator ++ (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8q & operator ++ (Vec8q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
+    return _mm512_sub_epi64(a, b);
+}
+
+// vector operator - : unary minus
+static inline Vec8q operator - (Vec8q const & a) {
+    return _mm512_sub_epi64(_mm512_setzero_epi32(), a);
+}
+
+// vector operator -= : subtract
+static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8q operator -- (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8q & operator -- (Vec8q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
+#if defined (GCC_VERSION) && GCC_VERSION < 41100 && !defined(__INTEL_COMPILER) && !defined(__clang__)
+    return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());  // _mm512_mullox_epi64 missing in gcc 4.10.
+#else
+    return _mm512_mullox_epi64(a, b);
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator << (Vec8q const & a, int32_t b) {
+    return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
+    return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
+    return _mm512_cmpeq_epi64_mask(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
+    return _mm512_cmpneq_epi64_mask(a, b);
+}
+  
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
+    return _mm512_cmplt_epi64_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
+    return _mm512_cmpge_epi64_mask(a, b);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
+    return _mm512_and_epi32(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
+    return _mm512_or_epi32(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
+    return _mm512_xor_epi32(a, b);
+}
+// vector operator ^= : bitwise xor
+static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator ~ (Vec8q const & a) {
+    return Vec8q(~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
+    return _mm512_mask_mov_epi64(b, s, a);
+    //return _mm512_mask_blend_epi64(s, b, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
+    return _mm512_mask_add_epi64(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add (Vec8q const & a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_epi64(a);
+#else
+    return horizontal_add(a.get_low()+a.get_high());
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec16i const & x) {
+    Vec8q a = _mm512_cvtepi32_epi64(x.get_low());
+    Vec8q b = _mm512_cvtepi32_epi64(x.get_high());
+    return horizontal_add(a+b);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec16ui const & x) {
+    Vec8q a = _mm512_cvtepu32_epi64(x.get_low());
+    Vec8q b = _mm512_cvtepu32_epi64(x.get_high());
+    return horizontal_add(a+b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
+    return _mm512_max_epi64(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
+    return _mm512_min_epi64(a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const & a) {
+    return _mm512_abs_epi64(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const & a) {
+    return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const & a, int b) {
+    return _mm512_rolv_epi64(a, Vec8q(b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8uq : public Vec8q {
+public:
+    // Default constructor:
+    Vec8uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8uq(uint64_t i) {
+        zmm = Vec8q(i);
+    }
+    // Constructor to convert from Vec8q:
+    Vec8uq(Vec8q const & x) {
+        zmm = x;
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec8uq(__m512i const & x) {
+        zmm = x;
+    }
+    // Constructor to build from all elements:
+    Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
+        zmm = Vec8q(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4uq:
+    Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
+        zmm = Vec8q(Vec4q(a0), Vec4q(a1));
+    }
+    // Assignment operator to convert from Vec8q:
+    Vec8uq  & operator = (Vec8q const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec8uq & operator = (__m512i const & x) {
+        zmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8uq & load(void const * p) {
+        Vec8q::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8uq & load_a(void const * p) {
+        Vec8q::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8uq const & insert(uint32_t index, uint64_t value) {
+        Vec8q::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(uint32_t index) const {
+        return Vec8q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec4uq get_low() const {
+        return Vec4uq(Vec8q::get_low());
+    }
+    Vec4uq get_high() const {
+        return Vec4uq(Vec8q::get_high());
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) + Vec8q(b));
+}
+
+// vector operator - : subtract
+static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) - Vec8q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) * Vec8q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
+    return _mm512_srl_epi64(a,_mm_cvtsi32_si128(b)); 
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
+    return Vec8uq ((Vec8q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
+    return Vec8uq ((Vec8q)a << b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
+    return _mm512_cmplt_epu64_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
+    return _mm512_cmpge_epu64_mask(a, b);
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) & Vec8q(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) | Vec8q(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) ^ Vec8q(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
+    return _mm512_mask_add_epi64(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add (Vec8uq const & a) {
+    return horizontal_add(Vec8q(a));
+}
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
+    return _mm512_max_epu64(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
+    return _mm512_min_epu64(a, b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8q b;
+* b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8q(Vec8q const & a) {
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<< 8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
+    const int m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_epi32();
+
+    // mask for elements not zeroed
+    const __mmask8  z = __mmask8((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
+    // same with 2 bits for each element
+    const __mmask16 zz = __mmask16((i0>=0?3:0) | (i1>=0?0xC:0) | (i2>=0?0x30:0) | (i3>=0?0xC0:0) | (i4>=0?0x300:0) | (i5>=0?0xC00:0) | (i6>=0?0x3000:0) | (i7>=0?0xC000:0));
+
+    if (((m1 ^ 0x76543210) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            return _mm512_maskz_mov_epi64(z, a);
+        }
+        return a;                                 // do nothing
+    }
+
+    if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
+        // no exchange of data between the four 128-bit lanes
+        const int pat = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
+        const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
+        if (((m1 ^ pat) & mz & 0x11111111) == 0) {
+            // same permute pattern in all lanes
+            if (dozero) {  // permute within lanes and zero
+                return _mm512_maskz_shuffle_epi32(zz, a, (_MM_PERM_ENUM)pmask);
+            }
+            else {  // permute within lanes
+                return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
+            }
+        }
+        // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+    if ((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0) 
+    &&  ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0) {
+        // permute lanes only. no permutation within each lane
+        const int m3 = m2 | (m2 >> 4);
+        const int s = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
+        if (dozero) {
+            // permute lanes and zero some 64-bit elements
+            return  _mm512_maskz_shuffle_i64x2(z, a, a, (_MM_PERM_ENUM)s);
+        }
+        else {
+            // permute lanes
+            return _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+    // full permute needed
+    const __m512i pmask = constant16i<i0&7, 0, i1&7, 0, i2&7, 0, i3&7, 0, i4&7, 0, i5&7, 0, i6&7, 0, i7&7, 0>();
+    if (dozero) {
+        // full permute and zeroing
+        // Documentation is inconsistent. which order of the operands is correct?
+        return _mm512_maskz_permutexvar_epi64(z, pmask, a);
+    }
+    else {    
+        return _mm512_permutexvar_epi64(pmask, a);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8uq(Vec8uq const & a) {
+    return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
+}
+
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i permute16i(Vec16i const & a) {
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const uint64_t m1 = (i0&15) | (i1&15)<<4 | (i2&15)<< 8 | (i3&15)<<12 | (i4&15)<<16 | (i5&15)<<20 | (i6&15)<<24 | (i7&15LL)<<28   // 15LL avoids sign extension of (int32_t | int64_t)
+        | (i8&15LL)<<32 | (i9&15LL)<<36 | (i10&15LL)<<40 | (i11&15LL)<<44 | (i12&15LL)<<48 | (i13&15LL)<<52 | (i14&15LL)<<56 | (i15&15LL)<<60;
+
+    // Mask to zero out negative indexes
+    const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL) | (i8<0?0:0xF00000000) 
+        | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
+
+    const uint64_t m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_epi32();
+
+    // mask for elements not zeroed
+    const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
+        | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
+
+    if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
+        // no shuffling
+        if (dozero) {
+            // zero some elements
+            return _mm512_maskz_mov_epi32(z, a);
+        }
+        return a;                                 // do nothing
+    }
+
+    if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+        // no exchange of data between the four 128-bit lanes
+        const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
+        const int pmask = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
+        if (((m1 ^ pat) & 0x3333333333333333 & mz) == 0) {
+            // same permute pattern in all lanes
+            if (dozero) {  // permute within lanes and zero
+                return _mm512_maskz_shuffle_epi32(z, a, (_MM_PERM_ENUM)pmask);
+            }
+            else {  // permute within lanes
+                return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
+            }
+        }
+        // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
+    }
+    const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
+    if ((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0) 
+    &&  ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+        // permute lanes only. no permutation within each lane
+        const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
+        if (dozero) {
+            // permute lanes and zero some 64-bit elements
+            return  _mm512_maskz_shuffle_i32x4(z, a, a, (_MM_PERM_ENUM)s);
+        }
+        else {
+            // permute lanes
+            return _mm512_shuffle_i32x4(a, a, (_MM_PERM_ENUM)s);
+        }
+    }
+    // full permute needed
+    const __m512i pmask = constant16i<i0&15, i1&15, i2&15, i3&15, i4&15, i5&15, i6&15, i7&15, i8&15, i9&15, i10&15, i11&15, i12&15, i13&15, i14&15, i15&15>();
+    if (dozero) {
+        // full permute and zeroing
+        return _mm512_maskz_permutexvar_epi32(z, pmask, a);
+    }
+    else {    
+        return _mm512_permutexvar_epi32(pmask, a);
+    }
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16ui permute16ui(Vec16ui const & a) {
+    return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8q c;
+* c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {  
+
+    // Combine indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<< 8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
+    const int m2 = m1 & mz;
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
+
+    // mask for elements not zeroed
+    const __mmask8 z = __mmask8((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_epi32();
+
+    // special case: all from a
+    if ((m1 & 0x88888888 & mz) == 0) {
+        return permute8q <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+
+    // special case: all from b
+    if ((~m1 & 0x88888888 & mz) == 0) {
+        return permute8q <i0^8, i1^8, i2^8, i3^8, i4^8, i5^8, i6^8, i7^8> (b);
+    }
+
+    // special case: blend without permute
+    if (((m1 ^ 0x76543210) & 0x77777777 & mz) == 0) {
+        __mmask8 blendmask = __mmask8((i0&8)>>3 | (i1&8)>>2 | (i2&8)>>1 | (i3&8)>>0 | (i4&8)<<1 | (i5&8)<<2 | (i6&8)<<3 | (i7&8)<<4 );
+        __m512i t = _mm512_mask_blend_epi64(blendmask, a, b);
+        if (dozero) {
+            t = _mm512_maskz_mov_epi64(z, t);
+        }
+        return t;
+    }
+    // special case: all data stay within their lane
+    if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
+        // mask for elements from a and b
+        const uint32_t mb = ((i0&8)?0xF:0) | ((i1&8)?0xF0:0) | ((i2&8)?0xF00:0) | ((i3&8)?0xF000:0) | ((i4&8)?0xF0000:0) | ((i5&8)?0xF00000:0) | ((i6&8)?0xF000000:0) | ((i7&8)?0xF0000000:0);
+        const uint32_t mbz = mb & mz;     // mask for nonzero elements from b
+        const uint32_t maz = ~mb & mz;    // mask for nonzero elements from a
+        const uint32_t m1a = m1 & maz;
+        const uint32_t m1b = m1 & mbz;
+        const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from a
+        const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from b
+        if (((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0) {
+            // Same permute pattern in all lanes:
+            // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+            // and we are saving 64 bytes of data cache.
+            // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+            __m512i ta = permute8q< (maz&0xF)?i0&7:-1, (maz&0xF0)?i1&7:-1, (maz&0xF00)?i2&7:-1, (maz&0xF000)?i3&7:-1, 
+                (maz&0xF0000)?i4&7:-1, (maz&0xF00000)?i5&7:-1, (maz&0xF000000)?i6&7:-1, (maz&0xF0000000)?i7&7:-1> (a);
+            // write mask for elements from b
+            const __mmask16 sb = ((mbz&0xF)?3:0) | ((mbz&0xF0)?0xC:0) | ((mbz&0xF00)?0x30:0) | ((mbz&0xF000)?0xC0:0) | ((mbz&0xF0000)?0x300:0) | ((mbz&0xF00000)?0xC00:0) | ((mbz&0xF000000)?0x3000:0) | ((mbz&0xF0000000)?0xC000:0);
+            // permute index for elements from b
+            const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
+            // 2. Permute elements from b and combine with elements from a through write mask
+            return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
+        }
+        // not same permute pattern in all lanes. use full permute
+    }
+    // general case: full permute
+    const __m512i pmask = constant16i<i0&0xF, 0, i1&0xF, 0, i2&0xF, 0, i3&0xF, 0, i4&0xF, 0, i5&0xF, 0, i6&0xF, 0, i7&0xF, 0>();
+    if (dozero) {
+        return _mm512_maskz_permutex2var_epi64(z, a, pmask, b);
+    }
+    else {
+        return _mm512_permutex2var_epi64(a, pmask, b);
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
+}
+
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {  
+
+    // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
+    const uint64_t m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xFLL)<<28
+        | (i8&0xFLL)<<32 | (i9&0xFLL)<<36 | (i10&0xFLL)<<40 | (i11&0xFLL)<<44 | (i12&0xFLL)<<48 | (i13&0xFLL)<<52 | (i14&0xFLL)<<56 | (i15&0xFLL)<<60;
+
+    // Mask to zero out negative indexes
+    const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL)
+        | (i8<0?0:0xF00000000) | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
+    const uint64_t m2 = m1 & mz;
+
+    // collect bit 4 of each index = select source
+    const uint64_t ms = ((i0&16)?0xF:0) | ((i1&16)?0xF0:0) | ((i2&16)?0xF00:0) | ((i3&16)?0xF000:0) | ((i4&16)?0xF0000:0) | ((i5&16)?0xF00000:0) | ((i6&16)?0xF000000:0) | ((i7&16)?0xF0000000ULL:0)
+        | ((i8&16)?0xF00000000:0) | ((i9&16)?0xF000000000:0) | ((i10&16)?0xF0000000000:0) | ((i11&16)?0xF00000000000:0) | ((i12&16)?0xF000000000000:0) | ((i13&16)?0xF0000000000000:0) | ((i14&16)?0xF00000000000000:0) | ((i15&16)?0xF000000000000000:0);
+
+    // zeroing needed
+    const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
+
+    // mask for elements not zeroed
+    const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7 
+        | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
+
+    // special case: all zero
+    if (mz == 0) return  _mm512_setzero_epi32();
+
+    // special case: all from a
+    if ((ms & mz) == 0) {
+        return permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a);
+    }
+
+    // special case: all from b
+    if ((~ms & mz) == 0) {
+        return permute16i<i0^16,i1^16,i2^16,i3^16,i4^16,i5^16,i6^16,i7^16,i8^16,i9^16,i10^16,i11^16,i12^16,i13^16,i14^16,i15^16 > (b);
+    }
+
+    // special case: blend without permute
+    if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
+        __mmask16 blendmask = __mmask16((i0&16)>>4 | (i1&16)>>3 | (i2&16)>>2 | (i3&16)>>1 | (i4&16) | (i5&16)<<1 | (i6&16)<<2 | (i7&16)<<3
+            | (i8&16)<<4 | (i9&16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11);
+        __m512i t = _mm512_mask_blend_epi32(blendmask, a, b);
+        if (dozero) {
+            t = _mm512_maskz_mov_epi32(z, t);
+        }
+        return t;
+    }
+
+    // special case: all data stay within their lane
+    if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
+
+        // mask for elements from a and b
+        const uint64_t mb  = ms;
+        const uint64_t mbz = mb & mz;     // mask for nonzero elements from b
+        const uint64_t maz = ~mb & mz;    // mask for nonzero elements from a
+        const uint64_t m1a = m1 & maz;
+        const uint64_t m1b = m1 & mbz;
+        const uint64_t pata = ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from a
+        const uint64_t patb = ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from b
+        if (((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0) {
+            // Same permute pattern in all lanes:
+            // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
+            // and we are saving 64 bytes of data cache.
+            // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
+            __m512i ta = permute16i< (maz&0xF)?i0&15:-1, (maz&0xF0)?i1&15:-1, (maz&0xF00)?i2&15:-1, (maz&0xF000)?i3&15:-1, 
+                (maz&0xF0000)?i4&15:-1, (maz&0xF00000)?i5&15:-1, (maz&0xF000000)?i6&15:-1, (maz&0xF0000000)?i7&15:-1,
+                (maz&0xF00000000)?i8&15:-1, (maz&0xF000000000)?i9&15:-1, (maz&0xF0000000000)?i10&15:-1, (maz&0xF00000000000)?i11&15:-1, 
+                (maz&0xF000000000000)?i12&15:-1, (maz&0xF0000000000000)?i13&15:-1, (maz&0xF00000000000000)?i14&15:-1, (maz&0xF000000000000000)?i15&15:-1> (a);
+            // write mask for elements from b
+            const __mmask16 sb = ((mbz&0xF)?1:0) | ((mbz&0xF0)?0x2:0) | ((mbz&0xF00)?0x4:0) | ((mbz&0xF000)?0x8:0) | ((mbz&0xF0000)?0x10:0) | ((mbz&0xF00000)?0x20:0) | ((mbz&0xF000000)?0x40:0) | ((mbz&0xF0000000)?0x80:0) 
+                | ((mbz&0xF00000000)?0x100:0) | ((mbz&0xF000000000)?0x200:0) | ((mbz&0xF0000000000)?0x400:0) | ((mbz&0xF00000000000)?0x800:0) | ((mbz&0xF000000000000)?0x1000:0) | ((mbz&0xF0000000000000)?0x2000:0) | ((mbz&0xF00000000000000)?0x4000:0) | ((mbz&0xF000000000000000)?0x8000:0);
+            // permute index for elements from b
+            const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
+            // 2. Permute elements from b and combine with elements from a through write mask
+            return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
+        }
+        // not same permute pattern in all lanes. use full permute
+    }
+
+    // general case: full permute
+    const __m512i pmask = constant16i<i0&0x1F, i1&0x1F, i2&0x1F, i3&0x1F, i4&0x1F, i5&0x1F, i6&0x1F, i7&0x1F, 
+        i8&0x1F, i9&0x1F, i10&0x1F, i11&0x1F, i12&0x1F, i13&0x1F, i14&0x1F, i15&0x1F>();
+    if (dozero) {
+        return _mm512_maskz_permutex2var_epi32(z, a, pmask, b);        
+    }
+    else {
+        return _mm512_permutex2var_epi32(a, pmask, b);
+    }
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8q c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
+    return _mm512_permutexvar_epi32(index, table);
+}
+
+template <int n>
+static inline Vec16i lookup(Vec16i const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 16) {
+        Vec16i table1 = Vec16i().load(table);
+        return lookup16(index, table1);
+    }
+    if (n <= 32) {
+        Vec16i table1 = Vec16i().load(table);
+        Vec16i table2 = Vec16i().load((int8_t*)table + 64);
+        return _mm512_permutex2var_epi32(table1, index, table2);
+    }
+    // n > 32. Limit index
+    Vec16ui index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16ui(index), uint32_t(n-1));
+    }
+    return _mm512_i32gather_epi32(index1, (const int*)table, 4);
+    // return  _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0);
+}
+
+
+static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
+    return _mm512_permutexvar_epi64(index, table);
+}
+
+template <int n>
+static inline Vec8q lookup(Vec8q const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 8) {
+        Vec8q table1 = Vec8q().load(table);
+        return lookup8(index, table1);
+    }
+    if (n <= 16) {
+        Vec8q table1 = Vec8q().load(table);
+        Vec8q table2 = Vec8q().load((int8_t*)table + 64);
+        return _mm512_permutex2var_epi64(table1, index, table2);
+    }
+    // n > 16. Limit index
+    Vec8uq index1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8uq(index), uint32_t(n-1));
+    }
+    return _mm512_i64gather_epi64(index1, (const long long*)table, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i gather16i(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
+    // find smallest and biggest index, using only compile-time constant expressions
+    const int i01min   = i0  < i1  ? i0  : i1;
+    const int i23min   = i2  < i3  ? i2  : i3;
+    const int i45min   = i4  < i5  ? i4  : i5;
+    const int i67min   = i6  < i7  ? i6  : i7;
+    const int i89min   = i8  < i9  ? i8  : i9;
+    const int i1011min = i10 < i11 ? i10 : i11;
+    const int i1213min = i12 < i13 ? i12 : i13;
+    const int i1415min = i14 < i15 ? i14 : i15;
+    const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
+    const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
+    const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
+    const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
+    const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
+    const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+    const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
+    const int i01max   = i0  > i1  ? i0  : i1;
+    const int i23max   = i2  > i3  ? i2  : i3;
+    const int i45max   = i4  > i5  ? i4  : i5;
+    const int i67max   = i6  > i7  ? i6  : i7;
+    const int i89max   = i8  > i9  ? i8  : i9;
+    const int i1011max = i10 > i11 ? i10 : i11;
+    const int i1213max = i12 > i13 ? i12 : i13;
+    const int i1415max = i14 > i15 ? i14 : i15;
+    const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
+    const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
+    const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
+    const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
+    const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
+    const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+    const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
+    if (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
+            return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16i b = Vec16i().load((int32_t const *)a + imin);
+            return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)    
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16i b = Vec16i().load((int32_t const *)a + imin);
+        Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i32gather_epi32(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const int *)a, 4);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
+            return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8q b = Vec8q().load((int64_t const *)a + imin);
+            return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8q b = Vec8q().load((int64_t const *)a + imin);
+        Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i64gather_epi64(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const long long *)a, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_to_int : extends Vec16s to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16s const & a) {
+    return _mm512_cvtepi16_epi32(a);
+}
+
+// Function extend_to_int : extends Vec16us to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16us const & a) {
+    return _mm512_cvtepu16_epi32(a);
+}
+
+// Function extend_to_int : extends Vec16c to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16c const & a) {
+    return _mm512_cvtepi8_epi32(a);
+}
+
+// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16uc const & a) {
+    return _mm512_cvtepu8_epi32(a);
+}
+
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low (Vec16i const & a) {
+    return _mm512_cvtepi32_epi64(a.get_low());
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high (Vec16i const & a) {
+    return _mm512_cvtepi32_epi64(a.get_high());
+}
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low (Vec16ui const & a) {
+    return _mm512_cvtepu32_epi64(a.get_low());
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high (Vec16ui const & a) {
+    return _mm512_cvtepu32_epi64(a.get_high());
+}
+
+
+// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress_to_int8 (Vec16i const & a) {
+    return _mm512_cvtepi32_epi8(a);
+}
+
+static inline Vec16s compress_to_int16 (Vec16i const & a) {
+    return _mm512_cvtepi32_epi16(a);
+}
+
+// with signed saturation
+static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
+    return _mm512_cvtsepi32_epi8(a);
+}
+
+static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
+    return _mm512_cvtsepi32_epi16(a);
+}
+
+// with unsigned saturation
+static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
+    return _mm512_cvtusepi32_epi8(a);
+}
+
+static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
+    return _mm512_cvtusepi32_epi16(a);
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
+    Vec8i low2   = _mm512_cvtepi64_epi32(low);
+    Vec8i high2  = _mm512_cvtepi64_epi32(high);
+    return Vec16i(low2, high2);
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
+    Vec8i low2   = _mm512_cvtsepi64_epi32(low);
+    Vec8i high2  = _mm512_cvtsepi64_epi32(high);
+    return Vec16i(low2, high2);
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
+    Vec8ui low2   = _mm512_cvtusepi64_epi32(low);
+    Vec8ui high2  = _mm512_cvtusepi64_epi32(high);
+    return Vec16ui(low2, high2);
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 16 32-bit signed integers
+static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
+    __m512i m   = _mm512_broadcast_i32x4(d.getm());        // broadcast multiplier
+    __m512i sgn = _mm512_broadcast_i32x4(d.getsign());     // broadcast sign of d
+    __m512i t1  = _mm512_mul_epi32(a,m);                   // 32x32->64 bit signed multiplication of even elements of a
+    __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m512i t4  = _mm512_mul_epi32(t3,m);                  // 32x32->64 bit signed multiplication of odd elements
+    __m512i t2  = _mm512_srli_epi64(t1,32);                // dword of even index results
+    __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
+    __m512i t8  = _mm512_add_epi32(t7,a);                  // add
+    __m512i t9  = _mm512_sra_epi32(t8,d.gets1());          // shift right artihmetic
+    __m512i t10 = _mm512_srai_epi32(a,31);                 // sign of a
+    __m512i t11 = _mm512_sub_epi32(t10,sgn);               // sign of a - sign of d
+    __m512i t12 = _mm512_sub_epi32(t9,t11);                // + 1 if a < 0, -1 if d < 0
+    return        _mm512_xor_si512(t12,sgn);               // change sign if divisor negative
+}
+
+// vector of 16 32-bit unsigned integers
+static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
+    __m512i m   = _mm512_broadcast_i32x4(d.getm());       // broadcast multiplier
+    __m512i t1  = _mm512_mul_epu32(a,m);                   // 32x32->64 bit unsigned multiplication of even elements of a
+    __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m512i t4  = _mm512_mul_epu32(t3,m);                  // 32x32->64 bit unsigned multiplication of odd elements
+    __m512i t2  = _mm512_srli_epi64(t1,32);                // high dword of even index results
+    __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
+    __m512i t8  = _mm512_sub_epi32(a,t7);                  // subtract
+    __m512i t9  = _mm512_srl_epi32(t8,d.gets1());          // shift right logical
+    __m512i t10 = _mm512_add_epi32(t7,t9);                 // add
+    return        _mm512_srl_epi32(t10,d.gets2());         // shift right logical 
+}
+
+// vector operator /= : divide
+static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d ==  1) return  x;
+    if (d == -1) return -x;
+    if (uint32_t(d) == 0x80000000u) {
+        return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1);  // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
+    }
+    const uint32_t d1 = d > 0 ? uint32_t(d) : -uint32_t(d);          // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if ((d1 & (d1-1)) == 0) {
+        // d1 is a power of 2. use shift
+        const int k = bit_scan_reverse_const(d1);
+        __m512i sign;
+        if (k > 1) sign = _mm512_srai_epi32(x, k-1); else sign = x;  // k copies of sign bit
+        __m512i bias    = _mm512_srli_epi32(sign, 32-k);             // bias = x >= 0 ? 0 : k-1
+        __m512i xpbias  = _mm512_add_epi32 (x, bias);                // x + bias
+        __m512i q       = _mm512_srai_epi32(xpbias, k);              // (x + bias) >> k
+        if (d > 0)      return q;                                    // d > 0: return  q
+        return _mm512_sub_epi32(_mm512_setzero_epi32(), q);          // d < 0: return -q
+
+    }
+    // general case
+    const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);       // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const & x) {
+    Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
+    if (d == 1) return x;                                            // divide by 1
+    const int b = bit_scan_reverse_const(d);                         // floor(log2(d))
+    if ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm512_srli_epi32(x, b);                             // x >> b
+    }
+    // general case (d > 2)
+    uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d);           // multiplier = 2^(32+b) / d
+    const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    const bool round_down = (2*rem < d);                             // check if fraction is less than 0.5
+    if (!round_down) {
+        mult = mult + 1;                                             // round up mult
+    }
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+    const __m512i multv = Vec16ui(uint64_t(mult));                   // zero-extend mult and broadcast
+    __m512i t1 = _mm512_mul_epu32(x,multv);                          // 32x32->64 bit unsigned multiplication of even elements
+    if (round_down) {
+        t1      = _mm512_add_epi64(t1,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m512i t2 = _mm512_srli_epi64(t1,32);                           // high dword of result 0 and 2
+    __m512i t3 = _mm512_srli_epi64(x,32);                            // get odd elements into position for multiplication
+    __m512i t4 = _mm512_mul_epu32(t3,multv);                         // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if (round_down) {
+        t4      = _mm512_add_epi64(t4,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAA), t4);     // blend two results
+    Vec16ui q  = _mm512_srli_epi32(t7, b);                           // shift right by b
+    return q;                                                        // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+
+static inline int horizontal_find_first(Vec16ib const & x) {
+    uint32_t b = uint16_t(__mmask16(x));
+    if (b) {
+        return bit_scan_forward(b);
+    }
+    else {
+        return -1;
+    }
+}
+
+static inline int horizontal_find_first(Vec8qb const & x) {
+    uint32_t b = uint8_t(__mmask8(x));
+    if (b) {
+        return bit_scan_forward(b);
+    }
+    else {
+        return -1;
+    }
+}
+
+static inline uint32_t horizontal_count(Vec16ib const & x) {
+    return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
+}
+
+static inline uint32_t horizontal_count(Vec8qb const & x) {
+    return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib x) {
+    __m512i a = _mm512_castsi128_si512(x);
+    __mmask16 b = _mm512_mask_testn_epi32_mask(0xF, a, a);
+    return uint8_t(b) ^ 0xF;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4ib to_Vec4ib(uint8_t x) {
+    return _mm512_castsi512_si128(_mm512_maskz_set1_epi32(__mmask16(x), -1));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb x) {
+    __m512i a = _mm512_castsi128_si512(x);
+    __mmask16 b = _mm512_mask_testn_epi64_mask(0x3, a, a);
+    return uint8_t(b) ^ 0x3;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec2qb to_Vec2qb(uint8_t x) {
+    return _mm512_castsi512_si128(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib x) {
+    __m512i a = _mm512_castsi256_si512(x);
+    __mmask16 b = _mm512_mask_testn_epi32_mask(0xFF, a, a);
+    return ~ uint8_t(b);
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec8ib to_Vec8ib(uint8_t x) {
+    return _mm512_castsi512_si256(_mm512_maskz_set1_epi32(__mmask16(x), -1));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb x) {
+    __m512i a = _mm512_castsi256_si512(x);
+    __mmask16 b = _mm512_mask_testn_epi64_mask(0xF, a, a);
+    return uint8_t(b) ^ 0xF;
+}
+
+// to_Vec16c: convert integer bitfield to boolean vector
+static inline Vec4qb to_Vec4qb(uint8_t x) {
+    return _mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
+}
+
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16b a) {
+    return (uint16_t)(__mmask16)a;
+}
+
+// to_Vec16b: convert integer bitfield to boolean vector
+static inline Vec16b to_Vec16b(uint16_t x) {
+    return (__mmask16)x;
+}
+
+// to_Vec16ib: convert integer bitfield to boolean vector
+static inline Vec16ib to_Vec16ib(uint16_t x) {
+    return to_Vec16b(x);
+}
+
+// to_Vec8b: convert integer bitfield to boolean vector
+static inline Vec8qb to_Vec8qb(uint8_t x) {
+    return (__mmask8)x;
+}
+
+#endif // VECTORI512_H
diff --git a/vectorclass/vectori512e.h b/vectorclass/vectori512e.h
new file mode 100755
index 0000000..de7dac6
--- /dev/null
+++ b/vectorclass/vectori512e.h
@@ -0,0 +1,2545 @@
+/****************************  vectori512e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file defining integer vector classes as interface to intrinsic 
+* functions in x86 microprocessors with AVX512 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired 
+* instruction set, which must be at least AVX512. 
+*
+* The following vector classes are defined here:
+* Vec16i    Vector of  16  32-bit signed   integers
+* Vec16ui   Vector of  16  32-bit unsigned integers
+* Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+* Vec8q     Vector of   8  64-bit signed   integers
+* Vec8uq    Vector of   8  64-bit unsigned integers
+* Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+*****************************************************************************/
+
+// check combination of header files
+#if defined (VECTORI512_H)
+#if    VECTORI512_H != 1
+#error Two different versions of vectori512.h included
+#endif
+#else
+#define VECTORI512_H  1
+
+
+/*****************************************************************************
+*
+*          base class Vec512ie
+*
+*****************************************************************************/
+// base class to replace _mm512i when AVX512 is not supported
+class Vec512ie {
+protected:
+    Vec256b z0;                         // low half
+    Vec256b z1;                         // high half
+public:
+    Vec512ie(void) {};                  // default constructor
+    Vec512ie(Vec8i const & x0, Vec8i const & x1) {      // constructor to build from two Vec8i
+        z0 = x0;  z1 = x1;
+    }
+    Vec8i get_low() const {            // get low half
+        return Vec8i(z0);
+    }
+    Vec8i get_high() const {           // get high half
+        return Vec8i(z1);
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vector of 512 1-bit unsigned integers or Booleans
+*
+*****************************************************************************/
+class Vec512b : public Vec512ie {
+public:
+    // Default constructor:
+    Vec512b() {
+    }
+    // Constructor to build from two Vec256b:
+    Vec512b(Vec256b const & a0, Vec256b const & a1) {
+        z0 = a0;  z1 = a1;
+    }
+    // Constructor to convert from type Vec512ie
+    Vec512b(Vec512ie const & x) {
+        z0 = x.get_low();  z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512ie
+    Vec512b & operator = (Vec512ie const & x) {
+        z0 = x.get_low();  z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec512b & load(void const * p) {
+        z0 = Vec8i().load(p);
+        z1 = Vec8i().load((int32_t*)p+8);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec512b & load_a(void const * p) {
+        z0 = Vec8i().load_a(p);
+        z1 = Vec8i().load_a((int32_t*)p+8);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        Vec8i(z0).store(p);
+        Vec8i(z1).store((int32_t*)p+8);
+    }
+    // Member function to store into array, aligned by 64
+    void store_a(void * p) const {
+        Vec8i(z0).store_a(p);
+        Vec8i(z1).store_a((int32_t*)p+8);
+    }
+    // Member function to change a single bit
+    // Note: This function is inefficient. Use load function if changing more than one bit
+    Vec512b const & set_bit(uint32_t index, int value) {
+        if (index < 256) {
+            z0 = Vec8i(z0).set_bit(index, value);
+        }
+        else {
+            z1 = Vec8i(z1).set_bit(index-256, value);
+        }
+        return *this;
+    }
+    // Member function to get a single bit
+    // Note: This function is inefficient. Use store function if reading more than one bit
+    int get_bit(uint32_t index) const {
+        if (index < 256) {
+            return Vec8i(z0).get_bit(index);
+        }
+        else {
+            return Vec8i(z1).get_bit(index-256);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return get_bit(index) != 0;
+    }
+    // Member functions to split into two Vec128b:
+    Vec256b get_low() const {
+        return z0;
+    }
+    Vec256b get_high() const {
+        return z1;
+    }
+    static int size () {
+        return 512;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
+    return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
+    return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
+    return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator ~ (Vec512b const & a) {
+    return Vec512b(~a.get_low(), ~a.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
+    return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+// Generate a constant vector of 8 integers stored in memory.
+// Can be converted to any integer vector type
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec512ie constant16i() {
+    static const union {
+        int32_t i[16];
+        Vec256b y[2];  // note: requires C++0x or later. Use option -std=c++0x
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
+    return Vec512ie(u.y[0], u.y[1]);
+}
+
+
+/*****************************************************************************
+*
+*          Boolean vector base classes for AVX512
+*
+*****************************************************************************/
+
+class Vec16b : public Vec512b {
+public:
+    // Default constructor:
+    Vec16b () {
+    }
+    // Constructor to build from all elements:
+    Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, 
+    bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
+        *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7), Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15));
+    }
+    // Constructor to convert from type Vec512b
+    Vec16b (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to make from two halves
+    Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }        
+    // Constructor to make from two halves
+    Vec16b (Vec8i const & x0, Vec8i const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }        
+    // Constructor to broadcast single value:
+    Vec16b(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16b & operator = (bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+        return *this;
+    }
+private: 
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec16b(int b);
+    // Prevent assigning int because of ambiguity
+    Vec16b & operator = (int x);
+public:
+    // split into two halves
+    Vec8ib get_low() const {
+        return Vec8ib(z0);
+    }
+    Vec8ib get_high() const {
+        return Vec8ib(z1);
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16b & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16b const & insert(uint32_t index, bool value) {
+        if (index < 8) {
+            z0 = Vec8ib(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8ib(z1).insert(index-8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(uint32_t index) const {
+        if (index < 8) {
+            return Vec8ib(z0).extract(index);
+        }
+        else {
+            return Vec8ib(z1).extract(index-8);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec16b operator & (Vec16b const & a, Vec16b const & b) {
+    return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16b operator && (Vec16b const & a, Vec16b const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16b operator | (Vec16b const & a, Vec16b const & b) {
+    return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16b operator || (Vec16b const & a, Vec16b const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16b operator ^ (Vec16b const & a, Vec16b const & b) {
+    return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16b operator ~ (Vec16b const & a) {
+    return Vec16b(~(a.get_low()), ~(a.get_high()));
+}
+
+// vector operator ! : element not
+static inline Vec16b operator ! (Vec16b const & a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16b & operator &= (Vec16b & a, Vec16b const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16b & operator |= (Vec16b & a, Vec16b const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16b & operator ^= (Vec16b & a, Vec16b const & b) {
+    a = a ^ b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Functions for boolean vectors
+*
+*****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec16b andnot (Vec16b const & a, Vec16b const & b) {
+    return Vec16b(Vec8ib(andnot(a.get_low(),b.get_low())), Vec8ib(andnot(a.get_high(),b.get_high())));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec16b const & a) {
+    return  horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec16b const & a) {
+    return  horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
+*
+*****************************************************************************/
+
+class Vec16ib : public Vec16b {
+public:
+    // Default constructor:
+    Vec16ib () {
+    }
+    Vec16ib (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to build from all elements:
+    Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
+        z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7);
+        z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15);
+    }
+    // Constructor to convert from type Vec512b
+    Vec16ib (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Construct from two halves
+    Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16ib & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16ib(bool b) : Vec16b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16ib & operator = (bool b) {
+        *this = Vec16b(b);
+        return *this;
+    }
+private: // Prevent constructing from int, etc.
+    Vec16ib(int b);
+    Vec16ib & operator = (int x);
+public:
+};
+
+// Define operators for Vec16ib
+
+// vector operator & : bitwise and
+static inline Vec16ib operator & (Vec16ib const & a, Vec16ib const & b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec16ib operator && (Vec16ib const & a, Vec16ib const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16ib operator | (Vec16ib const & a, Vec16ib const & b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec16ib operator || (Vec16ib const & a, Vec16ib const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ib operator ^ (Vec16ib const & a, Vec16ib const & b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ib operator ~ (Vec16ib const & a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec16ib operator ! (Vec16ib const & a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector function andnot
+static inline Vec16ib andnot (Vec16ib const & a, Vec16ib const & b) {
+    return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vec8b: Base class vector of 8 Booleans
+*
+*****************************************************************************/
+
+class Vec8b : public Vec16b {
+public:
+    // Default constructor:
+    Vec8b () {
+    }
+    Vec8b (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to convert from type Vec512b
+    Vec8b (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // construct from two halves
+    Vec8b (Vec4qb const & x0, Vec4qb const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to broadcast single value:
+    Vec8b(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8b & operator = (bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+        return *this;
+    }
+private: 
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8b(int b);
+    // Prevent assigning int because of ambiguity
+    Vec8b & operator = (int x);
+public:
+    // split into two halves
+    Vec4qb get_low() const {
+        return Vec4qb(z0);
+    }
+    Vec4qb get_high() const {
+        return Vec4qb(z1);
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8b & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8b const & insert(uint32_t index, bool value) {
+        if (index < 4) {
+            z0 = Vec4qb(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec4qb(z1).insert(index-4, value);
+        }
+        return *this;
+    }
+    bool extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4qb(Vec4q(z0)).extract(index);
+        }
+        else {
+            return Vec4qb(Vec4q(z1)).extract(index-4);
+        }
+    }
+    bool operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
+*
+*****************************************************************************/
+
+class Vec8qb : public Vec8b {
+public:
+    // Default constructor:
+    Vec8qb () {
+    }
+    Vec8qb (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to build from all elements:
+    Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        z0 = Vec4qb(x0, x1, x2, x3);
+        z1 = Vec4qb(x4, x5, x6, x7);
+    }
+    // Constructor to convert from type Vec512b
+    Vec8qb (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // construct from two halves
+    Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8qb & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast single value:
+    Vec8qb(bool b) : Vec8b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8qb & operator = (bool b) {
+        *this = Vec8b(b);
+        return *this;
+    }
+private: 
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8qb(int b);
+    // Prevent assigning int because of ambiguity
+    Vec8qb & operator = (int x);
+public:
+};
+
+// Define operators for Vec8qb
+
+// vector operator & : bitwise and
+static inline Vec8qb operator & (Vec8qb const & a, Vec8qb const & b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec8qb operator && (Vec8qb const & a, Vec8qb const & b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8qb operator | (Vec8qb const & a, Vec8qb const & b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec8qb operator || (Vec8qb const & a, Vec8qb const & b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8qb operator ^ (Vec8qb const & a, Vec8qb const & b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8qb operator ~ (Vec8qb const & a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec8qb operator ! (Vec8qb const & a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector function andnot
+static inline Vec8qb andnot (Vec8qb const & a, Vec8qb const & b) {
+    return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16i: public Vec512b {
+public:
+    // Default constructor:
+    Vec16i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16i(int i) {
+        z0 = z1 = Vec8i(i);
+    }
+    // Constructor to build from all elements:
+    Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+    int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
+        z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7);
+        z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to build from two Vec8i:
+    Vec16i(Vec8i const & a0, Vec8i const & a1) {
+        *this = Vec512b(a0, a1);
+    }
+    // Constructor to convert from type Vec512b
+    Vec16i(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16i & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16i & load(void const * p) {
+        Vec512b::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16i & load_a(void const * p) {
+        Vec512b::load_a(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16i & load_partial(int n, void const * p) {
+        if (n < 8) {
+            z0 = Vec8i().load_partial(n, p);
+            z1 = Vec8i(0);
+        }
+        else {
+            z0 = Vec8i().load(p);
+            z1 = Vec8i().load_partial(n - 8, (int32_t *)p + 8);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n < 8) {
+            Vec8i(get_low()).store_partial(n, p);
+        }
+        else {
+            Vec8i(get_low()).store(p);
+            Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16i & cutoff(int n) {
+        if (n < 8) {
+            z0 = Vec8i(z0).cutoff(n);
+            z1 = Vec8i(0);
+        }
+        else {
+            z1 = Vec8i(z1).cutoff(n - 8);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16i const & insert(uint32_t index, int32_t value) {
+        if (index < 8) {
+            z0 = Vec8i(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8i(z1).insert(index - 8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(uint32_t index) const {
+        if (index < 8) {
+            return Vec8i(z0).extract(index);
+        }
+        else {
+            return Vec8i(z1).extract(index - 8);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8i:
+    Vec8i get_low() const {
+        return Vec8i(z0);
+    }
+    Vec8i get_high() const {
+        return Vec8i(z1);
+    }
+    static int size () {
+        return 16;
+    }
+};
+
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator ++ (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16i & operator ++ (Vec16i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16i operator - (Vec16i const & a) {
+    return Vec16i(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16i operator -- (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16i & operator -- (Vec16i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec16i operator << (Vec16i const & a, int32_t b) {
+    return Vec16i(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
+    return Vec16i(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
+    return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
+    return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+  
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
+    return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
+    return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
+    return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator ~ (Vec16i const & a) {
+    return Vec16i(~(a.get_low()), ~(a.get_high()));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
+    return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
+    return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int32_t horizontal_add (Vec16i const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
+    return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
+    return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
+    return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
+    return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const & a) {
+    return Vec16i(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const & a) {
+    return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const & a, int b) {
+    return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16ui : public Vec16i {
+public:
+    // Default constructor:
+    Vec16ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16ui(uint32_t i) {
+        z0 = z1 = Vec8ui(i);
+    };
+    // Constructor to build from all elements:
+    Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
+    uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
+        z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7);
+        z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to build from two Vec8ui:
+    Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Constructor to convert from type Vec512b
+    Vec16ui(Vec512b const & x) {
+        *this = x;
+    };
+    // Assignment operator to convert from type Vec512b
+    Vec16ui & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    };
+    // Member function to load from array (unaligned)
+    Vec16ui & load(void const * p) {
+        Vec16i::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16ui & load_a(void const * p) {
+        Vec16i::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16ui const & insert(uint32_t index, uint32_t value) {
+        Vec16i::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(uint32_t index) const {
+        return Vec16i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec8ui get_low() const {
+        return Vec8ui(Vec16i::get_low());
+    }
+    Vec8ui get_high() const {
+        return Vec8ui(Vec16i::get_high());
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) + Vec16i(b));
+}
+
+// vector operator - : subtract
+static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) - Vec16i(b));
+}
+
+// vector operator * : multiply
+static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui (Vec16i(a) * Vec16i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
+    return Vec16ui(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}            
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) & Vec16i(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) | Vec16i(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(Vec16i(a) ^ Vec16i(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator ~ (Vec16ui const & a) {
+    return Vec16ui( ~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16ui const & a) {
+    return horizontal_add((Vec16i)a);
+}
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8q : public Vec512b {
+public:
+    // Default constructor:
+    Vec8q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8q(int64_t i) {
+        z0 = z1 = Vec4q(i);
+    }
+    // Constructor to build from all elements:
+    Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
+        z0 = Vec4q(i0, i1, i2, i3);
+        z1 = Vec4q(i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4q:
+    Vec8q(Vec4q const & a0, Vec4q const & a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Constructor to convert from type Vec512b
+    Vec8q(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8q & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8q & load(void const * p) {
+        z0 = Vec4q().load(p);
+        z1 = Vec4q().load((int64_t*)p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec8q & load_a(void const * p) {
+        z0 = Vec4q().load_a(p);
+        z1 = Vec4q().load_a((int64_t*)p+4);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8q & load_partial(int n, void const * p) {
+        if (n < 4) {
+            z0 = Vec4q().load_partial(n, p);
+            z1 = Vec4q(0);
+        }
+        else {
+            z0 = Vec4q().load(p);
+            z1 = Vec4q().load_partial(n - 4, (int64_t *)p + 4);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n < 4) {
+            Vec4q(get_low()).store_partial(n, p);
+        }
+        else {
+            Vec4q(get_low()).store(p);
+            Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8q & cutoff(int n) {
+        if (n < 4) {
+            z0 = Vec4q(z0).cutoff(n);
+            z1 = Vec4q(0);
+        }
+        else {
+            z1 = Vec4q(z1).cutoff(n - 4);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8q const & insert(uint32_t index, int64_t value) {
+        if (index < 4) {
+            z0 = Vec4q(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec4q(z1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(uint32_t index) const {
+        if (index < 4) {
+            return Vec4q(z0).extract(index);
+        }
+        else {
+            return Vec4q(z1).extract(index - 4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec4q get_low() const {
+        return Vec4q(z0);
+    }
+    Vec4q get_high() const {
+        return Vec4q(z1);
+    }
+    static int size () {
+        return 8;
+    }
+};
+
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator ++ (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8q & operator ++ (Vec8q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec8q operator - (Vec8q const & a) {
+    return Vec8q(- a.get_low(), - a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8q operator -- (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8q & operator -- (Vec8q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator << (Vec8q const & a, int32_t b) {
+    return Vec8q(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
+    return Vec8q(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
+    return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
+    return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+  
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
+    return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
+    return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
+    return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator ~ (Vec8q const & a) {
+    return Vec8q(~(a.get_low()), ~(a.get_high()));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
+    return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
+    return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int64_t horizontal_add (Vec8q const & a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec16i const & x) {
+    return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec16ui const & x) {
+    return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
+    return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
+    return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const & a) {
+    return Vec8q(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const & a) {
+    return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const & a, int b) {
+    return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8uq : public Vec8q {
+public:
+    // Default constructor:
+    Vec8uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8uq(uint64_t i) {
+        z0 = z1 = Vec4uq(i);
+    }
+    // Constructor to convert from Vec8q:
+    Vec8uq(Vec8q const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to convert from type Vec512b
+    Vec8uq(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to build from all elements:
+    Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
+        z0 = Vec4q(i0, i1, i2, i3);
+        z0 = Vec4q(i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4uq:
+    Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Assignment operator to convert from Vec8q:
+    Vec8uq  & operator = (Vec8q const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8uq & operator = (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8uq & load(void const * p) {
+        Vec8q::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8uq & load_a(void const * p) {
+        Vec8q::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8uq const & insert(uint32_t index, uint64_t value) {
+        Vec8q::insert(index, value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(uint32_t index) const {
+        return Vec8q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (uint32_t index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec4uq get_low() const {
+        return Vec4uq(Vec8q::get_low());
+    }
+    Vec4uq get_high() const {
+        return Vec4uq(Vec8q::get_high());
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) + Vec8q(b));
+}
+
+// vector operator - : subtract
+static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) - Vec8q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq (Vec8q(a) * Vec8q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
+    return Vec8uq(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+} 
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
+    return Vec8uq ((Vec8q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
+    return Vec8uq ((Vec8q)a << b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) & Vec8q(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) | Vec8q(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(Vec8q(a) ^ Vec8q(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint64_t horizontal_add (Vec8uq const & a) {
+    return horizontal_add(Vec8q(a));
+}
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero. An index of -256 means don't care.
+*
+* Example:
+* Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
+* Vec8q b;
+* b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8q(Vec8q const & a) {
+    return Vec8q(blend4q<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+                 blend4q<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8uq(Vec8uq const & a) {
+    return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
+}
+
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i permute16i(Vec16i const & a) {
+    return Vec16i(blend8i<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
+                  blend8i<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16ui permute16ui(Vec16ui const & a) {
+    return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors and
+* optionally set some elements to zero. 
+*
+* The indexes are inserted as template parameters in <>. These indexes must be
+* constants. Each template parameter is an index to the element you want to 
+* select, where higher indexes indicate an element from the second source
+* vector. For example, if each vector has 8 elements, then indexes 0 - 7
+* will select an element from the first vector and indexes 8 - 15 will select 
+* an element from the second vector. A negative index will generate zero.
+*
+* Example:
+* Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
+* Vec8q c;
+* c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+
+// helper function used below
+template <int n>
+static inline Vec4q select4(Vec8q const & a, Vec8q const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return Vec4q(0);
+}
+
+// blend vectors Vec8q
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {  
+    const int j0 = i0 >= 0 ? i0/4 : i0;
+    const int j1 = i1 >= 0 ? i1/4 : i1;
+    const int j2 = i2 >= 0 ? i2/4 : i2;
+    const int j3 = i3 >= 0 ? i3/4 : i3;
+    const int j4 = i4 >= 0 ? i4/4 : i4;
+    const int j5 = i5 >= 0 ? i5/4 : i5;
+    const int j6 = i6 >= 0 ? i6/4 : i6;
+    const int j7 = i7 >= 0 ? i7/4 : i7;
+    Vec4q x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
+    const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
+    const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
+
+    // Combine all the indexes into a single bitfield, with 4 bits for each
+    const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
+
+    // Mask to zero out negative indexes
+    const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
+
+    if (r0 < 0) {
+        x0 =  Vec4q(0);
+    }
+    else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { 
+        // i0 - i3 all from same source
+        x0 = permute4q<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
+    }
+    else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { 
+        // i0 - i3 all from two sources
+        const int k0 =  i0 >= 0 ? i0 & 3 : i0;
+        const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
+        const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
+        const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
+        x0 = blend4q<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i3 from three or four different sources
+        x0 = blend4q<0,1,6,7> (
+             blend4q<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
+             blend4q<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
+    }
+
+    if (r1 < 0) {
+        x1 =  Vec4q(0);
+    }
+    else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) { 
+        // i4 - i7 all from same source
+        x1 = permute4q<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
+    }
+    else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { 
+        // i4 - i7 all from two sources
+        const int k4 =  i4 >= 0 ? i4 & 3 : i4;
+        const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
+        const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
+        const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
+        x1 = blend4q<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i4 - i7 from three or four different sources
+        x1 = blend4q<0,1,6,7> (
+             blend4q<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
+             blend4q<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
+    }
+
+    return Vec8q(x0,x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> 
+static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
+    return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
+}
+
+
+// helper function used below
+template <int n>
+static inline Vec8i select4(Vec16i const & a, Vec16i const & b) {
+    switch (n) {
+    case 0:
+        return a.get_low();
+    case 1:
+        return a.get_high();
+    case 2:
+        return b.get_low();
+    case 3:
+        return b.get_high();
+    }
+    return  Vec8i(0);
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {
+
+    const int j0  = i0  >= 0 ? i0 /8 : i0;
+    const int j1  = i1  >= 0 ? i1 /8 : i1;
+    const int j2  = i2  >= 0 ? i2 /8 : i2;
+    const int j3  = i3  >= 0 ? i3 /8 : i3;
+    const int j4  = i4  >= 0 ? i4 /8 : i4;
+    const int j5  = i5  >= 0 ? i5 /8 : i5;
+    const int j6  = i6  >= 0 ? i6 /8 : i6;
+    const int j7  = i7  >= 0 ? i7 /8 : i7;
+    const int j8  = i8  >= 0 ? i8 /8 : i8;
+    const int j9  = i9  >= 0 ? i9 /8 : i9;
+    const int j10 = i10 >= 0 ? i10/8 : i10;
+    const int j11 = i11 >= 0 ? i11/8 : i11;
+    const int j12 = i12 >= 0 ? i12/8 : i12;
+    const int j13 = i13 >= 0 ? i13/8 : i13;
+    const int j14 = i14 >= 0 ? i14/8 : i14;
+    const int j15 = i15 >= 0 ? i15/8 : i15;
+
+    Vec8i x0, x1;
+
+    const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2  >= 0 ? j2  : j3  >= 0 ? j3  : j4  >= 0 ? j4  : j5  >= 0 ? j5  : j6  >= 0 ? j6  : j7;
+    const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
+    const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2  : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
+    const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
+
+    if (r0 < 0) {
+        x0 = Vec8i(0);
+    }
+    else if (r0 == s0) {
+        // i0 - i7 all from same source
+        x0 = permute8i<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
+    }
+    else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3==s0) && (j4<0||j4==r0||j4==s0) && (j5<0||j5==r0||j5==s0) && (j6<0||j6==r0||j6==s0) && (j7<0||j7==r0||j7==s0)) {
+        // i0 - i7 all from two sources
+        const int k0 =  i0 >= 0 ? (i0 & 7) : i0;
+        const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
+        const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
+        const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
+        const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
+        const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
+        const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
+        const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
+        x0 = blend8i<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
+    }
+    else {
+        // i0 - i7 from three or four different sources
+        const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
+        const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
+        const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
+        const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
+        const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
+        const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
+        const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
+        const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
+        x0 = blend8i<n0, n1, n2, n3, n4, n5, n6, n7> (
+             blend8i< j0   & 2 ? -256 : i0 &15,  j1   & 2 ? -256 : i1 &15,  j2   & 2 ? -256 : i2 &15,  j3   & 2 ? -256 : i3 &15,  j4   & 2 ? -256 : i4 &15,  j5   & 2 ? -256 : i5 &15,  j6   & 2 ? -256 : i6 &15,  j7   & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
+             blend8i<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
+    }
+
+    if (r1 < 0) {
+        x1 = Vec8i(0);
+    }
+    else if (r1 == s1) {
+        // i8 - i15 all from same source
+        x1 = permute8i<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
+    }
+    else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) {
+        // i8 - i15 all from two sources
+        const int k8 =  i8 >= 0 ? (i8 & 7) : i8;
+        const int k9 = (i9 >= 0 ? (i9 & 7) : i9 ) | (j9 == s1 ? 8 : 0);
+        const int k10= (i10>= 0 ? (i10& 7) : i10) | (j10== s1 ? 8 : 0);
+        const int k11= (i11>= 0 ? (i11& 7) : i11) | (j11== s1 ? 8 : 0);
+        const int k12= (i12>= 0 ? (i12& 7) : i12) | (j12== s1 ? 8 : 0);
+        const int k13= (i13>= 0 ? (i13& 7) : i13) | (j13== s1 ? 8 : 0);
+        const int k14= (i14>= 0 ? (i14& 7) : i14) | (j14== s1 ? 8 : 0);
+        const int k15= (i15>= 0 ? (i15& 7) : i15) | (j15== s1 ? 8 : 0);
+        x1 = blend8i<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
+    }
+    else {
+        // i8 - i15 from three or four different sources
+        const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
+        const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
+        const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
+        const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
+        const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
+        const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
+        const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
+        const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
+        x1 = blend8i<n8, n9, n10, n11, n12, n13, n14, n15> (
+             blend8i< j8   & 2 ? -256 : i8 &15,  j9   & 2 ? -256 : i9 &15,  j10   & 2 ? -256 : i10 &15,  j11   & 2 ? -256 : i11 &15,  j12   & 2 ? -256 : i12 &15,  j13   & 2 ? -256 : i13 &15,  j14   & 2 ? -256 : i14 &15,  j15   & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
+             blend8i<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
+    }
+    return Vec16i(x0,x1);
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7, 
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 > 
+static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
+    return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
+* Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
+* Vec8q c;
+* c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
+*
+*****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
+    int32_t tab[16];
+    table.store(tab);
+    Vec8i t0 = lookup<16>(index.get_low(), tab);
+    Vec8i t1 = lookup<16>(index.get_high(), tab);
+    return Vec16i(t0, t1);
+}
+
+template <int n>
+static inline Vec16i lookup(Vec16i const & index, void const * table) {
+    if (n <=  0) return 0;
+    if (n <=  8) {
+        Vec8i table1 = Vec8i().load(table);        
+        return Vec16i(       
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if (n <= 16) return lookup16(index, Vec16i().load(table));
+    // n > 16. Limit index
+    Vec16ui i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16ui(index), n-1);
+    }
+    int32_t const * t = (int32_t const *)table;
+    return Vec16i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
+        t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
+}
+
+static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
+    int64_t tab[8];
+    table.store(tab);
+    Vec4q t0 = lookup<8>(index.get_low(), tab);
+    Vec4q t1 = lookup<8>(index.get_high(), tab);
+    return Vec8q(t0, t1);
+}
+
+template <int n>
+static inline Vec8q lookup(Vec8q const & index, void const * table) {
+    if (n <= 0) return 0;
+    if (n <= 4) {
+        Vec4q table1 = Vec4q().load(table);        
+        return Vec8q(       
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if (n <= 8) {
+        return lookup8(index, Vec8q().load(table));
+    }
+    // n > 8. Limit index
+    Vec8uq i1;
+    if ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8uq(index), n-1);
+    }
+    int64_t const * t = (int64_t const *)table;
+    return Vec8q(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, 
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i gather16i(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
+    // find smallest and biggest index, using only compile-time constant expressions
+    const int i01min   = i0  < i1  ? i0  : i1;
+    const int i23min   = i2  < i3  ? i2  : i3;
+    const int i45min   = i4  < i5  ? i4  : i5;
+    const int i67min   = i6  < i7  ? i6  : i7;
+    const int i89min   = i8  < i9  ? i8  : i9;
+    const int i1011min = i10 < i11 ? i10 : i11;
+    const int i1213min = i12 < i13 ? i12 : i13;
+    const int i1415min = i14 < i15 ? i14 : i15;
+    const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
+    const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
+    const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
+    const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
+    const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
+    const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
+    const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
+    const int i01max   = i0  > i1  ? i0  : i1;
+    const int i23max   = i2  > i3  ? i2  : i3;
+    const int i45max   = i4  > i5  ? i4  : i5;
+    const int i67max   = i6  > i7  ? i6  : i7;
+    const int i89max   = i8  > i9  ? i8  : i9;
+    const int i1011max = i10 > i11 ? i10 : i11;
+    const int i1213max = i12 > i13 ? i12 : i13;
+    const int i1415max = i14 > i15 ? i14 : i15;
+    const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
+    const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
+    const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
+    const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
+    const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
+    const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
+    const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
+    if (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
+            return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16i b = Vec16i().load((int32_t const *)a + imin);
+            return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)    
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16i b = Vec16i().load((int32_t const *)a + imin);
+        Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), a);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const * a) {
+    Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
+
+    const int i01min = i0 < i1 ? i0 : i1;
+    const int i23min = i2 < i3 ? i2 : i3;
+    const int i45min = i4 < i5 ? i4 : i5;
+    const int i67min = i6 < i7 ? i6 : i7;
+    const int i0123min = i01min < i23min ? i01min : i23min;
+    const int i4567min = i45min < i67min ? i45min : i67min;
+    const int imin = i0123min < i4567min ? i0123min : i4567min;
+    const int i01max = i0 > i1 ? i0 : i1;
+    const int i23max = i2 > i3 ? i2 : i3;
+    const int i45max = i4 > i5 ? i4 : i5;
+    const int i67max = i6 > i7 ? i6 : i7;
+    const int i0123max = i01max > i23max ? i01max : i23max;
+    const int i4567max = i45max > i67max ? i45max : i67max;
+    const int imax = i0123max > i4567max ? i0123max : i4567max;
+    if (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
+            return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8q b = Vec8q().load((int64_t const *)a + imin);
+            return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8q b = Vec8q().load((int64_t const *)a + imin);
+        Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), a);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_to_int : extends Vec16s to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16s const & a) {
+    return Vec16i(extend_low(a), extend_high(a));
+}
+
+// Function extend_to_int : extends Vec16us to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16us const & a) {
+    return Vec16i(extend_low(a), extend_high(a));
+}
+
+// Function extend_to_int : extends Vec16c to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16c const & a) {
+    return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
+}
+
+// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16uc const & a) {
+    return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
+}
+
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low (Vec16i const & a) {
+    return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high (Vec16i const & a) {
+    return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low (Vec16ui const & a) {
+    return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high (Vec16ui const & a) {
+    return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+
+// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress_to_int8 (Vec16i const & a) {
+    Vec16s b = compress(a.get_low(), a.get_high());
+    Vec16c c = compress(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16s compress_to_int16 (Vec16i const & a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+// with signed saturation
+static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
+    Vec16s b = compress_saturated(a.get_low(), a.get_high());
+    Vec16c c = compress_saturated(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
+    return compress_saturated(a.get_low(), a.get_high());
+}
+
+// with unsigned saturation
+static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
+    Vec16us b = compress_saturated(a.get_low(), a.get_high());
+    Vec16uc c = compress_saturated(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
+    return compress_saturated(a.get_low(), a.get_high());
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
+    return Vec16i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
+    return Vec16i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
+    return Vec16ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector operator / : divide all elements by same integer
+static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
+    return Vec16i(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
+    return Vec16ui(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const & a) {
+    return Vec16i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
+    Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const & a) {
+    return Vec16ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
+    Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal scan functions
+*
+*****************************************************************************/
+
+// Get index to the first element that is true. Return -1 if all are false
+static inline int horizontal_find_first(Vec16ib const & x) {
+    int a1 = horizontal_find_first(x.get_low());
+    if (a1 >= 0) return a1;
+    int a2 = horizontal_find_first(x.get_high());
+    if (a2 < 0) return a2;
+    return a2 + 8;
+}
+
+static inline int horizontal_find_first(Vec8qb const & x) {
+    int a1 = horizontal_find_first(x.get_low());
+    if (a1 >= 0) return a1;
+    int a2 = horizontal_find_first(x.get_high());
+    if (a2 < 0) return a2;
+    return a2 + 4;
+}
+
+// count the number of true elements
+static inline uint32_t horizontal_count(Vec16ib const & x) {
+    return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
+}
+
+static inline uint32_t horizontal_count(Vec8qb const & x) {
+    return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16b const & a) {
+    return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
+}
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16ib const & a) {
+    return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
+}
+
+// to_Vec16ib: convert integer bitfield to boolean vector
+static inline Vec16ib to_Vec16ib(uint16_t const & x) {
+    return Vec16i(to_Vec8ib(uint8_t(x)), to_Vec8ib(uint8_t(x>>8)));
+}
+
+// to_bits: convert to integer bitfield
+static inline uint8_t to_bits(Vec8b const & a) {
+    return to_bits(a.get_low()) | (to_bits(a.get_high()) << 4);
+}
+
+// to_Vec8qb: convert integer bitfield to boolean vector
+static inline Vec8qb to_Vec8qb(uint8_t x) {
+    return Vec8q(to_Vec4qb(x), to_Vec4qb(x>>4));
+}
+
+#endif // VECTORI512_H
diff --git a/vectorclass/vectormath_common.h b/vectorclass/vectormath_common.h
new file mode 100755
index 0000000..edcbd13
--- /dev/null
+++ b/vectorclass/vectormath_common.h
@@ -0,0 +1,310 @@
+/***************************  vectormath_common.h   ****************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file containing common code for inline version of mathematical functions.
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* Calculation methods:
+* Some functions are using Pad� approximations f(x) = P(x)/Q(x)
+* Most single precision functions are using Taylor expansions
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+******************************************************************************/
+
+#ifndef VECTORMATH_COMMON_H
+#define VECTORMATH_COMMON_H  1
+
+#ifdef VECTORMATH_LIB_H
+#error conflicting header files: vectormath_lib.h for external math functions, other vectormath_xxx.h for inline math functions
+#endif
+
+#include <math.h>
+#include "vectorclass.h"
+
+
+
+/******************************************************************************
+               define mathematical constants
+******************************************************************************/
+#define VM_PI       3.14159265358979323846           // pi
+#define VM_PI_2     1.57079632679489661923           // pi / 2
+#define VM_PI_4     0.785398163397448309616          // pi / 4
+#define VM_SQRT2    1.41421356237309504880           // sqrt(2)
+#define VM_LOG2E    1.44269504088896340736           // 1/log(2)
+#define VM_LOG10E   0.434294481903251827651          // 1/log(10)
+#define VM_LN2      0.693147180559945309417          // log(2)
+#define VM_LN10     2.30258509299404568402           // log(10)
+#define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
+#define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
+
+
+/******************************************************************************
+      templates for producing infinite and nan in desired vector type
+******************************************************************************/
+template <class VTYPE>
+static inline VTYPE infinite_vec();
+
+template <>
+inline Vec2d infinite_vec<Vec2d>() {
+    return infinite2d();
+}
+
+template <>
+inline Vec4f infinite_vec<Vec4f>() {
+    return infinite4f();
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d infinite_vec<Vec4d>() {
+    return infinite4d();
+}
+
+template <>
+inline Vec8f infinite_vec<Vec8f>() {
+    return infinite8f();
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d infinite_vec<Vec8d>() {
+    return infinite8d();
+}
+
+template <>
+inline Vec16f infinite_vec<Vec16f>() {
+    return infinite16f();
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// template for producing quiet NAN
+template <class VTYPE>
+static inline VTYPE nan_vec(int n = 0x100);
+
+template <>
+inline Vec2d nan_vec<Vec2d>(int n) {
+    return nan2d(n);
+}
+
+template <>
+inline Vec4f nan_vec<Vec4f>(int n) {
+    return nan4f(n);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d nan_vec<Vec4d>(int n) {
+    return nan4d(n);
+}
+
+template <>
+inline Vec8f nan_vec<Vec8f>(int n) {
+    return nan8f(n);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d nan_vec<Vec8d>(int n) {
+    return nan8d(n);
+}
+
+template <>
+inline Vec16f nan_vec<Vec16f>(int n) {
+    return nan16f(n);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+// Define NAN trace values
+#define NAN_LOG 0x101  // logarithm for x<0
+#define NAN_POW 0x102  // negative number raised to non-integer power
+#define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
+
+
+/******************************************************************************
+                  templates for polynomials
+Using Estrin's scheme to make shorter dependency chains and use FMA, starting
+longest dependency chains first.
+******************************************************************************/
+
+// template <typedef VECTYPE, typedef CTYPE> 
+template <class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2) {
+    // calculates polynomial c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    //return = x2 * c2 + (x * c1 + c0);
+    return mul_add(x2, c2, mul_add(x, c1, c0));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_3(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
+    // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    //return (c2 + c3*x)*x2 + (c1*x + c0);
+    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_4(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
+    // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
+    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0) + c4*x4);
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_4n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
+    // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
+    return mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0) + x4);
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_5(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
+    // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
+    return mul_add(mul_add(c3,x,c2), x2, mul_add(mul_add(c5,x,c4), x4, mul_add(c1,x,c0)));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_5n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
+    // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
+    return mul_add( mul_add(c3,x,c2), x2, mul_add(c4+x,x4,mul_add(c1,x,c0)) );
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_6(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
+    // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c6,x2,mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_6n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
+    // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c5,x,c4+x2), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_7(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
+    // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
+    // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
+           mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0)+c8*x8));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
+    // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c9,x,c8), x8, mul_add(
+        mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
+        mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0))));
+}
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
+    // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(x2,c10,mul_add(c9,x,c8)), x8,
+                   mul_add(mul_add(mul_add(c7,x,c6),x2,mul_add(c5,x,c4)), x4,
+                           mul_add(mul_add(c3,x,c2),x2,mul_add(c1,x,c0))));
+} 
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_13(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
+    // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    return mul_add(        
+             mul_add(
+               mul_add(c13,x,c12), x4,
+                 mul_add(mul_add(c11,x,c10), x2, mul_add(c9,x,c8))), x8,
+             mul_add(
+               mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4,
+               mul_add(mul_add(c3,x,c2), x2, mul_add(c1,x,c0))));
+}
+
+
+template<class VTYPE, class CTYPE> 
+static inline VTYPE polynomial_13m(VTYPE const & x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
+    // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
+    return mul_add(
+        mul_add(mul_add(c13,x,c12), x4, mul_add(mul_add(c11,x,c10), x2, mul_add(c9,x,c8))), x8,
+        mul_add( mul_add(mul_add(c7,x,c6), x2, mul_add(c5,x,c4)), x4, mul_add(mul_add(c3,x,c2),x2,x)));
+}
+
+#endif
diff --git a/vectorclass/vectormath_exp.h b/vectorclass/vectormath_exp.h
new file mode 100755
index 0000000..4669b87
--- /dev/null
+++ b/vectorclass/vectormath_exp.h
@@ -0,0 +1,1995 @@
+/****************************  vectormath_exp.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2014-12-18
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file containing inline vector functions of logarithms, exponential 
+* and power functions:
+* exp         exponential function
+* exmp1       exponential function minus 1
+* log         natural logarithm
+* log1p       natural logarithm of 1+x
+* cbrt        cube root
+* pow         raise vector elements to power
+* pow_ratio   raise vector elements to rational power
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions, see vectormath_common.h and VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+******************************************************************************/
+
+#ifndef VECTORMATH_EXP_H
+#define VECTORMATH_EXP_H  1 
+
+#include "vectormath_common.h"  
+
+
+/******************************************************************************
+*                 Exponential functions
+******************************************************************************/
+
+// Helper functions, used internally:
+
+// This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow
+static inline Vec2d vm_pow2n (Vec2d const & n) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec2d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec2q b = reinterpret_i(a);                  // bit-cast to integer
+    Vec2q c = b << 52;                           // shift left 52 places to get into exponent field
+    Vec2d d = reinterpret_d(c);                  // bit-cast back to double
+    return d;
+}
+
+static inline Vec4f vm_pow2n (Vec4f const & n) {
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec4f a = n + (bias + pow2_23);              // put n + bias in least significant bits
+    Vec4i b = reinterpret_i(a);                  // bit-cast to integer
+    Vec4i c = b << 23;                           // shift left 23 places to get into exponent field
+    Vec4f d = reinterpret_f(c);                  // bit-cast back to float
+    return d;
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d vm_pow2n (Vec4d const & n) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec4d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec4q b = reinterpret_i(a);                  // bit-cast to integer
+    Vec4q c = b << 52;                           // shift left 52 places to get value into exponent field
+    Vec4d d = reinterpret_d(c);                  // bit-cast back to double
+    return d;
+}
+
+static inline Vec8f vm_pow2n (Vec8f const & n) {
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec8f a = n + (bias + pow2_23);              // put n + bias in least significant bits
+    Vec8i b = reinterpret_i(a);                  // bit-cast to integer
+    Vec8i c = b << 23;                           // shift left 23 places to get into exponent field
+    Vec8f d = reinterpret_f(c);                  // bit-cast back to float
+    return d;
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d vm_pow2n (Vec8d const & n) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec8d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec8q b = Vec8q(reinterpret_i(a));           // bit-cast to integer
+    Vec8q c = b << 52;                           // shift left 52 places to get value into exponent field
+    Vec8d d = Vec8d(reinterpret_d(c));           // bit-cast back to double
+    return d;
+}
+
+static inline Vec16f vm_pow2n (Vec16f const & n) {
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec16f a = n + (bias + pow2_23);             // put n + bias in least significant bits
+    Vec16i b = Vec16i(reinterpret_i(a));         // bit-cast to integer
+    Vec16i c = b << 23;                          // shift left 23 places to get into exponent field
+    Vec16f d = Vec16f(reinterpret_f(c));         // bit-cast back to float
+    return d;
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for exp function, double precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+#if 1  // choose method
+
+// Taylor expansion
+template<class VTYPE, class BTYPE, int M1, int BA> 
+static inline VTYPE exp_d(VTYPE const & initial_x) {    
+
+    // Taylor coefficients, 1/n!
+    // Not using minimax approximation because we prioritize precision close to x = 0
+    const double p2  = 1./2.;
+    const double p3  = 1./6.;
+    const double p4  = 1./24.;
+    const double p5  = 1./120.; 
+    const double p6  = 1./720.; 
+    const double p7  = 1./5040.; 
+    const double p8  = 1./40320.; 
+    const double p9  = 1./362880.; 
+    const double p10 = 1./3628800.; 
+    const double p11 = 1./39916800.; 
+    const double p12 = 1./479001600.; 
+    const double p13 = 1./6227020800.; 
+
+    // maximum abs(x), value depends on BA, defined below
+    // The lower limit of x is slightly more restrictive than the upper limit.
+    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+    double max_x;
+
+    // data vectors
+    VTYPE x, r, z, n2;
+    BTYPE inrange;                               // boolean vector
+
+    if (BA <= 1) { // exp(x)
+        max_x = BA == 0 ? 708.39 : 709.7; // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions
+        const double ln2d_hi = 0.693145751953125;
+        const double ln2d_lo = 1.42860682030941723212E-6;
+        x  = initial_x;
+        r  = round(initial_x*VM_LOG2E);
+        // subtraction in two steps for higher precision
+        x = nmul_add(r, ln2d_hi, x);             //  x -= r * ln2d_hi;
+        x = nmul_add(r, ln2d_lo, x);             //  x -= r * ln2d_lo;
+    }
+    else if (BA == 2) { // pow(2,x)
+        max_x = 1022.0;
+        r  = round(initial_x);
+        x  = initial_x - r;
+        x *= VM_LN2;
+    }
+    else if (BA == 10) { // pow(10,x)
+        max_x = 307.65;
+        const double log10_2_hi = 0.30102999554947019;     // log10(2) in two parts
+        const double log10_2_lo = 1.1451100899212592E-10;
+        x  = initial_x;
+        r  = round(initial_x*(VM_LOG2E*VM_LN10));
+        // subtraction in two steps for higher precision
+        x  = nmul_add(r, log10_2_hi, x);         //  x -= r * log10_2_hi;
+        x  = nmul_add(r, log10_2_lo, x);         //  x -= r * log10_2_lo;
+        x *= VM_LN10;
+    }
+    else  {  // undefined value of BA
+        return 0.;
+    }
+
+    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+
+    if (BA == 1) r--;  // 0.5 * exp(x)
+
+    // multiply by power of 2 
+    n2 = vm_pow2n(r);
+
+    if (M1 == 0) {
+        // exp
+        z = (z + 1.0) * n2;
+    }
+    else {
+        // expm1
+        z = mul_add(z, n2, n2 - 1.0);            // z = z * n2 + (n2 - 1.0);
+    }
+
+    // check for overflow
+    inrange  = abs(initial_x) < max_x;
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return z;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
+        z = select(inrange, z, r);                                     // +/- underflow
+        z = select(is_nan(initial_x), initial_x, z);                   // NAN goes through
+        return z;
+    }
+}
+
+#else
+
+// Pade expansion uses less code and fewer registers, but is slower
+template<class VTYPE, class BTYPE, int M1, int BA> 
+static inline VTYPE exp_d(VTYPE const & initial_x) {
+
+    // define constants
+    const double ln2p1   = 0.693145751953125;
+    const double ln2p2   = 1.42860682030941723212E-6;
+    const double log2e   = VM_LOG2E;
+    const double max_exp = 708.39;
+    // coefficients of pade polynomials
+    const double P0exp = 9.99999999999999999910E-1;
+    const double P1exp = 3.02994407707441961300E-2;
+    const double P2exp = 1.26177193074810590878E-4;
+    const double Q0exp = 2.00000000000000000009E0;
+    const double Q1exp = 2.27265548208155028766E-1;
+    const double Q2exp = 2.52448340349684104192E-3;
+    const double Q3exp = 3.00198505138664455042E-6;
+
+    VTYPE x, r, xx, px, qx, y, n2;               // data vectors
+    BTYPE inrange;                               // boolean vector
+
+    x = initial_x;
+    r = round(initial_x*log2e);
+
+    // subtraction in one step would gives loss of precision
+    x -= r * ln2p1;
+    x -= r * ln2p2;
+
+    xx = x * x;
+
+    // px = x * P(x^2).
+    px = polynomial_2(xx, P0exp, P1exp, P2exp) * x;
+
+    // Evaluate Q(x^2).
+    qx = polynomial_3(xx, Q0exp, Q1exp, Q2exp, Q3exp);
+
+    // e^x = 1 + 2*P(x^2)/( Q(x^2) - P(x^2) )
+    y = (2.0 * px) / (qx - px);
+
+    // Get 2^n in double.
+    // n  = round_to_int64_limited(r);
+    // n2 = exp2(n);
+    n2 = vm_pow2n(r);  // this is faster
+
+    if (M1 == 0) {
+        // exp
+        y = (y + 1.0) * n2;
+    }
+    else {
+        // expm1
+        y = y * n2 + (n2 - 1.0);
+    }
+
+    // overflow
+    inrange  = abs(initial_x) < max_exp;
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return y;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of overflow or INF
+        y = select(inrange, y, r);                                     // +/- overflow
+        y = select(is_nan(initial_x), initial_x, y);                   // NAN goes through
+        return y;
+    }
+}
+#endif
+
+// instances of exp_d template
+static inline Vec2d exp(Vec2d const & x) {
+    return exp_d<Vec2d, Vec2db, 0, 0>(x);
+}
+
+static inline Vec2d expm1(Vec2d const & x) {
+    return exp_d<Vec2d, Vec2db, 1, 0>(x);
+}
+
+static inline Vec2d exp2(Vec2d const & x) {
+    return exp_d<Vec2d, Vec2db, 0, 2>(x);
+}
+
+static inline Vec2d exp10(Vec2d const & x) {
+    return exp_d<Vec2d, Vec2db, 0, 10>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d exp(Vec4d const & x) {
+    return exp_d<Vec4d, Vec4db, 0, 0>(x);
+}
+
+static inline Vec4d expm1(Vec4d const & x) {
+    return exp_d<Vec4d, Vec4db, 1, 0>(x);
+}
+
+static inline Vec4d exp2(Vec4d const & x) {
+    return exp_d<Vec4d, Vec4db, 0, 2>(x);
+}
+
+static inline Vec4d exp10(Vec4d const & x) {
+    return exp_d<Vec4d, Vec4db, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d exp(Vec8d const & x) {
+    return exp_d<Vec8d, Vec8db, 0, 0>(x);
+}
+
+static inline Vec8d expm1(Vec8d const & x) {
+    return exp_d<Vec8d, Vec8db, 1, 0>(x);
+}
+
+static inline Vec8d exp2(Vec8d const & x) {
+    return exp_d<Vec8d, Vec8db, 0, 2>(x);
+}
+
+static inline Vec8d exp10(Vec8d const & x) {
+    return exp_d<Vec8d, Vec8db, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+// Template for exp function, single precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: float vector type
+// BTYPE: boolean vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+template<class VTYPE, class BTYPE, int M1, int BA> 
+static inline VTYPE exp_f(VTYPE const & initial_x) {
+
+    // Taylor coefficients
+    const float P0expf   =  1.f/2.f;
+    const float P1expf   =  1.f/6.f;
+    const float P2expf   =  1.f/24.f;
+    const float P3expf   =  1.f/120.f; 
+    const float P4expf   =  1.f/720.f; 
+    const float P5expf   =  1.f/5040.f; 
+
+    VTYPE x, r, x2, z, n2;                       // data vectors        
+    BTYPE inrange;                               // boolean vector
+
+    // maximum abs(x), value depends on BA, defined below
+    // The lower limit of x is slightly more restrictive than the upper limit.
+    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+    float max_x;
+
+    if (BA <= 1) { // exp(x)
+        const float ln2f_hi  =  0.693359375f;
+        const float ln2f_lo  = -2.12194440e-4f;
+        max_x = (BA == 0) ? 87.3f : 89.0f;
+
+        x = initial_x;
+        r = round(initial_x*float(VM_LOG2E));
+        x = nmul_add(r, VTYPE(ln2f_hi), x);      //  x -= r * ln2f_hi;
+        x = nmul_add(r, VTYPE(ln2f_lo), x);      //  x -= r * ln2f_lo;
+    }
+    else if (BA == 2) {                          // pow(2,x)
+        max_x = 126.f;
+        r = round(initial_x);
+        x = initial_x - r;
+        x = x * (float)VM_LN2;
+    }
+    else if (BA == 10) {                         // pow(10,x)
+        max_x = 37.9f;
+        const float log10_2_hi = 0.301025391f;   // log10(2) in two parts
+        const float log10_2_lo = 4.60503907E-6f;
+        x = initial_x;
+        r = round(initial_x*float(VM_LOG2E*VM_LN10));
+        x = nmul_add(r, VTYPE(log10_2_hi), x);   //  x -= r * log10_2_hi;
+        x = nmul_add(r, VTYPE(log10_2_lo), x);   //  x -= r * log10_2_lo;
+        x = x * (float)VM_LN10;
+    }
+    else  {  // undefined value of BA
+        return 0.;
+    }
+
+    x2 = x * x;
+    z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf);    
+    z = mul_add(z, x2, x);                       // z *= x2;  z += x;
+
+    if (BA == 1) r--;                            // 0.5 * exp(x)
+
+    // multiply by power of 2 
+    n2 = vm_pow2n(r);
+
+    if (M1 == 0) {
+        // exp
+        z = (z + 1.0f) * n2;
+    }
+    else {
+        // expm1
+        z = mul_add(z, n2, n2 - 1.0f);           //  z = z * n2 + (n2 - 1.0f);
+    }
+
+    // check for overflow
+    inrange  = abs(initial_x) < max_x;
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return z;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.f-M1, infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
+        z = select(inrange, z, r);                                      // +/- underflow
+        z = select(is_nan(initial_x), initial_x, z);                    // NAN goes through
+        return z;
+    }
+}
+
+// instances of exp_f template
+static inline Vec4f exp(Vec4f const & x) {
+    return exp_f<Vec4f, Vec4fb, 0, 0>(x);
+}
+
+static inline Vec4f expm1(Vec4f const & x) {
+    return exp_f<Vec4f, Vec4fb, 1, 0>(x);
+}
+
+static inline Vec4f exp2(Vec4f const & x) {
+    return exp_f<Vec4f, Vec4fb, 0, 2>(x);
+}
+
+static inline Vec4f exp10(Vec4f const & x) {
+    return exp_f<Vec4f, Vec4fb, 0, 10>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exp(Vec8f const & x) {
+    return exp_f<Vec8f, Vec8fb, 0, 0>(x);
+}
+
+static inline Vec8f expm1(Vec8f const & x) {
+    return exp_f<Vec8f, Vec8fb, 1, 0>(x);
+}
+
+static inline Vec8f exp2(Vec8f const & x) {
+    return exp_f<Vec8f, Vec8fb, 0, 2>(x);
+}
+
+static inline Vec8f exp10(Vec8f const & x) {
+    return exp_f<Vec8f, Vec8fb, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exp(Vec16f const & x) {
+    return exp_f<Vec16f, Vec16fb, 0, 0>(x);
+}
+
+static inline Vec16f expm1(Vec16f const & x) {
+    return exp_f<Vec16f, Vec16fb, 1, 0>(x);
+}
+
+static inline Vec16f exp2(Vec16f const & x) {
+    return exp_f<Vec16f, Vec16fb, 0, 2>(x);
+}
+
+static inline Vec16f exp10(Vec16f const & x) {
+    return exp_f<Vec16f, Vec16fb, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+/******************************************************************************
+*                 Logarithm functions
+******************************************************************************/
+
+// Helper functions: fraction_2(x) = fraction(x)*0.5
+
+// Modified fraction function:
+// Extract the fraction part of a floating point number, and divide by 2
+// The fraction function is defined in vectorf128.h etc.
+// fraction_2(x) = fraction(x)*0.5
+// This version gives half the fraction without extra delay
+// Does not work for x = 0
+static inline Vec4f fraction_2(Vec4f const & a) {
+    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
+    Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000); // set exponent to 0 + bias
+    return _mm_castsi128_ps(t2);
+}
+
+static inline Vec2d fraction_2(Vec2d const & a) {
+    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
+    Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
+    return _mm_castsi128_pd(t2);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f fraction_2(Vec8f const & a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
+    Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
+    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000; // set exponent to 0 + bias
+    return _mm256_castsi256_ps(t2);
+#else
+    return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec4d fraction_2(Vec4d const & a) {
+#if VECTORI256_H > 1  // AVX2
+    Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
+    Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
+    return _mm256_castsi256_pd(t2);
+#else
+    return Vec4d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f fraction_2(Vec16f const & a) {
+#if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
+    return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+    //return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f;
+#else
+    return Vec16f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec8d fraction_2(Vec8d const & a) {
+#if INSTRSET >= 9                    // 512 bit integer vectors are available, AVX512
+    return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+    //return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5;
+#else
+    return Vec8d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Helper functions: exponent_f(x) = exponent(x) as floating point number
+
+union vm_ufi {
+    float f;
+    uint32_t i;
+};
+
+union vm_udi {
+    double d;
+    uint64_t i;
+};
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec4f exponent_f(Vec4f const & x) {
+    const float pow2_23 =  8388608.0f;           // 2^23
+    const float bias = 127.f;                    // bias in exponent
+    const vm_ufi upow2_23 = {pow2_23};
+    Vec4ui a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec4ui b = a >> 23;                          // shift down exponent to low bits
+    Vec4ui c = b | Vec4ui(upow2_23.i);           // insert new exponent
+    Vec4f  d = reinterpret_f(c);                 // bit-cast back to double
+    Vec4f  e = d - (pow2_23 + bias);             // subtract magic number and bias
+    return e;
+}
+
+static inline Vec2d exponent_f(Vec2d const & x) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    const vm_udi upow2_52 = {pow2_52};
+
+    Vec2uq a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec2uq b = a >> 52;                          // shift down exponent to low bits
+    Vec2uq c = b | Vec2uq(upow2_52.i);           // insert new exponent
+    Vec2d  d = reinterpret_d(c);                 // bit-cast back to double
+    Vec2d  e = d - (pow2_52 + bias);             // subtract magic number and bias
+    return e;
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exponent_f(Vec8f const & x) {
+    const float pow2_23 =  8388608.0f;           // 2^23
+    const float bias = 127.f;                    // bias in exponent
+    const vm_ufi upow2_23 = {pow2_23};
+    Vec8ui a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec8ui b = a >> 23;                          // shift down exponent to low bits
+    Vec8ui c = b | Vec8ui(upow2_23.i);           // insert new exponent
+    Vec8f  d = reinterpret_f(c);                 // bit-cast back to double
+    Vec8f  e = d - (pow2_23 + bias);             // subtract magic number and bias
+    return e;
+} 
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec4d exponent_f(Vec4d const & x) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    const vm_udi upow2_52 = {pow2_52};
+
+    Vec4uq a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec4uq b = a >> 52;                          // shift down exponent to low bits
+    Vec4uq c = b | Vec4uq(upow2_52.i);           // insert new exponent
+    Vec4d  d = reinterpret_d(c);                 // bit-cast back to double
+    Vec4d  e = d - (pow2_52 + bias);             // subtract magic number and bias
+    return e;
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exponent_f(Vec16f const & x) {
+#if INSTRSET >= 9                                // AVX512
+    return _mm512_getexp_ps(x);
+#else
+    return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+} 
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec8d exponent_f(Vec8d const & x) {
+#if INSTRSET >= 9                                // AVX512
+    return _mm512_getexp_pd(x);
+#else
+    return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// log function, double precision
+// template parameters:
+// VTYPE: f.p. vector type
+// BTYPE: boolean vector type
+// M1: 0 for log, 1 for log1p
+template<class VTYPE, class BTYPE, int M1> 
+static inline VTYPE log_d(VTYPE const & initial_x) {
+
+    // define constants
+    const double ln2_hi =  0.693359375;
+    const double ln2_lo = -2.121944400546905827679E-4;
+    const double P0log  =  7.70838733755885391666E0;
+    const double P1log  =  1.79368678507819816313E1;
+    const double P2log  =  1.44989225341610930846E1;
+    const double P3log  =  4.70579119878881725854E0;
+    const double P4log  =  4.97494994976747001425E-1;
+    const double P5log  =  1.01875663804580931796E-4;
+    const double Q0log  =  2.31251620126765340583E1;
+    const double Q1log  =  7.11544750618563894466E1;
+    const double Q2log  =  8.29875266912776603211E1;
+    const double Q3log  =  4.52279145837532221105E1;
+    const double Q4log  =  1.12873587189167450590E1;
+
+    VTYPE x1, x, x2, px, qx, res, fe;            // data vectors
+    BTYPE blend, overflow, underflow;            // boolean vectors
+
+    if (M1 == 0) {
+        x1 = initial_x;                          // log(x)
+    }
+    else {
+        x1 = initial_x + 1.0;                    // log(x+1)
+    }
+    // separate mantissa from exponent 
+    // VTYPE x  = fraction(x1) * 0.5;
+    x  = fraction_2(x1);
+    fe = exponent_f(x1);
+
+    blend = x > VM_SQRT2*0.5;
+    x  = if_add(!blend, x, x);                   // conditional add
+    fe = if_add(blend, fe, 1.);                  // conditional add
+
+    if (M1 == 0) {
+        // log(x). Expand around 1.0
+        x -= 1.0;
+    }
+    else {
+        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+        x = select(fe==0., initial_x, x - 1.0);
+    }
+
+    // rational form 
+    px  = polynomial_5 (x, P0log, P1log, P2log, P3log, P4log, P5log);
+    x2  = x * x;
+    px *= x * x2;
+    qx  = polynomial_5n(x, Q0log, Q1log, Q2log, Q3log, Q4log);
+    res = px / qx ;
+
+    // add exponent
+    res  = mul_add(fe, ln2_lo, res);             // res += fe * ln2_lo;
+    res += nmul_add(x2, 0.5, x);                 // res += x  - 0.5 * x2;
+    res  = mul_add(fe, ln2_hi, res);             // res += fe * ln2_hi;
+
+    overflow  = !is_finite(x1);
+    underflow = x1 < VM_SMALLEST_NORMAL;         // denormals not supported by this functions
+
+    if (!horizontal_or(overflow | underflow)) {
+        // normal path
+        return res;
+    }
+    else {
+        // overflow and underflow
+        res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                   // x1  < 0 gives NAN
+        res = select(x1 == 0. || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 gives -INF
+        res = select(overflow,  x1, res);                                        // INF or NAN goes through
+        res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);     // -INF gives NAN
+        return res;
+    }
+}
+
+
+static inline Vec2d log(Vec2d const & x) {
+    return log_d<Vec2d, Vec2db, 0>(x);
+}
+
+static inline Vec2d log1p(Vec2d const & x) {
+    return log_d<Vec2d, Vec2db, 1>(x);
+}
+
+static inline Vec2d log2(Vec2d const & x) {
+    return VM_LOG2E * log_d<Vec2d, Vec2db, 0>(x);
+}
+
+static inline Vec2d log10(Vec2d const & x) {
+    return VM_LOG10E * log_d<Vec2d, Vec2db, 0>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d log(Vec4d const & x) {
+    return log_d<Vec4d, Vec4db, 0>(x);
+}
+
+static inline Vec4d log1p(Vec4d const & x) {
+    return log_d<Vec4d, Vec4db, 1>(x);
+}
+
+static inline Vec4d log2(Vec4d const & x) {
+    return VM_LOG2E * log_d<Vec4d, Vec4db, 0>(x);
+}
+
+static inline Vec4d log10(Vec4d const & x) {
+    return VM_LOG10E * log_d<Vec4d, Vec4db, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d log(Vec8d const & x) {
+    return log_d<Vec8d, Vec8db, 0>(x);
+}
+
+static inline Vec8d log1p(Vec8d const & x) {
+    return log_d<Vec8d, Vec8db, 1>(x);
+}
+
+static inline Vec8d log2(Vec8d const & x) {
+    return VM_LOG2E * log_d<Vec8d, Vec8db, 0>(x);
+}
+
+static inline Vec8d log10(Vec8d const & x) {
+    return VM_LOG10E * log_d<Vec8d, Vec8db, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+// log function, single precision
+// template parameters:
+// VTYPE: f.p. vector type
+// ITYPE: integer vector type with same element size
+// BTYPE: boolean vector type
+// BTYPEI: boolean vector type for ITYPE
+// M1: 0 for log, 1 for log1p
+template<class VTYPE, class ITYPE, class BTYPE, class BTYPEI, int M1> 
+static inline VTYPE log_f(VTYPE const & initial_x) {
+
+    // define constants
+    const float ln2f_hi =  0.693359375f;
+    const float ln2f_lo = -2.12194440E-4f;
+    const float P0logf  =  3.3333331174E-1f;
+    const float P1logf  = -2.4999993993E-1f;
+    const float P2logf  =  2.0000714765E-1f;
+    const float P3logf  = -1.6668057665E-1f;
+    const float P4logf  =  1.4249322787E-1f;
+    const float P5logf  = -1.2420140846E-1f;
+    const float P6logf  =  1.1676998740E-1f;
+    const float P7logf  = -1.1514610310E-1f;
+    const float P8logf  =  7.0376836292E-2f;
+
+    VTYPE x1, x, res, x2, fe;                    // data vectors
+    ITYPE e;                                     // integer vector
+    BTYPE blend, overflow, underflow;            // boolean vectors
+
+    if (M1 == 0) {
+        x1 = initial_x;                          // log(x)
+    }
+    else {
+        x1 = initial_x + 1.0f;                   // log(x+1)
+    }
+
+    // separate mantissa from exponent 
+    x = fraction_2(x1);
+    e = exponent(x1);
+
+    blend = x > float(VM_SQRT2*0.5);
+    x  = if_add(!blend, x, x);                   // conditional add
+    e  = if_add(BTYPEI(blend),  e, ITYPE(1));    // conditional add
+    fe = to_float(e);
+
+    if (M1 == 0) {
+        // log(x). Expand around 1.0
+        x -= 1.0f;
+    }
+    else {
+        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+        x = select(BTYPE(e==0), initial_x, x - 1.0f);
+    }
+
+    // Taylor expansion
+    res = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+    x2  = x*x;
+    res *= x2*x;
+
+    // add exponent
+    res  = mul_add(fe, ln2f_lo, res);            // res += ln2f_lo  * fe;
+    res += nmul_add(x2, 0.5f, x);                // res += x - 0.5f * x2;
+    res  = mul_add(fe, ln2f_hi, res);            // res += ln2f_hi  * fe;
+
+    overflow  = !is_finite(x1);
+    underflow = x1 < VM_SMALLEST_NORMALF;        // denormals not supported by this functions
+
+    if (!horizontal_or(overflow | underflow)) {
+        // normal path
+        return res;
+    }
+    else {
+        // overflow and underflow
+        res = select(underflow, nan_vec<VTYPE>(NAN_LOG), res);                    // x1 < 0 gives NAN
+        res = select(x1 == 0.f || is_subnormal(x1), -infinite_vec<VTYPE>(), res); // x1 == 0 or denormal gives -INF
+        res = select(overflow,  x1, res);                                         // INF or NAN goes through
+        res = select(is_inf(x1)&sign_bit(x1), nan_vec<VTYPE>(NAN_LOG), res);      // -INF gives NAN
+        return res;
+    }
+}
+
+static inline Vec4f log(Vec4f const & x) {
+    return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
+}
+
+static inline Vec4f log1p(Vec4f const & x) {
+    return log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 1>(x);
+}
+
+static inline Vec4f log2(Vec4f const & x) {
+    return float(VM_LOG2E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
+}
+
+static inline Vec4f log10(Vec4f const & x) {
+    return float(VM_LOG10E) * log_f<Vec4f, Vec4i, Vec4fb, Vec4ib, 0>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f log(Vec8f const & x) {
+    return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
+}
+
+static inline Vec8f log1p(Vec8f const & x) {
+    return log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 1>(x);
+}
+
+static inline Vec8f log2(Vec8f const & x) {
+    return float(VM_LOG2E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
+}
+
+static inline Vec8f log10(Vec8f const & x) {
+    return float(VM_LOG10E) * log_f<Vec8f, Vec8i, Vec8fb, Vec8ib, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f log(Vec16f const & x) {
+    return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
+}
+
+static inline Vec16f log1p(Vec16f const & x) {
+    return log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 1>(x);
+}
+
+static inline Vec16f log2(Vec16f const & x) {
+    return float(VM_LOG2E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
+}
+
+static inline Vec16f log10(Vec16f const & x) {
+    return float(VM_LOG10E) * log_f<Vec16f, Vec16i, Vec16fb, Vec16ib, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+/******************************************************************************
+*           Cube root and reciprocal cube root
+******************************************************************************/
+
+// cube root template, double precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  uint32_t integer vector type with same total number of bits
+// ITYPE2: uint64_t integer vector type with same total number of bits
+// BTYPE:  boolean vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template<class VTYPE, class ITYPE, class ITYPE2, class BTYPE, int CR> 
+static inline VTYPE cbrt_d(VTYPE const & x) {
+    const int iter = 7;     // iteration count of x^(-1/3) loop
+    int i;
+    VTYPE  xa, xa3, a, a2;
+    ITYPE  m1, m2;
+    BTYPE  underflow;
+    ITYPE2 q1(0x5540000000000000ULL);            // exponent bias
+    ITYPE2 q2(0x0005555500000000ULL);            // exponent multiplier for 1/3
+    ITYPE2 q3(0x0010000000000000ULL);            // denormal limit
+    const double one_third  = 1./3.;
+    const double four_third = 4./3.;
+
+    xa  = abs(x);
+    xa3 = one_third*xa;
+
+    // multiply exponent by -1/3
+    m1 = reinterpret_i(xa);
+    m2 = ITYPE(q1) - (m1 >> 20) * ITYPE(q2);
+    a  = reinterpret_d(m2);
+    underflow = BTYPE(ITYPE2(m1) < q3);          // true if denormal or zero
+
+    // Newton Raphson iteration
+    for (i = 0; i < iter-1; i++) {
+        a2 = a * a;
+        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
+    }
+    // last iteration with better precision
+    a2 = a * a;    
+    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); // a = a + one_third*(a - xa*a2*a2);
+
+    if (CR == -1) {  // reciprocal cube root
+        // (note: gives wrong sign when input is INF)
+        // generate INF if underflow
+        a = select(underflow, infinite_vec<VTYPE>(), a);
+        // get sign
+        a = sign_combine(a, x);
+    }
+    else if (CR == 1) {     // cube root
+        a = a * a * x;
+        // generate 0 if underflow
+        a = select(underflow, 0., a);
+    }
+    else if (CR == 2) {     // cube root squared
+        // (note: gives wrong sign when input is INF)
+        a = a * xa;
+        // generate 0 if underflow
+        a = select(underflow, 0., a);
+    }
+    return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec2d cbrt(Vec2d const & x) {
+    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 1> (x);
+}
+
+// reciprocal cube root
+static inline Vec2d reciprocal_cbrt(Vec2d const & x) {
+    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, -1> (x);
+}
+
+// square cube root
+static inline Vec2d square_cbrt(Vec2d const & x) {
+    return cbrt_d<Vec2d, Vec4ui, Vec2uq, Vec2db, 2> (x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d cbrt(Vec4d const & x) {
+    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 1> (x);
+}
+
+static inline Vec4d reciprocal_cbrt(Vec4d const & x) {
+    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, -1> (x);
+}
+
+static inline Vec4d square_cbrt(Vec4d const & x) {
+    return cbrt_d<Vec4d, Vec8ui, Vec4uq, Vec4db, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d cbrt(Vec8d const & x) {
+    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 1> (x);
+}
+
+static inline Vec8d reciprocal_cbrt(Vec8d const & x) {
+    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, -1> (x);
+}
+
+static inline Vec8d square_cbrt(Vec8d const & x) {
+    return cbrt_d<Vec8d, Vec16ui, Vec8uq, Vec8db, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// cube root template, single precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  uint32_t integer vector type
+// BTYPE:  boolean vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template<class VTYPE, class ITYPE, class BTYPE, int CR> 
+static inline VTYPE cbrt_f(VTYPE const & x) {
+
+    const int iter = 6;                          // iteration count of x^(-1/3) loop
+    int i;
+    VTYPE  xa, xa3, a, a2;
+    ITYPE  m1, m2;
+    BTYPE  underflow;
+    ITYPE  q1(0x54800000U);                      // exponent bias
+    ITYPE  q2(0x002AAAAAU);                      // exponent multiplier for 1/3
+    ITYPE  q3(0x00800000U);                      // denormal limit
+    const  float one_third  = float(1./3.);
+    const  float four_third = float(4./3.);
+
+    xa  = abs(x);
+    xa3 = one_third*xa;
+
+    // multiply exponent by -1/3
+    m1 = reinterpret_i(xa);
+    m2 = q1 - (m1 >> 23) * q2;
+    a  = reinterpret_f(m2);
+
+    underflow = BTYPE(m1 < q3);                  // true if denormal or zero
+
+    // Newton Raphson iteration
+    for (i = 0; i < iter-1; i++) {
+        a2 = a*a;        
+        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
+    }
+    // last iteration with better precision
+    a2 = a*a;    
+    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); //a = a + one_third*(a - xa*a2*a2);
+
+    if (CR == -1) {                              // reciprocal cube root
+        // generate INF if underflow
+        a = select(underflow, infinite_vec<VTYPE>(), a);
+        // get sign
+        a = sign_combine(a, x);
+    }
+    else if (CR == 1) {                          // cube root
+        a = a * a * x;
+        // generate 0 if underflow
+        a = select(underflow, 0., a);
+    }
+    else if (CR == 2) {                          // cube root squared
+        a = a * xa;
+        // generate 0 if underflow
+        a = select(underflow, 0., a);
+    }
+    return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec4f cbrt(Vec4f const & x) {
+    return cbrt_f<Vec4f, Vec4ui, Vec4fb, 1> (x);
+}
+
+// reciprocal cube root
+static inline Vec4f reciprocal_cbrt(Vec4f const & x) {
+    return cbrt_f<Vec4f, Vec4ui, Vec4fb, -1> (x);
+}
+
+// square cube root
+static inline Vec4f square_cbrt(Vec4f const & x) {
+    return cbrt_f<Vec4f, Vec4ui, Vec4fb, 2> (x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f cbrt(Vec8f const & x) {
+    return cbrt_f<Vec8f, Vec8ui, Vec8fb, 1> (x);
+}
+
+static inline Vec8f reciprocal_cbrt(Vec8f const & x) {
+    return cbrt_f<Vec8f, Vec8ui, Vec8fb, -1> (x);
+}
+
+static inline Vec8f square_cbrt(Vec8f const & x) {
+    return cbrt_f<Vec8f, Vec8ui, Vec8fb, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f cbrt(Vec16f const & x) {
+    return cbrt_f<Vec16f, Vec16ui, Vec16fb, 1> (x);
+}
+
+static inline Vec16f reciprocal_cbrt(Vec16f const & x) {
+    return cbrt_f<Vec16f, Vec16ui, Vec16fb, -1> (x);
+}
+
+static inline Vec16f square_cbrt(Vec16f const & x) {
+    return cbrt_f<Vec16f, Vec16ui, Vec16fb, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// ****************************************************************************
+//                pow template, double precision
+// ****************************************************************************
+// Calculate x to the power of y.
+
+// Precision is important here because rounding errors get multiplied by y.
+// The logarithm is calculated with extra precision, and the exponent is 
+// calculated separately.
+// The logarithm is calculated by Pad\E9 approximation with 6'th degree 
+// polynomials. A 7'th degree would be preferred for best precision by high y.
+// The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1)
+// did not give better precision.
+
+// Template parameters:
+// VTYPE:  data vector type
+// ITYPE:  signed integer vector type
+// BTYPE:  boolean vector type
+template <class VTYPE, class ITYPE, class BTYPE>
+static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) {
+
+    // define constants
+    const double ln2d_hi = 0.693145751953125;           // log(2) in extra precision, high bits
+    const double ln2d_lo = 1.42860682030941723212E-6;   // low bits of log(2)
+    const double log2e   = VM_LOG2E;                    // 1/log(2)
+    const double pow2_52 = 4503599627370496.0;          // 2^52
+
+    // coefficients for Pad\E9 polynomials
+    const double P0logl =  2.0039553499201281259648E1;
+    const double P1logl =  5.7112963590585538103336E1;
+    const double P2logl =  6.0949667980987787057556E1;
+    const double P3logl =  2.9911919328553073277375E1;
+    const double P4logl =  6.5787325942061044846969E0;
+    const double P5logl =  4.9854102823193375972212E-1;
+    const double P6logl =  4.5270000862445199635215E-5;
+    const double Q0logl =  6.0118660497603843919306E1;
+    const double Q1logl =  2.1642788614495947685003E2;
+    const double Q2logl =  3.0909872225312059774938E2;
+    const double Q3logl =  2.2176239823732856465394E2;
+    const double Q4logl =  8.3047565967967209469434E1;
+    const double Q5logl =  1.5062909083469192043167E1;
+
+    // Taylor coefficients for exp function, 1/n!
+    const double p2  = 1./2.;
+    const double p3  = 1./6.;
+    const double p4  = 1./24.;
+    const double p5  = 1./120.; 
+    const double p6  = 1./720.; 
+    const double p7  = 1./5040.; 
+    const double p8  = 1./40320.; 
+    const double p9  = 1./362880.; 
+    const double p10 = 1./3628800.; 
+    const double p11 = 1./39916800.; 
+    const double p12 = 1./479001600.; 
+    const double p13 = 1./6227020800.; 
+
+    // data vectors
+    VTYPE x, x1, x2;
+    VTYPE px, qx, ef, yr, v, z, z1;
+    VTYPE lg, lg1, lg2;
+    VTYPE lgerr, x2err;
+    VTYPE e1, e2, e3, ee;
+    // integer vectors
+    ITYPE ei, ej, yodd;
+    // boolean vectors
+    BTYPE blend, xzero, xnegative;
+    BTYPE overflow, underflow, xfinite, yfinite, efinite;
+
+    // remove sign
+    x1 = abs(x0);
+
+    // Separate mantissa from exponent 
+    // This gives the mantissa * 0.5
+    x  = fraction_2(x1);
+
+    // reduce range of x = +/- sqrt(2)/2
+    blend = x > VM_SQRT2*0.5;
+    x  = if_add(!blend, x, x);                   // conditional add
+
+    // Pade approximation
+    // Higher precision than in log function. Still higher precision wanted
+    x -= 1.0;
+    x2 = x*x;
+    px = polynomial_6  (x, P0logl, P1logl, P2logl, P3logl, P4logl, P5logl, P6logl);
+    px *= x * x2;
+    qx = polynomial_6n (x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl);
+    lg1 = px / qx;
+ 
+    // extract exponent
+    ef = exponent_f(x1);
+    ef = if_add(blend, ef, 1.);                  // conditional add
+
+    // multiply exponent by y
+    // nearest integer e1 goes into exponent of result, remainder yr is added to log
+    e1 = round(ef * y);
+    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
+
+    // add initial terms to Pade expansion
+    lg = nmul_add(0.5, x2, x) + lg1;             // lg = (x - 0.5 * x2) + lg1;
+    // calculate rounding errors in lg
+    // rounding error in multiplication 0.5*x*x
+    x2err = mul_sub_x(0.5*x, x, 0.5*x2);
+    // rounding error in additions and subtractions
+    lgerr = mul_add(0.5, x2, lg - x) - lg1;      // lgerr = ((lg - x) + 0.5 * x2) - lg1;
+
+    // extract something for the exponent
+    e2 = round(lg * y * VM_LOG2E);
+    // subtract this from lg, with extra precision
+    v = mul_sub_x(lg, y, e2 * ln2d_hi);
+    v = nmul_add(e2, ln2d_lo, v);                // v -= e2 * ln2d_lo;
+
+    // add remainder from ef * y
+    v = mul_add(yr, VM_LN2, v);                  // v += yr * VM_LN2;
+
+    // correct for previous rounding errors
+    v = nmul_add(lgerr + x2err, y, v);           // v -= (lgerr + x2err) * y;
+
+    // exp function
+
+    // extract something for the exponent if possible
+    x = v;
+    e3 = round(x*log2e);
+    // high precision multiplication not needed here because abs(e3) <= 1
+    x = nmul_add(e3, VM_LN2, x);                 // x -= e3 * VM_LN2;
+
+    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+    z = z + 1.0;
+
+    // contributions to exponent
+    ee = e1 + e2 + e3;
+    ei = round_to_int64_limited(ee);
+    // biased exponent of result:
+    ej = ei + (ITYPE(reinterpret_i(z)) >> 52);
+    // check exponent for overflow and underflow
+    overflow  = BTYPE(ej >= 0x07FF) | (ee >  3000.);
+    underflow = BTYPE(ej <= 0x0000) | (ee < -3000.);
+
+    // add exponent by integer addition
+    z = reinterpret_d(ITYPE(reinterpret_i(z)) + (ei << 52));
+
+    // check for special cases
+    xfinite   = is_finite(x0);
+    yfinite   = is_finite(y);
+    efinite   = is_finite(ee);
+    xzero     = is_zero_or_subnormal(x0);
+    xnegative = x0  < 0.;
+
+    // check for overflow and underflow
+    if (horizontal_or(overflow | underflow)) {
+        // handle errors
+        z = select(underflow, VTYPE(0.), z);
+        z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+    // check for x == 0
+    z = select(xzero, select(y < 0., infinite_vec<VTYPE>(), select(y == 0., VTYPE(1.), VTYPE(0.))), z);
+
+    // check for x < 0. y must be integer
+    if (horizontal_or(xnegative)) {
+        // test if y odd
+        yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63;     // convert y to integer and shift bit 0 to position of sign bit
+        z1 = z | (x0 & VTYPE(reinterpret_d(yodd)));              // apply sign if y odd
+        z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
+        z = select(xnegative, z1, z);
+    }
+
+    // check for range errors
+    if (horizontal_and(xfinite & yfinite & efinite)) {
+        // fast return if no special cases
+        return z;
+    }
+    // handle special error cases
+    z = select(yfinite & efinite, z, select(x1 == 1., VTYPE(1.), select((x1 > 1.) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.)));
+    yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63; // same as above
+    z = select(xfinite, z, select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec<VTYPE>() | ( VTYPE(reinterpret_d(yodd)) & x0))));
+    z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
+    return z;
+}; 
+
+
+//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+//template <typename TT> static Vec2d pow(Vec2d const & a, TT n);
+
+// instantiations of pow_template_d:
+template <>
+inline Vec2d pow<Vec2d const &>(Vec2d const & x, Vec2d const & y) {
+    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
+}
+
+template <>
+inline Vec2d pow<double>(Vec2d const & x, double y) {
+    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, y);
+}
+template <>
+inline Vec2d pow<float>(Vec2d const & x, float y) {
+    return pow_template_d<Vec2d, Vec2q, Vec2db>(x, (double)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d pow<Vec4d const &>(Vec4d const & x, Vec4d const & y) {
+    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
+}
+
+template <>
+inline Vec4d pow<double>(Vec4d const & x, double y) {
+    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, y);
+}
+
+template <>
+inline Vec4d pow<float>(Vec4d const & x, float y) {
+    return pow_template_d<Vec4d, Vec4q, Vec4db>(x, (double)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d pow<Vec8d const &>(Vec8d const & x, Vec8d const & y) {
+    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
+}
+
+template <>
+inline Vec8d pow<double>(Vec8d const & x, double y) {
+    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, y);
+}
+
+template <>
+inline Vec8d pow<float>(Vec8d const & x, float y) {
+    return pow_template_d<Vec8d, Vec8q, Vec8db>(x, (double)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+// ****************************************************************************
+//                pow template, single precision
+// ****************************************************************************
+
+// Template parameters:
+// VTYPE:  data vector type
+// ITYPE:  signed integer vector type
+// BTYPE:  boolean vector type
+// Calculate x to the power of y
+template <class VTYPE, class ITYPE, class BTYPE>
+static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) {
+
+    // define constants
+    const float ln2f_hi  =  0.693359375f;
+    const float ln2f_lo  = -2.12194440e-4f;
+    //const float max_expf =  87.3f;
+    const float log2e    =  float(VM_LOG2E);     // 1/log(2)
+    const float pow2_23  =  8388608.0f;          // 2^23
+
+    const float P0logf  =  3.3333331174E-1f;
+    const float P1logf  = -2.4999993993E-1f;
+    const float P2logf  =  2.0000714765E-1f;
+    const float P3logf  = -1.6668057665E-1f;
+    const float P4logf  =  1.4249322787E-1f;
+    const float P5logf  = -1.2420140846E-1f;
+    const float P6logf  =  1.1676998740E-1f;
+    const float P7logf  = -1.1514610310E-1f;
+    const float P8logf  =  7.0376836292E-2f;
+
+    // Taylor coefficients for exp function, 1/n!
+    const float p2expf   =  1.f/2.f;
+    const float p3expf   =  1.f/6.f;
+    const float p4expf   =  1.f/24.f;
+    const float p5expf   =  1.f/120.f; 
+    const float p6expf   =  1.f/720.f; 
+    const float p7expf   =  1.f/5040.f; 
+
+    // data vectors
+    VTYPE x, x1, x2;
+    VTYPE ef, yr, v, z, z1;
+    VTYPE lg, lg1;
+    VTYPE lgerr, x2err;
+    VTYPE e1, e2, e3, ee;
+    // integer vectors
+    ITYPE ei, ej, yodd;
+    // boolean vectors
+    BTYPE blend, xzero, xnegative;
+    BTYPE overflow, underflow, xfinite, yfinite, efinite;
+
+    // remove sign
+    x1 = abs(x0);
+
+    // Separate mantissa from exponent 
+    // This gives the mantissa * 0.5
+    x  = fraction_2(x1);
+
+    // reduce range of x = +/- sqrt(2)/2
+    blend = x > float(VM_SQRT2 * 0.5);
+    x  = if_add(!blend, x, x);                   // conditional add
+
+    // Taylor expansion, high precision
+    x   -= 1.0f;
+    x2   = x * x;
+    lg1  = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+    lg1 *= x2 * x; 
+ 
+    // extract exponent
+    ef = exponent_f(x1);
+    ef = if_add(blend, ef, 1.0f);                // conditional add
+
+    // multiply exponent by y
+    // nearest integer e1 goes into exponent of result, remainder yr is added to log
+    e1 = round(ef * y);
+    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
+
+    // add initial terms to expansion
+    lg = nmul_add(0.5f, x2, x) + lg1;            // lg = (x - 0.5f * x2) + lg1;
+
+    // calculate rounding errors in lg
+    // rounding error in multiplication 0.5*x*x
+    x2err = mul_sub_x(0.5f*x, x, 0.5f * x2);
+    // rounding error in additions and subtractions
+    lgerr = mul_add(0.5f, x2, lg - x) - lg1;     // lgerr = ((lg - x) + 0.5f * x2) - lg1;
+
+    // extract something for the exponent
+    e2 = round(lg * y * float(VM_LOG2E));
+    // subtract this from lg, with extra precision
+    v = mul_sub_x(lg, y, e2 * ln2f_hi);
+    v = nmul_add(e2, ln2f_lo, v);                // v -= e2 * ln2f_lo;
+
+    // correct for previous rounding errors
+    v -= mul_sub(lgerr + x2err, y, yr * float(VM_LN2)); // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;
+
+    // exp function
+
+    // extract something for the exponent if possible
+    x = v;
+    e3 = round(x*log2e);
+    // high precision multiplication not needed here because abs(e3) <= 1
+    x = nmul_add(e3, float(VM_LN2), x);          // x -= e3 * float(VM_LN2);
+
+    // Taylor polynomial
+    x2  = x  * x;
+    z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;
+
+    // contributions to exponent
+    ee = e1 + e2 + e3;
+    ei = round_to_int(ee);
+    // biased exponent of result:
+    ej = ei + (ITYPE(reinterpret_i(z)) >> 23);
+    // check exponent for overflow and underflow
+    overflow  = BTYPE(ej >= 0x0FF) | (ee >  300.f);
+    underflow = BTYPE(ej <= 0x000) | (ee < -300.f);
+
+    // add exponent by integer addition
+    z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23)); // the extra 0x10000 is shifted out here
+
+    // check for special cases
+    xfinite   = is_finite(x0);
+    yfinite   = is_finite(y);
+    efinite   = is_finite(ee);
+    xzero     = is_zero_or_subnormal(x0);
+    xnegative = x0  < 0.f;
+
+    // check for overflow and underflow
+    if (horizontal_or(overflow | underflow)) {
+        // handle errors
+        z = select(underflow, VTYPE(0.f), z);
+        z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+    // check for x == 0
+    z = select(xzero, select(y < 0.f, infinite_vec<VTYPE>(), select(y == 0.f, VTYPE(1.), VTYPE(0.f))), z);
+
+    // check for x < 0. y must be integer
+    if (horizontal_or(xnegative)) {
+        // test if y odd
+        yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31;     // convert y to integer and shift bit 0 to position of sign bit
+        z1 = z | (x0 & VTYPE(reinterpret_f(yodd)));              // apply sign if y odd
+        z1 = select(y == round(y), z1, nan_vec<VTYPE>(NAN_POW)); // NAN if y not integer
+        z = select(xnegative, z1, z);
+    }
+
+    // check for range errors
+    if (horizontal_and(xfinite & yfinite & efinite)) {
+        // fast return if no special cases
+        return z;
+    }
+    // handle special error cases
+    z = select(yfinite & efinite, z, select(x1 == 1.f, VTYPE(1.f), select((x1 > 1.f) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.f)));
+    yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31; // same as above
+    z = select(xfinite, z, select(y == 0.f, VTYPE(1.f), select(y < 0.f, VTYPE(0.f), infinite_vec<VTYPE>() | (VTYPE(reinterpret_f(yodd)) & x0))));
+    z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z));
+    return z;
+}
+
+//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+//template <typename TT> static Vec4f pow(Vec4f const & a, TT n);
+
+template <>
+inline Vec4f pow<Vec4f const &>(Vec4f const & x, Vec4f const & y) {
+    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
+}
+
+template <>
+inline Vec4f pow<float>(Vec4f const & x, float y) {
+    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, y);
+}
+
+template <>
+inline Vec4f pow<double>(Vec4f const & x, double y) {
+    return pow_template_f<Vec4f, Vec4i, Vec4fb>(x, (float)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec8f pow<Vec8f const &>(Vec8f const & x, Vec8f const & y) {
+    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
+}
+
+template <>
+inline Vec8f pow<float>(Vec8f const & x, float y) {
+    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, y);
+}
+template <>
+inline Vec8f pow<double>(Vec8f const & x, double y) {
+    return pow_template_f<Vec8f, Vec8i,  Vec8fb>(x, (float)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+inline Vec16f pow(Vec16f const & x, Vec16f const & y) {
+    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
+}
+
+inline Vec16f pow(Vec16f const & x, float y) {
+    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, y);
+}
+
+inline Vec16f pow(Vec16f const & x, double y) {
+    return pow_template_f<Vec16f, Vec16i,  Vec16fb>(x, (float)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             power function with rational exponent
+// *************************************************************
+// Power function with rational exponent: x^(a/b)
+// Template must be defined as class to allow partial template specialization
+template <int a, int b>
+class Power_rational {
+public:
+    // overloaded member function for each vector type
+    Vec4f pow(Vec4f const & x) {
+        Vec4f y = x;
+        // negative x allowed when b odd or a even
+        // (if a is even then either b is odd or a/b can be reduced, 
+        // but we can check a even anyway at no cost to be sure)
+        if (a == 0) return 1.f;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, float(double(a)/double(b)));
+        if (a & b & 1) y = sign_combine(y, x);          // apply sign if a and b both odd
+        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); // zero allowed for positive a and b
+        return y;
+    }
+    Vec2d pow(Vec2d const & x) {
+        Vec2d y = x;
+        if (a == 0) return 1.;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, double((long double)a/(long double)b));
+        if (a & b & 1) y = sign_combine(y, x);
+        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
+        return y;
+    }
+#if MAX_VECTOR_SIZE >= 256
+    Vec8f pow(Vec8f const & x) {
+        Vec8f y = x;
+        if (a == 0) return 1.f;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, float(double(a)/double(b)));
+        if (a & b & 1) y = sign_combine(y, x);
+        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
+        return y;
+    }
+    Vec4d pow(Vec4d const & x) {
+        Vec4d y = x;
+        if (a == 0) return 1.;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, double((long double)a/(long double)b));
+        if (a & b & 1) y = sign_combine(y, x);
+        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
+        return y;
+    }
+#endif // MAX_VECTOR_SIZE >= 256
+#if MAX_VECTOR_SIZE >= 512
+    Vec16f pow(Vec16f const & x) {
+        Vec16f y = x;
+        if (a == 0) return 1.f;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, float(double(a)/double(b)));
+        if (a & b & 1) y = sign_combine(y, x);
+        if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y);
+        return y;
+    }
+    Vec8d pow(Vec8d const & x) {
+        Vec8d y = x;
+        if (a == 0) return 1.;
+        if ((b | ~a) & 1) y = abs(y);
+        y = ::pow(y, double((long double)a/(long double)b));
+        if (a & b & 1) y = sign_combine(y, x);
+        if ((a ^ b) >= 0) y = select(x == 0., 0., y);
+        return y;
+    }
+#endif // MAX_VECTOR_SIZE >= 512
+};
+
+// partial specialization for b = 0
+template<int a>
+class Power_rational<a,0> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {return nan_vec<VTYPE>(NAN_LOG);}
+};
+
+// partial specialization for b = 1
+template<int a>
+class Power_rational<a,1> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {return pow_n<a>(x);}
+};
+
+// partial specialization for b = 2
+template<int a>
+class Power_rational<a,2> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {
+        VTYPE y = pow_n<(a > 0 ? a/2 : (a-1)/2)>(x);
+        if (a & 1) y *= sqrt(x);
+        return y;
+    }
+};
+
+// full specialization for a = 1, b = 2
+template<>
+class Power_rational<1,2> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {        
+        return sqrt(x);
+    }
+};
+
+// full specialization for a = -1, b = 2
+template<>
+class Power_rational<-1,2> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {        
+        // (this is faster than iteration method on modern CPUs)
+        return VTYPE(1.f) / sqrt(x);
+    }
+};
+
+// partial specialization for b = 3
+template<int a>
+class Power_rational<a,3> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {
+        VTYPE t;
+        switch (a % 3) {
+        case -2:
+            t = reciprocal_cbrt(x);
+            t *= t;
+            if (a == -2) return t;
+            t = t / pow_n<(-a-2)/3>(x);
+            break;
+        case -1:
+            t = reciprocal_cbrt(x);
+            if (a == -1) return t;
+            t = t / pow_n<(-a-1)/3>(x);
+            break;
+        case  0:
+            t = pow_n<a/3>(x);
+            break;
+        case  1:
+            t = cbrt(x);
+            if (a == 1) return t;
+            t = t * pow_n<a/3>(x);
+            break;
+        case  2:
+            t = square_cbrt(x);
+            if (a == 2) return t;
+            t = t * pow_n<a/3>(x);
+            break;
+        }
+        return t;
+    }
+};
+
+// partial specialization for b = 4
+template<int a>
+class Power_rational<a,4> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {
+        VTYPE t, s1, s2;
+        s1 = sqrt(x);
+        if (a & 1) s2 = sqrt(s1);
+        switch (a % 4) {
+        case -3:
+            t = s2 / pow_n<1+(-a)/4>(x);
+            break;
+        case -2:
+            t = s1 / pow_n<1+(-a)/4>(x);
+            break;
+        case -1:
+            if (a != -1) s2 *= pow_n<(-a)/4>(x);
+            t = VTYPE(1.f) / s2;
+            break;
+        case  0: default:
+            t = pow_n<a/4>(x);
+            break;
+        case  1:
+            t = s2;
+            if (a != 1) t *= pow_n<a/4>(x);
+            break;
+        case  2:
+            t = s1;
+            if (a != 2) t *= pow_n<a/4>(x);
+            break;
+        case  3:
+            t = s1 * s2;
+            if (a != 3) t *= pow_n<a/4>(x);
+            break;
+        }
+        return t;
+    }
+};
+
+// partial specialization for b = 6
+template<int a>
+class Power_rational<a,6> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {
+        VTYPE t, s1, s2, s3;
+        switch (a % 6) {
+        case -5:
+            t = reciprocal_cbrt(x);
+            t = t * t * sqrt(t);
+            if (a != -5) t /= pow_n<(-a)/6>(x);
+            break;
+        case -4:
+            t = reciprocal_cbrt(x);
+            t *= t;
+            if (a != -4) t /= pow_n<(-a)/6>(x);
+            break;
+        case -3:
+            t = pow_n<a/6>(x);
+            t /= sqrt(x);
+            break;
+        case -2:
+            t = reciprocal_cbrt(x);
+            if (a != -2) t /= pow_n<(-a)/6>(x);
+            break;
+        case -1:
+            t = sqrt(reciprocal_cbrt(x));
+            if (a != -1) t /= pow_n<(-a)/6>(x);
+            break;
+        case  0: default:
+            t = pow_n<a/6>(x);
+            break;
+        case  1:
+            t = sqrt(cbrt(x));
+            if (a != 1) t *= pow_n<a/6>(x);
+            break;
+        case  2:
+            t = cbrt(x);
+            if (a != 2) t *= pow_n<a/6>(x);
+            break;
+        case  3:
+            t = sqrt(x);
+            if (a != 3) t *= pow_n<a/6>(x);
+            break;
+        case  4:
+            t = square_cbrt(x);
+            if (a != 4) t *= pow_n<a/6>(x);
+            break;
+        case  5:
+            t = cbrt(x);
+            t = t * t * sqrt(t);
+            if (a != 5) t *= pow_n<a/6>(x);
+            break;
+        }
+        return t;
+    }
+};
+
+// partial specialization for b = 8
+template<int a>
+class Power_rational<a,8> {
+public:
+    template<class VTYPE>
+    VTYPE pow(VTYPE const & x) {
+        VTYPE t, s1, s2, s3;
+        s1 = sqrt(x);               // x^(1/2)
+        if (a & 3) s2 = sqrt(s1);   // x^(1/4)
+        if (a & 1) s3 = sqrt(s2);   // x^(1/8)
+        switch (a % 8) {
+        case -7:
+            t = s3 / pow_n<1+(-a)/8>(x);
+            break;
+        case -6:
+            t = s2 / pow_n<1+(-a)/8>(x);
+            break;
+        case -5:
+            t = s3 * (s2 / pow_n<1+(-a)/8>(x));
+            break;
+        case -4:
+            t = s1 / pow_n<1+(-a)/8>(x);
+            break;
+        case -3:
+            t = s3 * (s1 / pow_n<1+(-a)/8>(x));
+            break;
+        case -2:
+            if (a != -2) s2 *= pow_n<(-a)/8>(x);
+            t = VTYPE(1.f) / s2;
+            break;
+        case -1:
+            if (a != -1) s3 *= pow_n<(-a)/8>(x);
+            t = VTYPE(1.f) / s3;
+            break;
+        case  0: default:
+            t = pow_n<a/8>(x);
+            break;
+        case  1:
+            t = s3;
+            if (a != 1) t *= pow_n<a/8>(x);
+            break;
+        case  2:
+            t = s2;
+            if (a != 2) t *= pow_n<a/8>(x);
+            break;
+        case  3:
+            t = s2 * s3;
+            if (a != 3) t *= pow_n<a/8>(x);
+            break;
+        case  4:
+            t = s1;
+            if (a != 4) t *= pow_n<a/8>(x);
+            break;
+        case  5:
+            t = s1 * s3;
+            if (a != 5) t *= pow_n<a/8>(x);
+            break;
+        case  6:
+            t = s1 * s2;
+            if (a != 6) t *= pow_n<a/8>(x);
+            break;
+        case  7:
+            t = s2 * s3;
+            if (a != 7) s1 *= pow_n<a/8>(x);
+            t *= s1;
+            break;
+
+        }
+        return t;
+    }
+};
+
+// macro to call template class member function pow
+#define pow_ratio(x, a, b) (Power_rational<(b)<0 ? -(a):(a), (b)<0 ? -(b):(b)> ().pow(x))
+
+
+/******************************************************************************
+*                 Detect NAN codes
+*
+* These functions return the code hidden in a NAN. The sign bit is ignored
+******************************************************************************/
+
+static inline Vec4i nan_code(Vec4f const & x) {
+    Vec4i  a = reinterpret_i(x);
+    Vec4ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
+    return a & 0x007FFFFF & Vec4i(b);            // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec2q nan_code(Vec2d const & x) {
+    Vec2q  a = reinterpret_i(x);
+    Vec2q const m = 0x7FF0000000000000;
+    Vec2q const n = 0x000FFFFFFFFFFFFF;
+    Vec2qb b = (a & m) == m;                     // check if NAN/INF
+    return a & n & Vec2q(b);                     // isolate NAN code bits
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8i nan_code(Vec8f const & x) {
+    Vec8i  a = reinterpret_i(x);
+    Vec8ib b = (a & 0x7F800000) == 0x7F800000;   // check if NAN/INF
+    return a & 0x007FFFFF & Vec8i(b);            // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec4q nan_code(Vec4d const & x) {
+    Vec4q  a = reinterpret_i(x);
+    Vec4q const m = 0x7FF0000000000000;
+    Vec4q const n = 0x000FFFFFFFFFFFFF;
+    Vec4qb b = (a & m) == m;                     // check if NAN/INF
+    return a & n & Vec4q(b);                     // isolate NAN code bits
+}
+
+#endif // MAX_VECTOR_SIZE >= 256 
+#if MAX_VECTOR_SIZE >= 512
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec16i nan_code(Vec16f const & x) {
+    Vec16i  a = Vec16i(reinterpret_i(x));
+    Vec16ib b = (a & 0x7F800000) == 0x7F800000;  // check if NAN/INF
+    return a & 0x007FFFFF & Vec16i(b);           // isolate NAN code bits
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8q nan_code(Vec8d const & x) {
+    Vec8q  a = Vec8q(reinterpret_i(x));
+    Vec8q const m = 0x7FF0000000000000;
+    Vec8q const n = 0x000FFFFFFFFFFFFF;
+    Vec8qb b = (a & m) == m;                     // check if NAN/INF
+    return a & n & Vec8q(b);                     // isolate NAN code bits
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+#endif  // VECTORMATH_EXP_H
diff --git a/vectorclass/vectormath_hyp.h b/vectorclass/vectormath_hyp.h
new file mode 100755
index 0000000..27ca7a9
--- /dev/null
+++ b/vectorclass/vectormath_hyp.h
@@ -0,0 +1,736 @@
+/****************************  vectormath_hyp.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-07-09
+* Last modified: 2014-10-16
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file containing inline vector functions of hyperbolic and inverse 
+* hyperbolic functions:
+* sinh        hyperbolic sine
+* cosh        hyperbolic cosine
+* tanh        hyperbolic tangent
+* asinh       inverse hyperbolic sine
+* acosh       inverse hyperbolic cosine
+* atanh       inverse hyperbolic tangent
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions, see vectormath_common.h and VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+******************************************************************************/
+
+#ifndef VECTORMATH_HYP_H
+#define VECTORMATH_HYP_H  1 
+
+#include "vectormath_exp.h"  
+
+
+/******************************************************************************
+*                 Hyperbolic functions
+******************************************************************************/
+
+// Template for sinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE sinh_d(VTYPE const & x0) {    
+// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const double p0 = -3.51754964808151394800E5;
+    const double p1 = -1.15614435765005216044E4;
+    const double p2 = -1.63725857525983828727E2;
+    const double p3 = -7.89474443963537015605E-1; 
+
+    const double q0 = -2.11052978884890840399E6;
+    const double q1 =  3.61578279834431989373E4;
+    const double q2 = -2.77711081420602794433E2;
+    const double q3 =  1.0; 
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small;                               // boolean vector
+
+    x = abs(x0);
+    x_small = x <= 1.0;                          // use Pade approximation if abs(x) <= 1
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3);
+        y1 = mul_add(y1, x*x2, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 =  exp_d<VTYPE, BTYPE, 0, 1>(x);      //   0.5 * exp(x)
+        y2 -= 0.25 / y2;                         // - 0.5 * exp(-x)
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of sinh_d template
+static inline Vec2d sinh(Vec2d const & x) {
+    return sinh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sinh(Vec4d const & x) {
+    return sinh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sinh(Vec8d const & x) {
+    return sinh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for sinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE sinh_f(VTYPE const & x0) {    
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const float r0 = 1.66667160211E-1f;
+    const float r1 = 8.33028376239E-3f;
+    const float r2 = 2.03721912945E-4f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small;                               // boolean vector
+
+    x = abs(x0);
+    x_small = x <= 1.0f;                         // use polynomial approximation if abs(x) <= 1
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_2(x2, r0, r1, r2);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 =  exp_f<VTYPE, BTYPE, 0, 1>(x);      //   0.5 * exp(x)
+        y2 -= 0.25f / y2;                        // - 0.5 * exp(-x)
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of sinh_f template
+static inline Vec4f sinh(Vec4f const & x) {
+    return sinh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sinh(Vec8f const & x) {
+    return sinh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sinh(Vec16f const & x) {
+    return sinh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for cosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE cosh_d(VTYPE const & x0) {    
+// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // data vectors
+    VTYPE x, y;
+
+    x  = abs(x0);
+    y  = exp_d<VTYPE, BTYPE, 0, 1>(x);           //   0.5 * exp(x)
+    y += 0.25 / y;                               // + 0.5 * exp(-x)
+    return y;
+}
+
+// instances of sinh_d template
+static inline Vec2d cosh(Vec2d const & x) {
+    return cosh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d cosh(Vec4d const & x) {
+    return cosh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d cosh(Vec8d const & x) {
+    return cosh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for cosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE cosh_f(VTYPE const & x0) {    
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // data vectors
+    VTYPE x, y;
+
+    x  = abs(x0);
+    y  = exp_f<VTYPE, BTYPE, 0, 1>(x);           //   0.5 * exp(x)
+    y += 0.25f / y;                              // + 0.5 * exp(-x)
+    return y;
+}
+
+// instances of sinh_d template
+static inline Vec4f cosh(Vec4f const & x) {
+    return cosh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f cosh(Vec8f const & x) {
+    return cosh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f cosh(Vec16f const & x) {
+    return cosh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for tanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE tanh_d(VTYPE const & x0) {    
+
+    // Coefficients
+    const double p0 = -1.61468768441708447952E3;
+    const double p1 = -9.92877231001918586564E1;
+    const double p2 = -9.64399179425052238628E-1;
+    
+    const double q0 =  4.84406305325125486048E3;
+    const double q1 =  2.23548839060100448583E3;
+    const double q2 =  1.12811678491632931402E2;
+    const double q3 =  1.0; 
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small, x_big;                        // boolean vectors
+
+    x = abs(x0);
+    x_small = x <= 0.625;                        // use Pade approximation if abs(x) <= 5/8
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = exp(x+x);                           // exp(2*x)
+        y2 = 1.0 - 2.0 / (y2 + 1.0);             // tanh(x)
+    }
+    x_big = x > 350.;
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = select(x_big,  1.0, y1);                // avoid overflow
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of tanh_d template
+static inline Vec2d tanh(Vec2d const & x) {
+    return tanh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tanh(Vec4d const & x) {
+    return tanh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tanh(Vec8d const & x) {
+    return tanh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for tanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE tanh_f(VTYPE const & x0) {    
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const float r0 = -3.33332819422E-1f;
+    const float r1 =  1.33314422036E-1f;
+    const float r2 = -5.37397155531E-2f;
+    const float r3 =  2.06390887954E-2f;
+    const float r4 = -5.70498872745E-3f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small, x_big;                        // boolean vectors
+
+    x = abs(x0);
+    x_small = x <= 0.625f;                       // use polynomial approximation if abs(x) <= 5/8
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = exp(x+x);                           // exp(2*x)
+        y2 = 1.0f - 2.0f / (y2 + 1.0f);          // tanh(x)
+    }
+    x_big = x > 44.4f;
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = select(x_big,  1.0f, y1);               // avoid overflow
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of tanh_f template
+static inline Vec4f tanh(Vec4f const & x) {
+    return tanh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f tanh(Vec8f const & x) {
+    return tanh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f tanh(Vec16f const & x) {
+    return tanh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+/******************************************************************************
+*                 Inverse hyperbolic functions
+******************************************************************************/
+
+// Template for asinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE asinh_d(VTYPE const & x0) {    
+
+    // Coefficients
+    const double p0 = -5.56682227230859640450E0;
+    const double p1 = -9.09030533308377316566E0;
+    const double p2 = -4.37390226194356683570E0;
+    const double p3 = -5.91750212056387121207E-1; 
+    const double p4 = -4.33231683752342103572E-3;
+
+    const double q0 =  3.34009336338516356383E1;
+    const double q1 =  6.95722521337257608734E1;
+    const double q2 =  4.86042483805291788324E1;
+    const double q3 =  1.28757002067426453537E1;
+    const double q4 =  1.0;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small, x_huge;                       // boolean vectors
+
+    x2 = x0 * x0;
+    x  = abs(x0);
+    x_small = x <= 0.533;                        // use Pade approximation if abs(x) <= 0.5
+                                                 // both methods give the highest error close to 0.5. this limit is adjusted for minimum error
+    x_huge  = x > 1.E20;                         // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x + sqrt(x2 + 1.0));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x) + VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of asinh_d template
+static inline Vec2d asinh(Vec2d const & x) {
+    return asinh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asinh(Vec4d const & x) {
+    return asinh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asinh(Vec8d const & x) {
+    return asinh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for asinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE asinh_f(VTYPE const & x0) {    
+
+    // Coefficients
+    const float r0 = -1.6666288134E-1f;
+    const float r1 =  7.4847586088E-2f;
+    const float r2 = -4.2699340972E-2f;
+    const float r3 =  2.0122003309E-2f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+    BTYPE x_small, x_huge;                       // boolean vectors
+
+    x2 = x0 * x0;
+    x  = abs(x0);
+    x_small = x <= 0.51f;                        // use polynomial approximation if abs(x) <= 0.5
+    x_huge  = x > 1.E10f;                        // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = polynomial_3(x2, r0, r1, r2, r3);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x + sqrt(x2 + 1.0f));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x) + (float)VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of asinh_f template
+static inline Vec4f asinh(Vec4f const & x) {
+    return asinh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asinh(Vec8f const & x) {
+    return asinh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asinh(Vec16f const & x) {
+    return asinh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for acosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE acosh_d(VTYPE const & x0) {    
+
+    // Coefficients
+    const double p0 = 1.10855947270161294369E5;
+    const double p1 = 1.08102874834699867335E5;
+    const double p2 = 3.43989375926195455866E4;
+    const double p3 = 3.94726656571334401102E3; 
+    const double p4 = 1.18801130533544501356E2;
+
+    const double q0 = 7.83869920495893927727E4;
+    const double q1 = 8.29725251988426222434E4;
+    const double q2 = 2.97683430363289370382E4;
+    const double q3 = 4.15352677227719831579E3;
+    const double q4 = 1.86145380837903397292E2;
+    const double q5 = 1.0;
+
+    // data vectors
+    VTYPE x1, y1, y2;
+    BTYPE x_small, x_huge, undef;                // boolean vectors
+
+    x1      = x0 - 1.0;
+    undef   = x0 < 1.0;                          // result is NAN
+    x_small = x1 < 0.49;                         // use Pade approximation if abs(x-1) < 0.5
+    x_huge  = x1 > 1.E20;                        // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5));
+        // x < 1 generates NAN
+        y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x0) + VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    return y1;
+}
+
+// instances of acosh_d template
+static inline Vec2d acosh(Vec2d const & x) {
+    return acosh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d acosh(Vec4d const & x) {
+    return acosh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d acosh(Vec8d const & x) {
+    return acosh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for acosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE acosh_f(VTYPE const & x0) {    
+
+    // Coefficients
+    const float r0 =  1.4142135263E0f;
+    const float r1 = -1.1784741703E-1f;
+    const float r2 =  2.6454905019E-2f;
+    const float r3 = -7.5272886713E-3f;
+    const float r4 =  1.7596881071E-3f;
+
+    // data vectors
+    VTYPE x1, y1, y2;
+    BTYPE x_small, x_huge, undef;                // boolean vectors
+
+    x1      = x0 - 1.0f;
+    undef   = x0 < 1.0f;                         // result is NAN
+    x_small = x1 < 0.49f;                        // use Pade approximation if abs(x-1) < 0.5
+    x_huge  = x1 > 1.E10f;                       // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4);
+        // x < 1 generates NAN
+        y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x0) + (float)VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    return y1;
+}
+
+// instances of acosh_f template
+static inline Vec4f acosh(Vec4f const & x) {
+    return acosh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f acosh(Vec8f const & x) {
+    return acosh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f acosh(Vec16f const & x) {
+    return acosh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for atanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE atanh_d(VTYPE const & x0) {    
+
+    // Coefficients
+    const double p0 = -3.09092539379866942570E1;
+    const double p1 =  6.54566728676544377376E1;
+    const double p2 = -4.61252884198732692637E1;
+    const double p3 =  1.20426861384072379242E1;
+    const double p4 = -8.54074331929669305196E-1;
+
+    const double q0 = -9.27277618139601130017E1;
+    const double q1 =  2.52006675691344555838E2;
+    const double q2 = -2.49839401325893582852E2;
+    const double q3 =  1.08938092147140262656E2;
+    const double q4 = -1.95638849376911654834E1;
+    const double q5 =  1.0;
+
+    // data vectors
+    VTYPE x, x2, y1, y2, y3;
+    BTYPE x_small;                               // boolean vector
+
+    x  = abs(x0);
+    x_small = x < 0.5;                           // use Pade approximation if abs(x) < 0.5
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x * x;
+        y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5);
+        y1 = mul_add(y1, x2*x, x);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log((1.0+x)/(1.0-x)) * 0.5;
+        // check if out of range
+        y3 = select(x == 1.0, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+        y2 = select(x >= 1.0, y3, y2);
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of atanh_d template
+static inline Vec2d atanh(Vec2d const & x) {
+    return atanh_d<Vec2d, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atanh(Vec4d const & x) {
+    return atanh_d<Vec4d, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atanh(Vec8d const & x) {
+    return atanh_d<Vec8d, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for atanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE: double vector type
+// BTYPE: boolean vector type 
+template<class VTYPE, class BTYPE> 
+static inline VTYPE atanh_f(VTYPE const & x0) {    
+
+    // Coefficients
+    const float r0 = 3.33337300303E-1f;
+    const float r1 = 1.99782164500E-1f;
+    const float r2 = 1.46691431730E-1f;
+    const float r3 = 8.24370301058E-2f;
+    const float r4 = 1.81740078349E-1f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2, y3;
+    BTYPE x_small;                               // boolean vector
+
+    x  = abs(x0);
+    x_small = x < 0.5f;                          // use polynomial approximation if abs(x) < 0.5
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x * x;
+        y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+        y1 = mul_add(y1, x2*x, x);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log((1.0f+x)/(1.0f-x)) * 0.5f;
+        // check if out of range
+        y3 = select(x == 1.0f, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+        y2 = select(x >= 1.0f, y3, y2);
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+
+    return y1;
+}
+
+// instances of atanh_f template
+static inline Vec4f atanh(Vec4f const & x) {
+    return atanh_f<Vec4f, Vec4fb>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atanh(Vec8f const & x) {
+    return atanh_f<Vec8f, Vec8fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atanh(Vec16f const & x) {
+    return atanh_f<Vec16f, Vec16fb>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+#endif
diff --git a/vectorclass/vectormath_lib.h b/vectorclass/vectormath_lib.h
new file mode 100755
index 0000000..edea799
--- /dev/null
+++ b/vectorclass/vectormath_lib.h
@@ -0,0 +1,2107 @@
+/****************************  vectormath_lib.h   *****************************
+| Author:        Agner Fog
+| Date created:  2012-05-30
+* Last modified: 2014-04-23
+| Version:       1.16
+| Project:       vector classes
+| Description:
+| Header file defining mathematical functions on floating point vectors
+| May use Intel SVML library or AMD LIBM library
+|
+| Instructions:
+| Define VECTORMATH to one of the following values:
+|   0:  Use ordinary math library (slow)
+|   1:  Use AMD LIBM library
+|   2:  Use Intel SVML library with any compiler
+|   3:  Use Intel SVML library with Intel compiler
+|
+| For detailed instructions, see VectorClass.pdf
+|
+| (c) Copyright 2012-2014 GNU General Public License http://www.gnu.org/licenses
+\*****************************************************************************/
+
+// check combination of header files
+#ifndef VECTORMATH_LIB_H
+#define VECTORMATH_LIB_H
+
+#include "vectorf128.h"
+
+#ifndef VECTORMATH
+#ifdef __INTEL_COMPILER
+#define VECTORMATH 3
+#else
+#define VECTORMATH 0
+#endif // __INTEL_COMPILER
+#endif // VECTORMATH
+
+/*****************************************************************************
+*
+*      VECTORMATH = 0. Use ordinary library (scalar)
+*
+*****************************************************************************/
+#if VECTORMATH == 0
+#include <math.h>
+
+#ifndef VECTORMATH_COMMON_H
+// exponential and power functions
+static inline Vec4f exp (Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(expf(xx[0]), expf(xx[1]), expf(xx[2]), expf(xx[3]));
+}
+static inline Vec2d exp (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(exp(xx[0]), exp(xx[1]));
+}
+
+// There is no certain way to know which functions are available, but at least some (Gnu)
+// compilers have defines to specify this
+#ifdef HAVE_EXPM1
+static inline Vec4f expm1 (Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(expm1(xx[0]), expm1(xx[1]), expm1(xx[2]), expm1(xx[3]));
+}
+static inline Vec2d expm1 (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(expm1(xx[0]), expm1(xx[1]));
+}
+#endif
+
+#ifdef HAVE_EXP2
+static inline Vec4f exp2 (Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(exp2(xx[0]), exp2(xx[1]), exp2(xx[2]), exp2(xx[3]));
+}
+static inline Vec2d exp2 (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(exp2(xx[0]), exp2(xx[1]));
+}
+#else
+static inline Vec4f exp2 (Vec4f const & x) {
+    return exp(x*Vec4f(0.693147180559945309417f /* log(2) */));
+}
+static inline Vec2d exp2 (Vec2d const & x) {
+    return exp(x*Vec2d(0.693147180559945309417  /* log(2) */));
+}
+#endif
+
+static inline Vec4f exp10 (Vec4f const & x) {
+    return exp(x*Vec4f(2.30258509299404568402f /* log(10) */));
+}
+static inline Vec2d exp10 (Vec2d const & x) {
+    return exp(x*Vec2d(2.30258509299404568402  /* log(10) */));
+}
+
+static inline Vec4f pow (Vec4f const & a, Vec4f const & b) {
+    float aa[4], bb[4];
+    a.store(aa);  b.store(bb);
+    return Vec4f(powf(aa[0],bb[0]), powf(aa[1],bb[1]), powf(aa[2],bb[2]), powf(aa[3],bb[3]));
+}
+static inline Vec2d pow (Vec2d const & a, Vec2d const & b) {
+    double aa[4], bb[4];
+    a.store(aa);  b.store(bb);
+    return Vec2d(pow(aa[0],bb[0]), pow(aa[1],bb[1]));
+}
+
+static inline Vec4f log (Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(log(xx[0]), log(xx[1]), log(xx[2]), log(xx[3]));
+}
+static inline Vec2d log (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(log(xx[0]), log(xx[1]));
+}
+
+#ifdef HAVE_LOG1P
+static inline Vec4f log1p (Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(log1p(xx[0]), log1p(xx[1]), log1p(xx[2]), log1p(xx[3]));
+}
+static inline Vec2d log1p (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(log1p(xx[0]), log1p(xx[1]));
+}
+#endif
+
+static inline Vec4f log2 (Vec4f const & x) {   // logarithm base 2
+    return log(x)*Vec4f(1.44269504088896340736f/* log2(e) */);
+}
+static inline Vec2d log2 (Vec2d const & x) {   // logarithm base 2
+    return log(x)*Vec2d(1.44269504088896340736 /* log2(e) */);
+}
+
+static inline Vec4f log10 (Vec4f const & x) {  // logarithm base 10
+    float xx[4];
+    x.store(xx);
+    return Vec4f(log10f(xx[0]), log10f(xx[1]), log10f(xx[2]), log10f(xx[3]));
+}
+static inline Vec2d log10 (Vec2d const & x) {  // logarithm base 10
+    double xx[4];
+    x.store(xx);
+    return Vec2d(log10(xx[0]), log10(xx[1]));
+}
+
+// trigonometric functions
+static inline Vec4f sin(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(sinf(xx[0]), sinf(xx[1]), sinf(xx[2]), sinf(xx[3]));
+}
+static inline Vec2d sin (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(sin(xx[0]), sin(xx[1]));
+}
+
+static inline Vec4f cos(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(cosf(xx[0]), cosf(xx[1]), cosf(xx[2]), cosf(xx[3]));
+}
+static inline Vec2d cos (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(cos(xx[0]), cos(xx[1]));
+}
+
+static inline Vec4f sincos (Vec4f * pcos, Vec4f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    *pcos = cos(x);
+    return sin(x);
+}
+static inline Vec2d sincos (Vec2d * pcos, Vec2d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    *pcos = cos(x);
+    return sin(x);
+}
+
+static inline Vec4f tan(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(tanf(xx[0]), tanf(xx[1]), tanf(xx[2]), tanf(xx[3]));
+}
+static inline Vec2d tan (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(tan(xx[0]), tan(xx[1]));
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(asinf(xx[0]), asinf(xx[1]), asinf(xx[2]), asinf(xx[3]));
+}
+static inline Vec2d asin (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(asin(xx[0]), asin(xx[1]));
+}
+
+static inline Vec4f acos(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(acosf(xx[0]), acosf(xx[1]), acosf(xx[2]), acosf(xx[3]));
+}
+static inline Vec2d acos (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(acos(xx[0]), acos(xx[1]));
+}
+
+static inline Vec4f atan(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(atanf(xx[0]), atanf(xx[1]), atanf(xx[2]), atanf(xx[3]));
+}
+static inline Vec2d atan (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(atan(xx[0]), atan(xx[1]));
+}
+
+static inline Vec4f atan2 (Vec4f const & a, Vec4f const & b) {   // inverse tangent of a/b
+    float aa[4], bb[4];
+    a.store(aa);  b.store(bb);
+    return Vec4f(atan2f(aa[0],bb[0]), atan2f(aa[1],bb[1]), atan2f(aa[2],bb[2]), atan2f(aa[3],bb[3]));
+}
+static inline Vec2d atan2 (Vec2d const & a, Vec2d const & b) {   // inverse tangent of a/b
+    double aa[4], bb[4];
+    a.store(aa);  b.store(bb);
+    return Vec2d(atan2(aa[0],bb[0]), atan2(aa[1],bb[1]));
+}
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions
+static inline Vec4f sinh(Vec4f const & x) {   // hyperbolic sine
+    float xx[4];
+    x.store(xx);
+    return Vec4f(sinhf(xx[0]), sinhf(xx[1]), sinhf(xx[2]), sinhf(xx[3]));
+}
+static inline Vec2d sinh (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(sinh(xx[0]), sinh(xx[1]));
+}
+
+static inline Vec4f cosh(Vec4f const & x) {   // hyperbolic cosine
+    float xx[4];
+    x.store(xx);
+    return Vec4f(coshf(xx[0]), coshf(xx[1]), coshf(xx[2]), coshf(xx[3]));
+}
+static inline Vec2d cosh (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(cosh(xx[0]), cosh(xx[1]));
+}
+
+static inline Vec4f tanh(Vec4f const & x) {   // hyperbolic tangent
+    float xx[4];
+    x.store(xx);
+    return Vec4f(tanhf(xx[0]), tanhf(xx[1]), tanhf(xx[2]), tanhf(xx[3]));
+}
+static inline Vec2d tanh (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(tanh(xx[0]), tanh(xx[1]));
+}
+
+// error function
+#ifdef HAVE_ERF
+static inline Vec4f erf(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(erf(xx[0]), erf(xx[1]), erf(xx[2]), erf(xx[3]));
+}
+static inline Vec2d erf (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(erf(xx[0]), erf(xx[1]));
+}
+#endif
+
+#ifdef HAVE_ERFC
+static inline Vec4f erfc(Vec4f const & x) {
+    float xx[4];
+    x.store(xx);
+    return Vec4f(erfc(xx[0]), erfc(xx[1]), erfc(xx[2]), erfc(xx[3]));
+}
+static inline Vec2d erfc (Vec2d const & x) {
+    double xx[4];
+    x.store(xx);
+    return Vec2d(erfc(xx[0]), erfc(xx[1]));
+}
+#endif
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec4f cexp (Vec4f const & x) {   // complex exponential function
+    float xx[4], ee[2];
+    x.store(xx);
+    Vec4f z(cosf(xx[1]),sinf(xx[1]),cosf(xx[3]),sinf(xx[3])); 
+    ee[0] = expf(xx[0]);  ee[1] = expf(xx[2]);
+    return z * Vec4f(ee[0],ee[0],ee[1],ee[1]);
+}
+
+static inline Vec2d cexp (Vec2d const & x) {   // complex exponential function
+    double xx[2];
+    x.store(xx);
+    Vec2d z(cos(xx[1]), sin(xx[1]));
+    return z * exp(xx[0]);
+}
+
+#if defined (VECTORF256_H)  // 256 bit vectors defined
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const & x) {   // exponential function
+    return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp (Vec4d const & x) {   // exponential function
+    return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+#ifdef HAVE_EXPM1
+static inline Vec8f expm1 (Vec8f const & x) {   // exp(x)-1
+    return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1 (Vec4d const & x) {   // exp(x)-1
+    return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+#endif
+
+static inline Vec8f exp2 (Vec8f const & x) {   // pow(2,x)
+    return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2 (Vec4d const & x) {   // pow(2,x)
+    return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10 (Vec8f const & x) {   // pow(10,x)
+    return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10 (Vec4d const & x) {   // pow(10,x)
+    return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow (Vec8f const & a, Vec8f const & b) {   // pow(a,b) = a to the power of b
+    return Vec8f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec4d pow (Vec4d const & a, Vec4d const & b) {   // pow(a,b) = a to the power of b
+    return Vec4d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const & x) {   // natural logarithm
+    return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log (Vec4d const & x) {   // natural logarithm
+    return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+#ifdef HAVE_LOG1P
+static inline Vec8f log1p (Vec8f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p (Vec4d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+#endif
+
+static inline Vec8f log2 (Vec8f const & x) {   // logarithm base 2
+    return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2 (Vec4d const & x) {   // logarithm base 2
+    return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10 (Vec8f const & x) {   // logarithm base 10
+    return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10 (Vec4d const & x) {   // logarithm base 10
+    return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const & x) {   // sine
+    return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin (Vec4d const & x) {   // sine
+    return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos (Vec8f const & x) {   // cosine
+    return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos (Vec4d const & x) {   // cosine
+    return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    *pcos = Vec8f(cos(x.get_low()), cos(x.get_high())); 
+    return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    *pcos = Vec4d(cos(x.get_low()), cos(x.get_high())); 
+    return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f tan (Vec8f const & x) {   // tangent
+    return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan (Vec4d const & x) {   // tangent
+    return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const & x) {   // inverse sine
+    return Vec8f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec4d asin (Vec4d const & x) {   // inverse sine
+    return Vec4d(asin(x.get_low()), asin(x.get_high()));
+}
+
+static inline Vec8f acos (Vec8f const & x) {   // inverse cosine
+    return Vec8f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec4d acos (Vec4d const & x) {   // inverse cosine
+    return Vec4d(acos(x.get_low()), acos(x.get_high()));
+}
+
+static inline Vec8f atan (Vec8f const & x) {   // inverse tangent
+    return Vec8f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec4d atan (Vec4d const & x) {   // inverse tangent
+    return Vec4d(atan(x.get_low()), atan(x.get_high()));
+}
+
+static inline Vec8f atan (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
+    return Vec8f(atan(a.get_low(),b.get_low()), atan(a.get_high(),b.get_high()));
+}
+static inline Vec4d atan (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
+    return Vec4d(atan(a.get_low(),b.get_low()), atan(a.get_high(),b.get_high()));
+}
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh (Vec8f const & x) {   // hyperbolic sine
+    return Vec8f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec4d sinh (Vec4d const & x) {   // hyperbolic sine
+    return Vec4d(sinh(x.get_low()), sinh(x.get_high()));
+}
+
+static inline Vec8f cosh (Vec8f const & x) {   // hyperbolic cosine
+    return Vec8f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec4d cosh (Vec4d const & x) {   // hyperbolic cosine
+    return Vec4d(cosh(x.get_low()), cosh(x.get_high()));
+}
+
+static inline Vec8f tanh (Vec8f const & x) {   // hyperbolic tangent
+    return Vec8f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec4d tanh (Vec4d const & x) {   // hyperbolic tangent
+    return Vec4d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+// error function
+#ifdef HAVE_ERF
+static inline Vec8f erf (Vec8f const & x) {   // error function
+    return Vec8f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec4d erf (Vec4d const & x) {   // error function
+    return Vec4d(erf(x.get_low()), erf(x.get_high()));
+}
+#endif
+#ifdef HAVE_ERFC
+static inline Vec8f erfc (Vec8f const & x) {   // error function complement
+    return Vec8f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec4d erfc (Vec4d const & x) {   // error function complement
+    return Vec4d(erfc(x.get_low()), erfc(x.get_high()));
+}
+#endif
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp (Vec8f const & x) {   // complex exponential function
+    return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp (Vec4d const & x) {   // complex exponential function
+    return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif // VECTORF256_H == 1
+
+
+/*****************************************************************************
+*
+*      VECTORMATH = 1. Use AMD LIBM library
+*
+*****************************************************************************/
+#elif VECTORMATH == 1
+//#include <amdlibm.h> 
+#include "amdlibm.h" // if header file is in current directory
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp (Vec4f const & x) {   // exponential function
+    return amd_vrs4_expf(x);
+}
+static inline Vec2d exp (Vec2d const & x) {   // exponential function
+    return amd_vrd2_exp(x);
+}
+
+static inline Vec4f expm1 (Vec4f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return amd_vrs4_expm1f(x);
+}
+static inline Vec2d expm1 (Vec2d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return amd_vrd2_expm1(x);
+}
+
+static inline Vec4f exp2 (Vec4f const & x) {   // pow(2,x)
+    return amd_vrs4_exp2f(x);
+}
+static inline Vec2d exp2 (Vec2d const & x) {   // pow(2,x)
+    return amd_vrd2_exp2(x);
+}
+
+static inline Vec4f exp10 (Vec4f const & x) {   // pow(10,x)
+    return amd_vrs4_exp10f(x);
+}
+static inline Vec2d exp10 (Vec2d const & x) {   // pow(10,x)
+    return amd_vrd2_exp10(x);
+}
+
+static inline Vec4f pow (Vec4f const & a, Vec4f const & b) {   // pow(a,b) = a to the power of b
+    return amd_vrs4_powf(a,b);
+}
+static inline Vec2d pow (Vec2d const & a, Vec2d const & b) {   // pow(a,b) = a to the power of b
+    return amd_vrd2_pow(a,b);
+}
+
+static inline Vec4f cbrt (Vec4f const & x) {   // pow(x,1/3)
+    return amd_vrs4_cbrtf(x);
+}
+static inline Vec2d cbrt (Vec2d const & x) {   // pow(x,1/3)
+    return amd_vrd2_cbrt(x);
+}
+
+// logarithms
+static inline Vec4f log (Vec4f const & x) {   // natural logarithm
+    return amd_vrs4_logf(x);
+}
+static inline Vec2d log (Vec2d const & x) {   // natural logarithm
+    return amd_vrd2_log(x);
+}
+
+static inline Vec4f log1p (Vec4f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return amd_vrs4_log1pf(x);
+}
+static inline Vec2d log1p (Vec2d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return amd_vrd2_log1p(x);
+}
+
+static inline Vec4f log2 (Vec4f const & x) {   // logarithm base 2
+    return amd_vrs4_log2f(x);
+}
+static inline Vec2d log2 (Vec2d const & x) {   // logarithm base 2
+    return amd_vrd2_log2(x);
+}
+
+static inline Vec4f log10 (Vec4f const & x) {   // logarithm base 10
+    return amd_vrs4_log10f(x);
+}
+static inline Vec2d log10 (Vec2d const & x) {   // logarithm base 10
+    return amd_vrd2_log10(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec4f sin (Vec4f const & x) {   // sine
+    return amd_vrs4_sinf(x);
+}
+static inline Vec2d sin (Vec2d const & x) {   // sine
+    return amd_vrd2_sin(x);
+}
+
+static inline Vec4f cos (Vec4f const & x) {   // cosine
+    return amd_vrs4_cosf(x);
+}
+static inline Vec2d cos (Vec2d const & x) {   // cosine
+    return amd_vrd2_cos(x);
+}
+
+static inline Vec4f sincos (Vec4f * pcos, Vec4f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128 r_sin;
+    amd_vrs4_sincosf(x, &r_sin, (__m128*)pcos);
+    return r_sin;
+}
+static inline Vec2d sincos (Vec2d * pcos, Vec2d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128d r_sin;
+    amd_vrd2_sincos(x, &r_sin, (__m128d*)pcos);
+    return r_sin;
+}
+
+static inline Vec4f tan (Vec4f const & x) {   // tangent
+    return amd_vrs4_tanf(x);
+}
+static inline Vec2d tan (Vec2d const & x) {   // tangent
+    return amd_vrd2_tan(x);
+}
+
+// inverse trigonometric functions not supported
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions not supported
+
+// error function not supported
+
+// complex exponential function not supported
+
+#ifdef VECTORF256_H
+
+// Emulate 256 bit vector functions with two 128-bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const & x) {   // exponential function
+    return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp (Vec4d const & x) {   // exponential function
+    return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+
+static inline Vec8f expm1 (Vec8f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1 (Vec4d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+
+static inline Vec8f exp2 (Vec8f const & x) {   // pow(2,x)
+    return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2 (Vec4d const & x) {   // pow(2,x)
+    return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10 (Vec8f const & x) {   // pow(10,x)
+    return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10 (Vec4d const & x) {   // pow(10,x)
+    return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow (Vec8f const & a, Vec8f const & b) {   // pow(a,b) = a to the power of b
+    return Vec8f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec4d pow (Vec4d const & a, Vec4d const & b) {   // pow(a,b) = a to the power of b
+    return Vec4d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+
+static inline Vec8f cbrt (Vec8f const & x) {   // pow(x,1/3)
+    return Vec8f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec4d cbrt (Vec4d const & x) {   // pow(x,1/3)
+    return Vec4d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+
+// logarithms
+static inline Vec8f log (Vec8f const & x) {   // natural logarithm
+    return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log (Vec4d const & x) {   // natural logarithm
+    return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+
+static inline Vec8f log1p (Vec8f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p (Vec4d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+
+static inline Vec8f log2 (Vec8f const & x) {   // logarithm base 2
+    return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2 (Vec4d const & x) {   // logarithm base 2
+    return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10 (Vec8f const & x) {   // logarithm base 10
+    return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10 (Vec4d const & x) {   // logarithm base 10
+    return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const & x) {   // sine
+    return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin (Vec4d const & x) {   // sine
+    return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos (Vec8f const & x) {   // cosine
+    return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos (Vec4d const & x) {   // cosine
+    return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec4f r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low()); 
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec8f(r_cos0, r_cos1);
+    return Vec8f(r_sin0, r_sin1); 
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec2d r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low()); 
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec4d(r_cos0, r_cos1);
+    return Vec4d(r_sin0, r_sin1); 
+}
+
+static inline Vec8f tan (Vec8f const & x) {   // tangent
+    return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan (Vec4d const & x) {   // tangent
+    return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+#endif // VECTORMATH_COMMON_H
+
+#endif // VECTORF256_H == 1
+
+
+/*****************************************************************************
+*
+*      VECTORMATH = 2. Use Intel SVML library with any compiler
+*
+*****************************************************************************/
+#elif VECTORMATH == 2 
+
+extern "C" {
+extern __m128  __svml_expf4       (__m128);
+extern __m128d __svml_exp2        (__m128d);
+extern __m128  __svml_expm1f4     (__m128);
+extern __m128d __svml_expm12      (__m128d);
+extern __m128  __svml_exp2f4      (__m128);
+extern __m128d __svml_exp22       (__m128d);
+extern __m128  __svml_exp10f4     (__m128);
+extern __m128d __svml_exp102      (__m128d);
+extern __m128  __svml_powf4       (__m128,  __m128);
+extern __m128d __svml_pow2        (__m128d, __m128d);
+extern __m128  __svml_cbrtf4      (__m128);
+extern __m128d __svml_cbrt2       (__m128d);
+extern __m128  __svml_invsqrtf4   (__m128);
+extern __m128d __svml_invsqrt2    (__m128d);
+extern __m128  __svml_logf4       (__m128);
+extern __m128d __svml_log2        (__m128d);
+extern __m128  __svml_log1pf4     (__m128);
+extern __m128d __svml_log1p2      (__m128d);
+extern __m128  __svml_log2f4      (__m128);
+extern __m128d __svml_log22       (__m128d);
+extern __m128  __svml_log10f4     (__m128);
+extern __m128d __svml_log102      (__m128d);
+extern __m128  __svml_sinf4       (__m128);
+extern __m128d __svml_sin2        (__m128d);
+extern __m128  __svml_cosf4       (__m128);
+extern __m128d __svml_cos2        (__m128d);
+extern __m128  __svml_sincosf4    (__m128);  // cos returned in xmm1
+extern __m128d __svml_sincos2     (__m128d); // cos returned in xmm1
+extern __m128  __svml_tanf4       (__m128);
+extern __m128d __svml_tan2        (__m128d);
+extern __m128  __svml_asinf4      (__m128);
+extern __m128d __svml_asin2       (__m128d);
+extern __m128  __svml_acosf4      (__m128);
+extern __m128d __svml_acos2       (__m128d);
+extern __m128  __svml_atanf4      (__m128);
+extern __m128d __svml_atan2       (__m128d);
+extern __m128  __svml_atan2f4     (__m128,  __m128);
+extern __m128d __svml_atan22      (__m128d, __m128d);
+extern __m128  __svml_sinhf4      (__m128);
+extern __m128d __svml_sinh2       (__m128d);
+extern __m128  __svml_coshf4      (__m128);
+extern __m128d __svml_cosh2       (__m128d);
+extern __m128  __svml_tanhf4      (__m128);
+extern __m128d __svml_tanh2       (__m128d);
+extern __m128  __svml_asinhf4     (__m128);
+extern __m128d __svml_asinh2      (__m128d);
+extern __m128  __svml_acoshf4     (__m128);
+extern __m128d __svml_acosh2      (__m128d);
+extern __m128  __svml_atanhf4     (__m128);
+extern __m128d __svml_atanh2      (__m128d);
+extern __m128  __svml_erff4       (__m128);
+extern __m128d __svml_erf2        (__m128d);
+extern __m128  __svml_erfcf4      (__m128);
+extern __m128d __svml_erfc2       (__m128d);
+extern __m128  __svml_erfinvf4    (__m128);
+extern __m128d __svml_erfinv2     (__m128d);
+extern __m128  __svml_cdfnorminvf4(__m128);
+extern __m128d __svml_cdfnorminv2 (__m128d);
+extern __m128  __svml_cdfnormf4   (__m128);
+extern __m128d __svml_cdfnorm2    (__m128d);
+extern __m128  __svml_cexpf4      (__m128);
+extern __m128d __svml_cexp2       (__m128d);
+}
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp (Vec4f const & x) {   // exponential function
+    return __svml_expf4(x);
+}
+static inline Vec2d exp (Vec2d const & x) {   // exponential function
+    return __svml_exp2(x);
+}
+
+static inline Vec4f expm1 (Vec4f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return __svml_expm1f4(x);
+}
+static inline Vec2d expm1 (Vec2d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return __svml_expm12(x);
+}
+
+static inline Vec4f exp2 (Vec4f const & x) {   // pow(2,x)
+    return __svml_exp2f4(x);
+}
+static inline Vec2d exp2 (Vec2d const & x) {   // pow(2,x)
+    return __svml_exp22(x);
+}
+
+static inline Vec4f exp10 (Vec4f const & x) {   // pow(10,x)
+    return __svml_exp10f4(x);
+}
+static inline Vec2d exp10 (Vec2d const & x) {   // pow(10,x)
+    return __svml_exp102(x);
+}
+
+static inline Vec4f pow (Vec4f const & a, Vec4f const & b) {   // pow(a,b) = a to the power of b
+    return __svml_powf4(a,b);
+}
+static inline Vec2d pow (Vec2d const & a, Vec2d const & b) {   // pow(a,b) = a to the power of b
+    return __svml_pow2(a,b);
+}
+
+static inline Vec4f cbrt (Vec4f const & x) {   // pow(x,1/3)
+    return __svml_cbrtf4(x);
+}
+static inline Vec2d cbrt (Vec2d const & x) {   // pow(x,1/3)
+    return __svml_cbrt2(x);
+}
+
+// logarithms
+static inline Vec4f log (Vec4f const & x) {   // natural logarithm
+    return __svml_logf4(x);
+}
+static inline Vec2d log (Vec2d const & x) {   // natural logarithm
+    return __svml_log2(x);
+}
+
+static inline Vec4f log1p (Vec4f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return __svml_log1pf4(x);
+}
+static inline Vec2d log1p (Vec2d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return __svml_log1p2(x);
+}
+
+static inline Vec4f log2 (Vec4f const & x) {   // logarithm base 2
+    return __svml_log2f4(x);
+}
+static inline Vec2d log2 (Vec2d const & x) {   // logarithm base 2
+    return __svml_log22(x);
+}
+
+static inline Vec4f log10 (Vec4f const & x) {   // logarithm base 10
+    return __svml_log10f4(x);
+}
+static inline Vec2d log10 (Vec2d const & x) {   // logarithm base 10
+    return __svml_log102(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec4f sin (Vec4f const & x) {   // sine
+    return __svml_sinf4(x);
+}
+static inline Vec2d sin (Vec2d const & x) {   // sine
+    return __svml_sin2(x);
+}
+
+static inline Vec4f cos (Vec4f const & x) {   // cosine
+    return __svml_cosf4(x);
+}
+static inline Vec2d cos (Vec2d const & x) {   // cosine
+    return __svml_cos2(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec4f sincos (Vec4f * pcos, Vec4f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128 r_sin, r_cos;
+    r_sin = __svml_sincosf4(x);
+#if defined(__unix__) || defined(__GNUC__)
+    //   __asm__ ( "call __svml_sincosf4 \n movaps %%xmm0, %0 \n movaps %%xmm1, %1" : "=m"(r_sin), "=m"(r_cos) : "xmm0"(x) );
+     __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm movaps r_cos, xmm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec2d sincos (Vec2d * pcos, Vec2d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128d r_sin, r_cos;
+    r_sin = __svml_sincos2(x);
+#if defined(__unix__) || defined(__GNUC__)
+     __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm movapd r_cos, xmm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+#endif // inline assembly available
+
+static inline Vec4f tan (Vec4f const & x) {   // tangent
+    return __svml_tanf4(x);
+}
+static inline Vec2d tan (Vec2d const & x) {   // tangent
+    return __svml_tan2(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin (Vec4f const & x) {   // inverse sine
+    return __svml_asinf4(x);
+}
+static inline Vec2d asin (Vec2d const & x) {   // inverse sine
+    return __svml_asin2(x);
+}
+
+static inline Vec4f acos (Vec4f const & x) {   // inverse cosine
+    return __svml_acosf4(x);
+}
+static inline Vec2d acos (Vec2d const & x) {   // inverse cosine
+    return __svml_acos2(x);
+}
+
+static inline Vec4f atan (Vec4f const & x) {   // inverse tangent
+    return __svml_atanf4(x);
+}
+static inline Vec2d atan (Vec2d const & x) {   // inverse tangent
+    return __svml_atan2(x);
+}
+
+static inline Vec4f atan2 (Vec4f const & a, Vec4f const & b) {   // inverse tangent of a/b
+    return __svml_atan2f4(a,b);
+}
+static inline Vec2d atan2 (Vec2d const & a, Vec2d const & b) {   // inverse tangent of a/b
+    return __svml_atan22(a,b);
+}
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh (Vec4f const & x) {   // hyperbolic sine
+    return __svml_sinhf4(x);
+}
+static inline Vec2d sinh (Vec2d const & x) {   // hyperbolic sine
+    return __svml_sinh2(x);
+}
+
+static inline Vec4f cosh (Vec4f const & x) {   // hyperbolic cosine
+    return __svml_coshf4(x);
+}
+static inline Vec2d cosh (Vec2d const & x) {   // hyperbolic cosine
+    return __svml_cosh2(x);
+}
+
+static inline Vec4f tanh (Vec4f const & x) {   // hyperbolic tangent
+    return __svml_tanhf4(x);
+}
+static inline Vec2d tanh (Vec2d const & x) {   // hyperbolic tangent
+    return __svml_tanh2(x);
+}
+
+static inline Vec4f asinh (Vec4f const & x) {   // inverse hyperbolic sine
+    return __svml_asinhf4(x);
+}
+static inline Vec2d asinh (Vec2d const & x) {   // inverse hyperbolic sine
+    return __svml_asinh2(x);
+}
+
+static inline Vec4f acosh (Vec4f const & x) {   // inverse hyperbolic cosine
+    return __svml_acoshf4(x);
+}
+static inline Vec2d acosh (Vec2d const & x) {   // inverse hyperbolic cosine
+    return __svml_acosh2(x);
+}
+
+static inline Vec4f atanh (Vec4f const & x) {   // inverse hyperbolic tangent
+    return __svml_atanhf4(x);
+}
+static inline Vec2d atanh (Vec2d const & x) {   // inverse hyperbolic tangent
+    return __svml_atanh2(x);
+}
+
+// error function
+static inline Vec4f erf (Vec4f const & x) {   // error function
+    return __svml_erff4(x);
+}
+static inline Vec2d erf (Vec2d const & x) {   // error function
+    return __svml_erf2(x);
+}
+
+static inline Vec4f erfc (Vec4f const & x) {   // error function complement
+    return __svml_erfcf4(x);
+}
+static inline Vec2d erfc (Vec2d const & x) {   // error function complement
+    return __svml_erfc2(x);
+}
+
+static inline Vec4f erfinv (Vec4f const & x) {   // inverse error function
+    return __svml_erfinvf4(x);
+}
+static inline Vec2d erfinv (Vec2d const & x) {   // inverse error function
+    return __svml_erfinv2(x);
+}
+
+static inline Vec4f cdfnorm (Vec4f const & x) {   // cumulative normal distribution function
+    return __svml_cdfnormf4(x);
+}
+static inline Vec2d cdfnorm (Vec2d const & x) {   // cumulative normal distribution function
+    return __svml_cdfnorm2(x);
+}
+
+static inline Vec4f cdfnorminv (Vec4f const & x) {   // inverse cumulative normal distribution function
+    return __svml_cdfnorminvf4(x);
+}
+static inline Vec2d cdfnorminv (Vec2d const & x) {   // inverse cumulative normal distribution function
+    return __svml_cdfnorminv2(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec4f cexp (Vec4f const & x) {   // complex exponential function
+    return __svml_cexpf4(x);
+}
+static inline Vec2d cexp (Vec2d const & x) {   // complex exponential function
+    return __svml_cexp2(x);
+}
+
+
+#if defined (VECTORF256_H) && VECTORF256_H >= 2  
+// AVX gives 256 bit vectors
+
+extern "C" {
+extern __m256  __svml_expf8       (__m256);
+extern __m256d __svml_exp4        (__m256d);
+extern __m256  __svml_expm1f8     (__m256);
+extern __m256d __svml_expm14      (__m256d);
+extern __m256  __svml_exp2f8      (__m256);
+extern __m256d __svml_exp24       (__m256d);
+extern __m256  __svml_exp10f8     (__m256);
+extern __m256d __svml_exp104      (__m256d);
+extern __m256  __svml_powf8       (__m256,  __m256);
+extern __m256d __svml_pow4        (__m256d, __m256d);
+extern __m256  __svml_cbrtf8      (__m256);
+extern __m256d __svml_cbrt4       (__m256d);
+extern __m256  __svml_invsqrtf8   (__m256);
+extern __m256d __svml_invsqrt4    (__m256d);
+extern __m256  __svml_logf8       (__m256);
+extern __m256d __svml_log4        (__m256d);
+extern __m256  __svml_log1pf8     (__m256);
+extern __m256d __svml_log1p4      (__m256d);
+extern __m256  __svml_log2f8      (__m256);
+extern __m256d __svml_log24       (__m256d);
+extern __m256  __svml_log10f8     (__m256);
+extern __m256d __svml_log104      (__m256d);
+extern __m256  __svml_sinf8       (__m256);
+extern __m256d __svml_sin4        (__m256d);
+extern __m256  __svml_cosf8       (__m256);
+extern __m256d __svml_cos4        (__m256d);
+extern __m256  __svml_sincosf8    (__m256);  // cos returned in ymm1
+extern __m256d __svml_sincos4     (__m256d); // cos returned in ymm1
+extern __m256  __svml_tanf8       (__m256);
+extern __m256d __svml_tan4        (__m256d);
+extern __m256  __svml_asinf8      (__m256);
+extern __m256d __svml_asin4       (__m256d);
+extern __m256  __svml_acosf8      (__m256);
+extern __m256d __svml_acos4       (__m256d);
+extern __m256  __svml_atanf8      (__m256);
+extern __m256d __svml_atan4       (__m256d);
+extern __m256  __svml_atan2f8     (__m256, __m256);
+extern __m256d __svml_atan24      (__m256d, __m256d);
+extern __m256  __svml_sinhf8      (__m256);
+extern __m256d __svml_sinh4       (__m256d);
+extern __m256  __svml_coshf8      (__m256);
+extern __m256d __svml_cosh4       (__m256d);
+extern __m256  __svml_tanhf8      (__m256);
+extern __m256d __svml_tanh4       (__m256d);
+extern __m256  __svml_asinhf8     (__m256);
+extern __m256d __svml_asinh4      (__m256d);
+extern __m256  __svml_acoshf8     (__m256);
+extern __m256d __svml_acosh4      (__m256d);
+extern __m256  __svml_atanhf8     (__m256);
+extern __m256d __svml_atanh4      (__m256d);
+extern __m256  __svml_erff8       (__m256);
+extern __m256d __svml_erf4        (__m256d);
+extern __m256  __svml_erfcf8      (__m256);
+extern __m256d __svml_erfc4       (__m256d);
+extern __m256  __svml_erfinvf8    (__m256);
+extern __m256d __svml_erfinv4     (__m256d);
+extern __m256  __svml_cdfnorminvf8(__m256);
+extern __m256d __svml_cdfnorminv4 (__m256d);
+extern __m256  __svml_cdfnormf8   (__m256);
+extern __m256d __svml_cdfnorm4    (__m256d);
+//extern __m256  __svml_cexpf8      (__m256); // missing in current version of SVML (jan 2012)
+//extern __m256d __svml_cexp4       (__m256d);
+}
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const & x) {   // exponential function
+    return __svml_expf8(x);
+}
+static inline Vec4d exp (Vec4d const & x) {   // exponential function
+    return __svml_exp4(x);
+}
+
+static inline Vec8f expm1 (Vec8f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return __svml_expm1f8(x);
+}
+static inline Vec4d expm1 (Vec4d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return __svml_expm14(x);
+}
+
+static inline Vec8f exp2 (Vec8f const & x) {   // pow(2,x)
+    return __svml_exp2f8(x);
+}
+static inline Vec4d exp2 (Vec4d const & x) {   // pow(2,x)
+    return __svml_exp24(x);
+}
+
+static inline Vec8f exp10 (Vec8f const & x) {   // pow(10,x)
+    return __svml_exp10f8(x);
+}
+static inline Vec4d exp10 (Vec4d const & x) {   // pow(10,x)
+    return __svml_exp104(x);
+}
+
+static inline Vec8f pow (Vec8f const & a, Vec8f const & b) {   // pow(a,b) = a to the power of b
+    return __svml_powf8(a,b);
+}
+static inline Vec4d pow (Vec4d const & a, Vec4d const & b) {   // pow(a,b) = a to the power of b
+    return __svml_pow4(a,b);
+}
+
+static inline Vec8f cbrt (Vec8f const & x) {   // pow(x,1/3)
+    return __svml_cbrtf8(x);
+}
+static inline Vec4d cbrt (Vec4d const & x) {   // pow(x,1/3)
+    return __svml_cbrt4(x);
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const & x) {   // natural logarithm
+    return __svml_logf8(x);
+}
+static inline Vec4d log (Vec4d const & x) {   // natural logarithm
+    return __svml_log4(x);
+}
+
+static inline Vec8f log1p (Vec8f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return __svml_log1pf8(x);
+}
+static inline Vec4d log1p (Vec4d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return __svml_log1p4(x);
+}
+
+static inline Vec8f log2 (Vec8f const & x) {   // logarithm base 2
+    return __svml_log2f8(x);
+}
+static inline Vec4d log2 (Vec4d const & x) {   // logarithm base 2
+    return __svml_log24(x);
+}
+
+static inline Vec8f log10 (Vec8f const & x) {   // logarithm base 10
+    return __svml_log10f8(x);
+}
+static inline Vec4d log10 (Vec4d const & x) {   // logarithm base 10
+    return __svml_log104(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const & x) {   // sine
+    return __svml_sinf8(x);
+}
+static inline Vec4d sin (Vec4d const & x) {   // sine
+    return __svml_sin4(x);
+}
+
+static inline Vec8f cos (Vec8f const & x) {   // cosine
+    return __svml_cosf8(x);
+}
+static inline Vec4d cos (Vec4d const & x) {   // cosine
+    return __svml_cos4(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256 r_sin, r_cos;
+    r_sin = __svml_sincosf8(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm vmovaps r_cos, ymm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256d r_sin, r_cos;
+    r_sin = __svml_sincos4(x);
+#if defined(__unix__) || defined(__GNUC__)
+     __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm vmovapd r_cos, ymm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+#endif // inline assembly available
+
+static inline Vec8f tan (Vec8f const & x) {   // tangent
+    return __svml_tanf8(x);
+}
+static inline Vec4d tan (Vec4d const & x) {   // tangent
+    return __svml_tan4(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const & x) {   // inverse sine
+    return __svml_asinf8(x);
+}
+static inline Vec4d asin (Vec4d const & x) {   // inverse sine
+    return __svml_asin4(x);
+}
+
+static inline Vec8f acos (Vec8f const & x) {   // inverse cosine
+    return __svml_acosf8(x);
+}
+static inline Vec4d acos (Vec4d const & x) {   // inverse cosine
+    return __svml_acos4(x);
+}
+
+static inline Vec8f atan (Vec8f const & x) {   // inverse tangent
+    return __svml_atanf8(x);
+}
+static inline Vec4d atan (Vec4d const & x) {   // inverse tangent
+    return __svml_atan4(x);
+}
+
+static inline Vec8f atan2 (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
+    return __svml_atan2f8(a,b);
+}
+static inline Vec4d atan2 (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
+    return __svml_atan24(a,b);
+}
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh (Vec8f const & x) {   // hyperbolic sine
+    return __svml_sinhf8(x);
+}
+static inline Vec4d sinh (Vec4d const & x) {   // hyperbolic sine
+    return __svml_sinh4(x);
+}
+
+static inline Vec8f cosh (Vec8f const & x) {   // hyperbolic cosine
+    return __svml_coshf8(x);
+}
+static inline Vec4d cosh (Vec4d const & x) {   // hyperbolic cosine
+    return __svml_cosh4(x);
+}
+
+static inline Vec8f tanh (Vec8f const & x) {   // hyperbolic tangent
+    return __svml_tanhf8(x);
+}
+static inline Vec4d tanh (Vec4d const & x) {   // hyperbolic tangent
+    return __svml_tanh4(x);
+}
+
+static inline Vec8f asinh (Vec8f const & x) {   // inverse hyperbolic sine
+    return __svml_asinhf8(x);
+}
+static inline Vec4d asinh (Vec4d const & x) {   // inverse hyperbolic sine
+    return __svml_asinh4(x);
+}
+
+static inline Vec8f acosh (Vec8f const & x) {   // inverse hyperbolic cosine
+    return __svml_acoshf8(x);
+}
+static inline Vec4d acosh (Vec4d const & x) {   // inverse hyperbolic cosine
+    return __svml_acosh4(x);
+}
+
+static inline Vec8f atanh (Vec8f const & x) {   // inverse hyperbolic tangent
+    return __svml_atanhf8(x);
+}
+static inline Vec4d atanh (Vec4d const & x) {   // inverse hyperbolic tangent
+    return __svml_atanh4(x);
+}
+
+// error function
+static inline Vec8f erf (Vec8f const & x) {   // error function
+    return __svml_erff8(x);
+}
+static inline Vec4d erf (Vec4d const & x) {   // error function
+    return __svml_erf4(x);
+}
+
+static inline Vec8f erfc (Vec8f const & x) {   // error function complement
+    return __svml_erfcf8(x);
+}
+static inline Vec4d erfc (Vec4d const & x) {   // error function complement
+    return __svml_erfc4(x);
+}
+
+static inline Vec8f erfinv (Vec8f const & x) {   // inverse error function
+    return __svml_erfinvf8(x);
+}
+static inline Vec4d erfinv (Vec4d const & x) {   // inverse error function
+    return __svml_erfinv4(x);
+}
+
+static inline Vec8f cdfnorm (Vec8f const & x) {   // cumulative normal distribution function
+    return __svml_cdfnormf8(x);
+}
+static inline Vec4d cdfnorm (Vec4d const & x) {   // cumulative normal distribution function
+    return __svml_cdfnorm4(x);
+}
+
+static inline Vec8f cdfnorminv (Vec8f const & x) {   // inverse cumulative normal distribution function
+    return __svml_cdfnorminvf8(x);
+}
+static inline Vec4d cdfnorminv (Vec4d const & x) {   // inverse cumulative normal distribution function
+    return __svml_cdfnorminv4(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+// 256-bit version missing in current version of SVML (jan 2012). Use 128 bit version
+static inline Vec8f cexp (Vec8f const & x) {   // complex exponential function
+    return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp (Vec4d const & x) {   // complex exponential function
+    return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif // VECTORF256_H == 2
+
+
+/*****************************************************************************
+*
+*      VECTORMATH = 3. Use Intel SVML library with Intel compiler
+*
+*****************************************************************************/
+#elif VECTORMATH == 3 
+#include <ia32intrin.h>    // intel svml functions defined in Intel version of immintrin.h
+
+// 128 bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec4f exp (Vec4f const & x) {   // exponential function
+    return _mm_exp_ps(x);
+}
+static inline Vec2d exp (Vec2d const & x) {   // exponential function
+    return _mm_exp_pd(x);
+}
+
+static inline Vec4f expm1 (Vec4f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm_expm1_ps(x);
+}
+static inline Vec2d expm1 (Vec2d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm_expm1_pd(x);
+}
+
+static inline Vec4f exp2 (Vec4f const & x) {   // pow(2,x)
+    return _mm_exp2_ps(x);
+}
+static inline Vec2d exp2 (Vec2d const & x) {   // pow(2,x)
+    return _mm_exp2_pd(x);
+}
+
+static inline Vec4f exp10 (Vec4f const & x) {   // pow(10,x)
+    return _mm_exp10_ps(x);
+}
+static inline Vec2d exp10 (Vec2d const & x) {   // pow(10,x)
+    return _mm_exp10_pd(x);
+}
+
+static inline Vec4f pow (Vec4f const & a, Vec4f const & b) {   // pow(a,b) = a to the power of b
+    return _mm_pow_ps(a,b);
+}
+static inline Vec2d pow (Vec2d const & a, Vec2d const & b) {   // pow(a,b) = a to the power of b
+    return _mm_pow_pd(a,b);
+}
+
+static inline Vec4f cbrt (Vec4f const & x) {   // pow(x,1/3)
+    return _mm_cbrt_ps(x);
+}
+static inline Vec2d cbrt (Vec2d const & x) {   // pow(x,1/3)
+    return _mm_cbrt_pd(x);
+}
+
+// logarithms
+static inline Vec4f log (Vec4f const & x) {   // natural logarithm
+    return _mm_log_ps(x);
+}
+static inline Vec2d log (Vec2d const & x) {   // natural logarithm
+    return _mm_log_pd(x);
+}
+
+static inline Vec4f log1p (Vec4f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm_log1p_ps(x);
+}
+static inline Vec2d log1p (Vec2d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm_log1p_pd(x);
+}
+
+static inline Vec4f log2 (Vec4f const & x) {   // logarithm base 2
+    return _mm_log2_ps(x);
+}
+static inline Vec2d log2 (Vec2d const & x) {   // logarithm base 2
+    return _mm_log2_pd(x);
+}
+
+static inline Vec4f log10 (Vec4f const & x) {   // logarithm base 10
+    return _mm_log10_ps(x);
+}
+static inline Vec2d log10 (Vec2d const & x) {   // logarithm base 10
+    return _mm_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec4f sin (Vec4f const & x) {   // sine
+    return _mm_sin_ps(x);
+}
+static inline Vec2d sin (Vec2d const & x) {   // sine
+    return _mm_sin_pd(x);
+}
+
+static inline Vec4f cos (Vec4f const & x) {   // cosine
+    return _mm_cos_ps(x);
+}
+static inline Vec2d cos (Vec2d const & x) {   // cosine
+    return _mm_cos_pd(x);
+}
+
+static inline Vec4f sincos (Vec4f * pcos, Vec4f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128 r_sin, r_cos;
+    r_sin = _mm_sincos_ps(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec2d sincos (Vec2d * pcos, Vec2d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128d r_sin, r_cos;
+    r_sin = _mm_sincos_pd(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+
+static inline Vec4f tan (Vec4f const & x) {   // tangent
+    return _mm_tan_ps(x);
+}
+static inline Vec2d tan (Vec2d const & x) {   // tangent
+    return _mm_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin (Vec4f const & x) {   // inverse sine
+    return _mm_asin_ps(x);
+}
+static inline Vec2d asin (Vec2d const & x) {   // inverse sine
+    return _mm_asin_pd(x);
+}
+
+static inline Vec4f acos (Vec4f const & x) {   // inverse cosine
+    return _mm_acos_ps(x);
+}
+static inline Vec2d acos (Vec2d const & x) {   // inverse cosine
+    return _mm_acos_pd(x);
+}
+
+static inline Vec4f atan (Vec4f const & x) {   // inverse tangent
+    return _mm_atan_ps(x);
+}
+static inline Vec2d atan (Vec2d const & x) {   // inverse tangent
+    return _mm_atan_pd(x);
+}
+
+static inline Vec4f atan2 (Vec4f const & a, Vec4f const & b) {   // inverse tangent of a/b
+    return _mm_atan2_ps(a,b);
+}
+static inline Vec2d atan2 (Vec2d const & a, Vec2d const & b) {   // inverse tangent of a/b
+    return _mm_atan2_pd(a,b);
+}
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh (Vec4f const & x) {   // hyperbolic sine
+    return _mm_sinh_ps(x);
+}
+static inline Vec2d sinh (Vec2d const & x) {   // hyperbolic sine
+    return _mm_sinh_pd(x);
+}
+
+static inline Vec4f cosh (Vec4f const & x) {   // hyperbolic cosine
+    return _mm_cosh_ps(x);
+}
+static inline Vec2d cosh (Vec2d const & x) {   // hyperbolic cosine
+    return _mm_cosh_pd(x);
+}
+
+static inline Vec4f tanh (Vec4f const & x) {   // hyperbolic tangent
+    return _mm_tanh_ps(x);
+}
+static inline Vec2d tanh (Vec2d const & x) {   // hyperbolic tangent
+    return _mm_tanh_pd(x);
+}
+
+static inline Vec4f asinh (Vec4f const & x) {   // inverse hyperbolic sine
+    return _mm_asinh_ps(x);
+}
+static inline Vec2d asinh (Vec2d const & x) {   // inverse hyperbolic sine
+    return _mm_asinh_pd(x);
+}
+
+static inline Vec4f acosh (Vec4f const & x) {   // inverse hyperbolic cosine
+    return _mm_acosh_ps(x);
+}
+static inline Vec2d acosh (Vec2d const & x) {   // inverse hyperbolic cosine
+    return _mm_acosh_pd(x);
+}
+
+static inline Vec4f atanh (Vec4f const & x) {   // inverse hyperbolic tangent
+    return _mm_atanh_ps(x);
+}
+static inline Vec2d atanh (Vec2d const & x) {   // inverse hyperbolic tangent
+    return _mm_atanh_pd(x);
+}
+
+// error function
+static inline Vec4f erf (Vec4f const & x) {   // error function
+    return _mm_erf_ps(x);
+}
+static inline Vec2d erf (Vec2d const & x) {   // error function
+    return _mm_erf_pd(x);
+}
+
+static inline Vec4f erfc (Vec4f const & x) {   // error function complement
+    return _mm_erfc_ps(x);
+}
+static inline Vec2d erfc (Vec2d const & x) {   // error function complement
+    return _mm_erfc_pd(x);
+}
+
+static inline Vec4f erfinv (Vec4f const & x) {   // inverse error function
+    return _mm_erfinv_ps(x);
+}
+static inline Vec2d erfinv (Vec2d const & x) {   // inverse error function
+    return _mm_erfinv_pd(x);
+}
+
+extern "C" {
+extern __m128 __svml_cdfnormf4(__m128);  // not in immintrin.h
+extern __m128d __svml_cdfnorm2(__m128d); // not in immintrin.h
+}
+
+static inline Vec4f cdfnorm (Vec4f const & x) {   // cumulative normal distribution function
+    return __svml_cdfnormf4(x);
+}
+static inline Vec2d cdfnorm (Vec2d const & x) {   // cumulative normal distribution function
+    return __svml_cdfnorm2(x);
+}
+
+static inline Vec4f cdfnorminv (Vec4f const & x) {   // inverse cumulative normal distribution function
+    return _mm_cdfnorminv_ps(x);
+}
+static inline Vec2d cdfnorminv (Vec2d const & x) {   // inverse cumulative normal distribution function
+    return _mm_cdfnorminv_pd(x);
+}
+
+// complex functions
+extern "C" {
+extern __m128  __svml_cexpf2(__m128);   // not in immintrin.h
+extern __m128  __svml_cexpf4(__m128);   // not in immintrin.h
+extern __m128d __svml_cexp2(__m128d);   // not in immintrin.h
+}
+
+static inline Vec4f cexp (Vec4f const & x) {   // complex exponential function
+    return __svml_cexpf4(x);
+}
+static inline Vec2d cexp (Vec2d const & x) {   // complex exponential function
+    return __svml_cexp2(x);
+}
+
+#if defined (VECTORF256_H) && VECTORF256_H >= 2
+
+// 256 bit vectors
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const & x) {   // exponential function
+    return _mm256_exp_ps(x);
+}
+static inline Vec4d exp (Vec4d const & x) {   // exponential function
+    return _mm256_exp_pd(x);
+}
+
+static inline Vec8f expm1 (Vec8f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm256_expm1_ps(x);
+}
+static inline Vec4d expm1 (Vec4d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm256_expm1_pd(x);
+}
+
+static inline Vec8f exp2 (Vec8f const & x) {   // pow(2,x)
+    return _mm256_exp2_ps(x);
+}
+static inline Vec4d exp2 (Vec4d const & x) {   // pow(2,x)
+    return _mm256_exp2_pd(x);
+}
+
+static inline Vec8f exp10 (Vec8f const & x) {   // pow(10,x)
+    return _mm256_exp10_ps(x);
+}
+static inline Vec4d exp10 (Vec4d const & x) {   // pow(10,x)
+    return _mm256_exp10_pd(x);
+}
+
+static inline Vec8f pow (Vec8f const & a, Vec8f const & b) {   // pow(a,b) = a to the power of b
+    return _mm256_pow_ps(a,b);
+}
+static inline Vec4d pow (Vec4d const & a, Vec4d const & b) {   // pow(a,b) = a to the power of b
+    return _mm256_pow_pd(a,b);
+}
+
+static inline Vec8f cbrt (Vec8f const & x) {   // pow(x,1/3)
+    return _mm256_cbrt_ps(x);
+}
+static inline Vec4d cbrt (Vec4d const & x) {   // pow(x,1/3)
+    return _mm256_cbrt_pd(x);
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const & x) {   // natural logarithm
+    return _mm256_log_ps(x);
+}
+static inline Vec4d log (Vec4d const & x) {   // natural logarithm
+    return _mm256_log_pd(x);
+}
+
+static inline Vec8f log1p (Vec8f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm256_log1p_ps(x);
+}
+static inline Vec4d log1p (Vec4d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm256_log1p_pd(x);
+}
+
+static inline Vec8f log2 (Vec8f const & x) {   // logarithm base 2
+    return _mm256_log2_ps(x);
+}
+static inline Vec4d log2 (Vec4d const & x) {   // logarithm base 2
+    return _mm256_log2_pd(x);
+}
+
+static inline Vec8f log10 (Vec8f const & x) {   // logarithm base 10
+    return _mm256_log10_ps(x);
+}
+static inline Vec4d log10 (Vec4d const & x) {   // logarithm base 10
+    return _mm256_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec8f sin (Vec8f const & x) {   // sine
+    return _mm256_sin_ps(x);
+}
+static inline Vec4d sin (Vec4d const & x) {   // sine
+    return _mm256_sin_pd(x);
+}
+
+static inline Vec8f cos (Vec8f const & x) {   // cosine
+    return _mm256_cos_ps(x);
+}
+static inline Vec4d cos (Vec4d const & x) {   // cosine
+    return _mm256_cos_pd(x);
+}
+
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256 r_sin, r_cos;
+    r_sin = _mm256_sincos_ps(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256d r_sin, r_cos;
+    r_sin = _mm256_sincos_pd(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+
+static inline Vec8f tan (Vec8f const & x) {   // tangent
+    return _mm256_tan_ps(x);
+}
+static inline Vec4d tan (Vec4d const & x) {   // tangent
+    return _mm256_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const & x) {   // inverse sine
+    return _mm256_asin_ps(x);
+}
+static inline Vec4d asin (Vec4d const & x) {   // inverse sine
+    return _mm256_asin_pd(x);
+}
+
+static inline Vec8f acos (Vec8f const & x) {   // inverse cosine
+    return _mm256_acos_ps(x);
+}
+static inline Vec4d acos (Vec4d const & x) {   // inverse cosine
+    return _mm256_acos_pd(x);
+}
+
+static inline Vec8f atan (Vec8f const & x) {   // inverse tangent
+    return _mm256_atan_ps(x);
+}
+static inline Vec4d atan (Vec4d const & x) {   // inverse tangent
+    return _mm256_atan_pd(x);
+}
+
+static inline Vec8f atan2 (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
+    return _mm256_atan2_ps(a,b);
+}
+static inline Vec4d atan2 (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
+    return _mm256_atan2_pd(a,b);
+}
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh (Vec8f const & x) {   // hyperbolic sine
+    return _mm256_sinh_ps(x);
+}
+static inline Vec4d sinh (Vec4d const & x) {   // hyperbolic sine
+    return _mm256_sinh_pd(x);
+}
+
+static inline Vec8f cosh (Vec8f const & x) {   // hyperbolic cosine
+    return _mm256_cosh_ps(x);
+}
+static inline Vec4d cosh (Vec4d const & x) {   // hyperbolic cosine
+    return _mm256_cosh_pd(x);
+}
+
+static inline Vec8f tanh (Vec8f const & x) {   // hyperbolic tangent
+    return _mm256_tanh_ps(x);
+}
+static inline Vec4d tanh (Vec4d const & x) {   // hyperbolic tangent
+    return _mm256_tanh_pd(x);
+}
+
+static inline Vec8f asinh (Vec8f const & x) {   // inverse hyperbolic sine
+    return _mm256_asinh_ps(x);
+}
+static inline Vec4d asinh (Vec4d const & x) {   // inverse hyperbolic sine
+    return _mm256_asinh_pd(x);
+}
+
+static inline Vec8f acosh (Vec8f const & x) {   // inverse hyperbolic cosine
+    return _mm256_acosh_ps(x);
+}
+static inline Vec4d acosh (Vec4d const & x) {   // inverse hyperbolic cosine
+    return _mm256_acosh_pd(x);
+}
+
+static inline Vec8f atanh (Vec8f const & x) {   // inverse hyperbolic tangent
+    return _mm256_atanh_ps(x);
+}
+static inline Vec4d atanh (Vec4d const & x) {   // inverse hyperbolic tangent
+    return _mm256_atanh_pd(x);
+}
+
+// error function
+static inline Vec8f erf (Vec8f const & x) {   // error function
+    return _mm256_erf_ps(x);
+}
+static inline Vec4d erf (Vec4d const & x) {   // error function
+    return _mm256_erf_pd(x);
+}
+
+static inline Vec8f erfc (Vec8f const & x) {   // error function complement
+    return _mm256_erfc_ps(x);
+}
+static inline Vec4d erfc (Vec4d const & x) {   // error function complement
+    return _mm256_erfc_pd(x);
+}
+
+static inline Vec8f erfinv (Vec8f const & x) {   // inverse error function
+    return _mm256_erfinv_ps(x);
+}
+static inline Vec4d erfinv (Vec4d const & x) {   // inverse error function
+    return _mm256_erfinv_pd(x);
+}
+
+extern "C" {
+extern __m256 __svml_cdfnormf8(__m256);  // not in immintrin.h
+extern __m256d __svml_cdfnorm4(__m256d); // not in immintrin.h
+}
+static inline Vec8f cdfnorm (Vec8f const & x) {   // cumulative normal distribution function
+    return __svml_cdfnormf8(x);
+}
+static inline Vec4d cdfnorm (Vec4d const & x) {   // cumulative normal distribution function
+    return __svml_cdfnorm4(x);
+}
+
+static inline Vec8f cdfnorminv (Vec8f const & x) {   // inverse cumulative normal distribution function
+    return _mm256_cdfnorminv_ps(x);
+}
+static inline Vec4d cdfnorminv (Vec4d const & x) {   // inverse cumulative normal distribution function
+    return _mm256_cdfnorminv_pd(x);
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp (Vec8f const & x) {   // complex exponential function
+    return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp (Vec4d const & x) {   // complex exponential function
+    return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif // VECTORF256_H >= 2
+
+#else
+#error unknown value of VECTORMATH
+#endif // VECTORMATH
+
+
+#if defined (VECTORF256_H) && VECTORF256_H == 1 && (VECTORMATH == 2 || VECTORMATH == 3)
+/*****************************************************************************
+*
+*      VECTORF256_H == 1. 256 bit vectors emulated as two 128-bit vectors,
+*      SVML library
+*
+*****************************************************************************/
+
+#ifndef VECTORMATH_COMMON_H
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const & x) {   // exponential function
+    return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp (Vec4d const & x) {   // exponential function
+    return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+
+static inline Vec8f expm1 (Vec8f const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1 (Vec4d const & x) {   // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+
+static inline Vec8f exp2 (Vec8f const & x) {   // pow(2,x)
+    return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2 (Vec4d const & x) {   // pow(2,x)
+    return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+
+static inline Vec8f exp10 (Vec8f const & x) {   // pow(10,x)
+    return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10 (Vec4d const & x) {   // pow(10,x)
+    return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+
+static inline Vec8f pow (Vec8f const & a, Vec8f const & b) {   // pow(a,b) = a to the power of b
+    return Vec8f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec4d pow (Vec4d const & a, Vec4d const & b) {   // pow(a,b) = a to the power of b
+    return Vec4d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+
+static inline Vec8f cbrt (Vec8f const & x) {   // pow(x,1/3)
+    return Vec8f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec4d cbrt (Vec4d const & x) {   // pow(x,1/3)
+    return Vec4d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const & x) {   // natural logarithm
+    return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log (Vec4d const & x) {   // natural logarithm
+    return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+
+static inline Vec8f log1p (Vec8f const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p (Vec4d const & x) {   // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+
+static inline Vec8f log2 (Vec8f const & x) {   // logarithm base 2
+    return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2 (Vec4d const & x) {   // logarithm base 2
+    return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+
+static inline Vec8f log10 (Vec8f const & x) {   // logarithm base 10
+    return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10 (Vec4d const & x) {   // logarithm base 10
+    return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const & x) {   // sine
+    return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin (Vec4d const & x) {   // sine
+    return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+
+static inline Vec8f cos (Vec8f const & x) {   // cosine
+    return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos (Vec4d const & x) {   // cosine
+    return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec4f r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low()); 
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec8f(r_cos0, r_cos1);
+    return Vec8f(r_sin0, r_sin1); 
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const & x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec2d r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low()); 
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec4d(r_cos0, r_cos1);
+    return Vec4d(r_sin0, r_sin1); 
+}
+#endif // inline assembly available
+
+static inline Vec8f tan (Vec8f const & x) {   // tangent
+    return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan (Vec4d const & x) {   // tangent
+    return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const & x) {   // inverse sine
+    return Vec8f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec4d asin (Vec4d const & x) {   // inverse sine
+    return Vec4d(asin(x.get_low()), asin(x.get_high()));
+}
+
+static inline Vec8f acos (Vec8f const & x) {   // inverse cosine
+    return Vec8f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec4d acos (Vec4d const & x) {   // inverse cosine
+    return Vec4d(acos(x.get_low()), acos(x.get_high()));
+}
+
+static inline Vec8f atan (Vec8f const & x) {   // inverse tangent
+    return Vec8f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec4d atan (Vec4d const & x) {   // inverse tangent
+    return Vec4d(atan(x.get_low()), atan(x.get_high()));
+}
+
+static inline Vec8f atan2 (Vec8f const & a, Vec8f const & b) {   // inverse tangent of a/b
+    return Vec8f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+static inline Vec4d atan2 (Vec4d const & a, Vec4d const & b) {   // inverse tangent of a/b
+    return Vec4d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+
+#endif // VECTORMATH_COMMON_H
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh (Vec8f const & x) {   // hyperbolic sine
+    return Vec8f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec4d sinh (Vec4d const & x) {   // hyperbolic sine
+    return Vec4d(sinh(x.get_low()), sinh(x.get_high()));
+}
+
+static inline Vec8f cosh (Vec8f const & x) {   // hyperbolic cosine
+    return Vec8f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec4d cosh (Vec4d const & x) {   // hyperbolic cosine
+    return Vec4d(cosh(x.get_low()), cosh(x.get_high()));
+}
+
+static inline Vec8f tanh (Vec8f const & x) {   // hyperbolic tangent
+    return Vec8f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec4d tanh (Vec4d const & x) {   // hyperbolic tangent
+    return Vec4d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+static inline Vec8f asinh (Vec8f const & x) {   // inverse hyperbolic sine
+    return Vec8f(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec4d asinh (Vec4d const & x) {   // inverse hyperbolic sine
+    return Vec4d(asinh(x.get_low()), asinh(x.get_high()));
+}
+
+static inline Vec8f acosh (Vec8f const & x) {   // inverse hyperbolic cosine
+    return Vec8f(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec4d acosh (Vec4d const & x) {   // inverse hyperbolic cosine
+    return Vec4d(acosh(x.get_low()), acosh(x.get_high()));
+}
+
+static inline Vec8f atanh (Vec8f const & x) {   // inverse hyperbolic tangent
+    return Vec8f(atanh(x.get_low()), atanh(x.get_high()));
+}
+static inline Vec4d atanh (Vec4d const & x) {   // inverse hyperbolic tangent
+    return Vec4d(atanh(x.get_low()), atanh(x.get_high()));
+}
+
+// error function
+static inline Vec8f erf (Vec8f const & x) {   // error function
+    return Vec8f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec4d erf (Vec4d const & x) {   // error function
+    return Vec4d(erf(x.get_low()), erf(x.get_high()));
+}
+
+static inline Vec8f erfc (Vec8f const & x) {   // error function complement
+    return Vec8f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec4d erfc (Vec4d const & x) {   // error function complement
+    return Vec4d(erfc(x.get_low()), erfc(x.get_high()));
+}
+
+static inline Vec8f erfinv (Vec8f const & x) {   // inverse error function
+    return Vec8f(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+static inline Vec4d erfinv (Vec4d const & x) {   // inverse error function
+    return Vec4d(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+
+static inline Vec8f cdfnorm (Vec8f const & x) {   // cumulative normal distribution function
+    return Vec8f(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec4d cdfnorm (Vec4d const & x) {   // cumulative normal distribution function
+    return Vec4d(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+
+static inline Vec8f cdfnorminv (Vec8f const & x) {   // inverse cumulative normal distribution function
+    return Vec8f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+static inline Vec4d cdfnorminv (Vec4d const & x) {   // inverse cumulative normal distribution function
+    return Vec4d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+
+// complex exponential function (real part in even numbered elements, imaginary part in odd numbered elements)
+static inline Vec8f cexp (Vec8f const & x) {   // complex exponential function
+    return Vec8f(cexp(x.get_low()), cexp(x.get_high()));
+}
+static inline Vec4d cexp (Vec4d const & x) {   // complex exponential function
+    return Vec4d(cexp(x.get_low()), cexp(x.get_high()));
+}
+
+#endif // VECTORF256_H == 1
+
+#endif // VECTORMATH_LIB_H
diff --git a/vectorclass/vectormath_trig.h b/vectorclass/vectormath_trig.h
new file mode 100755
index 0000000..ecbceaa
--- /dev/null
+++ b/vectorclass/vectormath_trig.h
@@ -0,0 +1,1041 @@
+/****************************  vectormath_trig.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2014-10-22
+* Version:       1.16
+* Project:       vector classes
+* Description:
+* Header file containing inline version of trigonometric functions 
+* and inverse trigonometric functions
+* sin, cos, sincos, tan
+* asin, acos, atan, atan2
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions, see vectormath_common.h and VectorClass.pdf
+*
+* (c) Copyright 2014 GNU General Public License http://www.gnu.org/licenses
+******************************************************************************/
+
+#ifndef VECTORMATH_TRIG_H
+#define VECTORMATH_TRIG_H  1
+
+#include "vectormath_common.h"
+
+// Different overloaded functions for template resolution.
+// These are used to fix the problem that the quadrant index uses
+// a vector of 32-bit integers which doesn't fit the size of the
+// 64-bit double precision vector:
+// VTYPE | ITYPE | ITYPEH
+// -----------------------
+// Vec2d | Vec2q | Vec4i
+// Vec4d | Vec4q | Vec4i
+// Vec8d | Vec8q | Vec8i
+
+// define overloaded truncate functions
+static inline Vec4i vm_truncate_low_to_int(Vec2d const & x) {
+    return truncate_to_int(x,x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4i vm_truncate_low_to_int(Vec4d const & x) {
+    return truncate_to_int(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8i vm_truncate_low_to_int(Vec8d const & x) {
+    return truncate_to_int(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// define int -> double conversions
+template<class VTYPE, class ITYPE>
+static inline VTYPE vm_half_int_vector_to_double(ITYPE const & x);
+
+template<>
+inline Vec2d vm_half_int_vector_to_double<Vec2d, Vec4i>(Vec4i const & x) {
+    return to_double_low(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+template<>
+inline Vec4d vm_half_int_vector_to_double<Vec4d, Vec4i>(Vec4i const & x) {
+    return to_double(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+template<>
+inline Vec8d vm_half_int_vector_to_double<Vec8d, Vec8i>(Vec8i const & x) {
+    return to_double(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// define int32_t to int64_t conversions
+template<class ITYPE, class ITYPEH>
+static inline ITYPE vm_half_int_vector_to_full(ITYPEH const & x);
+
+template<>
+inline Vec2q vm_half_int_vector_to_full<Vec2q,Vec4i>(Vec4i const & x) {
+    return extend_low(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+template<>
+inline Vec4q vm_half_int_vector_to_full<Vec4q,Vec4i>(Vec4i const & x) {
+    return extend_low(Vec8i(x,x));
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+template<>
+inline Vec8q vm_half_int_vector_to_full<Vec8q,Vec8i>(Vec8i const & x) {
+    return extend_low(Vec16i(x,x));
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+// *************************************************************
+//             sincos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// ITYPEH: integer vector type with half the element size
+// BTYPE:  boolean vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<class VTYPE, class ITYPE, class ITYPEH, class BTYPE, int SC> 
+static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const & xx) {
+
+    // define constants
+    const double ONEOPIO4 = 4./VM_PI;
+
+    const double P0sin =-1.66666666666666307295E-1;
+    const double P1sin = 8.33333333332211858878E-3;
+    const double P2sin =-1.98412698295895385996E-4;
+    const double P3sin = 2.75573136213857245213E-6;
+    const double P4sin =-2.50507477628578072866E-8;
+    const double P5sin = 1.58962301576546568060E-10;
+
+    const double P0cos = 4.16666666666665929218E-2;
+    const double P1cos =-1.38888888888730564116E-3;
+    const double P2cos = 2.48015872888517045348E-5;
+    const double P3cos =-2.75573141792967388112E-7;
+    const double P4cos = 2.08757008419747316778E-9;
+    const double P5cos =-1.13585365213876817300E-11;
+
+    const double DP1 = 7.853981554508209228515625E-1;
+    const double DP2 = 7.94662735614792836714E-9;
+    const double DP3 = 3.06161699786838294307E-17;
+    /*
+    const double DP1sc = 7.85398125648498535156E-1;
+    const double DP2sc = 3.77489470793079817668E-8;
+    const double DP3sc = 2.69515142907905952645E-15;
+    */
+    VTYPE xa, x, y, x2, s, c, sin1, cos1;        // data vectors
+    ITYPEH q;                                    // integer vectors, 32 bit
+    ITYPE qq, signsin, signcos;                  // integer vectors, 64 bit
+    BTYPE swap, overflow;                        // boolean vectors
+
+    xa = abs(xx);
+
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 2
+    // 3*pi/4 - 5*pi/4 => 4
+    // 5*pi/4 - 7*pi/4 => 6
+    // 7*pi/4 - 8*pi/4 => 8
+
+    // truncate to integer (magic number conversion is not faster here)
+    q = vm_truncate_low_to_int(xa * ONEOPIO4);
+    q = (q + 1) & ~1;
+
+    y = vm_half_int_vector_to_double<VTYPE>(q);  // quadrant, as double
+
+    // Reduce by extended precision modular arithmetic
+    x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));    // x = ((xa - y * DP1) - y * DP2) - y * DP3;
+
+    // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+    x2 = x * x;
+    s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
+    c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
+    s = mul_add(x * x2, s, x);                                       // s = x + (x * x2) * s;
+    c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0));                 // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
+
+    // correct for quadrant
+    qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
+    swap = BTYPE((qq & 2) != 0);
+
+    // check for overflow
+    if (horizontal_or(q < 0)) {
+        overflow = (y < 0) & is_finite(xa);
+        s = select(overflow, 0., s);
+        c = select(overflow, 1., c);
+    }
+
+    if (SC & 1) {  // calculate sin
+        sin1 = select(swap, c, s);
+        signsin = ((qq << 61) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1ULL << 63);
+        sin1 ^= reinterpret_d(signsin);
+    }
+    if (SC & 2) {  // calculate cos
+        cos1 = select(swap, s, c);
+        signcos = ((qq + 2) << 61) & (1ULL << 63);
+        cos1 ^= reinterpret_d(signcos);
+    }
+    if (SC == 3) {  // calculate both. cos returned through pointer
+        *cosret = cos1;
+    }
+    if (SC & 1) return sin1; else return cos1;
+}
+
+// instantiations of sincos_d template:
+
+static inline Vec2d sin(Vec2d const & x) {
+    return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 1>(0, x);
+}
+
+static inline Vec2d cos(Vec2d const & x) {
+    return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 2>(0, x);
+}
+
+static inline Vec2d sincos(Vec2d * cosret, Vec2d const & x) {
+    return sincos_d<Vec2d, Vec2q, Vec4i, Vec2db, 3>(cosret, x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sin(Vec4d const & x) {
+    return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 1>(0, x);
+}
+
+static inline Vec4d cos(Vec4d const & x) {
+    return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 2>(0, x);
+}
+
+static inline Vec4d sincos(Vec4d * cosret, Vec4d const & x) {
+    return sincos_d<Vec4d, Vec4q, Vec4i, Vec4db, 3>(cosret, x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sin(Vec8d const & x) {
+    return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 1>(0, x);
+}
+
+static inline Vec8d cos(Vec8d const & x) {
+    return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 2>(0, x);
+}
+
+static inline Vec8d sincos(Vec8d * cosret, Vec8d const & x) {
+    return sincos_d<Vec8d, Vec8q, Vec8i, Vec8db, 3>(cosret, x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             sincos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// BTYPE:  boolean vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos, 4 = tan
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<class VTYPE, class ITYPE, class BTYPE, int SC> 
+static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const & xx) {
+
+    // define constants
+    const float ONEOPIO4f = (float)(4./VM_PI);
+
+    const float DP1F = 0.78515625f;
+    const float DP2F = 2.4187564849853515625E-4f;
+    const float DP3F = 3.77489497744594108E-8f; 
+
+    const float P0sinf = -1.6666654611E-1f;
+    const float P1sinf =  8.3321608736E-3f;
+    const float P2sinf = -1.9515295891E-4f;
+
+    const float P0cosf =  4.166664568298827E-2f;
+    const float P1cosf = -1.388731625493765E-3f;
+    const float P2cosf =  2.443315711809948E-5f;
+
+    VTYPE xa, x, y, x2, s, c, sin1, cos1;  // data vectors
+    ITYPE q, signsin, signcos;             // integer vectors
+    BTYPE swap, overflow;                  // boolean vectors
+
+    xa = abs(xx);
+
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 2
+    // 3*pi/4 - 5*pi/4 => 4
+    // 5*pi/4 - 7*pi/4 => 6
+    // 7*pi/4 - 8*pi/4 => 8
+    q = truncate_to_int(xa * ONEOPIO4f);
+    q = (q + 1) & ~1;
+
+    y = to_float(q);  // quadrant, as float
+
+    // Reduce by extended precision modular arithmetic
+    x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa))); // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
+
+    // A two-step reduction saves time at the cost of precision for very big x:
+    //x = (xa - y * DP1F) - y * (DP2F+DP3F);
+
+    // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+    x2 = x * x;
+    s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2)  + x;
+    c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f);
+
+    // correct for quadrant
+    swap = BTYPE((q & 2) != 0);
+
+    // check for overflow
+    overflow = BTYPE(q < 0);  // q = 0x80000000 if overflow
+    if (horizontal_or(overflow & is_finite(xa))) {
+        s = select(overflow, 0.f, s);
+        c = select(overflow, 1.f, c);
+    }
+
+    if (SC & 5) {  // calculate sin
+        sin1 = select(swap, c, s);
+        signsin = ((q << 29) ^ ITYPE(reinterpret_i(xx))) & ITYPE(1 << 31);
+        sin1 ^= reinterpret_f(signsin);
+    }
+    if (SC & 6) {  // calculate cos
+        cos1 = select(swap, s, c);
+        signcos = ((q + 2) << 29) & (1 << 31);
+        cos1 ^= reinterpret_f(signcos);
+    }
+    if      (SC == 1) return sin1;
+    else if (SC == 2) return cos1;
+    else if (SC == 3) {  // calculate both. cos returned through pointer
+        *cosret = cos1;
+        return sin1;
+    }
+    else /*if (SC == 4)*/ return sin1 / cos1;
+}
+
+// instantiations of sincos_f template:
+
+static inline Vec4f sin(Vec4f const & x) {
+    return sincos_f<Vec4f, Vec4i, Vec4fb, 1>(0, x);
+}
+
+static inline Vec4f cos(Vec4f const & x) {
+    return sincos_f<Vec4f, Vec4i, Vec4fb, 2>(0, x);
+}
+
+static inline Vec4f sincos(Vec4f * cosret, Vec4f const & x) {
+    return sincos_f<Vec4f, Vec4i, Vec4fb, 3>(cosret, x);
+}
+
+static inline Vec4f tan(Vec4f const & x) {
+    return sincos_f<Vec4f, Vec4i, Vec4fb, 4>(0, x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sin(Vec8f const & x) {
+    return sincos_f<Vec8f, Vec8i, Vec8fb, 1>(0, x);
+}
+
+static inline Vec8f cos(Vec8f const & x) {
+    return sincos_f<Vec8f, Vec8i, Vec8fb, 2>(0, x);
+}
+
+static inline Vec8f sincos(Vec8f * cosret, Vec8f const & x) {
+    return sincos_f<Vec8f, Vec8i, Vec8fb, 3>(cosret, x);
+}
+
+static inline Vec8f tan(Vec8f const & x) {
+    return sincos_f<Vec8f, Vec8i, Vec8fb, 4>(0, x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sin(Vec16f const & x) {
+    return sincos_f<Vec16f, Vec16i, Vec16fb, 1>(0, x);
+}
+
+static inline Vec16f cos(Vec16f const & x) {
+    return sincos_f<Vec16f, Vec16i, Vec16fb, 2>(0, x);
+}
+
+static inline Vec16f sincos(Vec16f * cosret, Vec16f const & x) {
+    return sincos_f<Vec16f, Vec16i, Vec16fb, 3>(cosret, x);
+}
+
+static inline Vec16f tan(Vec16f const & x) {
+    return sincos_f<Vec16f, Vec16i, Vec16fb, 4>(0, x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             tan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// ITYPEH: integer vector type with half the element size
+// BTYPE:  boolean vector type
+// Paramterers:
+// x = input x (radians)
+template<class VTYPE, class ITYPE, class ITYPEH, class BTYPE> 
+static inline VTYPE tan_d(VTYPE const & x) {
+
+    // define constants
+    const double ONEOPIO4 = 4./VM_PI;
+
+    const double DP1 = 7.853981554508209228515625E-1;
+    const double DP2 = 7.94662735614792836714E-9;
+    const double DP3 = 3.06161699786838294307E-17;
+
+    const double P2tan=-1.30936939181383777646E4;
+    const double P1tan=1.15351664838587416140E6;
+    const double P0tan=-1.79565251976484877988E7;
+
+    const double Q3tan = 1.36812963470692954678E4;
+    const double Q2tan = -1.32089234440210967447E6;
+    const double Q1tan = 2.50083801823357915839E7;
+    const double Q0tan = -5.38695755929454629881E7;
+
+    VTYPE xa, y, z, zz, px, qx, tn, recip;  // data vectors
+    ITYPEH q;                               // integer vector, 32 bit
+    ITYPE qq;                               // integer vector, 64 bit
+    BTYPE doinvert, xzero, overflow;        // boolean vectors
+
+    xa = abs(x);
+
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 2
+    // 3*pi/4 - 5*pi/4 => 4
+    // 5*pi/4 - 7*pi/4 => 6
+    // 7*pi/4 - 8*pi/4 => 8
+
+    q = vm_truncate_low_to_int(xa * ONEOPIO4);
+    q = (q + 1) & ~1;
+
+    y = vm_half_int_vector_to_double<VTYPE>(q);  // quadrant, as double
+
+    // Reduce by extended precision modular arithmetic    
+    z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));    //z = ((xa - y * DP1) - y * DP2) - y * DP3;
+
+    // Pade expansion of tan, valid for -pi/4 <= x <= pi/4
+    zz = z * z;
+    px = polynomial_2 (zz, P0tan, P1tan, P2tan);
+    qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
+
+    // qx cannot be 0 for x <= pi/4
+    tn = mul_add(px / qx, z * zz, z);            // tn = z + z * zz * px / qx;
+
+    // if (q&2) tn = -1/tn
+    qq = vm_half_int_vector_to_full<ITYPE,ITYPEH>(q);
+    doinvert = BTYPE((qq & 2) != 0);
+    xzero = (xa == 0.);
+    // avoid division by 0. We will not be using recip anyway if xa == 0.
+    // tn never becomes exactly 0 when x = pi/2 so we only have to make 
+    // a special case for x == 0.
+    recip = (-1.) / select(xzero, VTYPE(-1.), tn);
+    tn = select(doinvert, recip, tn);
+    tn = sign_combine(tn, x);       // get original sign
+
+    // check for overflow
+    if (horizontal_or(q < 0)) {
+        overflow = (y < 0) & is_finite(xa);
+        tn = select(overflow, 0., tn);
+    }
+
+    return tn;
+}
+
+// instantiations of tan_d template:
+
+static inline Vec2d tan(Vec2d const & x) {
+    return tan_d<Vec2d, Vec2q, Vec4i, Vec2db>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tan(Vec4d const & x) { 
+    return tan_d<Vec4d, Vec4q, Vec4i, Vec4db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tan(Vec8d const & x) { 
+    return tan_d<Vec8d, Vec8q, Vec8i, Vec8db>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+/*
+This is removed for the single precision version. 
+It is faster to use tan(x) = sin(x)/cos(x)
+
+// *************************************************************
+//             tan template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// ITYPE:  integer vector type with same element size
+// BTYPE:  boolean vector type
+// Paramterers:
+// x = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<class VTYPE, class ITYPE, class BTYPE> 
+static inline VTYPE tan_f(VTYPE const & x) {
+
+    // define constants
+    const float ONEOPIO4f = (float)(4./VM_PI);
+
+    const float DP1F = 0.78515625f;
+    const float DP2F = 2.4187564849853515625E-4f;
+    const float DP3F = 3.77489497744594108E-8f;
+
+    const float P5tanf = 9.38540185543E-3f;
+    const float P4tanf = 3.11992232697E-3f;
+    const float P3tanf = 2.44301354525E-2f;
+    const float P2tanf = 5.34112807005E-2f;
+    const float P1tanf = 1.33387994085E-1f;
+    const float P0tanf = 3.33331568548E-1f;
+
+    VTYPE xa, y, z, zz, tn, recip;   // data vectors
+    ITYPE q;                         // integer vector
+    BTYPE doinvert, xzero;           // boolean vectors
+
+    xa = abs(x);
+
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 2
+    // 3*pi/4 - 5*pi/4 => 4
+    // 5*pi/4 - 7*pi/4 => 6
+    // 7*pi/4 - 8*pi/4 => 8
+    q = truncate_to_int(xa * ONEOPIO4f);
+    q = (q + 1) & ~1;
+
+    y = to_float(q);             // quadrant, as float
+
+    // Reduce by extended precision modular arithmetic
+    z = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
+    //z = (xa - y * DP1F) - y * (DP2F + DP3F);
+    zz = z * z;
+
+    // Taylor expansion
+    tn = polynomial_5(zz, P0tanf, P1tanf, P2tanf, P3tanf, P4tanf, P5tanf) * (zz * z) + z;
+
+    // if (q&2) tn = -1/tn
+    doinvert = (q & 2) != 0;
+    xzero = (xa == 0.f);
+    // avoid division by 0. We will not be using recip anyway if xa == 0.
+    // tn never becomes exactly 0 when x = pi/2 so we only have to make 
+    // a special case for x == 0.
+    recip = (-1.f) / select(xzero, VTYPE(-1.f), tn);
+    tn = select(doinvert, recip, tn);
+    tn = sign_combine(tn, x);          // get original sign
+
+    return tn;
+}
+
+// instantiations of tan_f template:
+
+static inline Vec4f tan(Vec4f const & x) {
+    return tan_f<Vec4f, Vec4i, Vec4fb>(x);
+} 
+
+static inline Vec8f tan(Vec8f const & x) {
+    return tan_f<Vec8f, Vec8i, Vec8fb>(x);
+}
+*/
+
+// *************************************************************
+//             asin/acos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BTYPE:  boolean vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template<class VTYPE, class BTYPE, int AC> 
+static inline VTYPE asin_d(VTYPE const & x) {
+
+    // define constants
+    const double R4asin =  2.967721961301243206100E-3;
+    const double R3asin = -5.634242780008963776856E-1;
+    const double R2asin =  6.968710824104713396794E0;
+    const double R1asin = -2.556901049652824852289E1;
+    const double R0asin =  2.853665548261061424989E1;
+
+    const double S3asin = -2.194779531642920639778E1;
+    const double S2asin =  1.470656354026814941758E2;
+    const double S1asin = -3.838770957603691357202E2;
+    const double S0asin =  3.424398657913078477438E2;
+
+    const double P5asin =  4.253011369004428248960E-3;
+    const double P4asin = -6.019598008014123785661E-1;
+    const double P3asin =  5.444622390564711410273E0;
+    const double P2asin = -1.626247967210700244449E1;
+    const double P1asin =  1.956261983317594739197E1;
+    const double P0asin = -8.198089802484824371615E0;
+
+    const double Q4asin = -1.474091372988853791896E1;
+    const double Q3asin =  7.049610280856842141659E1;
+    const double Q2asin = -1.471791292232726029859E2;
+    const double Q1asin =  1.395105614657485689735E2;
+    const double Q0asin = -4.918853881490881290097E1;
+
+    VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
+    BTYPE big;
+    bool dobig, dosmall;
+
+    xa  = abs(x);
+    big = xa >= 0.625;
+
+    /*
+    Small: xa < 0.625
+    ------------------
+    x = xa * xa;
+    px = PX(x);
+    qx = QX(x);
+    y1 = x*px/qx;    
+    y1 = xa * y1 + xa;
+
+    Big: xa >= 0.625
+    ------------------
+    x = 1.0 - xa;
+    rx = RX(x);
+    sx = SX(x);
+    y1 = x * rx/sx;
+    x3 = sqrt(x+x);
+    y3 = x3 * y1 - MOREBITS;
+    z = pi/2 - x3 - y3
+    */
+
+    // select a common x for all polynomials
+    // This allows sharing of powers of x through common subexpression elimination
+    x1 = select(big, 1.0 - xa, xa * xa); 
+
+    // calculate powers of x1 outside branches to make sure they are only calculated once
+    x2 = x1 * x1;
+    x4 = x2 * x2;
+    x5 = x4 * x1;
+    x3 = x2 * x1;
+
+    dosmall = !horizontal_and(big);   // at least one element is small
+    dobig   =  horizontal_or(big) ;   // at least one element is big
+
+    // calculate polynomials (reuse powers of x)
+    if (dosmall) {
+        // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
+        // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
+        px = mul_add(x3,P3asin,P0asin) + mul_add(x4,P4asin,x1*P1asin) + mul_add(x5,P5asin,x2*P2asin);
+        qx = mul_add(x4,Q4asin,x5) + mul_add(x3,Q3asin,x1*Q1asin) + mul_add(x2,Q2asin,Q0asin);
+    }
+    if (dobig) {
+        // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
+        // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
+        rx = mul_add(x3,R3asin,x2*R2asin) + mul_add(x4,R4asin,mul_add(x1,R1asin,R0asin));
+        sx = mul_add(x3,S3asin,x4) + mul_add(x2,S2asin,mul_add(x1,S1asin,S0asin));
+    }
+
+    // select and divide outside branches to avoid dividing twice
+    vx = select(big, rx, px);
+    wx = select(big, sx, qx);
+    y1 = vx/wx * x1;
+
+    // results for big
+    if (dobig) {                                 // avoid square root if all are small
+        xb = sqrt(x1+x1);                        // this produces NAN if xa > 1 so we don't need a special case for xa > 1
+        z1 = mul_add(xb, y1, xb);                // yb = xb * y1; z1 = xb + yb;
+    }
+
+    // results for small        
+    z2 = mul_add(xa, y1, xa);                    // z2 = xa * y1 + xa;
+
+    // correct for sign
+    if (AC) {  // acos
+        z1 = select(x < 0., VM_PI - z1, z1);
+        z2 = VM_PI_2 - sign_combine(z2, x);
+        z = select(big, z1, z2);
+    }
+    else {     // asin
+        z1 = VM_PI_2 - z1;
+        z = select(big, z1, z2);
+        z = sign_combine(z, x);
+    }
+    return z;
+}
+
+// instantiations of asin_d template:
+
+static inline Vec2d asin(Vec2d const & x) {
+    return asin_d<Vec2d, Vec2db, 0>(x);
+}
+
+static inline Vec2d acos(Vec2d const & x) {
+    return asin_d<Vec2d, Vec2db, 1>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asin(Vec4d const & x) { 
+    return asin_d<Vec4d, Vec4db, 0>(x);
+}
+
+static inline Vec4d acos(Vec4d const & x) { 
+    return asin_d<Vec4d, Vec4db, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asin(Vec8d const & x) { 
+    return asin_d<Vec8d, Vec8db, 0>(x);
+}
+
+static inline Vec8d acos(Vec8d const & x) { 
+    return asin_d<Vec8d, Vec8db, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             asin/acos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BTYPE:  boolean vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template<class VTYPE, class BTYPE, int AC> 
+static inline VTYPE asin_f(VTYPE const & x) {
+
+    // define constants
+    const float P4asinf = 4.2163199048E-2f;
+    const float P3asinf = 2.4181311049E-2f;
+    const float P2asinf = 4.5470025998E-2f;
+    const float P1asinf = 7.4953002686E-2f;
+    const float P0asinf = 1.6666752422E-1f;
+
+    VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2;
+    BTYPE big;
+
+    xa  = abs(x);
+    big = xa > 0.5f;
+
+    x1 = 0.5f * (1.0f - xa);
+    x2 = xa * xa;        
+    x3 = select(big, x1, x2);
+
+    //if (horizontal_or(big)) 
+    {
+        xb = sqrt(x1);
+    }
+    x4 = select(big, xb, xa);
+
+    z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    z = mul_add(z, x3*x4, x4);                   // z = z * (x3*x4) + x4;
+    z1 = z + z;
+
+    // correct for sign
+    if (AC) {  // acos
+        z1 = select(x < 0., float(VM_PI) - z1, z1);
+        z2 = float(VM_PI_2) - sign_combine(z, x);
+        z  = select(big, z1, z2);
+    }
+    else {     // asin
+        z1 = float(VM_PI_2) - z1;
+        z  = select(big, z1, z);
+        z  = sign_combine(z, x);
+    }
+
+    return z;
+}
+
+// instantiations of asin_f template:
+
+static inline Vec4f asin(Vec4f const & x) {
+    return asin_f<Vec4f, Vec4fb, 0>(x);
+}
+
+static inline Vec4f acos(Vec4f const & x) {
+    return asin_f<Vec4f, Vec4fb, 1>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asin(Vec8f const & x) { 
+    return asin_f<Vec8f, Vec8fb, 0>(x);
+}
+static inline Vec8f acos(Vec8f const & x) { 
+    return asin_f<Vec8f, Vec8fb, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asin(Vec16f const & x) { 
+    return asin_f<Vec16f, Vec16fb, 0>(x);
+}
+static inline Vec16f acos(Vec16f const & x) { 
+    return asin_f<Vec16f, Vec16fb, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             atan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BTYPE:  boolean vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+// atan2(0,0) gives NAN. Future versions may give 0
+template<class VTYPE, class BTYPE, int T2> 
+static inline VTYPE atan_d(VTYPE const & y, VTYPE const & x) {
+
+    // define constants
+    //const double ONEOPIO4 = 4./VM_PI;
+    const double MOREBITS = 6.123233995736765886130E-17;
+    const double MOREBITSO2 = MOREBITS * 0.5;
+    const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880;
+
+	const double P4atan = -8.750608600031904122785E-1;
+	const double P3atan = -1.615753718733365076637E1;
+	const double P2atan = -7.500855792314704667340E1;
+	const double P1atan = -1.228866684490136173410E2;
+	const double P0atan = -6.485021904942025371773E1;
+
+	const double Q4atan = 2.485846490142306297962E1;
+	const double Q3atan = 1.650270098316988542046E2;
+	const double Q2atan = 4.328810604912902668951E2;
+	const double Q1atan = 4.853903996359136964868E2;
+	const double Q0atan = 1.945506571482613964425E2;
+
+    VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re;  // data vectors
+    BTYPE swapxy, notbig, notsmal;                             // boolean vectors
+
+    if (T2) {  // atan2(y,x)
+        // move in first octant
+        x1 = abs(x);
+        y1 = abs(y);
+        swapxy = (y1 > x1);
+        // swap x and y if y1 > x1
+        x2 = select(swapxy, y1, x1);
+        y2 = select(swapxy, x1, y1);        
+        t  = y2 / x2;                  // x = y = 0 gives NAN here
+    }
+    else {    // atan(y)
+        t = abs(y);
+    }
+
+    // small:  t < 0.66
+    // medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
+    // big:    t > 2.4142
+    notbig  = t <= T3PO8;  // t <= 2.4142
+    notsmal = t >= 0.66;   // t >= 0.66
+
+    s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
+    s = notsmal & s;                   // select(notsmal, s, 0.);
+    fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS));
+    fac = notsmal & fac;  //select(notsmal, fac, 0.);
+
+    // small:  z = t / 1.0;
+    // medium: z = (t-1.0) / (t+1.0);
+    // big:    z = -1.0 / t;
+    a = notbig & t;   // select(notbig, t, 0.);
+    a = if_add(notsmal, a, -1.);
+    b = notbig & VTYPE(1.); //  select(notbig, 1., 0.);
+    b = if_add(notsmal, b, t);
+    z = a / b;      // division by 0 will not occur unless x and y are both 0
+
+    zz = z * z;
+
+    px = polynomial_4 (zz, P0atan, P1atan, P2atan, P3atan, P4atan);
+    qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
+
+    re = mul_add(px / qx, z * zz, z);            // re = (px / qx) * (z * zz) + z;
+    re += s + fac;
+
+    if (T2) {  // atan2(y,x)
+        // move back in place
+        re = select(swapxy, VM_PI_2 - re, re);
+        re = select(x < 0., VM_PI   - re, re);
+        re = select((x | y) == 0., 0., re);      // atan2(0,0) = 0 by convention
+    }
+    // get sign bit
+    re = sign_combine(re, y);
+
+    return re;
+}
+
+// instantiations of atan_d template:
+
+static inline Vec2d atan2(Vec2d const & y, Vec2d const & x) {
+    return atan_d<Vec2d, Vec2db, 1>(y, x);
+}
+
+static inline Vec2d atan(Vec2d const & y) {
+    return atan_d<Vec2d, Vec2db, 0>(y, 0.);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atan2(Vec4d const & y, Vec4d const & x) {
+    return atan_d<Vec4d, Vec4db, 1>(y, x);
+}
+
+static inline Vec4d atan(Vec4d const & y) {
+    return atan_d<Vec4d, Vec4db, 0>(y, 0.);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atan2(Vec8d const & y, Vec8d const & x) {
+    return atan_d<Vec8d, Vec8db, 1>(y, x);
+}
+
+static inline Vec8d atan(Vec8d const & y) {
+    return atan_d<Vec8d, Vec8db, 0>(y, 0.);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+// *************************************************************
+//             atan template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// BTYPE:  boolean vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+// atan2(0,0) gives NAN. Future versions may give 0
+template<class VTYPE, class BTYPE, int T2> 
+static inline VTYPE atan_f(VTYPE const & y, VTYPE const & x) {
+
+    // define constants
+    const float P3atanf =  8.05374449538E-2f;
+    const float P2atanf = -1.38776856032E-1f;
+    const float P1atanf =  1.99777106478E-1f;
+    const float P0atanf = -3.33329491539E-1f;
+
+    VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re;   // data vectors
+    BTYPE swapxy, notbig, notsmal;                 // boolean vectors
+
+    if (T2) {  // atan2(y,x)
+        // move in first octant
+        x1 = abs(x);
+        y1 = abs(y);
+        swapxy = (y1 > x1);
+        // swap x and y if y1 > x1
+        x2 = select(swapxy, y1, x1);
+        y2 = select(swapxy, x1, y1);
+
+        // do we need to protect against x = y = 0? It will just produce NAN, probably without delay
+        t  = y2 / x2;
+    }
+    else {    // atan(y)
+        t = abs(y);
+    }
+
+    // small:  t < 0.4142
+    // medium: 0.4142 <= t <= 2.4142
+    // big:    t > 2.4142  (not for atan2)
+    if (!T2) {  // atan(y)
+        notsmal = t >= float(VM_SQRT2-1.);       // t >= tan  pi/8
+        notbig  = t <= float(VM_SQRT2+1.);       // t <= tan 3pi/8
+
+        s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
+        s = notsmal & s;      // select(notsmal, s, 0.);
+
+        // small:  z = t / 1.0;
+        // medium: z = (t-1.0) / (t+1.0);
+        // big:    z = -1.0 / t;
+        a = notbig & t; // select(notbig, t, 0.);
+        a = if_add(notsmal, a, -1.f);
+        b = notbig & VTYPE(1.f); //  select(notbig, 1., 0.);
+        b = if_add(notsmal, b, t);
+        z = a / b;      // division by 0 will not occur unless x and y are both 0
+    }
+    else {  // atan2(y,x)
+        // small:  z = t / 1.0;
+        // medium: z = (t-1.0) / (t+1.0);
+        notsmal = t >= float(VM_SQRT2-1.); 
+        a = if_add(notsmal, t, -1.f);
+        b = if_add(notsmal, 1.f, t);
+        s = notsmal & VTYPE(float(VM_PI_4));
+        z = a / b;
+    }
+
+    zz = z * z;
+
+    // Taylor expansion
+    re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf);
+    re = mul_add(re, zz * z, z) + s;
+
+    if (T2) {  // atan2(y,x)
+        // move back in place
+        re = select(swapxy, float(VM_PI_2) - re, re);
+        re = select(x < 0., float(VM_PI)   - re, re);
+        re = select((x | y) == 0.f, 0.f, re);    // atan2(0,0) = 0 by convention
+    }
+    // get sign bit
+    re = sign_combine(re, y);
+
+    return re;
+}
+
+// instantiations of atan_f template:
+
+static inline Vec4f atan2(Vec4f const & y, Vec4f const & x) {
+    return atan_f<Vec4f, Vec4fb, 1>(y, x);
+}
+
+static inline Vec4f atan(Vec4f const & y) {
+    return atan_f<Vec4f, Vec4fb, 0>(y, 0.);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atan2(Vec8f const & y, Vec8f const & x) {
+    return atan_f<Vec8f, Vec8fb, 1>(y, x);
+}
+
+static inline Vec8f atan(Vec8f const & y) {
+    return atan_f<Vec8f, Vec8fb, 0>(y, 0.);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atan2(Vec16f const & y, Vec16f const & x) {
+    return atan_f<Vec16f, Vec16fb, 1>(y, x);
+}
+
+static inline Vec16f atan(Vec16f const & y) {
+    return atan_f<Vec16f, Vec16fb, 0>(y, 0.);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+#endif
diff --git a/whtest/CMakeLists.txt b/whtest/CMakeLists.txt
new file mode 100644
index 0000000..178300f
--- /dev/null
+++ b/whtest/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+include_directories("${PROJECT_BINARY_DIR}")
+
+add_library(whtest 
+eigen.c  
+eigen_sym.c  
+random.c  
+weisslambda.c  
+weisslambda_sub.c  
+whtest.c  
+whtest_sub.c)
+
+#install (TARGETS whtest DESTINATION bin)
+#install (FILES  eigen.h  eigen_sym.h  random.h  tools.h  weisslambda_sub.h  whtest.h  whtest_sub.h DESTINATION include)
\ No newline at end of file
diff --git a/whtest/eigen.c b/whtest/eigen.c
new file mode 100644
index 0000000..d714aa2
--- /dev/null
+++ b/whtest/eigen.c
@@ -0,0 +1,926 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, BUI Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+#include "eigen.h"
+
+
+/* Allmost everything below is taken from Yang's Paml package */
+/* Thanks to Ziheng Yang */
+
+int abyx (double a, double x[], int n);
+int xtoy (double x[], double y[], int n);
+
+/**
+	matrix multiplication, return C = A*B
+*/
+void matAbyBisC (double A[], double B[], int n, double C[]);
+
+int matinv( double x[], int n, int m, double space[]);
+int eigen(int job, double A[], int n, double rr[], double ri[],
+          double vr[], double vi[], double w[]);
+void balance(double mat[], int n, int *low, int *hi, double scale[]);
+void unbalance(int n, double vr[], double vi[], int low, int hi,
+               double scale[]);
+int realeig(int job, double mat[], int n,int low, int hi, double valr[],
+            double vali[], double vr[], double vi[]);
+void elemhess(int job, double mat[], int n, int low, int hi, 
+            double vr[], double vi[], int work[]);
+
+typedef struct { double re, im; } complex;
+#define csize(a) (fabs(a.re)+fabs(a.im))
+
+complex compl (double re,double im);
+/*complex conj (complex a);*/
+complex cplus (complex a, complex b);
+complex cminus (complex a, complex b);
+complex cby (complex a, complex b);
+complex cdiv (complex a,complex b);
+/*complex cexp (complex a);*/
+complex cfactor (complex x, double a);
+int cxtoy (complex x[], complex y[], int n);
+int cmatby (complex a[], complex b[], complex c[], int n,int m,int k);
+int cmatout (FILE * fout, complex x[], int n, int m);
+int cmatinv( complex x[], int n, int m, double space[]);
+   
+int abyx (double a, double x[], int n)
+{ int i; for (i=0; i<n; x[i]*=a,i++) ;  return(0); }
+int xtoy (double x[], double y[], int n)
+{ int i; for (i=0; i<n; y[i]=x[i],i++) ;  return(0); }
+
+void matAbyBisC (double A[], double B[], int n, double C[])
+{ 
+	int i, j, k;
+	
+	for( i = 0; i < n*n; i++)
+	C[i] = 0;
+	
+	for( i = 0; i < n; i++)
+	{
+		for( j = 0; j < n; j++)
+		{
+			for( k = 0; k < n; k++)
+			C[i*n+j] += A[i*n+k] * B[k*n+j];
+		}
+	}
+
+}
+
+int matinv( double x[], int n, int m, double space[])
+{
+/* x[n*m]  ... m>=n
+*/
+   register int i,j,k;
+   int *irow=(int*) space;
+   double ee=1.0e-20, t,t1,xmax;
+   double det=1.0;
+
+   for (i=0; i<n; i++)  {
+      xmax = 0.;
+      for (j=i; j<n; j++) {
+	 if (xmax < fabs(x[j*m+i]))  {
+	    xmax = fabs( x[j*m+i] );
+	    irow[i] = j;
+	 }
+      }
+      det *= xmax;
+      if (xmax < ee)   {
+	 printf("\nDet becomes zero at %3d!\t\n", i+1);
+	 return(-1);
+      }
+      if (irow[i] != i) {
+	 for (j=0; j<m; j++) {
+	    t = x[i*m+j];
+	    x[i*m+j] = x[irow[i] * m + j];
+	    x[ irow[i] * m + j] = t;
+	 }
+      }
+      t = 1./x[i*m+i];
+      for (j=0; j<n; j++) {
+	 if (j == i) continue;
+	 t1 = t*x[j*m+i];
+	 for (k=0; k<n; k++)  x[j*m+k] -= t1*x[i*m+k];
+	 x[j*m+i] = -t1;
+      }
+      for (j=0; j<m; j++)   x[i*m+j] *= t;
+      x[i*m+i] = t;
+   }                            /* i  */
+   for (i=n-1; i>=0; i--) {
+      if (irow[i] == i) continue;
+      for (j=0; j<n; j++)  {
+	 t = x[j*m+i];
+	 x[j*m+i] = x[ j*m + irow[i] ];
+	 x[ j*m + irow[i] ] = t;
+      }
+   }
+   return (0);
+}
+
+/***********************************************************
+*  This eigen() works for eigenvalue/vector analysis
+*         for real general square matrix A
+*         A will be destroyed
+*         rr,ri are vectors containing eigenvalues
+*         vr,vi are matrices containing (right) eigenvectors
+*
+*              A*[vr+vi*i] = [vr+vi*i] * diag{rr+ri*i}
+*
+*  Algorithm: Handbook for Automatic Computation, vol 2
+*             by Wilkinson and Reinsch, 1971
+*             most of source codes were taken from a public domain
+*             solftware called MATCALC.
+*  Credits:   to the authors of MATCALC
+*
+*  return     -1 not converged
+*              0 no complex eigenvalues/vectors
+*              1 complex eigenvalues/vectors
+*  Tianlin Wang at University of Illinois
+*  Thu May  6 15:22:31 CDT 1993
+***************************************************************/
+
+#define FOR(i,n) for(i=0; i<n; i++)
+#define FPN(file) fputc('\n', file)
+#define min(a,b) ((a)<(b)?(a):(b))
+#define max(a,b) ((a)>(b)?(a):(b))
+
+#define BASE        2    /* base of floating point arithmetic */
+#define DIGITS     53    /* no. of digits to the base BASE in the fraction */
+#define MAXITER    30    /* max. no. of iterations to converge */
+
+#define pos(i,j,n)      ((i)*(n)+(j))
+
+int eigen(int job, double A[], int n, double rr[], double ri[], 
+          double vr[], double vi[], double work[])
+{    
+/*  double work[n*2]: working space
+*/
+    int low,hi,i,j,k, it, istate=0;
+    double tiny=sqrt(pow((double)BASE,(double)(1-DIGITS))), t; 
+
+    balance(A,n,&low,&hi,work);
+    elemhess(job,A,n,low,hi,vr,vi, (int*)(work+n));
+    if (-1 == realeig(job,A,n,low,hi,rr,ri,vr,vi)) return (-1);
+    if (job) unbalance(n,vr,vi,low,hi,work);
+
+/* sort, added by Z. Yang */
+   for (i=0; i<n; i++) {
+       for (j=i+1,it=i,t=rr[i]; j<n; j++)
+           if (t<rr[j]) { t=rr[j]; it=j; }
+       rr[it]=rr[i];   rr[i]=t;
+       t=ri[it];       ri[it]=ri[i];  ri[i]=t;
+       for (k=0; k<n; k++) {
+          t=vr[k*n+it];  vr[k*n+it]=vr[k*n+i];  vr[k*n+i]=t;
+          t=vi[k*n+it];  vi[k*n+it]=vi[k*n+i];  vi[k*n+i]=t;
+       }
+       if (fabs(ri[i])>tiny) istate=1;
+   }
+
+    return (istate) ;
+}
+
+/* complex funcctions
+*/
+
+complex compl (double re,double im)
+{
+    complex r;
+
+    r.re = re;
+    r.im = im;
+    return(r);
+}
+
+/*complex conj (complex a)
+{
+    a.im = -a.im;
+    return(a);
+}*/
+
+#define csize(a) (fabs(a.re)+fabs(a.im))
+
+complex cplus (complex a, complex b)
+{
+   complex c;
+   c.re = a.re+b.re;  
+   c.im = a.im+b.im;   
+   return (c);
+}
+
+complex cminus (complex a, complex b)
+{
+   complex c;
+   c.re = a.re-b.re;  
+   c.im = a.im-b.im;   
+   return (c);
+}
+
+complex cby (complex a, complex b)
+{
+   complex c;
+   c.re = a.re*b.re-a.im*b.im ;
+   c.im = a.re*b.im+a.im*b.re ;
+   return (c);
+}
+
+complex cdiv (complex a,complex b)
+{
+    double ratio, den;
+    complex c;
+
+    if (fabs(b.re) <= fabs(b.im)) {
+        ratio = b.re / b.im;
+        den = b.im * (1 + ratio * ratio);
+        c.re = (a.re * ratio + a.im) / den;
+        c.im = (a.im * ratio - a.re) / den;
+    }
+    else {
+        ratio = b.im / b.re;
+        den = b.re * (1 + ratio * ratio);
+        c.re = (a.re + a.im * ratio) / den;
+        c.im = (a.im - a.re * ratio) / den;
+    }
+    return(c);
+}
+
+/*complex cexp (complex a)
+{
+   complex c;
+   c.re = exp(a.re);
+   if (fabs(a.im)==0) c.im = 0; 
+   else  { c.im = c.re*sin(a.im); c.re*=cos(a.im); }
+   return (c);
+}*/
+
+complex cfactor (complex x, double a)
+{
+   complex c;
+   c.re = a*x.re; 
+   c.im = a*x.im;
+   return (c);
+}
+
+int cxtoy (complex x[], complex y[], int n)
+{
+   int i;
+   FOR (i,n) y[i]=x[i];
+   return (0);
+}
+
+int cmatby (complex a[], complex b[], complex c[], int n,int m,int k)
+/* a[n*m], b[m*k], c[n*k]  ......  c = a*b
+*/
+{
+   int i,j,i1;
+   complex t;
+
+   FOR (i,n)  FOR(j,k) {
+       for (i1=0,t=compl(0,0); i1<m; i1++)  
+           t = cplus (t, cby(a[i*m+i1],b[i1*k+j]));
+       c[i*k+j] = t;
+   }
+   return (0);
+}
+
+int cmatout (FILE * fout, complex x[], int n, int m)
+{
+   int i,j;
+   for (i=0,FPN(fout); i<n; i++,FPN(fout)) 
+        FOR(j,m) fprintf(fout, "%7.3f%7.3f  ", x[i*m+j].re, x[i*m+j].im);
+   return (0);
+}
+
+int cmatinv( complex x[], int n, int m, double space[])
+{
+/* x[n*m]  ... m>=n
+*/
+   int i,j,k, *irow=(int*) space;
+   double xmaxsize, ee=1e-20;
+   complex /*xmax,*/ t,t1;
+
+   FOR(i,n)  {
+       xmaxsize = 0.;
+       for (j=i; j<n; j++) {
+          if ( xmaxsize < csize (x[j*m+i]))  {
+               xmaxsize = csize (x[j*m+i]);
+               /*xmax = x[j*m+i];*/
+               irow[i] = j;
+          }
+       }
+       if (xmaxsize < ee)   {
+           printf("\nDet goes to zero at %8d!\t\n", i+1);
+           return(-1);
+       }
+       if (irow[i] != i) {
+           FOR(j,m) {
+                t = x[i*m+j];
+                x[i*m+j] = x[irow[i]*m+j];
+                x[ irow[i]*m+j] = t;
+           }
+       }
+       t = cdiv (compl(1,0), x[i*m+i]);
+       FOR(j,n) {
+           if (j == i) continue;
+           t1 = cby (t,x[j*m+i]);
+           FOR(k,m)  x[j*m+k] = cminus (x[j*m+k], cby(t1,x[i*m+k]));
+           x[j*m+i] = cfactor (t1, -1);
+       }
+       FOR(j,m)   x[i*m+j] = cby (x[i*m+j], t);
+       x[i*m+i] = t;
+   }                         
+   for (i=n-1; i>=0; i--) {
+        if (irow[i] == i) continue;
+        FOR(j,n)  {
+           t = x[j*m+i];
+           x[j*m+i] = x[j*m+irow[i]];
+           x[ j*m+irow[i]] = t;
+        }
+   }
+   return (0);
+}
+
+
+void balance(double mat[], int n,int *low, int *hi, double scale[])
+{
+/* Balance a matrix for calculation of eigenvalues and eigenvectors
+*/
+    double c,f,g,r,s;
+    int i,j,k,l,done;
+        /* search for rows isolating an eigenvalue and push them down */
+    for (k = n - 1; k >= 0; k--) {
+        for (j = k; j >= 0; j--) {
+            for (i = 0; i <= k; i++) {
+                if (i != j && fabs(mat[pos(j,i,n)]) != 0) break;
+            }
+
+            if (i > k) {
+                scale[k] = j;
+
+                if (j != k) {
+                    for (i = 0; i <= k; i++) {
+                       c = mat[pos(i,j,n)];
+                       mat[pos(i,j,n)] = mat[pos(i,k,n)];
+                       mat[pos(i,k,n)] = c;
+                    }
+
+                    for (i = 0; i < n; i++) {
+                       c = mat[pos(j,i,n)];
+                       mat[pos(j,i,n)] = mat[pos(k,i,n)];
+                       mat[pos(k,i,n)] = c;
+                    }
+                }
+                break;
+            }
+        }
+        if (j < 0) break;
+    }
+
+    /* search for columns isolating an eigenvalue and push them left */
+
+    for (l = 0; l <= k; l++) {
+        for (j = l; j <= k; j++) {
+            for (i = l; i <= k; i++) {
+                if (i != j && fabs(mat[pos(i,j,n)]) != 0) break;
+            }
+            if (i > k) {
+                scale[l] = j;
+                if (j != l) {
+                    for (i = 0; i <= k; i++) {
+                       c = mat[pos(i,j,n)];
+                       mat[pos(i,j,n)] = mat[pos(i,l,n)];
+                       mat[pos(i,l,n)] = c;
+                    }
+
+                    for (i = l; i < n; i++) {
+                       c = mat[pos(j,i,n)];
+                       mat[pos(j,i,n)] = mat[pos(l,i,n)];
+                       mat[pos(l,i,n)] = c;
+                    }
+                }
+
+                break;
+            }
+        }
+
+        if (j > k) break;
+    }
+
+    *hi = k;
+    *low = l;
+
+    /* balance the submatrix in rows l through k */
+
+    for (i = l; i <= k; i++) {
+        scale[i] = 1;
+    }
+
+    do {
+        for (done = 1,i = l; i <= k; i++) {
+            for (c = 0,r = 0,j = l; j <= k; j++) {
+                if (j != i) {
+                    c += fabs(mat[pos(j,i,n)]);
+                    r += fabs(mat[pos(i,j,n)]);
+                }
+            }
+
+            if (c != 0 && r != 0) {
+                g = r / BASE;
+                f = 1;
+                s = c + r;
+
+                while (c < g) {
+                    f *= BASE;
+                    c *= BASE * BASE;
+                }
+
+                g = r * BASE;
+
+                while (c >= g) {
+                    f /= BASE;
+                    c /= BASE * BASE;
+                }
+
+                if ((c + r) / f < 0.95 * s) {
+                    done = 0;
+                    g = 1 / f;
+                    scale[i] *= f;
+
+                    for (j = l; j < n; j++) {
+                        mat[pos(i,j,n)] *= g;
+                    }
+
+                    for (j = 0; j <= k; j++) {
+                        mat[pos(j,i,n)] *= f;
+                    }
+                }
+            }
+        }
+    } while (!done);
+}
+
+
+/*
+ * Transform back eigenvectors of a balanced matrix
+ * into the eigenvectors of the original matrix
+ */
+void unbalance(int n,double vr[],double vi[], int low, int hi, double scale[])
+{
+    int i,j,k;
+    double tmp;
+
+    for (i = low; i <= hi; i++) {
+        for (j = 0; j < n; j++) {
+            vr[pos(i,j,n)] *= scale[i];
+            vi[pos(i,j,n)] *= scale[i];
+        }
+    }
+
+    for (i = low - 1; i >= 0; i--) {
+        if ((k = (int)scale[i]) != i) {
+            for (j = 0; j < n; j++) {
+                tmp = vr[pos(i,j,n)];
+                vr[pos(i,j,n)] = vr[pos(k,j,n)];
+                vr[pos(k,j,n)] = tmp;
+
+                tmp = vi[pos(i,j,n)];
+                vi[pos(i,j,n)] = vi[pos(k,j,n)];
+                vi[pos(k,j,n)] = tmp;        
+            }
+        }
+    }
+
+    for (i = hi + 1; i < n; i++) {
+        if ((k = (int)scale[i]) != i) {
+            for (j = 0; j < n; j++) {
+                tmp = vr[pos(i,j,n)];
+                vr[pos(i,j,n)] = vr[pos(k,j,n)];
+                vr[pos(k,j,n)] = tmp;
+
+                tmp = vi[pos(i,j,n)];
+                vi[pos(i,j,n)] = vi[pos(k,j,n)];
+                vi[pos(k,j,n)] = tmp;        
+            }
+        }
+    }
+}
+
+/*
+ * Reduce the submatrix in rows and columns low through hi of real matrix mat to
+ * Hessenberg form by elementary similarity transformations
+ */
+void elemhess(int job,double mat[],int n,int low,int hi, double vr[],
+              double vi[], int work[])
+{
+/* work[n] */
+    int i,j,m;
+    double x,y;
+
+    for (m = low + 1; m < hi; m++) {
+        for (x = 0,i = m,j = m; j <= hi; j++) {
+            if (fabs(mat[pos(j,m-1,n)]) > fabs(x)) {
+                x = mat[pos(j,m-1,n)];
+                i = j;
+            }
+        }
+
+        if ((work[m] = i) != m) {
+            for (j = m - 1; j < n; j++) {
+               y = mat[pos(i,j,n)];
+               mat[pos(i,j,n)] = mat[pos(m,j,n)];
+               mat[pos(m,j,n)] = y;
+            }
+
+            for (j = 0; j <= hi; j++) {
+               y = mat[pos(j,i,n)];
+               mat[pos(j,i,n)] = mat[pos(j,m,n)];
+               mat[pos(j,m,n)] = y;
+            }
+        }
+
+        if (x != 0) {
+            for (i = m + 1; i <= hi; i++) {
+                if ((y = mat[pos(i,m-1,n)]) != 0) {
+                    y = mat[pos(i,m-1,n)] = y / x;
+
+                    for (j = m; j < n; j++) {
+                        mat[pos(i,j,n)] -= y * mat[pos(m,j,n)];
+                    }
+
+                    for (j = 0; j <= hi; j++) {
+                        mat[pos(j,m,n)] += y * mat[pos(j,i,n)];
+                    }
+                }
+            }
+        }
+    }
+    if (job) {
+       for (i=0; i<n; i++) {
+          for (j=0; j<n; j++) {
+             vr[pos(i,j,n)] = 0.0; vi[pos(i,j,n)] = 0.0;
+          }
+          vr[pos(i,i,n)] = 1.0;
+       }
+
+       for (m = hi - 1; m > low; m--) {
+          for (i = m + 1; i <= hi; i++) {
+             vr[pos(i,m,n)] = mat[pos(i,m-1,n)];
+          }
+
+         if ((i = work[m]) != m) {
+            for (j = m; j <= hi; j++) {
+               vr[pos(m,j,n)] = vr[pos(i,j,n)];
+               vr[pos(i,j,n)] = 0.0;
+            }
+            vr[pos(i,m,n)] = 1.0;
+         }
+      }
+   }
+}
+
+/*
+ * Calculate eigenvalues and eigenvectors of a real upper Hessenberg matrix
+ * Return 1 if converges successfully and 0 otherwise
+ */
+ 
+int realeig(int job,double mat[],int n,int low, int hi, double valr[],
+      double vali[], double vr[],double vi[])
+{
+   complex v;
+   double p=0,q=0,r=0,s=0,t,w,x,y,z=0,ra,sa,norm,eps;
+   int niter,en,i,j,k,l,m;
+   double precision  = pow((double)BASE,(double)(1-DIGITS));
+
+   eps = precision;
+   for (i=0; i<n; i++) {
+      valr[i]=0.0;
+      vali[i]=0.0;
+   }
+      /* store isolated roots and calculate norm */
+   for (norm = 0,i = 0; i < n; i++) {
+      for (j = max(0,i-1); j < n; j++) {
+         norm += fabs(mat[pos(i,j,n)]);
+      }
+      if (i < low || i > hi) valr[i] = mat[pos(i,i,n)];
+   }
+   t = 0;
+   en = hi;
+
+   while (en >= low) {
+      niter = 0;
+      for (;;) {
+
+       /* look for single small subdiagonal element */
+
+         for (l = en; l > low; l--) {
+            s = fabs(mat[pos(l-1,l-1,n)]) + fabs(mat[pos(l,l,n)]);
+            if (s == 0) s = norm;
+            if (fabs(mat[pos(l,l-1,n)]) <= eps * s) break;
+         }
+
+         /* form shift */
+
+         x = mat[pos(en,en,n)];
+
+         if (l == en) {             /* one root found */
+            valr[en] = x + t;
+            if (job) mat[pos(en,en,n)] = x + t;
+            en--;
+            break;
+         }
+
+         y = mat[pos(en-1,en-1,n)];
+         w = mat[pos(en,en-1,n)] * mat[pos(en-1,en,n)];
+
+         if (l == en - 1) {                /* two roots found */
+            p = (y - x) / 2;
+            q = p * p + w;
+            z = sqrt(fabs(q));
+            x += t;
+            if (job) {
+               mat[pos(en,en,n)] = x;
+               mat[pos(en-1,en-1,n)] = y + t;
+            }
+            if (q < 0) {                /* complex pair */
+               valr[en-1] = x+p;
+               vali[en-1] = z;
+               valr[en] = x+p;
+               vali[en] = -z;
+            }
+            else {                      /* real pair */
+               z = (p < 0) ? p - z : p + z;
+               valr[en-1] = x + z;
+               valr[en] = (z == 0) ? x + z : x - w / z;
+               if (job) {
+                  x = mat[pos(en,en-1,n)];
+                  s = fabs(x) + fabs(z);
+                  p = x / s;
+                  q = z / s;
+                  r = sqrt(p*p+q*q);
+                  p /= r;
+                  q /= r;
+                  for (j = en - 1; j < n; j++) {
+                     z = mat[pos(en-1,j,n)];
+                     mat[pos(en-1,j,n)] = q * z + p *
+                     mat[pos(en,j,n)];
+                     mat[pos(en,j,n)] = q * mat[pos(en,j,n)] - p*z;
+                  }
+                  for (i = 0; i <= en; i++) {
+                     z = mat[pos(i,en-1,n)];
+                     mat[pos(i,en-1,n)] = q * z + p * mat[pos(i,en,n)];
+                     mat[pos(i,en,n)] = q * mat[pos(i,en,n)] - p*z;
+                  }
+                  for (i = low; i <= hi; i++) {
+                     z = vr[pos(i,en-1,n)];
+                     vr[pos(i,en-1,n)] = q*z + p*vr[pos(i,en,n)];
+                     vr[pos(i,en,n)] = q*vr[pos(i,en,n)] - p*z;
+                  }
+               }
+            }
+            en -= 2;
+            break;
+         }
+         if (niter == MAXITER) return(-1);
+         if (niter != 0 && niter % 10 == 0) {
+            t += x;
+            for (i = low; i <= en; i++) mat[pos(i,i,n)] -= x;
+            s = fabs(mat[pos(en,en-1,n)]) + fabs(mat[pos(en-1,en-2,n)]);
+            x = y = 0.75 * s;
+            w = -0.4375 * s * s;
+         }
+         niter++;
+           /* look for two consecutive small subdiagonal elements */
+         for (m = en - 2; m >= l; m--) {
+            z = mat[pos(m,m,n)];
+            r = x - z;
+            s = y - z;
+            p = (r * s - w) / mat[pos(m+1,m,n)] + mat[pos(m,m+1,n)];
+            q = mat[pos(m+1,m+1,n)] - z - r - s;
+            r = mat[pos(m+2,m+1,n)];
+            s = fabs(p) + fabs(q) + fabs(r);
+            p /= s;
+            q /= s;
+            r /= s;
+            if (m == l || fabs(mat[pos(m,m-1,n)]) * (fabs(q)+fabs(r)) <=
+                eps * (fabs(mat[pos(m-1,m-1,n)]) + fabs(z) +
+                fabs(mat[pos(m+1,m+1,n)])) * fabs(p)) break;
+         }
+         for (i = m + 2; i <= en; i++) mat[pos(i,i-2,n)] = 0;
+         for (i = m + 3; i <= en; i++) mat[pos(i,i-3,n)] = 0;
+             /* double QR step involving rows l to en and columns m to en */
+         for (k = m; k < en; k++) {
+            if (k != m) {
+               p = mat[pos(k,k-1,n)];
+               q = mat[pos(k+1,k-1,n)];
+               r = (k == en - 1) ? 0 : mat[pos(k+2,k-1,n)];
+               if ((x = fabs(p) + fabs(q) + fabs(r)) == 0) continue;
+               p /= x;
+               q /= x;
+               r /= x;
+            }
+            s = sqrt(p*p+q*q+r*r);
+            if (p < 0) s = -s;
+            if (k != m) {
+               mat[pos(k,k-1,n)] = -s * x;
+            }
+            else if (l != m) {
+               mat[pos(k,k-1,n)] = -mat[pos(k,k-1,n)];
+            }
+            p += s;
+            x = p / s;
+            y = q / s;
+            z = r / s;
+            q /= p;
+            r /= p;
+                /* row modification */
+            for (j = k; j <= (!job ? en : n-1); j++){
+               p = mat[pos(k,j,n)] + q * mat[pos(k+1,j,n)];
+               if (k != en - 1) {
+                  p += r * mat[pos(k+2,j,n)];
+                  mat[pos(k+2,j,n)] -= p * z;
+               }
+               mat[pos(k+1,j,n)] -= p * y;
+               mat[pos(k,j,n)] -= p * x;
+            }
+            j = min(en,k+3);
+              /* column modification */
+            for (i = (!job ? l : 0); i <= j; i++) {
+               p = x * mat[pos(i,k,n)] + y * mat[pos(i,k+1,n)];
+               if (k != en - 1) {
+                  p += z * mat[pos(i,k+2,n)];
+                  mat[pos(i,k+2,n)] -= p*r;
+               }
+               mat[pos(i,k+1,n)] -= p*q;
+               mat[pos(i,k,n)] -= p;
+            }
+            if (job) {             /* accumulate transformations */
+               for (i = low; i <= hi; i++) {
+                  p = x * vr[pos(i,k,n)] + y * vr[pos(i,k+1,n)];
+                  if (k != en - 1) {
+                     p += z * vr[pos(i,k+2,n)];
+                     vr[pos(i,k+2,n)] -= p*r;
+                  }
+                  vr[pos(i,k+1,n)] -= p*q;
+                  vr[pos(i,k,n)] -= p;
+               }
+            }
+         }
+      }
+   }
+
+   if (!job) return(0);
+   if (norm != 0) {
+       /* back substitute to find vectors of upper triangular form */
+      for (en = n-1; en >= 0; en--) {
+         p = valr[en];
+         if ((q = vali[en]) < 0) {            /* complex vector */
+            m = en - 1;
+            if (fabs(mat[pos(en,en-1,n)]) > fabs(mat[pos(en-1,en,n)])) {
+               mat[pos(en-1,en-1,n)] = q / mat[pos(en,en-1,n)];
+               mat[pos(en-1,en,n)] = (p - mat[pos(en,en,n)]) /
+                     mat[pos(en,en-1,n)];
+            }
+            else {
+               v = cdiv(compl(0.0,-mat[pos(en-1,en,n)]),
+                    compl(mat[pos(en-1,en-1,n)]-p,q));
+               mat[pos(en-1,en-1,n)] = v.re;
+               mat[pos(en-1,en,n)] = v.im;
+            }
+            mat[pos(en,en-1,n)] = 0;
+            mat[pos(en,en,n)] = 1;
+            for (i = en - 2; i >= 0; i--) {
+               w = mat[pos(i,i,n)] - p;
+               ra = 0;
+               sa = mat[pos(i,en,n)];
+               for (j = m; j < en; j++) {
+                  ra += mat[pos(i,j,n)] * mat[pos(j,en-1,n)];
+                  sa += mat[pos(i,j,n)] * mat[pos(j,en,n)];
+               }
+               if (vali[i] < 0) {
+                  z = w;
+                  r = ra;
+                  s = sa;
+               }
+               else {
+                  m = i;
+                  if (vali[i] == 0) {
+                     v = cdiv(compl(-ra,-sa),compl(w,q));
+                     mat[pos(i,en-1,n)] = v.re;
+                     mat[pos(i,en,n)] = v.im;
+                  }
+                  else {                      /* solve complex equations */
+                     x = mat[pos(i,i+1,n)];
+                     y = mat[pos(i+1,i,n)];
+                     v.re = (valr[i]- p)*(valr[i]-p) + vali[i]*vali[i] - q*q;
+                     v.im = (valr[i] - p)*2*q;
+                     if ((fabs(v.re) + fabs(v.im)) == 0) {
+                        v.re = eps * norm * (fabs(w) +
+                                fabs(q) + fabs(x) + fabs(y) + fabs(z));
+                     }
+                     v = cdiv(compl(x*r-z*ra+q*sa,x*s-z*sa-q*ra),v);
+                     mat[pos(i,en-1,n)] = v.re;
+                     mat[pos(i,en,n)] = v.im;
+                     if (fabs(x) > fabs(z) + fabs(q)) {
+                        mat[pos(i+1,en-1,n)] = 
+                             (-ra - w * mat[pos(i,en-1,n)] +
+                             q * mat[pos(i,en,n)]) / x;
+                        mat[pos(i+1,en,n)] = (-sa - w * mat[pos(i,en,n)] -
+                             q * mat[pos(i,en-1,n)]) / x;
+                     }
+                     else {
+                        v = cdiv(compl(-r-y*mat[pos(i,en-1,n)],
+                             -s-y*mat[pos(i,en,n)]),compl(z,q));
+                        mat[pos(i+1,en-1,n)] = v.re;
+                        mat[pos(i+1,en,n)] = v.im;
+                     }
+                  }
+               }
+            }
+         }
+         else if (q == 0) {                             /* real vector */
+            m = en;
+            mat[pos(en,en,n)] = 1;
+            for (i = en - 1; i >= 0; i--) {
+               w = mat[pos(i,i,n)] - p;
+               r = mat[pos(i,en,n)];
+               for (j = m; j < en; j++) {
+                  r += mat[pos(i,j,n)] * mat[pos(j,en,n)];
+               }
+               if (vali[i] < 0) {
+                  z = w;
+                  s = r;
+               }
+               else {
+                  m = i;
+                  if (vali[i] == 0) {
+                     if ((t = w) == 0) t = eps * norm;
+                     mat[pos(i,en,n)] = -r / t;
+                  }
+                  else {            /* solve real equations */
+                     x = mat[pos(i,i+1,n)];
+                     y = mat[pos(i+1,i,n)];
+                     q = (valr[i] - p) * (valr[i] - p) + vali[i]*vali[i];
+                     t = (x * s - z * r) / q;
+                     mat[pos(i,en,n)] = t;
+                     if (fabs(x) <= fabs(z)) {
+                        mat[pos(i+1,en,n)] = (-s - y * t) / z;
+                     }
+                     else {
+                        mat[pos(i+1,en,n)] = (-r - w * t) / x;
+                     }
+                  }
+               }
+            }
+         }
+      }
+             /* vectors of isolated roots */
+      for (i = 0; i < n; i++) {
+         if (i < low || i > hi) {
+            for (j = i; j < n; j++) {
+               vr[pos(i,j,n)] = mat[pos(i,j,n)];
+            }
+         }
+      }
+       /* multiply by transformation matrix */
+
+      for (j = n-1; j >= low; j--) {
+         m = min(j,hi);
+         for (i = low; i <= hi; i++) {
+            for (z = 0,k = low; k <= m; k++) {
+               z += vr[pos(i,k,n)] * mat[pos(k,j,n)];
+            }
+            vr[pos(i,j,n)] = z;
+         }
+      }
+   }
+    /* rearrange complex eigenvectors */
+   for (j = 0; j < n; j++) {
+      if (vali[j] != 0) {
+         for (i = 0; i < n; i++) {
+            vi[pos(i,j,n)] = vr[pos(i,j+1,n)];
+            vr[pos(i,j+1,n)] = vr[pos(i,j,n)];
+            vi[pos(i,j+1,n)] = -vi[pos(i,j,n)];
+         }
+         j++;
+      }
+   }
+   return(0);
+}
diff --git a/whtest/eigen.h b/whtest/eigen.h
new file mode 100644
index 0000000..7dceac7
--- /dev/null
+++ b/whtest/eigen.h
@@ -0,0 +1,33 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, BUI Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#ifndef EIGEN_H
+#define EIGEN_H
+
+int xtoy (double x[], double y[], int n);
+int matinv( double x[], int n, int m, double space[]);
+
+void matAbyBisC (double A[], double B[], int n, double C[]);
+
+int eigen(int job, double A[], int n, double rr[], double ri[],
+          double vr[], double vi[], double w[]);
+
+#endif 
diff --git a/whtest/eigen_sym.c b/whtest/eigen_sym.c
new file mode 100644
index 0000000..2a9ce74
--- /dev/null
+++ b/whtest/eigen_sym.c
@@ -0,0 +1,316 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include "eigen_sym.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "whtools.h"
+
+void eliminateZero(DMat20 mat, DVec20 forg, int num, 
+	double **new_mat, DVec20 new_forg, int *new_num) {
+	int i, j, inew, jnew;
+	*new_num = 0;
+	for (i = 0; i < num; i++)
+		if (forg[i] > ZERO) 
+			new_forg[(*new_num)++] = forg[i];
+	if (*new_num == num) return;
+	for (i = 0, inew = 0; i < num; i++)
+		if (forg[i] > ZERO) {
+			for (j = 0, jnew = 0; j < num; j++) 
+				if (forg[j] > ZERO) {
+					new_mat[inew][jnew] = mat[i][j];
+					jnew++;
+				}
+			inew++;
+		}
+}
+
+void transformHMatrix(double **a, DVec20 stateFrq, DVec20 stateFrq_sqrt, int num_state) {
+	int i, j;
+
+	for (i = 0; i < num_state; i++)
+		stateFrq_sqrt[i] = sqrt(stateFrq[i]);
+	for (i = 0; i < num_state; i++)
+		for (j = 0; j < i; j++) {
+			a[i][j] *= (stateFrq_sqrt[i] / stateFrq_sqrt[j]);
+			a[j][i] = a[i][j];
+		}
+}
+
+void tred2(double **a, int n, double *d, double *e)
+{
+	int l,k,j,i;
+	double scale,hh,h,g,f;
+
+	for (i=n-1;i>0;i--) {
+		l=i-1;
+		h=scale=0.0;
+		if (l > 0) {
+			for (k=0;k<=l;k++)
+				scale += fabs(a[i][k]);
+			if (scale == 0.0)
+				e[i]=a[i][l];
+			else {
+				for (k=0;k<=l;k++) {
+					a[i][k] /= scale;
+					h += a[i][k]*a[i][k];
+				}
+				f=a[i][l];
+				g=(f >= 0.0 ? -sqrt(h) : sqrt(h));
+				e[i]=scale*g;
+				h -= f*g;
+				a[i][l]=f-g;
+				f=0.0;
+				for (j=0;j<=l;j++) {
+					a[j][i]=a[i][j]/h;
+					g=0.0;
+					for (k=0;k<=j;k++)
+						g += a[j][k]*a[i][k];
+					for (k=j+1;k<=l;k++)
+						g += a[k][j]*a[i][k];
+					e[j]=g/h;
+					f += e[j]*a[i][j];
+				}
+				hh=f/(h+h);
+				for (j=0;j<=l;j++) {
+					f=a[i][j];
+					e[j]=g=e[j]-hh*f;
+					for (k=0;k<=j;k++)
+						a[j][k] -= (f*e[k]+g*a[i][k]);
+				}
+			}
+		} else
+			e[i]=a[i][l];
+		d[i]=h;
+	}
+	d[0]=0.0;
+	e[0]=0.0;
+	/* Contents of this loop can be omitted if eigenvectors not
+			wanted except for statement d[i]=a[i][i]; */
+	for (i=0;i<n;i++) {
+		l=i;
+		if (d[i] != 0.0) {
+			for (j=0;j<l;j++) {
+				g=0.0;
+				for (k=0;k<l;k++)
+					g += a[i][k]*a[k][j];
+				for (k=0;k<l;k++)
+					a[k][j] -= g*a[k][i];
+			}
+		}
+		d[i]=a[i][i];
+		a[i][i]=1.0;
+		for (j=0;j<l;j++) a[j][i]=a[i][j]=0.0;
+	}
+}
+
+#define sqr(a) (((a)==0.0)? 0.0 : (a)*(a))
+
+double pythag(double a, double b)
+{
+	double absa,absb;
+	absa=fabs(a);
+	absb=fabs(b);
+	if (absa > absb) return absa*sqrt(1.0+sqr(absb/absa));
+	else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+sqr(absa/absb)));
+}
+
+
+#define NRANSI
+#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+
+void tqli(double *d, double *e, int n, double **z)
+{
+	int m,l,iter,i,k;
+	double s,r,p,g,f,dd,c,b;
+
+	for (i=1;i<n;i++) e[i-1]=e[i];
+	e[n-1]=0.0;
+	for (l=0;l<n;l++) {
+		iter=0;
+		do {
+			for (m=l;m<n-1;m++) {
+				dd=fabs(d[m])+fabs(d[m+1]);
+				if ((double)(fabs(e[m])+dd) == dd) break;
+			}
+			if (m != l) {
+				if (iter++ == 30) {
+					printf("ERROR: Too many iterations in tqli\n");
+					Finalize(1);
+				}
+				g=(d[l+1]-d[l])/(2.0*e[l]);
+				r=pythag(g,1.0);
+				g=d[m]-d[l]+e[l]/(g+SIGN(r,g));
+				s=c=1.0;
+				p=0.0;
+				for (i=m-1;i>=l;i--) {
+					f=s*e[i];
+					b=c*e[i];
+					e[i+1]=(r=pythag(f,g));
+					if (r == 0.0) {
+						d[i+1] -= p;
+						e[m]=0.0;
+						break;
+					}
+					s=f/r;
+					c=g/r;
+					g=d[i+1]-p;
+					r=(d[i]-g)*s+2.0*c*b;
+					d[i+1]=g+(p=s*r);
+					g=c*r-b;
+					for (k=0;k<n;k++) {
+						f=z[k][i+1];
+						z[k][i+1]=s*z[k][i]+c*f;
+						z[k][i]=c*z[k][i]-s*f;
+					}
+				}
+				if (r == 0.0 && i >= l) continue;
+				d[l] -= p;
+				e[l]=g;
+				e[m]=0.0;
+			}
+		} while (m != l);
+	}
+}
+#undef SIGN
+#undef NRANSI
+
+int eigen_sym_core(double *mat, int n, double *eval) {
+	double **a;
+	int i, j;
+	double *off_diag;
+
+	a = (double **) malloc(n * sizeof(double*));
+	for (i = 0; i < n; i++)
+		a[i] = (double *) calloc(n, sizeof(double));
+	off_diag = (double *) malloc(n * sizeof(double*));
+	
+	for (i = 0; i < n; i++)
+		for (j = 0; j < n; j++)
+			a[i][j] = mat[i*n+j];
+
+	/* make this matrix tridiagonal */
+	tred2(a, n, eval, off_diag);
+	/* compute eigenvalues and eigenvectors */
+	tqli(eval, off_diag, n, a);
+
+	free(off_diag);
+	for (i = n-1; i >= 0; i--)
+		free(a[i]);
+	free(a);
+
+	return 0;
+}
+
+int eigen_sym(DMat20 H_mat, DVec20 Pi_vec, int num_state, 
+	DVec20 eval, DMat20 evec, DMat20 inv_evec) {
+
+	DVec20 forg, new_forg, forg_sqrt, off_diag, eval_new;
+	DMat20 b;
+	int i, j, k, error, new_num, inew, jnew;
+	double zero;
+	double **a;
+
+	a = (double **) malloc(num_state * sizeof(double*));
+	for (i = 0; i < num_state; i++)
+		a[i] = (double*) (calloc(num_state, sizeof(double)));
+
+	/* copy a to b */
+	for (i = 0; i < num_state; i++)
+		for (j = 0; j < num_state; j++) {
+			a[i][j] = H_mat[i][j] / Pi_vec[i];
+			b[i][j] = a[i][j];
+		}
+	for (i = 0; i < num_state; i++) forg[i] = Pi_vec[i];
+
+	eliminateZero(b, forg, num_state, a, new_forg, &new_num);
+
+	transformHMatrix(a, new_forg, forg_sqrt, new_num); 
+
+	/* make this matrix tridiagonal */
+	tred2(a, new_num, eval_new, off_diag);
+	/* compute eigenvalues and eigenvectors */
+	tqli(eval_new, off_diag, new_num, a);
+
+	/* now get back eigen */
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		eval[i] = (forg[i] > ZERO) ? eval_new[inew--] : 0;
+
+	/* calculate the actual eigenvectors of H and its inverse matrix */
+	for (i = num_state-1,inew = new_num-1; i >= 0; i--)
+		if (forg[i] > ZERO) {
+			for (j = num_state-1, jnew = new_num-1; j >= 0; j--) 
+				if (forg[j] > ZERO) {
+					evec[i][j] = a[inew][jnew] / forg_sqrt[inew];
+					inv_evec[i][j] = a[jnew][inew] * forg_sqrt[jnew];
+					jnew--;
+				} else {
+					evec[i][j] = (i == j);
+					inv_evec[i][j] = (i == j);
+				}
+ 			inew--;
+		} else 
+		for (j=0; j < num_state; j++) {
+			evec[i][j] = (i==j);
+			inv_evec[i][j] = (i==j);
+		}
+/*
+	printf("eigen_sym \n");
+	for (i = 0; i < num_state; i++)
+		printf("%f ", eval[i]);
+	printf("\n");*/
+
+
+	/* check eigenvalue equation */
+	error = 0;
+	for (j = 0; j < num_state; j++) {
+		for (i = 0, zero = 0.0; i < num_state; i++) {
+			for (k = 0; k < num_state; k++) zero += b[i][k] * evec[k][j];
+			zero -= eval[j] * evec[i][j];
+			if (fabs(zero) > 1.0e-5) {
+				error = 1;
+				printf("zero = %f\n", zero);
+			}
+		}
+
+		for (i = 0, zero = 0.0; i < num_state; i++) {
+			for (k = 0; k < num_state; k++) zero += evec[i][k] * inv_evec[k][j];
+			if (i == j) zero -= 1.0;
+			if (fabs(zero) > 1.0e-5) {
+				error = 1;
+				printf("zero = %f\n", zero);
+			}
+		}
+
+	}
+
+	for (i = num_state-1; i >= 0; i--)
+		free(a[i]);
+	free(a);
+
+	if (error) {
+		printf("\nWARNING: Eigensystem doesn't satisfy eigenvalue equation!\n");
+		return 1;
+	}
+
+	return 0;
+}
+
diff --git a/whtest/eigen_sym.h b/whtest/eigen_sym.h
new file mode 100644
index 0000000..4e0f49e
--- /dev/null
+++ b/whtest/eigen_sym.h
@@ -0,0 +1,48 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#ifndef EIGEN_SYM_H
+#define EIGEN_SYM_H
+
+#define NUM_STATE 4
+#define ZERO 0.000001
+
+
+typedef double DVec20[NUM_STATE];
+typedef DVec20 DMat20[NUM_STATE];
+
+
+
+/**
+	computing eigenvalues and eigenvectors of matrix Pi_vec^(-1) * H_mat, where H_mat is a
+	symmetric matrix
+	@param H_mat (IN)
+	@param Pi_vec (IN)
+	@param n (IN) size of matrix
+	@param eval (OUT) eigenvalues
+	@param evec (OUT) eigenvectors
+	@param inv_evec (OUT) inverse matrix of eigenvectors
+*/
+int eigen_sym(DMat20 H_mat, DVec20 Pi_vec, int num_state, 
+	DVec20 eval, DMat20 evec, DMat20 inv_evec);
+int eigen_sym_core(double *mat, int n, double *eval);
+
+#endif
diff --git a/whtest/random.c b/whtest/random.c
new file mode 100644
index 0000000..c1f21d1
--- /dev/null
+++ b/whtest/random.c
@@ -0,0 +1,356 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "random.h"
+#include "whtools.h"
+
+unsigned int kiss(void);
+
+/****************************************************************************************/ 
+double sexp(void) 
+{ 
+    /* q[k-1] = sum(aLOG(2.0)**k/k!) k=1,..,n, */ 
+    /* The highest n (here 8) is determined by q[n-1] = 1.0 */ 
+    /* within standard precision */ 
+    static double q[] = 
+    { 
+		0.6931471805599453, 
+			0.9333736875190459, 
+			0.9888777961838675, 
+			0.9984959252914960, 
+			0.9998292811061389, 
+			0.9999833164100727, 
+			0.9999985691438767, 
+			0.9999998906925558, 
+			0.9999999924734159, 
+			0.9999999995283275, 
+			0.9999999999728814, 
+			0.9999999999985598, 
+			0.9999999999999289, 
+			0.9999999999999968, 
+			0.9999999999999999, 
+			1.0000000000000000 
+    }; 
+    double a, u, ustar, umin; 
+    int i; 
+	 
+    a = 0.0; 
+    u = ranDum(); 
+    for (;;) { 
+		u = u + u; 
+		if (u > 1.0) 
+			break; 
+		a = a + q[0]; 
+    } 
+    u = u - 1.0; 
+	 
+    if (u <= q[0]) 
+		return a + u; 
+	 
+    i = 0; 
+    ustar = ranDum(); 
+    umin = ustar; 
+    do { 
+		ustar = ranDum(); 
+		if (ustar < umin) 
+			umin = ustar; 
+		i = i + 1; 
+    } 
+    while (u > q[i]); 
+    return a + umin * q[0]; 
+} 
+/******************************************************************/
+/*  variables for the kiss random number generator                */
+/******************************************************************/
+unsigned int k,m,x,y,z,w,carry,r;
+/******************************************************************/
+/*  kickstart the kiss random number generator                    */
+/******************************************************************/
+void start_kiss(int seed)
+{
+#  ifdef PARALLEL
+	int n;
+#  endif
+
+     x=seed;y=102;z=12;w=34535;
+     x = x * 69069 + 1;
+     y ^= y << 13;
+     y ^= y >> 17;
+     y ^= y << 5;
+     k = (z >> 2) + (w >> 3) + (carry >> 2);
+     m = w + w + z + carry;
+     z = w;
+     w = m;
+     carry = k >> 30;
+
+#  ifdef PARALLEL
+	for (n=0; n<mpi_myrank; n++)
+		kiss();
+#  endif
+
+}
+/******************************************************************/
+void restart_kiss(unsigned int *vals)
+{
+	k=vals[0];
+	m=vals[1];
+	x=vals[2];
+	y=vals[3];
+	z=vals[4];
+	w=vals[5];
+	carry=vals[6];
+	r=vals[7];
+}
+/******************************************************************/
+void kiss_state(unsigned int *vals)
+{
+	vals[0]=k;
+	vals[1]=m;
+	vals[2]=x;
+	vals[3]=y;
+	vals[4]=z;
+	vals[5]=w;
+	vals[6]=carry;
+	vals[7]=r;
+}
+/******************************************************************/
+/*   Keep It Simple Stupid random number generator from George 
+     Marsaglia's DIEHARD cdrom                                    */
+/******************************************************************/
+unsigned int single_kiss(void)
+{
+     x = x * 69069 + 1;
+     y ^= y << 13;
+     y ^= y >> 17;
+     y ^= y << 5;
+     k = (z >> 2) + (w >> 3) + (carry >> 2);
+     m = w + w + z + carry;
+     z = w;
+     w = m;
+     carry = k >> 30;
+     return x+y+z;
+}
+
+unsigned int kiss(void) {
+#ifdef PARALLEL
+	int i;
+	for (i = 1; i < mpi_size; i++)
+		single_kiss();
+#endif
+
+	return single_kiss();
+}
+
+/******************************************************************/
+double dkiss(void)
+{
+    return ((double)kiss()+0.5)/4294967296.0;
+}
+/******************************************************************/
+/*************************************************************************/ 
+double normal(void) 
+{ 
+	static int iset = 0; 
+	static double gset; 
+	double  fac,rsq,v1,v2; 
+	 
+	if (iset == 0) { 
+		do { 
+			v1 = 2.0*ranDum( )-1.0; 
+			v2 = 2.0*ranDum( )-1.0; 
+			rsq = v1*v1+v2*v2; 
+		} while (rsq >= 1.0 || rsq == 0); 
+		fac = sqrt(-2.0*log(rsq)/rsq); 
+		gset = v1 * fac; 
+		iset = 1; 
+		return v2*fac; 
+	} else { 
+		iset = 0; 
+		return gset; 
+	} 
+} 
+/****************************************************************/ 
+static double a1 = 0.3333333; 
+static double a2 = -0.250003; 
+static double a3 = 0.2000062; 
+static double a4 = -0.1662921; 
+static double a5 = 0.1423657; 
+static double a6 = -0.1367177; 
+static double a7 = 0.1233795; 
+static double e1 = 1.0; 
+static double e2 = 0.4999897; 
+static double e3 = 0.166829; 
+static double e4 = 0.0407753; 
+static double e5 = 0.010293; 
+static double q1 = 0.04166669; 
+static double q2 = 0.02083148; 
+static double q3 = 0.00801191; 
+static double q4 = 0.00144121; 
+static double q5 = -7.388e-5; 
+static double q6 = 2.4511e-4; 
+static double q7 = 2.424e-4; 
+static double sqrt32 = 5.656854; 
+static double aa = 0.; 
+static double aaa = 0.; 
+double rgamma(double a, double scale) 
+#define repeat for(;;) 
+/* Taken from R */ 
+{ 
+	static double b, c, d, e, p, q, r, s, t, u, v, w, x; 
+	static double q0, s2, si; 
+	double ret_val; 
+	 
+	if (a < 1.0) { 
+		/* alternate method for parameters a below 1 */ 
+		/* 0.36787944117144232159 = exp(-1) */ 
+		aa = 0.0; 
+		b = 1.0 + 0.36787944117144232159 * a; 
+		repeat { 
+			p = b * dkiss(); 
+			if (p >= 1.0) { 
+				ret_val = -log((b - p) / a); 
+				if (sexp() >= (1.0 - a) * log(ret_val)) 
+					break; 
+			} else { 
+				ret_val = exp(log(p) / a); 
+				if (sexp() >= ret_val) 
+					break; 
+			} 
+		} 
+		return scale * ret_val; 
+	} 
+	/* Step 1: Recalculations of s2, s, d if a has changed */ 
+	if (a != aa) { 
+		aa = a; 
+		s2 = a - 0.5; 
+		s = sqrt(s2); 
+		d = sqrt32 - s * 12.0; 
+	} 
+	/* Step 2: t = standard normal deviate, */ 
+	/* x = (s,1/2)-normal deviate. */ 
+	/* immediate acceptance (i) */ 
+	 
+	t = normal(); 
+	x = s + 0.5 * t; 
+	ret_val = x * x; 
+	if (t >= 0.0) 
+		return scale * ret_val; 
+	 
+	/* Step 3: u = 0,1 - uniform sample. squeeze acceptance (s) */ 
+	u = dkiss(); 
+	if (d * u <= t * t * t) { 
+		return scale * ret_val; 
+	} 
+	/* Step 4: recalculations of q0, b, si, c if necessary */ 
+	 
+	if (a != aaa) { 
+		aaa = a; 
+		r = 1.0 / a; 
+		q0 = ((((((q7 * r + q6) * r + q5) * r + q4) 
+			* r + q3) * r + q2) * r + q1) * r; 
+		 
+		/* Approximation depending on size of parameter a */ 
+		/* The constants in the expressions for b, si and */ 
+		/* c were established by numerical experiments */ 
+		 
+		if (a <= 3.686) { 
+			b = 0.463 + s + 0.178 * s2; 
+			si = 1.235; 
+			c = 0.195 / s - 0.079 + 0.16 * s; 
+		} else if (a <= 13.022) { 
+			b = 1.654 + 0.0076 * s2; 
+			si = 1.68 / s + 0.275; 
+			c = 0.062 / s + 0.024; 
+		} else { 
+			b = 1.77; 
+			si = 0.75; 
+			c = 0.1515 / s; 
+		} 
+	} 
+	/* Step 5: no quotient test if x not positive */ 
+	 
+	if (x > 0.0) { 
+		/* Step 6: calculation of v and quotient q */ 
+		v = t / (s + s); 
+		if (fabs(v) <= 0.25) 
+			q = q0 + 0.5 * t * t * ((((((a7 * v + a6) 
+			* v + a5) * v + a4) * v + a3) 
+			* v + a2) * v + a1) * v; 
+		else 
+			q = q0 - s * t + 0.25 * t * t + (s2 + s2) 
+			* log(1.0 + v); 
+		 
+		 
+		/* Step 7: quotient acceptance (q) */ 
+		 
+		if (log(1.0 - u) <= q) 
+			return scale * ret_val; 
+	} 
+	/* Step 8: e = standard exponential deviate */ 
+	/* u= 0,1 -uniform deviate */ 
+	/* t=(b,si)-double exponential (laplace) sample */ 
+	 
+	repeat { 
+		e = sexp(); 
+		u = dkiss(); 
+		u = u + u - 1.0; 
+		if (u < 0.0) 
+			t = b - si * e; 
+		else 
+			t = b + si * e; 
+		/* Step  9:  rejection if t < tau(1) = -0.71874483771719 */ 
+		if (t >= -0.71874483771719) { 
+			/* Step 10:  calculation of v and quotient q */ 
+			v = t / (s + s); 
+			if (fabs(v) <= 0.25) 
+				q = q0 + 0.5 * t * t * ((((((a7 * v + a6) 
+				* v + a5) * v + a4) * v + a3) 
+				* v + a2) * v + a1) * v; 
+			else 
+				q = q0 - s * t + 0.25 * t * t + (s2 + s2) 
+				* log(1.0 + v); 
+			/* Step 11:  hat acceptance (h) */ 
+			/* (if q not positive go to step 8) */ 
+			if (q > 0.0) { 
+				if (q <= 0.5) 
+					w = ((((e5 * q + e4) * q + e3) 
+					* q + e2) * q + e1) * q; 
+				else 
+					w = exp(q) - 1.0; 
+				/* if t is rejected */ 
+				/* sample again at step 8 */ 
+				if (c * fabs(u) <= w * exp(e - 0.5 * t * t)) 
+					break; 
+			} 
+		} 
+	} 
+	x = s + 0.5 * t; 
+	return scale * x * x; 
+} 
+
+
+
+
+/*******************************************************************/
diff --git a/whtest/random.h b/whtest/random.h
new file mode 100644
index 0000000..146a49e
--- /dev/null
+++ b/whtest/random.h
@@ -0,0 +1,37 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#define ranDum dkiss
+#define starTup start_kiss
+
+
+double dkiss(void);
+void start_kiss(int seed);
+void restart_kiss(unsigned int *vals);
+void kiss_state(unsigned int *vals);
+
+double rgamma(double a, double scale); 
+#endif
+
+
+
diff --git a/whtest/weisslambda.c b/whtest/weisslambda.c
new file mode 100644
index 0000000..e28ba43
--- /dev/null
+++ b/whtest/weisslambda.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+
+#include "weisslambda_sub.h"
+
+/* following is outdated */
+
+/*
+int performDeltaTest ()
+{
+
+	FILE *fps;
+
+	printf("Performing the test...\n");
+
+	ausgabe_null = (char*)&ausgabe_1;
+	ausgabe_data = (char*)&ausgabe_0;
+
+
+	printf ( "alpha  %f ",alpha );
+
+	printf ( "%s\n",ausgabe_data );
+
+
+	fps = fopen ( "p_values","a" );
+
+	fprintf ( fps,"file name\t%s\n\n",datei_name );
+
+	ReadDataSets();
+
+	fclose ( fps );
+
+
+	ComputeWeissLambdafromData();
+
+	ComputeWeissLambdafromSimulation();
+
+
+	fprintf ( stderr,"\n" );
+}
+
+*/
+
+
diff --git a/whtest/weisslambda_sub.c b/whtest/weisslambda_sub.c
new file mode 100644
index 0000000..5059f59
--- /dev/null
+++ b/whtest/weisslambda_sub.c
@@ -0,0 +1,369 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "weisslambda_sub.h"
+#include "eigen.h"
+#include "eigen_sym.h"
+
+/*
+int simulation, nr_basen, taxa, paare;
+
+char datei_name[15];
+char ausgabe_null[20];
+char ausgabe_data[20];
+char lambdawerte[20];
+*/
+
+/*
+char *ausgabe_null;
+char *ausgabe_data;
+int paare;
+
+double **dataQ;
+
+double **nullbetweenQ;
+
+double WeissLambdaData;
+*/
+
+/******************************************************/
+
+
+void Compute_SSbetween_Matrix ( double **data, int s, double SSbetween[] );
+
+double ComputeWeissLambda ( double WeissMatrix[] );
+
+/******************************************************/
+/******************************************************/
+#define NR_END 1
+#define FREE_ARG char*
+
+int *ivector ( long nl, long nh )
+/* allocate an int vector with subscript range v[nl..nh] */
+{
+	int *v;
+
+	v= ( int * ) malloc ( ( size_t ) ( ( nh-nl+1+NR_END ) *sizeof ( int ) ) );
+
+	return v-nl+NR_END;
+}
+void free_ivector ( int *v, long nl, long nh )
+/* free an int vector allocated with ivector() */
+{
+	free ( ( FREE_ARG ) ( v+nl-NR_END ) );
+}
+
+#define NRANSI
+
+#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
+#define M 7
+#define NSTACK 50
+
+void sort ( unsigned long n, double arr[] )
+{
+	unsigned long i,ir=n,j,k,l=1;
+	int jstack=0,*istack;
+	double a,temp;
+
+	istack=ivector ( 1,NSTACK );
+	for ( ;; )
+	{
+		if ( ir-l < M )
+		{
+			for ( j=l+1;j<=ir;j++ )
+			{
+				a=arr[j];
+				for ( i=j-1;i>=1;i-- )
+				{
+					if ( arr[i] <= a ) break;
+					arr[i+1]=arr[i];
+				}
+				arr[i+1]=a;
+			}
+			if ( jstack == 0 ) break;
+			ir=istack[jstack--];
+			l=istack[jstack--];
+		}
+		else
+		{
+			k= ( l+ir ) >> 1;
+			SWAP ( arr[k],arr[l+1] )
+			if ( arr[l+1] > arr[ir] )
+			{
+				SWAP ( arr[l+1],arr[ir] )
+			}
+			if ( arr[l] > arr[ir] )
+			{
+				SWAP ( arr[l],arr[ir] )
+			}
+			if ( arr[l+1] > arr[l] )
+			{
+				SWAP ( arr[l+1],arr[l] )
+			}
+			i=l+1;
+			j=ir;
+			a=arr[l];
+			for ( ;; )
+			{
+				do i++; while ( arr[i] < a );
+				do j--; while ( arr[j] > a );
+				if ( j < i ) break;
+				SWAP ( arr[i],arr[j] );
+			}
+			arr[l]=arr[j];
+			arr[j]=a;
+			jstack += 2;
+
+			if ( ir-i+1 >= j-l )
+			{
+				istack[jstack]=ir;
+				istack[jstack-1]=i;
+				ir=j-1;
+			}
+			else
+			{
+				istack[jstack]=j-1;
+				istack[jstack-1]=l;
+				l=i;
+			}
+		}
+	}
+	free_ivector ( istack,1,NSTACK );
+}
+#undef M
+#undef NSTACK
+#undef SWAP
+#undef NRANSI
+
+/******************************************************/
+/******************************************************/
+
+
+
+/*****************************************************************/
+
+void printSSbetween (double SS[]) {
+	int i, j;
+	for (i = 0; i < 12; i++) {
+		for ( j = 0; j < 12; j++)
+			printf("%+7.5f ", SS[i*12+j]);
+		printf("\n");
+	}
+}
+
+int CountValidPairs(double **q_mats) {
+	int i, num_q;
+
+	num_q = 0;
+	for ( i = 0; i < ( int ) ( ( taxa-1. ) *taxa/2. ); i++ )
+		if (q_mats[i][0] != 0.0) num_q ++;
+
+	return num_q;
+}
+
+
+/**
+	compute the delta statistic from 16*16 matrix
+*/
+double ComputeWeissLambdaQ16(double **q_16) {
+	double SSbetween[144];
+	double delta;
+	double **q_12;
+	int i, j, pair_id;
+
+	int paare = taxa * (taxa-1) / 2;
+
+	q_12 = ( double ** ) malloc ( paare * sizeof ( double * ) );
+
+	for ( i = 0; i < paare; i++ )
+		q_12[i] = ( double * ) calloc ( 12, sizeof ( double ) );
+
+	for (pair_id = 0; pair_id < paare; pair_id++)
+		for (i = 0, j = 0; i < 16; i++)
+			if (i % 5 != 0) q_12[pair_id][j++] = q_16[pair_id][i];
+
+	/*
+	for (pair_id = 0; pair_id < paare; pair_id++) {
+		for (i = 0; i < 12; i++) 
+			printf("%f ", q_16[pair_id][i]);
+		printf("\n");
+	}*/
+
+
+	Compute_SSbetween_Matrix ( q_12, 0, SSbetween );
+
+	/*printSSbetween(SSbetween);*/
+
+	
+
+
+	for ( i = paare-1; i >= 0; i-- )
+		free(q_12[i]);
+	free(q_12);
+
+	delta = ComputeWeissLambda(SSbetween);
+
+	return delta;
+}
+
+/***************************************************************/
+
+/***************************************************************/
+
+
+
+
+
+
+/***************************************************************/
+
+void Compute_SSbetween_Matrix ( double **data, int s, double SSbetween[] )
+{
+
+	int i, k, l;
+	int paare = taxa*(taxa-1)/2;
+
+	double mean[12];
+
+	int true_pair = 0;
+
+	for ( k = 0; k < 12; k++ )
+		mean[k] = 0;
+
+	for ( k = 0; k < 144; k++ )
+		SSbetween[k] = 0;
+
+	for ( i = 0; i < paare; i++ )
+	if (data[s*paare+i][0] != 0.0)
+	{
+		true_pair++;
+		for ( k = 0; k < 12; k++ )
+		{
+			mean[k] += data[s*paare+i][k];
+
+			for ( l = 0; l < 12; l++ )
+				SSbetween[k*12+l] += data[s*paare+i][k]*data[s*paare+i][l];
+		}
+	} else { 
+		/*fprintf(stderr, "one pair discarded\n");*/
+	}
+
+	for ( k = 0; k < 12; k++ )
+		mean[k] /= ( double ) true_pair;
+
+	for ( k = 0; k < 12; k++ )
+	{
+		for ( l = 0; l < 12; l++ )
+			SSbetween[k*12+l] = SSbetween[k*12+l] - true_pair * mean[k] * mean[l];
+
+	}
+
+	for ( k = 0; k < 144; k++ )
+		SSbetween[k] /= ( true_pair-1. );
+
+
+
+	/*for( k = 0; k < 144; k++)
+	SSbetween[k] *= simulation;
+	*/
+}
+
+
+/***************************************************************/
+
+double ComputeWeissLambda ( double WeissMatrix[] )
+{
+
+	double EigenWert[12], W[144]; /*T1[12], U[144], V[144], T2[144];*/
+
+	int k;
+/*
+	double lambda_summe = 0, log_lambda_sum = 0;
+	double product_lambda = 1, productinvlambda = 1;*/
+	double product_log_lambda = 1;
+
+
+	for ( k = 0; k < 144; k++ )
+		W[k] = WeissMatrix[k];
+
+
+
+	if ( ( k=eigen_sym_core ( W, 12, EigenWert ) ) !=0 )
+	/*if ( ( k=eigen ( 1, W, 12, EigenWert, T1, U, V, T2 ) ) !=0 )*/
+	{
+		fprintf ( stderr, "\ncomplex roots in WilksMatrix\n" );
+
+		return 0;
+	}
+
+	else
+	{
+		if ( EigenWert[0] > 100000 || EigenWert[11] < -0.1 )
+		{
+			fprintf ( stderr, "\nnumerical problems in eigenvalues of WeissMatrix\n" );
+			return 0;
+		}
+
+		else
+		{
+
+			for ( k = 0; k < 12; k++ )
+			{
+				product_log_lambda += log ( 1.+ EigenWert[k] );
+
+				/*
+				lambda_summe += EigenWert[k];
+
+				log_lambda_sum += log ( EigenWert[0] + EigenWert[k] );
+
+				product_lambda *= ( 1.+ EigenWert[k] );
+
+
+				productinvlambda /= ( 1.+ EigenWert[k] );*/
+
+			}
+
+
+			/*		fps = fopen(lambdawerte,"a");
+
+			fprintf(fps,"%f\n",product_log_lambda);
+
+					fclose( fps );
+
+			*/
+			return product_log_lambda;
+
+		}
+	}
+
+}
+
+
+/***************************************************************/
+
+
diff --git a/whtest/weisslambda_sub.h b/whtest/weisslambda_sub.h
new file mode 100644
index 0000000..8ee2bff
--- /dev/null
+++ b/whtest/weisslambda_sub.h
@@ -0,0 +1,53 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef WEISSLAMBDA_SUB_H
+#define WEISSLAMBDA_SUB_H
+
+extern int simulation, nr_basen, taxa;
+extern char datei_name[100];
+
+extern double alpha;
+/*
+extern char ausgabe_0[200];
+extern char ausgabe_1[200];
+extern char ausgabe_2[200];
+extern char *ausgabe_null;
+extern char *ausgabe_data;
+*/
+
+
+/*********************************/
+
+void ReadDataSize();
+
+void AllocateMemory();
+
+void ComputeWilksLambdafromData();
+
+double ComputeWeissLambdaQ16(double **q_16);
+int CountValidPairs(double **q_mats);
+
+void sort ( unsigned long n, double arr[] );
+
+
+/************************************/
+
+#endif
diff --git a/whtest/whtest.c b/whtest/whtest.c
new file mode 100644
index 0000000..4d137a4
--- /dev/null
+++ b/whtest/whtest.c
@@ -0,0 +1,658 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, BUI Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/*#include <iqtree_config.h>*/
+#include "../timeutil.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include "weisslambda_sub.h"
+
+#include "whtest_sub.h"
+#include "random.h"
+#include "whtools.h"
+/*
+#ifdef WIN32
+#include <sys/timeb.h>
+#include <sys/types.h>
+#include <winsock.h>
+void gettimeofday(struct timeval* t, void* timezone)
+{       struct _timeb timebuffer;
+        _ftime( &timebuffer );
+        t->tv_sec=timebuffer.time;
+        t->tv_usec=1000*timebuffer.millitm;
+}
+#else
+  #include <sys/time.h>
+  #ifndef HAVE_GETTIMEOFDAY
+	void gettimeofday(struct timeval* t, void* timezone) {
+		time_t cur_time;
+		time(&cur_time);
+		t->tv_sec = cur_time;
+		t->tv_usec = 0;
+	}
+  #endif
+#endif
+*/
+
+#ifdef PARALLEL
+int mpi_myrank;
+int mpi_size;
+int mpi_master_rank = 0;
+
+long p_randn;
+long p_rand;
+#endif /*PARALLEL*/
+
+int isMasterProc() {
+#ifdef PARALLEL
+	return mpi_myrank == mpi_master_rank;
+#else
+	return 1;
+#endif
+}
+
+int isSlaveProc() {
+#ifdef PARALLEL
+	return mpi_myrank != mpi_master_rank;
+#else
+	return 0;
+#endif
+}
+
+/*
+int isFirstSlaveProc() {
+#ifdef PARALLEL
+	if (mpi_size == 1)
+		return 1;
+	return mpi_myrank == mpi_master_rank+1;
+#else
+	return 1;
+#endif
+}*/
+
+
+void Finalize(int exit_code) {
+#ifdef PARALLEL
+	MPI_Finalize();
+#endif
+	if (isMasterProc())
+		if (exit_code == 0)
+			printf("\nFinished successfully.\n");
+	exit(exit_code);
+}
+
+
+
+int simulation, current_sim, nr_basen, taxa;
+
+int random_seed = -1;
+int check_times = 10;
+double p_value_cutoff;
+double alpha, beta;
+/*int verbose_mode = 0;*/
+int write_sim_result = 0;
+int write_dist_matrix = 0;
+int fix_distance = 0;
+
+double delta_data;
+double *delta_sim;
+double p_wert;
+
+char datei_name[100];
+
+/*
+char ausgabe_0[200];
+char ausgabe_1[200];
+char ausgabe_2[200];
+*/
+char ausgabe_report[200];
+char ausgabe_sim_result[200];
+char ausgabe_dist[200];
+char ausgabe_nj_tree[200];
+
+double *ml_distance = NULL;
+
+void WHT_setAlignmentSize(int ntax, int nsite) {
+	taxa = ntax;
+	nr_basen = nsite;
+}
+
+void WHT_allocateMemory() {
+	AllocateMemory();
+}
+
+void WHT_setSequenceSite(int seqid, int siteid, char c) {
+	if (c>4) c = 4;
+	seqData[seqid][siteid] = c;
+}
+
+void WHT_setSequenceName(int seqid, const char *name) {
+	strcpy(baum[seqid].bezeichnung, name);
+}
+
+void WHT_setParams(int nsim, double gamma_shape, char *filename, double *dist) {
+	simulation = nsim;
+	alpha = gamma_shape;
+	strcpy(datei_name, filename);
+	current_sim = 0;
+	p_value_cutoff = 1.0;
+
+	strcpy ( ausgabe_report, datei_name );
+	strcat ( ausgabe_report, ".whtest" );
+
+	strcpy ( ausgabe_sim_result, datei_name );
+	strcat ( ausgabe_sim_result, ".whsim" );
+	strcpy ( ausgabe_dist, datei_name );
+	strcat ( ausgabe_dist, ".whdist" );
+
+	strcpy ( ausgabe_nj_tree, datei_name );
+	strcat ( ausgabe_nj_tree, ".nj" );
+
+	ml_distance = dist;
+	write_dist_matrix = 1;
+	write_sim_result = 1;
+
+}
+
+void WHT_getResults(double *delta, double *delta_quantile, double *p_value) {
+	*delta = delta_data;
+	*delta_quantile = delta_sim[(int)floor(0.95*simulation)];
+	*p_value = p_wert;
+}
+
+
+void SetMLDistance() {
+	int i;
+	for (i=0; i < taxa; i++)
+		memcpy(distance[i], ml_distance + (i*taxa), sizeof(double)*taxa);
+}
+
+void usage(char *prog_name) {
+	if (!isMasterProc()) Finalize(1);
+	printf("Usage: %s <alignment> [OPTIONS]\n", prog_name);
+	printf("  <alignment>         alignment file name, in standard PHYLIP format\n");
+	printf("OPTIONS:\n");
+	printf("  -h                  print usage\n");
+	printf("  -s <SIMULATION>     #simulations to assess significance, default is 1000\n");
+	printf("  -a <ALPHA>          gamma shape parameter, default is 100 (equal site-rates)\n");
+	printf("  -t <CUTOFF>         stop the simulations when p-value exceeds the cutoff\n");
+	printf("  -i <N>              check p-value N times during simulation, default 10\n");
+	printf("  -seed <#>           use <#> as random number seed\n");
+	printf("  -wsim               write simulation results to file .whtest.sim\n");
+	printf("  -wdist              write distance matrix to file .whtest.dist\n");
+	printf("\n");
+	Finalize(1);
+}
+
+void parseArg( int argc,char **argv ) {
+	int i;
+	int arg_i;
+	/*char *alpha_arg = NULL;*/
+
+	if (isMasterProc()) {
+
+		printf("\nWELCOME TO WH-TEST\n\n");
+		printf("G. Weiss and A. von Haeseler (2003) Testing substitution models\n");
+		printf("within a phylogenetic tree. Mol. Biol. Evol, 20(4):572-578\n\n");
+	
+#ifdef PARALLEL
+		printf("You are running MPI parallel version with %d processes\n\n", mpi_size);
+#endif
+
+
+		printf("Program was called with:\n");
+		for ( i = 0; i < argc; i++ )
+			printf ( "%s ",argv[i] );
+		printf ( "\n\n" );
+	}
+
+
+	simulation = 1000;
+	current_sim = 0;
+	alpha = 100;
+	datei_name[0] = 0;
+	p_value_cutoff = 1.0;
+
+	for (arg_i = 1; arg_i < argc; arg_i++) {
+		if (strcmp(argv[arg_i], "-h") == 0) {
+			usage(argv[0]);
+		} else if (strcmp(argv[arg_i], "-s") == 0) {
+			arg_i++;
+			simulation = atoi ( argv[arg_i] );
+		} else if (strcmp(argv[arg_i], "-t") == 0) {
+			arg_i++;
+			p_value_cutoff = atof ( argv[arg_i] );
+		} else if (strcmp(argv[arg_i], "-a") == 0) {
+			arg_i++;
+			/*alpha_arg = argv[arg_i];*/
+			alpha = atof ( argv[arg_i] );
+		} else if (strcmp(argv[arg_i], "-seed") == 0) {
+			arg_i++;
+			random_seed = atoi ( argv[arg_i] );
+		} else if (strcmp(argv[arg_i], "-i") == 0) {
+			arg_i++;
+			check_times = atoi ( argv[arg_i] );
+		} else if (strcmp(argv[arg_i], "-v") == 0) {
+			/*verbose_mode = 1;*/
+		} else if (strcmp(argv[arg_i], "-wsim") == 0) {
+			write_sim_result = 1;
+		} else if (strcmp(argv[arg_i], "-wdist") == 0) {
+			write_dist_matrix = 1;
+		} else if (strcmp(argv[arg_i], "-fdist") == 0) {
+			fix_distance = 1;
+		} else if (argv[arg_i][0] != '-') {
+			strcpy ( datei_name, argv[arg_i] );
+		
+		
+			strcpy ( ausgabe_report, datei_name );
+			strcat ( ausgabe_report, ".whtest" );
+
+			strcpy ( ausgabe_sim_result, ausgabe_report );
+			strcat ( ausgabe_sim_result, ".sim" );
+			strcpy ( ausgabe_dist, ausgabe_report );
+			strcat ( ausgabe_dist, ".dist" );
+			
+		} else {
+			if (isMasterProc())
+				printf("Unrecognized %s option, run with '-h' for help\n", argv[arg_i]);
+			Finalize(1);
+		} 
+	}
+
+	if (datei_name[0] == 0) {
+		printf("ERROR: Missing input alignment file.\n\n");
+		usage(argv[0]);
+	}
+
+	if (simulation <= 0 || simulation > 10000) {
+		if (isMasterProc())
+			fprintf ( stderr,"wrong #simulations: %d\nbetween 1 and 10000 please\n", simulation);
+		Finalize( 1 );
+	}
+
+	if (alpha < 0.01 || alpha > 100.0) {
+		if (isMasterProc())
+			fprintf ( stderr,"wrong alpha: %f\nbetween 0.01 and 100 please\n", alpha);
+		Finalize ( 1 );
+	}
+
+	if (check_times < 0) {
+		if (isMasterProc())
+			fprintf ( stderr,"wrong time interval: %d\npositive number please\n", check_times);
+		Finalize(1);
+	}
+
+	if (isMasterProc()) {
+		printf("Input file: %s\n", datei_name);
+		printf("Number of simulations: %d\n", simulation);
+		printf("Gamma shape alpha: %f\n", alpha);
+	}
+
+}
+
+void StartReport() {
+	FILE *fps = fopen( ausgabe_report, "w" );
+	fprintf(fps, "WH-TEST\n\n");
+	fprintf(fps, "G. Weiss and A. von Haeseler (2003) Testing substitution models\n");
+	fprintf(fps, "within a phylogenetic tree. Mol. Biol. Evol, 20(4):572-578\n\n");
+	fprintf(fps, "Input file name: %s\n", datei_name);
+	fprintf(fps, "Number of simulations: %d\n", simulation);
+	fprintf(fps, "Gamma shape parameter: %f\n", alpha);
+	fprintf(fps, "Random number seed: %d\n\n", random_seed);
+	fprintf(fps, "SEQUENCE ALIGNMENT\n\n");
+	fprintf(fps, "Input data: %d sequences with %d nucleotide sites\n", taxa, nr_basen);
+	fprintf(fps, "\n");
+	fclose(fps);
+}
+
+void FinishReport(time_t begin_time) {
+	FILE *fps = fopen( ausgabe_report, "a" );
+	char *finishedDate_;
+	int prog_time;
+	int nHour_, nMin_, nSec_;
+
+	time_t end_time;
+	time(&end_time);
+	finishedDate_ = ctime(&end_time);
+
+	prog_time = difftime (end_time, begin_time);
+
+	nHour_ = prog_time / 3600;
+	nMin_ = (prog_time  - nHour_ * 3600) / 60;
+	nSec_ = prog_time  - nMin_ * 60 - nHour_ * 3600;
+
+	/*printf("\nDate and time: %s", finishedDate_);*/
+	printf("Runtime: %dh:%dm:%ds\n\n", nHour_, nMin_, nSec_);
+
+	fprintf(fps, "\nTIME STAMP\n\n");
+	fprintf(fps, "Date and time: %s", finishedDate_);
+	fprintf(fps, "Runtime: %dh:%dm:%ds\n", nHour_, nMin_, nSec_);
+
+	fclose(fps);
+}
+
+void ReportResults(double delta_data, double delta_95quantile, double p_value) {
+
+	FILE *fps = fopen( ausgabe_report, "a" );
+
+	fprintf(fps, "\nTEST OF HOMOGENEITY ASSUMPTION OVER BRANCHES\n\n");
+
+	fprintf(fps, "Delta of data:                       %f\n", delta_data);
+	fprintf(fps, ".95 quantile of Delta distribution:  %f\n", delta_95quantile);
+	fprintf(fps, "Number of simulations performed:     %d\n", current_sim);
+	if (current_sim == simulation)
+		fprintf(fps, "p-value:                             %f\n", p_value);
+	else
+		fprintf(fps, "p-value:                             >%f\n", p_value);
+
+	fprintf(fps, "\n");
+	if (p_value < 0.05) {
+		fprintf(fps, "WH-test rejected the assumption of a single model among branches of the tree\n");
+	} else {
+		fprintf(fps, "WH-test DID NOT reject the assumption of a single model among branches of the tree\n");
+	}
+
+	fclose(fps);
+}
+
+int WHTest_run ( int argc,char **argv ) {
+
+	int i;
+	/*double *global_sim = NULL;*/
+	int count_sim;
+	int cur_point;
+	int *check_point = NULL;
+
+	double prev_p_wert = 0, own_p_wert;
+	int *valid_pairs;
+	/*int *global_pairs = NULL;*/
+	FILE *delta_file = NULL;
+	time_t begin_time;
+	struct timeval tv;
+	int work_single;
+#ifdef PARALLEL
+	int *displs, *rcounts;
+	double mpi_prog_time, mpi_sim_time;
+#endif
+
+	int start_sim, end_sim;
+
+	p_wert = 0.0;
+
+	/*knoten *baum;*/
+
+#ifdef PARALLEL
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &mpi_myrank);
+	MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+#endif
+
+	/* start to count the running time */
+	time(&begin_time);
+
+	if (argc>0) parseArg(argc, argv);
+	/* initialize random seed based on current time */
+	if (isMasterProc()) {
+#ifndef HAVE_GETTIMEOFDAY
+		if (random_seed < 0) {
+			printf("WARNING: Random seed may not be well initialized since gettimeofday() is not available.\n");
+			printf("         You can use option -seed <NUMBER> to specify your own seed number.\n");
+		}
+#endif
+		gettimeofday(&tv, NULL);
+		srand((unsigned) (tv.tv_sec+tv.tv_usec));
+		if (random_seed < 0)
+			random_seed = rand();
+		if (argc > 0)
+		printf("Random number seed: %d\n\n", random_seed);
+	}
+
+#ifdef PARALLEL
+	MPI_Bcast(&random_seed, 1, MPI_INT, mpi_master_rank, MPI_COMM_WORLD);
+#endif
+
+	start_kiss ( random_seed );
+
+	beta = 1./alpha;
+
+	if (argc > 0) {
+		ReadDataSize ( datei_name );
+		AllocateMemory();
+	}
+	delta_sim = ( double* ) calloc ( simulation, sizeof ( double) );
+	valid_pairs = ( int* ) calloc ( simulation, sizeof ( int) );
+	if (check_times > 0)
+		check_point = (int *) malloc(check_times * sizeof(int));
+	/*global_sim = ( double* ) calloc ( simulation, sizeof ( double) );
+	global_pairs = ( int* ) calloc ( simulation, sizeof ( int) );*/
+#ifdef PARALLEL
+	displs = (int*) malloc(mpi_size * sizeof(int));
+	rcounts = (int*) malloc(mpi_size * sizeof(int));
+#endif
+
+	if (isMasterProc() && argc > 0)
+		printf("Input data set (%s) contains %d sequences of length %d\n", datei_name, taxa, nr_basen);
+	if (argc > 0) ReadData ( datei_name );
+
+
+	if (isMasterProc())
+		printf("\n");
+	if (isMasterProc())
+		StartReport();
+	
+#ifdef PARALLEL
+	mpi_prog_time = MPI_Wtime();
+#endif
+
+	Compute_Hij();
+	Compute_Qij_tij();
+
+	/*if (isMasterProc())
+		printf("Computing average of Q matrices\n");*/
+	Compute_q_hat_pairwise();
+
+	delta_data = ComputeWeissLambdaQ16(q_matrizen); 
+
+	if (fix_distance) 
+		FixDistance();
+	if (isMasterProc() && write_dist_matrix)
+		Save_Distance(ausgabe_dist, distance);
+
+	if (ml_distance) SetMLDistance();
+
+	if (isMasterProc())
+		printf("Computing neighbor-joining tree\n");
+
+
+	ComputeNeighborJoiningTree();
+
+	if (isMasterProc()) {
+		Save_Tree ( baum + ( 2*taxa-2 ) );
+		printf("\nStart %d simulations\n", simulation);
+	}
+
+#ifdef PARALLEL
+	mpi_sim_time = MPI_Wtime();
+	work_single = (simulation+mpi_size-1) / mpi_size;
+	start_sim = work_single * mpi_myrank;
+	end_sim = work_single * (mpi_myrank+1);
+	if (end_sim > simulation) end_sim = simulation;
+	work_single = end_sim - start_sim;
+	for (i = 0; i < mpi_size; i++) {
+		displs[i] = work_single * i;
+		rcounts[i] = work_single;
+		if (i == mpi_size-1) rcounts[i] = simulation - displs[i];
+		/*if (isMasterProc())
+			printf(" %d ", rcounts[i]);*/
+	}
+#else
+	work_single = simulation;
+	start_sim = 0;
+	end_sim = simulation;
+#endif
+
+	for (i = 0; i < check_times; i++) {
+		check_point[i] = work_single*(i+1) / check_times;
+		if (i == check_times-1)
+			check_point[i] = end_sim-start_sim;
+	}
+
+	for ( i = start_sim, count_sim = 0, own_p_wert = 0.0, cur_point = 0; i < end_sim; i++) {
+		Simulate_Sequences_q_hat();
+		Compute_Hij();
+		Compute_Qij_tij();
+		delta_sim[i] = ComputeWeissLambdaQ16(q_matrizen);
+		valid_pairs[i] = CountValidPairs(q_matrizen);
+		count_sim++;
+		current_sim = count_sim;
+		if (delta_sim[i] >= delta_data) own_p_wert += 1.0;
+		p_wert = own_p_wert / simulation;
+		if (check_point && count_sim == check_point[cur_point]) {
+			cur_point++;
+#ifdef PARALLEL
+			MPI_Allreduce(&own_p_wert, &p_wert, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+			p_wert /= simulation;
+			MPI_Reduce(&count_sim, &current_sim, 1, MPI_INT, MPI_SUM, mpi_master_rank, MPI_COMM_WORLD);
+#endif
+			if (isMasterProc()) {
+				printf("%5d done", current_sim);
+				printf(", current p-value: %5.3f\n", p_wert);
+				if (p_wert > 0.05 && prev_p_wert <= 0.05) {
+					printf("NOTE: Homogeneity assumption is NOT rejected (p-value > 0.05)\n");
+				}
+				prev_p_wert = p_wert;
+			}
+		}
+		if (p_wert > p_value_cutoff) 
+			break;
+	}
+
+
+#ifdef PARALLEL
+	/*printf("Proc %d done.\n", mpi_myrank);
+	MPI_Barrier(MPI_COMM_WORLD);*/
+	if (mpi_size > 1) {
+			MPI_Gatherv(delta_sim + start_sim, end_sim - start_sim, MPI_DOUBLE, 
+				delta_sim, rcounts, displs, MPI_DOUBLE, mpi_master_rank, MPI_COMM_WORLD);
+			MPI_Gatherv(valid_pairs + start_sim, end_sim - start_sim, MPI_INT, 
+				valid_pairs, rcounts, displs, MPI_INT, mpi_master_rank, MPI_COMM_WORLD);
+			for (i = 0, current_sim = 0, p_wert = 0.0; i < simulation; i++) {
+				if (delta_sim[i] >= delta_data) p_wert += 1.0;
+				if (delta_sim[i] != 0.0) current_sim++;
+			}
+			p_wert /= simulation;
+/*
+		} else {
+			MPI_Reduce(&own_p_wert, &p_wert, 1, MPI_DOUBLE, MPI_SUM, mpi_master_rank, MPI_COMM_WORLD);
+			p_wert /= simulation;
+			MPI_Reduce(&count_sim, &current_sim, 1, MPI_INT, MPI_SUM, mpi_master_rank, MPI_COMM_WORLD);
+		}*/
+	}
+	/*printf("Process %d did %d simulations\n", mpi_myrank, count_sim);*/
+#endif
+
+	if (isMasterProc()) {
+		printf("%d simulations done\n", current_sim);
+	}
+
+
+	if (isMasterProc() && write_sim_result) {
+		delta_file = fopen(ausgabe_sim_result, "w");
+		if (!delta_file) {
+			printf ( "\nERROR: Cannot write to file %s!\n", ausgabe_sim_result );
+		} else {
+			fprintf(delta_file, "Sim.    Delta   Valid_Qs\n");
+			for (i = 0, count_sim = 1; i < simulation; i++)
+				if (delta_sim[i] != 0.0) {
+					fprintf(delta_file, "%d\t%f\t%d\n", count_sim++, delta_sim[i], valid_pairs[i]);
+				}
+			fclose(delta_file);
+		}
+	}
+
+
+	if (isMasterProc()) {
+
+#ifdef PARALLEL
+		/*if (verbose_mode) {
+			printf("Simulation time: %f\n", MPI_Wtime() - mpi_sim_time);
+		}*/
+#endif
+
+		sort ( simulation, delta_sim-1);
+		printf("\nDelta of input data: %f\n", delta_data);
+		printf("0.95 quantile:       %f\n", delta_sim[(int)floor(0.95*simulation)]);
+
+		if (current_sim == simulation)
+			printf("P-value:             %f\n\n",p_wert);
+		else
+			printf("P-value:            >%f\n\n",p_wert);
+
+	if (p_wert < 0.05) {
+		printf("RESULT: Model homogeneity is rejected (p-value cutoff 0.05)\n");
+	} else {
+		printf("RESULT: Model homogeneity is NOT rejected (p-value cutoff 0.05)\n");
+	}
+
+		ReportResults(delta_data, delta_sim[(int)floor(0.95*simulation)], p_wert);
+	if (argc > 0) {
+		printf("All results written to disk:\n");
+		printf("     WH-test report file:     %s\n", ausgabe_report);
+		if (write_sim_result)
+			printf("     Simulation results:      %s\n", ausgabe_sim_result);
+		if (write_dist_matrix)
+			printf("     Pairwise distances:      %s\n", ausgabe_dist);
+	}
+
+		FinishReport(begin_time);
+#ifdef PARALLEL	
+		/*if (verbose_mode) {
+			printf("Total time: %f\n", MPI_Wtime() - mpi_prog_time);
+		}*/
+#endif
+	}
+
+#ifdef PARALLEL
+	free(rcounts);
+	free(displs);
+#endif
+	if (check_point) free(check_point);
+	free(valid_pairs);
+	free(delta_sim);
+	FreeMemory();
+
+#ifdef PARALLEL
+	MPI_Finalize();
+#endif
+	if (isMasterProc() && argc > 0)
+		printf("Finished successfully.\n");
+	return 0;
+}
+
+
diff --git a/whtest/whtest.h b/whtest/whtest.h
new file mode 100644
index 0000000..f155162
--- /dev/null
+++ b/whtest/whtest.h
@@ -0,0 +1,39 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef WHTEST_H
+#define WHTEST_H
+
+
+int WHTest_run ( int argc, char **argv );
+
+void WHT_setAlignmentSize(int ntax, int nsite);
+
+void WHT_allocateMemory();
+
+void WHT_setSequenceSite(int seqid, int siteid, char c);
+
+void WHT_setSequenceName(int seqid, const char *name);
+
+void WHT_setParams(int nsim, double gamma_shape, char *filename, double *dist);
+
+void WHT_getResults(double *delta, double *delta_quantile, double *p_value);
+
+#endif
diff --git a/whtest/whtest_sub.c b/whtest/whtest_sub.c
new file mode 100644
index 0000000..d52b57b
--- /dev/null
+++ b/whtest/whtest_sub.c
@@ -0,0 +1,1176 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <ctype.h>
+
+#include "whtest_sub.h"
+#include "eigen.h"
+#include "eigen_sym.h"
+#include "random.h"
+#include "whtools.h"
+
+
+knoten *baum;
+int **seqData;
+double ****H;
+double **q_matrizen;
+double **distance;
+double *alpha_rate;
+double q_hat_eigen[4], U_q_hat[16], V_q_hat[16], statPi[4];
+
+/*************************************************/
+
+void ReadDataSize ( char *datafile )
+{
+	char c;
+
+	FILE *ifp;
+
+	if ( ( ifp = fopen ( datafile, "r" ) ) == NULL ) {
+		printf ( "\nERROR: Missing input file %s!\n", datafile );
+		Finalize ( 1 );
+	}
+
+
+	if ( fscanf ( ifp, "%d", &taxa ) != 1 ) {
+		printf ( "\nERROR: Missing number of taxa!\n" );
+		Finalize ( 1 );
+	}
+
+	if ( fscanf ( ifp, "%d", &nr_basen ) != 1 ) {
+		printf ( "\nERROR: Missing number of sites!\n" );
+		Finalize ( 1 );
+	}
+
+	do	
+		c = fgetc ( ifp ); 	/* skip rest of line */
+	while ( c != '\n' );
+
+	fclose ( ifp );
+
+	/*fprintf(stderr,"\ntaxa: %d\t basen: %d\n", taxa,nr_basen);*/
+}
+
+/*************************************************/
+
+
+void AllocateMemory()
+{
+
+	int i, j, k;
+
+	/* sequence data */
+
+	seqData = ( int ** ) malloc ( ( 2*taxa-1 ) * sizeof ( int * ) );
+	for ( i = 0; i < 2*taxa-1; i++ )
+		seqData[i] = ( int * ) calloc ( nr_basen, sizeof ( int ) );
+
+
+	/* baum structure */
+
+	baum = ( knoten * ) malloc ( ( 2*taxa-1 ) * sizeof ( knoten ) );
+
+
+	/* distance matrix  */
+
+	distance = ( double ** ) malloc ( taxa * sizeof ( double * ) );
+	for ( i = 0; i < taxa; i++ )
+		distance[i] = ( double * ) calloc ( taxa, sizeof ( double ) );
+
+	/* divergence matrices */
+
+	H = ( double **** ) malloc ( taxa * sizeof ( double *** ) );
+	for ( i = 0; i < taxa; i++ ) {
+		H[i] = ( double *** ) malloc ( taxa * sizeof ( double ** ) );
+
+		for ( j = 0; j < taxa; j++ ) {
+			H[i][j] = ( double ** ) malloc ( 5 * sizeof ( double * ) );
+
+			for ( k = 0; k < 5; k++ )
+				H[i][j][k] = ( double * ) calloc ( 5, sizeof ( double ) );
+		}
+	}
+
+	/* pairwise rate matrices */
+
+	q_matrizen = ( double ** ) malloc ( taxa* ( taxa-1 ) /2 *sizeof ( double * ) );
+	for ( i = 0; i < ( int ) ( taxa* ( taxa-1. ) /2. ); i++ ) {
+		q_matrizen[i] = ( double * ) calloc ( 16, sizeof ( double ) );
+	}
+
+
+	/* raten_heterogenitaet */
+
+	alpha_rate = ( double * ) calloc ( nr_basen, sizeof ( double ) );
+
+}
+
+void FreeMemory() {
+	int i, j, k;
+
+	/* raten_heterogenitaet */
+	free(alpha_rate);
+
+	/* pairwise rate matrices */
+	for ( i = ( int ) ( taxa* ( taxa-1. ) /2. ) - 1; i >= 0; i-- ) {
+		free(q_matrizen[i]);
+	}
+	free(q_matrizen);
+
+	/* divergence matrices */
+
+	for ( i = taxa-1; i>=0; i-- ) {
+
+		for ( j = taxa-1; j >= 0; j-- ) {
+			for ( k = 4; k >= 0; k-- )
+				free(H[i][j][k]);
+			free(H[i][j]);
+		}
+		free(H[i]);
+	}
+
+	free(H);
+
+	/* distance matrix  */
+
+	for ( i = taxa-1; i >= 0; i-- )
+		free(distance[i]);
+	free(distance);
+
+
+	/* TODO baum structure */
+
+	free(baum);
+	/* sequence data */
+
+	for ( i = 2*taxa-2; i >= 0; i-- )
+		free(seqData[i]);
+
+	free(seqData);
+
+
+}
+
+
+/*************************************************/
+
+
+void ReadData ( char *datafile )
+{
+	int i, j;
+	char c;
+
+	FILE *ifp;
+
+	if ( ( ifp = fopen ( datafile, "r" ) ) == NULL ) {
+		if (isMasterProc())
+			printf ( "\nERROR: Missing input file!\n" );
+	}
+
+
+	do	
+		c = fgetc ( ifp ); 	/* skip 1st line */
+	while ( c != '\n' );
+
+	for ( i = 0; i < taxa; i++ ) {
+
+		/*	do	c = fgetc(ifp); */	 /* skip sequence name */
+		/*	while ( c != '\n' && c != ' ');
+		*/
+		j = 0;
+		while ( j < 10 ) {
+			fscanf ( ifp, "%c", &c );
+
+			if ( c != '\n' && c != ' ' )
+				baum[i].bezeichnung[j] = c;
+			else	{
+				baum[i].bezeichnung[j] = '\0';
+				j = 10;
+			}
+
+			j++;
+		}
+
+		if (isMasterProc())
+			printf("%3i\t%s\n", i+1, baum[i].bezeichnung);
+		j = 0;
+		while ( j < nr_basen ) {
+			fscanf ( ifp, "%c", &c );
+			c = toupper ( c );
+
+			if ( c == 'A' || c == '0' )	{seqData[i][j] = 0;  j++;}
+			else if ( c == 'C' || c== '1' )	{seqData[i][j] = 1;  j++;}
+			else if ( c == 'G' || c== '2' )	{seqData[i][j] = 2;  j++;}
+			else if ( c == 'T' || c== '3' )	{seqData[i][j] = 3;  j++;}
+			else if ( c == '-' )	{seqData[i][j] = 4;  j++;}
+			else if ( c == 'N' )     {seqData[i][j] = 4;  j++;}
+			else if ( c == ' ' ) ;
+			else if ( c == '\n' );
+			else {
+				if (isMasterProc())
+					fprintf ( stderr,"\nERROR: wrong BASE in datafile!   %c\n", c );
+				seqData[i][j] = 5;
+				j++;
+			}
+
+		}
+
+		if (j != nr_basen) {
+			if (isMasterProc())
+				printf("ERROR: %s has only %i characters\n", baum[i].bezeichnung, j);
+			Finalize(1);
+		}
+
+		do	
+			c = fgetc ( ifp ); 	/* skip rest of line */
+		while ( c != '\n' );
+	}
+
+	fclose ( ifp );
+
+}
+
+/*********************************************************************/
+#define JMAX 20
+
+/* newton raphson method */
+float rtnewt(void (*funcd)(float, float *, float *), float x1, float x2,
+	float xacc)
+{
+	int j;
+	float df,dx,f,rtn;
+
+	rtn=0.5*(x1+x2);
+	for (j=1;j<=JMAX;j++) {
+		(*funcd)(rtn,&f,&df);
+		dx=f/df;
+		rtn -= dx;
+		if ((x1-rtn)*(rtn-x2) < 0.0)
+			printf("Jumped out of brackets in rtnewt");
+		if (fabs(dx) < xacc) return rtn;
+	}
+	printf("Maximum number of iterations exceeded in rtnewt");
+	return 0.0;
+}
+
+
+void FixDistance() {
+	int i, j, k, m;
+	double T1[16], pi[4], coeff[4], coeff_eigen[4], identity;
+	double f, df, rtn, x1 = 0.000001, x2 = 10.0, dx, expf;
+	printf("Computing corrected distance matrix based on averaged Q\n");
+	
+	for (i = 0; i < taxa-1; i++) 
+		for (j = i+1; j < taxa; j++) {
+			/* get the state frequency pi */
+			pi[0] = statPi[0];
+			for (k = 1; k < 4; k++) {
+				pi[k] = statPi[k] - statPi[k-1];
+			}
+			matAbyBisC ( U_q_hat, V_q_hat, 4, T1 );
+			for (k = 0; k < 4; k++) 
+				for (m = 0, coeff[k] = 0.0; m < 4; m++) 
+					coeff[k] += pi[m] * U_q_hat[k*4+m] * V_q_hat[m*4+k];
+
+
+			for (k = 0, identity = 0.0; k < 4; k++) {
+				identity += H[i][j][k][k];
+				coeff_eigen[k] = coeff[k] * q_hat_eigen[k];
+			}
+			/* with this transformation, we need to solve the equation f(t) = 0 with
+			  f(t) = sum_k {coeff[k]*exp(eigen[k]*t)} - identity
+			  the derivative:
+			  f'(t) = sum_k {eigen[k]*coeff[k]*exp(eigen[k]*t)}
+			  In the following we use Newton-Raphson to find the root of f(t)
+			  */
+
+			/* first guess of the distance */
+			if (distance[i][j] < 10.0) 
+				rtn = distance[i][j];
+			else 
+				rtn = -(3.0/4.0) * log(1.0 - 4.0/3.0 *(1.0 - identity)); /* Juke-Cantor corrected distance */
+			for (k = 1; k <= JMAX; k++) {
+				/* compute f(x) and f'(x) at x = rtn */
+				for (m = 0, f = -identity, df = 0.0; m < 4; m++) {
+					expf = exp(q_hat_eigen[m] * rtn);
+					f += coeff[m] * expf;
+					df += coeff_eigen[m] * expf;
+				}
+				dx=f/df;
+				rtn -= dx;
+				if ((x1-rtn)*(rtn-x2) < 0.0)
+					printf("Jumped out of brackets in rtnewt");
+				if (fabs(dx) < 0.0001) break;
+			}
+			distance[i][j] = rtn;
+			distance[j][i] = rtn;
+		}
+}
+#undef JMAX
+
+/*********************************************************************/
+void FixDistance_old() {
+	int i, j, k;
+	double T1[16], T2[16];
+	
+	for (i = 0; i < taxa-1; i++) 
+		for (j = i+1; j < taxa; j++) 
+			if (distance[i][j] >= 100) {
+				for ( k = 0; k < 16; k++ )
+					T2[k] = 0;
+	
+				if ( alpha > 10 )	{
+					/* keine ratenheterogenitaet */
+	
+					T2[0] = q_hat_eigen[0];
+					T2[5] = q_hat_eigen[1];
+					T2[10] = q_hat_eigen[2];
+					T2[15] = q_hat_eigen[3];
+				} else 	{
+					/* ratenheterogenitaet */
+					/* something wrong here! */
+				T2[0] = alpha* ( 1.- exp(-q_hat_eigen[0]/alpha ) );
+				T2[5] = alpha* ( 1.- exp(-q_hat_eigen[1]/alpha ) );
+				T2[10] = alpha* ( 1.-exp(-q_hat_eigen[2]/alpha ) );
+				T2[15] = alpha* ( 1.-exp(-q_hat_eigen[3]/alpha ) );
+				}
+	
+				/* T2 = U * diag(eigenwert) * V */
+	
+				matAbyBisC ( U_q_hat, T2, 4, T1 );
+				matAbyBisC ( T1, V_q_hat, 4, T2 );
+	
+				/* normalisieren durch t = -sum (pi_i * r_ii) , Q hat rate 1 */
+	
+				distance[i][j] = 0;
+				for ( k = 0; k < 4; k++ )
+					distance[i][j] -= H[i][j][k][4]*T2[k*5];
+	
+				distance[j][i] = distance[i][j];
+/*
+				if (isMasterProc())
+					printf("Fix distance (%s,%s) -> %f\n", baum[i].bezeichnung, baum[j].bezeichnung, distance[i][j]);*/
+				if (distance[i][j] > 100) {
+					if (isMasterProc())
+						printf("ERROR: too large distance, try higher alpha please\n");
+					Finalize(1);
+				}
+			}
+}
+
+/*********************************************************************/
+
+void Save_Distance(char *distfile, double **dist) {
+	FILE *fps;
+	int i, j;
+
+	if ((fps = fopen(distfile, "w")) == NULL) {
+		printf ( "\nERROR: Cannot write to file %s!\n", distfile );
+	}
+
+	fprintf(fps, "%d\n", taxa);
+
+	for (i = 0; i < taxa; i++) {
+		fprintf(fps, "%-10s", baum[i].bezeichnung);
+		for (j = 0; j < taxa; j++) 
+			fprintf(fps, " %f", dist[i][j]);
+		fprintf(fps, "\n");
+	}
+
+	fclose(fps);
+}
+
+
+/*********************************************************************/
+
+
+void Compute_Hij()
+{
+
+	int i, j, k, l;
+
+	for ( i = 0; i < taxa; i++ ) {
+		for ( j = 0; j < taxa; j++ ) {
+			for ( k = 0; k < 5; k++ ) {
+				for ( l = 0; l < 5; l++ )
+					H[i][j][k][l] = 0;
+			}
+
+
+			for ( k = 0; k < nr_basen; k++ ) {
+				H[i][j][seqData[i][k]][seqData[j][k]]+=1.;
+				H[i][j][seqData[j][k]][seqData[i][k]]+=1.;
+				/* symmetrisiert, da reversibilitaet vorausgesetzt */
+			}
+		}
+	}
+
+
+	for ( i = 0; i < taxa; i++ ) {
+		for ( j = 0; j < taxa; j++ ) {
+			/* H[i][j][k][4] und H[i][j][4][k] enthalten basenhaeufigkeiten */
+
+			for ( k = 0; k < 4; k++ ) {
+				H[i][j][k][4] =	H[i][j][k][0]+H[i][j][k][1]+H[i][j][k][2]+H[i][j][k][3];
+				H[i][j][4][k] = H[i][j][k][4];
+			}
+
+			/* H[i][j][4][4] enthaelt nr_basen ohne gaps */
+			H[i][j][4][4] = H[i][j][0][4]+H[i][j][1][4]+H[i][j][2][4]+H[i][j][3][4];
+
+			for ( k = 0; k < 4; k++ ) {	/* normieren */	
+				for ( l = 0; l < 4; l++ )
+					H[i][j][k][l] /= H[i][j][4][4];
+
+				H[i][j][k][4] /= H[i][j][4][4];
+				H[i][j][4][k] /= H[i][j][4][4];
+			}
+		}
+	}
+
+}
+
+
+/******************************************************************************/
+/*
+void Write_Qij ( int a )
+{
+
+	FILE *fps;
+
+	int i, k, l;
+
+	if ( a == 0 )
+	{
+		fps = fopen ( ausgabe_0, "w" );
+	}
+
+	else if ( a == 1 )
+	{
+		fps = fopen ( ausgabe_1, "a" );
+	}
+
+	else return;
+
+
+	for ( i = 0; i < taxa* ( taxa-1 ) /2; i++ )
+	{
+		for ( k = 0; k < 4; k++ )
+		{
+			for ( l = 0; l < 4; l++ )
+			{
+
+				if ( k != l )
+					fprintf ( fps,"%f\t",q_matrizen[i][k*4+l] );
+			}
+		}
+
+		fprintf ( fps,"\n" );
+	}
+
+	fclose ( fps );
+
+}
+*/
+
+
+/******************************************************************************/
+
+
+void Compute_Qij_tij()
+{
+
+	int e, i, j, k, l, index_paar;
+
+	double /*P[16],*/ EigenWert[4], T1[16], U[16], V[16], T2[16];
+	DMat20 HMat, EigenVec, EigenVecInv;
+	DVec20 PiVec;
+
+	for ( i = 0; i < taxa; i++ )
+		distance[i][i] = 0.0;
+
+	for ( i = 0; i < taxa-1; i++ ) {
+		for ( j = i+1; j < taxa; j++ ) {
+			distance[i][j] = 100;
+			distance[j][i] = 100;
+			index_paar = ( int ) ( i* ( taxa- ( i+3. ) /2. ) +j-1 );
+			for ( k = 0; k < 16; k++ )
+				q_matrizen[index_paar][k] = 0;
+
+			for ( k = 0; k < 4; k++ ) {
+				PiVec[k] = H[i][j][k][4];
+				for ( l = 0; l < 4; l++ ) {		/* P_ij(t) = Pi^(-1)*H_ij */
+					/*P[4*k+l] = H[i][j][k][l] / H[i][j][k][4];*/
+					HMat[k][l] = H[i][j][k][l];
+				}
+			}
+
+			/*if ( ( e=eigen ( 1, P, 4, EigenWert, T1, U, V, T2 ) ) !=0 )*/
+			if ( ( e=eigen_sym (HMat, PiVec, 4, EigenWert, EigenVec, EigenVecInv ) ) !=0 ) 	{
+				fprintf ( stderr, "\ncomplex roots in Eigen\n" );
+				return;
+			} 
+			if ( EigenWert[0] <= 0.0001 || EigenWert[1] <= 0.0001 || EigenWert[2] <= 0.0001 ||  EigenWert[3] <= 0.0001 ||
+					EigenWert[0] > 1.01 || EigenWert[1] > 1.01 || EigenWert[2] > 1.01 ||  EigenWert[3] > 1.01 )
+			{/*
+				fprintf ( stderr, "\nbad numerics in estimation of Eigenvalues (%f, %f, %f, %f) of P(t) %d,%d\n",EigenWert[0], EigenWert[1], EigenWert[2], EigenWert[3],i+1,j+1 );
+				fprintf(stderr, "H = %f %f %f %f\n", H[i][j][0][0], H[i][j][0][1], H[i][j][0][2], H[i][j][0][3]);
+				fprintf(stderr, "    %f %f %f %f\n", H[i][j][1][0], H[i][j][1][1], H[i][j][1][2], H[i][j][1][3]);
+				fprintf(stderr, "    %f %f %f %f\n", H[i][j][2][0], H[i][j][2][1], H[i][j][2][2], H[i][j][2][3]);
+				fprintf(stderr, "    %f %f %f %f\n", H[i][j][3][0], H[i][j][3][1], H[i][j][3][2], H[i][j][3][3]);
+				fprintf(stderr, "Pt= %f %f %f %f\n", P[0], P[1], P[2], P[3]);
+				fprintf(stderr, "    %f %f %f %f\n", P[4], P[5], P[6], P[7]);
+				fprintf(stderr, "    %f %f %f %f\n", P[8], P[9], P[10], P[11]);
+				fprintf(stderr, "    %f %f %f %f\n", P[12], P[13], P[14], P[15]);
+				fprintf(stderr, "Pi= %f %f %f %f\n", H[i][j][0][4], H[i][j][1][4], H[i][j][2][4], H[i][j][3][4]);
+				*/
+				continue;
+			}
+
+			for ( k = 0; k < 4; k++ )	{
+				for ( l = 0; l < 4; l++ ) {
+					U[k*4+l] = EigenVec[k][l];
+					V[k*4+l] = EigenVecInv[k][l];
+				}
+			}
+
+			/*xtoy ( U, V, 16 );
+			matinv ( V, 4, 4, T1 );*/
+
+			/* berechne ratenmatrix  */
+
+			/* T2 = diag(eigenwert)*/
+
+			for ( k = 0; k < 16; k++ )
+				T2[k] = 0;
+
+			if ( alpha > 10 )	{
+				/* keine ratenheterogenitaet */
+
+				T2[0] = log ( EigenWert[0] );
+				T2[5] = log ( EigenWert[1] );
+				T2[10] = log ( EigenWert[2] );
+				T2[15] = log ( EigenWert[3] );
+			} else 	{
+				/* ratenheterogenitaet */
+
+				T2[0] = alpha* ( 1.-pow ( EigenWert[0],-1./alpha ) );
+				T2[5] = alpha* ( 1.-pow ( EigenWert[1],-1./alpha ) );
+				T2[10] = alpha* ( 1.-pow ( EigenWert[2],-1./alpha ) );
+				T2[15] = alpha* ( 1.-pow ( EigenWert[3],-1./alpha ) );
+			}
+
+			/* T2 = U * diag(eigenwert) * V */
+
+			matAbyBisC ( U, T2, 4, T1 );
+			matAbyBisC ( T1, V, 4, T2 );
+
+			/* normalisieren durch t = -sum (pi_i * r_ii) , Q hat rate 1 */
+
+			distance[i][j] = 0;
+			for ( k = 0; k < 4; k++ )
+				distance[i][j] -= H[i][j][k][4]*T2[k*5];
+
+			/* fix ZERO distance */
+			if (fabs(distance[i][j]) < 0.00001) {
+				if (distance[i][j] >= 0.0)
+					distance[i][j] = 0.00001;
+				else
+					distance[i][j] = -0.00001;
+			}
+
+			distance[j][i] = distance[i][j];
+
+			/* fix TOO LARGE distance */
+			if (distance[i][j] > 100) {
+				/*printf("Distance saturated, please try higher alpha\n");*/
+				continue;
+			}
+
+
+
+			for ( k = 0; k < 16; k++ )
+				q_matrizen[index_paar][k] = T2[k]/distance[i][j];
+
+			/*
+
+			int neg_rate = 0;
+			for ( k = 0; k < 16; k++ )
+				if ((k %5 != 0 && q_matrizen[index_paar][k] < 0.0)) neg_rate = 1;
+			if (neg_rate) {
+				printf("Negative non-diagonal entry of Q %d,%d\n", i+1, j+1);
+				for ( k = 0; k < 16; k++ ) {
+					printf("%+f ", q_matrizen[index_paar][k]);
+					if (k % 4 == 3) printf("\n");
+				}
+				
+			}*/
+
+			
+		}
+	}
+
+
+
+}
+/*********************************************************************/
+
+void Write_Tree (FILE *fp1, knoten *P )
+{
+	knoten *Q;
+
+
+	if ( P->left != 0 && P->right != 0 ) {
+		fprintf ( fp1,"(" );
+
+		Q = P->left;
+		Write_Tree (fp1, Q );
+		fprintf ( fp1,"," );
+
+		Q = P->right;
+		Write_Tree (fp1, Q );
+
+		/*if ( P->edge_length>0 )*/
+			fprintf ( fp1,"):%f",P->edge_length );
+		/*else
+			fprintf ( fp1,")" );*/
+	} else if	( P->left == 0 && P->right == 0 ) {
+		fprintf ( fp1,"%s:%f",P->bezeichnung,P->edge_length );
+	} 
+
+}
+
+/************************************************************/
+
+void Save_Tree ( knoten *P )
+{
+
+	/*knoten *Q;*/
+	FILE *fp1;
+
+	if ( ( fp1=fopen ( ausgabe_report,"a" ) ) == 0 ) {
+		fprintf ( stderr,"\nERROR writing file %s\n", ausgabe_report );
+		Finalize ( 1 );
+	}
+
+	fprintf(fp1, "\nNEIGHBOR-JOINING TREE\n\n");
+
+	fprintf ( fp1,"(" );
+
+	Write_Tree (fp1, P->left );
+
+	fprintf ( fp1,"," );
+	Write_Tree (fp1, P->right);
+
+	fprintf ( fp1,")" );
+
+
+	fprintf ( fp1,";\n\n" );
+
+	fclose ( fp1 );
+
+	if ( ( fp1=fopen ( ausgabe_nj_tree,"w" ) ) == 0 ) {
+		fprintf ( stderr,"\nERROR writing file %s\n", ausgabe_nj_tree );
+		Finalize ( 1 );
+	}
+
+	fprintf ( fp1,"(" );
+
+	Write_Tree (fp1, P->left );
+
+	fprintf ( fp1,"," );
+	Write_Tree (fp1, P->right);
+
+	fprintf ( fp1,")" );
+
+	fprintf ( fp1,";\n" );
+
+
+	fclose ( fp1 );
+}
+
+
+/*********************************************************************/
+/******************************************************************************/
+
+void ComputeNeighborJoiningTree()
+{
+
+	int i, j, c, p1 = 0, p2 = 0, nr_nodes;
+	int *cluster_index;
+	double **nj_matrix, nj_distance, *hilfsvektor;
+	double current_minimum, max = 0;
+	/*knoten *P;*/
+
+
+	cluster_index = ( int * ) malloc ( taxa * sizeof ( int ) );
+	for ( i = 0; i < taxa; i++ )
+		cluster_index[i] = i;
+
+	hilfsvektor = ( double * ) calloc ( taxa, sizeof ( double ) );
+
+	/* initialize nj_matrix */
+
+	nj_matrix = ( double ** ) malloc ( taxa * sizeof ( double * ) );
+
+	for ( i = 0; i < taxa; i++ ) {
+		nj_matrix[i] = ( double * ) calloc ( taxa+1, sizeof ( double ) );
+		nj_matrix[i][i] = 0.0;
+	}
+
+	for ( i = 0; i < taxa-1; i++ )	{
+		for ( j = i+1; j < taxa; j++ )
+			nj_matrix[i][j] = nj_matrix[j][i] = distance[i][j];
+	}
+
+
+	/* initialize tree */
+
+	/*  	baum = (knoten *) malloc ( (2*taxa-1) * sizeof ( knoten ) ); schon oben passiert  */
+
+	for ( i = 0; i < 2*taxa-1; i++ )	{
+		baum[i].label = baum[i].ixlabel = i;
+		baum[i].left = NULL;
+		baum[i].right = NULL;
+		baum[i].up = NULL;
+		baum[i].edge_length = 0;
+	}
+
+
+	/*  build tree  */
+
+	c = nr_nodes = taxa;
+
+	while ( /*c < 2 * taxa - 1 &&*/ nr_nodes > 2 )	{
+		for ( i = 0; i < nr_nodes; i++ )	{
+			nj_matrix[i][taxa] = 0;
+
+			for ( j = 0; j < nr_nodes; j++ )
+				nj_matrix[i][taxa] += nj_matrix[i][j];
+		}
+
+
+
+		/*	find_minimum_distance();	*/
+
+		current_minimum = max;
+
+		for ( i = 0; i < nr_nodes - 1; i++ )	{
+			for ( j = i + 1; j < nr_nodes; j++ )	{
+				nj_distance = nj_matrix[i][j] -
+				              ( nj_matrix[i][taxa] + nj_matrix[j][taxa] ) / ( nr_nodes-2. );
+
+				if ( nj_distance < current_minimum ) {
+					p1 = i;
+					p2 = j;
+					current_minimum = nj_distance;
+				}
+			}
+		}
+
+
+		/*	build_next_node();	*/
+
+		baum[c].left = baum + cluster_index[p1];
+		baum[c].right = baum + cluster_index[p2];
+
+		baum[cluster_index[p1]].up = baum + c;
+		baum[cluster_index[p2]].up = baum + c;
+
+		baum[cluster_index[p1]].edge_length = 
+			( nj_matrix[p1][p2] + (nj_matrix[p1][taxa] - nj_matrix[p2][taxa] ) / ( nr_nodes-2. ) ) /2.;
+
+
+		if ( baum[cluster_index[p1]].edge_length < 0 )	{
+			baum[cluster_index[p1]].edge_length = 0;
+			baum[cluster_index[p2]].edge_length = nj_matrix[p1][p2];
+		}	else
+			baum[cluster_index[p2]].edge_length = nj_matrix[p1][p2] -
+			                                      baum[cluster_index[p1]].edge_length;
+
+		if ( baum[cluster_index[p2]].edge_length < 0 )	{
+			baum[cluster_index[p2]].edge_length = 0;
+			baum[cluster_index[p1]].edge_length = nj_matrix[p1][p2];
+		}
+
+		/*	update_nj_matrix();	*/
+		for ( j = 0; j < nr_nodes; j++ ) {
+			if ( j == p1 && j == p2 ) /* MINH: this condition never works since p1 != p2 ! */
+				hilfsvektor[j] = 0;
+			else
+				hilfsvektor[j] = ( nj_matrix[p1][j] + nj_matrix[p2][j] - nj_matrix[p1][p2] ) / 2.;
+		}
+
+		for ( j = 0; j < nr_nodes; j++ )
+			nj_matrix[j][p1] = nj_matrix[p1][j] = hilfsvektor[j];
+
+		for ( j = 0; j < nr_nodes-1; j++ )	{
+			nj_matrix[p2][j] = nj_matrix[nr_nodes-1][j];
+			nj_matrix[j][p2] = nj_matrix[p2][j];
+		}
+
+		nj_matrix[p2][p2] = 0;
+
+		for ( j = 0; j < nr_nodes-1; j++ )	{
+			nj_matrix[nr_nodes-1][j] = nj_matrix[j][nr_nodes-1] = 0;
+		}
+
+		for ( j = 0; j < taxa; j++ )
+			nj_matrix[j][taxa] = 0;
+
+		cluster_index[p1] = c;
+		cluster_index[p2] = cluster_index[nr_nodes-1];
+		nr_nodes--;
+		c++;
+
+	}
+
+	/*	verbinde zwei letzte knoten mit virtuellem Wurzelknoten*/
+
+	p1 = 0; p2 = 1;
+	baum[c].left = baum + cluster_index[p1];
+	baum[c].right = baum + cluster_index[p2];
+
+	baum[cluster_index[p1]].up = baum + c;
+	baum[cluster_index[p2]].up = baum + c;
+
+	baum[cluster_index[p1]].edge_length = nj_matrix[p1][p2]/2;
+	baum[cluster_index[p2]].edge_length = nj_matrix[p1][p2]/2;
+
+	/* release memory */
+	for ( i = taxa-1; i >= 0; i-- )
+		free(nj_matrix[i]);
+	free(nj_matrix);
+	free(hilfsvektor);
+	free(cluster_index);
+
+/*
+	for (i = 0; i <= 2*(taxa-1); i++) {
+		P = baum+i;
+		if (P->left != 0 && P->right != 0)
+			printf("%d -> (%d, %d)\n", P->label, P->left->label, P->right->label);
+		else
+			printf("%d\n", P->label);
+	}*/
+}
+
+/******************************************************************************/
+
+void Compute_q_hat_pairwise()
+{
+
+	int i, j, k, num_q;
+	double Q[16], T1[16], T2[16];
+	double rate[6];
+	FILE *fps;
+	double rate_sum;
+	double *q0_matrix = ( double * ) calloc ( 16, sizeof ( double ) );
+
+	num_q = 0;
+	for ( i = 0; i <  (taxa-1)*taxa/2; i++ )
+		if (q_matrizen[i][0] != 0.0) num_q ++;
+
+/*	if (isMasterProc())
+		printf("%d/%d valid Q matrices\n", num_q, (taxa-1)*taxa/2);*/
+	
+	for ( k = 0; k < 16; k++ ) {
+		q0_matrix[k] = 0.0;
+
+		for ( i = 0; i < ( int ) ( ( taxa-1. ) *taxa/2. ); i++ )
+			q0_matrix[k] += q_matrizen[i][k];
+
+		q0_matrix[k] /= num_q;
+	}
+
+	/* spektral zerlegung von q0_matrix  */
+
+	for ( k = 0; k < 16; k++ )
+		Q[k] = q0_matrix[k];
+
+	if ( ( k=eigen ( 1, Q, 4, q_hat_eigen, T1, U_q_hat, V_q_hat, T2 ) ) !=0 ) {
+		if (isMasterProc())
+			fprintf ( stderr,"\nno spectral decomposition for q0_matrix\n" );
+		free(q0_matrix);
+		return;
+	} else {
+		if ( q_hat_eigen[0] > 0.01 || q_hat_eigen[1] > 0.01 || q_hat_eigen[2] > 0.01 ||  q_hat_eigen[3] > 0.01 ) {
+			if (isMasterProc()) {
+				fprintf ( stderr,"\n%f\t%f\t%f\t%f\n",q_hat_eigen[0],q_hat_eigen[1],q_hat_eigen[2],q_hat_eigen[3] );
+				fprintf ( stderr, "\nbad numerics in estimation of Eigenvalues of NULL-Qmatrix\n" );
+			}
+			Finalize ( 1 );
+		} else {
+			xtoy ( U_q_hat, V_q_hat, 16 );
+			matinv ( V_q_hat, 4, 4, T1 );
+			for ( k = 0; k < 4; k++ )
+				statPi[k] = V_q_hat[k]/ ( V_q_hat[0]+V_q_hat[1]+V_q_hat[2]+V_q_hat[3] );
+			for ( k = 1; k < 4; k++ )
+				statPi[k] += statPi[k-1];
+		}
+	}
+
+	fps = fopen ( ausgabe_report, "a" );
+
+	fprintf(fps, "\nSUBSTITUTION PROCESS OF HOMOGENEOUS MODEL\n\n");
+
+	fprintf ( fps,"Q matrix:\n" );
+
+	for ( k = 0; k < 16; k++ ) {
+		if ( k%4 == 0 )
+			fprintf ( fps,"\n" );
+
+		fprintf ( fps,"%f\t",q0_matrix[k] );
+	}
+
+	fprintf ( fps,"\n\nBase composition:\n\n%f\t",statPi[0] );
+
+	for ( k = 1; k < 4; k++ )
+		fprintf ( fps,"%f\t",statPi[k]-statPi[k-1] );
+	fprintf ( fps,"\n" );
+
+	/* print individual rates */
+	fprintf(fps, "\nRate:\n\n");
+	for (i = 0, k = 0; i < 3; i++)
+		for (j=i+1; j < 4; j++, k++) {
+			rate[k] = q0_matrix[i*4+j] / (statPi[j] - statPi[j-1]);
+		}
+	for (k=0; k < 6; k++)
+		fprintf(fps, "%f\n", rate[k]/rate[5]);
+	fprintf ( fps,"\n" );
+
+	fclose ( fps );
+
+	/* check that the scaling is 1 total subst per site */
+	rate_sum = 0.0;
+	for (i = 0; i < 4; i++) {
+		rate_sum -= q0_matrix[i*4+i] * ((i==0) ? statPi[0] : (statPi[i]-statPi[i-1]));
+	}
+	if (fabs(rate_sum - 1.0) > 1e-3) {
+		if (isMasterProc())
+			fprintf ( stderr,"\nq0_matrix not scaled to 1 total subst. per site (%f)\n", rate_sum );
+		Finalize(1);
+	}
+
+	free(q0_matrix);
+}
+
+
+
+
+/******************************************************************************/
+
+/******************************************************************************/
+/*   subroutine to Simulate_Sequences_q_hat() **/
+
+void EvolveSequences ( knoten *K, int **sim_sequences, double U[], double V[], double QEigenWert[], double *alpha_rate )
+{
+	int b, i, j, k, k1, l;
+	double x, T1[16], T2[16], P_matrix[16];
+
+	i = K->ixlabel;
+
+	/* evolve left child */
+
+	if ( K->left->edge_length > 0 )
+	{
+		if ( alpha > 10 )
+		{
+			for ( k = 0; k < 16; k++ )
+				T2[k] = 0;
+
+			T2[0] = exp ( QEigenWert[0]*K->left->edge_length );
+			T2[5] = exp ( QEigenWert[1]*K->left->edge_length );
+			T2[10] = exp ( QEigenWert[2]*K->left->edge_length );
+			T2[15] = exp ( QEigenWert[3]*K->left->edge_length );
+
+			matAbyBisC ( U, T2, 4, T1 );
+			matAbyBisC ( T1, V, 4, P_matrix );
+
+			for ( k = 0; k < 4; k++ ) {
+				for ( k1 = 1; k1 < 4; k1++ )
+					P_matrix[k*4+k1] += P_matrix[k*4+k1-1];
+			}
+
+			for ( j = 0; j < nr_basen; j++ ) {
+				b = sim_sequences[i][j];
+				l = 0;
+				x = dkiss();
+
+				while ( x > P_matrix[b*4+l] && l < 3)
+					l++;
+
+				sim_sequences[K->left->ixlabel][j] = l;
+			}
+		} else 	{
+
+			for ( j = 0; j < nr_basen; j++ ) {
+				for ( k = 0; k < 16; k++ )
+					T2[k] = 0;
+
+				T2[0] = exp ( QEigenWert[0]*K->left->edge_length*alpha_rate[j] );
+				T2[5] = exp ( QEigenWert[1]*K->left->edge_length*alpha_rate[j] );
+				T2[10] = exp ( QEigenWert[2]*K->left->edge_length*alpha_rate[j] );
+				T2[15] = exp ( QEigenWert[3]*K->left->edge_length*alpha_rate[j] );
+
+				matAbyBisC ( U, T2, 4, T1 );
+				matAbyBisC ( T1, V, 4, P_matrix );
+
+				for ( k = 0; k < 4; k++ ) {
+					for ( k1 = 1; k1 < 4; k1++ )
+						P_matrix[k*4+k1] += P_matrix[k*4+k1-1];
+				}
+
+				b = sim_sequences[i][j];
+				l = 0;
+				x = dkiss();
+
+				while ( x > P_matrix[b*4+l] && l < 3)
+					l++;
+
+				sim_sequences[K->left->ixlabel][j] = l;
+			}
+		}
+	} else {
+		for ( j = 0; j < nr_basen; j++ )
+			sim_sequences[K->left->ixlabel][j] = sim_sequences[i][j];
+	}
+
+	if ( K->left->ixlabel > taxa-1 ) 		/*  K ist kein blatt */
+		EvolveSequences ( K->left, sim_sequences, U, V, QEigenWert, alpha_rate );
+
+
+
+	/* evolve right child */
+
+
+	if ( K->right->edge_length > 0 ) {
+		if ( alpha > 10 ) {
+
+			for ( k = 0; k < 16; k++ )
+				T2[k] = 0;
+
+			T2[0] = exp ( QEigenWert[0]*K->right->edge_length );
+			T2[5] = exp ( QEigenWert[1]*K->right->edge_length );
+			T2[10] = exp ( QEigenWert[2]*K->right->edge_length );
+			T2[15] = exp ( QEigenWert[3]*K->right->edge_length );
+
+			matAbyBisC ( U, T2, 4, T1 );
+			matAbyBisC ( T1, V, 4, P_matrix );
+
+			for ( k = 0; k < 4; k++ ) {
+				for ( k1 = 1; k1 < 4; k1++ )
+					P_matrix[k*4+k1] += P_matrix[k*4+k1-1];
+			}
+
+			for ( j = 0; j < nr_basen; j++ ) {
+				b = sim_sequences[i][j];
+				l = 0;
+				x = dkiss();
+
+				while ( x > P_matrix[b*4+l] && l < 3)
+					l++;
+
+				sim_sequences[K->right->ixlabel][j] = l;
+			}
+
+		} else {
+			for ( j = 0; j < nr_basen; j++ ) {
+				for ( k = 0; k < 16; k++ )
+					T2[k] = 0;
+
+				T2[0] = exp ( QEigenWert[0]*K->right->edge_length*alpha_rate[j] );
+				T2[5] = exp ( QEigenWert[1]*K->right->edge_length*alpha_rate[j] );
+				T2[10] = exp ( QEigenWert[2]*K->right->edge_length*alpha_rate[j] );
+				T2[15] = exp ( QEigenWert[3]*K->right->edge_length*alpha_rate[j] );
+
+				matAbyBisC ( U, T2, 4, T1 );
+				matAbyBisC ( T1, V, 4, P_matrix );
+
+				for ( k = 0; k < 4; k++ ) {
+					for ( k1 = 1; k1 < 4; k1++ )
+						P_matrix[k*4+k1] += P_matrix[k*4+k1-1];
+				}
+
+				b = sim_sequences[i][j];
+
+				l = 0;
+				x = dkiss();
+
+				while ( x > P_matrix[b*4+l] && l < 3)
+					l++;
+
+				sim_sequences[K->right->ixlabel][j] = l;
+			}
+		}
+	} else {
+		for ( j = 0; j < nr_basen; j++ )
+			sim_sequences[K->right->ixlabel][j] = sim_sequences[i][j];
+	}
+
+	if ( K->right->ixlabel > taxa-1 ) 		/*  K->right ist kein blatt */
+		EvolveSequences ( K->right, sim_sequences, U, V, QEigenWert, alpha_rate );
+
+}
+
+
+
+/******************************************************************************/
+
+
+void Simulate_Sequences_q_hat()
+{
+
+	int j, l;
+	double x;
+
+	if ( alpha > 10 )	{
+		/*homogene raten */
+		;
+	} else {
+		/* heterogene raten zuweisen */
+		for ( j = 0; j < nr_basen; j++ ) {
+			alpha_rate[j] = rgamma ( alpha,beta );
+		}
+	}
+
+	/* generate sequences along the tree ************/
+	/*	root sequence	baum[2*taxa-2] */
+
+	for ( j = 0; j < nr_basen; j++ ) {
+		l = 0;
+		x = dkiss();
+
+		while ( x > statPi[l] && l < 3)
+			l++;
+
+		seqData[2*taxa-2][j] = l;
+	}
+
+	EvolveSequences ( baum+ ( 2*taxa-2 ), seqData, U_q_hat, V_q_hat, q_hat_eigen, alpha_rate );
+
+	/* end of generate sequences along the tree ************/
+
+}
diff --git a/whtest/whtest_sub.h b/whtest/whtest_sub.h
new file mode 100644
index 0000000..e29f8c1
--- /dev/null
+++ b/whtest/whtest_sub.h
@@ -0,0 +1,85 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, Bui Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef WHTEST_SUB_H
+#define WHTEST_SUB_H
+
+typedef struct _knoten { 
+
+	struct _knoten *left;
+	struct _knoten *right;
+	struct _knoten *up;
+
+	double	edge_length;
+	int	label;
+	int	ixlabel;
+	char	bezeichnung[100];
+
+} knoten;
+
+
+extern knoten *baum;
+extern int **seqData;
+
+extern int simulation, nr_basen, taxa;
+
+extern double alpha, beta;
+
+extern char datei_name[100];
+extern char ausgabe_report[200];
+extern char ausgabe_nj_tree[200];
+
+extern double **q_matrizen;
+
+extern double **distance;
+
+
+/*********************************/
+
+void ReadDataSize ( char *datafile );
+void ReadData ( char *datafile );
+
+void AllocateMemory();
+void FreeMemory();
+
+void Compute_Hij();
+
+void Compute_Qij_tij();
+
+void Write_Qij(int a);
+
+void ComputeNeighborJoiningTree();
+
+void Save_Tree( knoten *P );
+
+void FixDistance();
+void Save_Distance(char *distfile, double **dist);
+
+/*void Compute_Qm();*/
+
+void Simulate_Sequences_q_hat();
+void Compute_q_hat_pairwise();
+
+/*void Compute_q_hat();*/
+
+
+/************************************/
+
+#endif
diff --git a/whtest/whtools.h b/whtest/whtools.h
new file mode 100644
index 0000000..03b620a
--- /dev/null
+++ b/whtest/whtools.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by Gunter Weiss, BUI Quang Minh, Arndt von Haeseler   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef TOOLS_H
+#define TOOLS_H
+
+int isMasterProc();
+int isSlaveProc();
+/*int isFirstSlaveProc();*/
+void Finalize(int exit_code);
+
+#ifdef PARALLEL
+#	include <mpi.h>
+extern int mpi_myrank;
+extern int mpi_size;
+extern int mpi_master_rank;
+
+extern long p_randn;
+extern long p_rand;
+#endif /*PARALLEL*/
+
+#endif
+
diff --git a/whtest_wrapper.cpp b/whtest_wrapper.cpp
new file mode 100644
index 0000000..e21e2a0
--- /dev/null
+++ b/whtest_wrapper.cpp
@@ -0,0 +1,73 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#include "whtest_wrapper.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "whtest/whtest.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+void addArg(int &argc, char **argv, char *arg) {
+	argv[argc] = new char[strlen(arg)+1];
+	strcpy(argv[argc], arg);
+	argc++;
+}
+
+int WHTest_old(Params &params, PhyloTree &tree) {
+	int argc = 0;
+	char *argv[10];
+	char tmp[100];
+	addArg(argc, argv, (char*)"WHTest");
+	addArg(argc, argv, params.aln_file);
+	addArg(argc, argv, (char*)"-a");
+	sprintf(tmp, "%f", tree.getModelFactory()->site_rate->getGammaShape());
+	addArg(argc, argv, tmp);
+	return WHTest_run(argc, argv);
+}
+
+int WHTest(Params &params, IQTree &tree) {
+
+	int i, j;
+	int retval;
+	int nseq = tree.aln->getNSeq();
+	int nsite = tree.aln->getNSite(); 
+
+
+	WHT_setAlignmentSize(nseq, nsite);
+	WHT_allocateMemory();
+	for (i = 0; i < nseq; i++)
+		for (j = 0; j < nsite; j++) 
+			WHT_setSequenceSite(i, j, (*tree.aln)[tree.aln->getPatternID(j)][i]);
+			
+	for (i = 0; i < nseq; i++)
+		WHT_setSequenceName(i, tree.aln->getSeqName(i).c_str());
+	double gamma_shape = tree.getModelFactory()->site_rate->getGammaShape();
+	if (gamma_shape == 0) gamma_shape = 100.0;
+	//WHT_setParams(params.whtest_simulations, gamma_shape, params.out_prefix, tree.dist_matrix);
+	WHT_setParams(params.whtest_simulations, gamma_shape, params.out_prefix, NULL);
+	retval = WHTest_run(0, NULL);
+	WHT_getResults(&params.whtest_delta, &params.whtest_delta_quantile, &params.whtest_p_value);
+	return retval;
+}
diff --git a/whtest_wrapper.h b/whtest_wrapper.h
new file mode 100644
index 0000000..5fd28b6
--- /dev/null
+++ b/whtest_wrapper.h
@@ -0,0 +1,29 @@
+/***************************************************************************
+ *   Copyright (C) 2009 by BUI Quang Minh   *
+ *   minh.bui at univie.ac.at   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#ifndef WHTEST_WRAPPER_H
+#define WHTEST_WRAPPER_H
+
+#include "iqtree.h"
+
+int WHTest(Params &params, IQTree &tree);
+
+
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/iqtree.git